add get_glycoshift_per_site to analyze differential glycosylation in …

…glycoproteomics
BojarLab · Jun 17, 2024 · 6919fa5 · 6919fa5
1 parent f60c6d4
commit 6919fa5
Show file tree

Hide file tree

Showing 8 changed files with 6,275 additions and 5,451 deletions.
diff --git a/03_motif.ipynb b/03_motif.ipynb
diff --git a/build/lib/glycowork/motif/analysis.py b/build/lib/glycowork/motif/analysis.py
@@ -29,8 +29,8 @@
                                          sequence_richness, shannon_diversity_index, simpson_diversity_index,
                                          get_equivalence_test, clr_transformation, anosim, permanova_with_permutation,
                                          alpha_biodiversity_stats, get_additive_logratio_transformation, correct_multiple_testing,
-                                         omega_squared, get_glycoform_diff)
-from glycowork.motif.processing import enforce_class
+                                         omega_squared, get_glycoform_diff, process_glm_results)
+from glycowork.motif.processing import enforce_class, process_for_glycoshift
 from glycowork.motif.annotate import (annotate_dataset, quantify_motifs, link_find, create_correlation_network,
                                       group_glycans_core, group_glycans_sia_fuc, group_glycans_N_glycan_type, load_lectin_lib,
                                       create_lectin_and_motif_mappings, lectin_motif_scoring, clean_up_heatmap)
@@ -1451,3 +1451,47 @@ def get_lectin_array(df, group1, group2, paired = False, transform = ''):
   if not group2:
     df_out["change"] = ["different"] * len(df_out)
   return df_out
+
+
+def get_glycoshift_per_site(df, group1, group2, paired = False, impute = True,
+                            min_samples = 0.2, gamma = 0.1, custom_scale = 0):
+  """Calculates differentially expressed glycans or motifs from glycomics data\n
+  | Arguments:
+  | :-
+  | df (dataframe): dataframe containing glycan sequences in first column and relative abundances in subsequent columns [alternative: filepath to .csv or .xlsx]
+  | group1 (list): list of column indices or names for the first group of samples, usually the control
+  | group2 (list): list of column indices or names for the second group of samples
+  | paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False
+  | impute (bool): replaces zeroes with a Random Forest based model; default:True
+  | min_samples (float): Percent of the samples that need to have non-zero values for glycan to be kept; default: 20%
+  | gamma (float): uncertainty parameter to estimate scale uncertainty for CLR transformation; default: 0.1
+  | custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)\n
+  | Returns:
+  | :-
+  | Returns a dataframe with:
+  | (for each condition/interaction feature)
+  | (i) Regression coefficient from the GLM (indicating direction of change in the treatment condition)
+  | (ii) Corrected p-values (two-tailed t-test with two-stage Benjamini-Hochberg correction) for testing the coefficient against zero
+  | (iii) Significance: True/False of whether the corrected p-value lies below the sample size-appropriate significance threshold
+  """
+  df, df_org, group1, group2 = preprocess_data(df, group1, group2, experiment = "diff", motifs = False, impute = impute,
+                                               min_samples = min_samples, transform = "Nothing", paired = paired)
+  alpha = get_alphaN(len(group1+group2))
+  df, glycan_features = process_for_glycoshift(df) # potentially expand this further to infer and label high-Man/Hybrid/complex
+  necessary_columns = ['Glycoform'] + glycan_features
+  preserved_data = df[necessary_columns]
+  df = df.drop(necessary_columns, axis = 1)
+  df = df.set_index('Glycosite')
+  df = df.div(df.sum(axis = 0), axis = 1) * 100
+  df = df.reset_index()
+  results = [
+        clr_transformation(group_df[group1 + group2], group1, group2, gamma = gamma, custom_scale = custom_scale)
+        .assign(Glycosite = glycosite)
+        for glycosite, group_df in df.groupby('Glycosite')
+    ]
+  df = pd.concat(results, ignore_index = True)
+  df = pd.concat([df, preserved_data.reset_index(drop = True)], axis = 1)
+  df_long = pd.melt(df, id_vars = ['Glycosite', 'Glycoform'] + glycan_features,
+                  var_name = 'Sample', value_name = 'Abundance')
+  df_long['Condition'] = df_long['Sample'].apply(lambda x: 0 if x in group1 else 1)
+  return process_glm_results(df_long, alpha, glycan_features)
diff --git a/build/lib/glycowork/motif/processing.py b/build/lib/glycowork/motif/processing.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas as pd
 import copy
 import re
 from functools import wraps
@@ -309,7 +310,7 @@ def canonicalize_composition(comp):
   """converts a composition from any common format into the dictionary that is optimized for glycowork\n
   | Arguments:
   | :-
-  | comp (string): composition formatted either in the style of HexNAc2Hex1Fuc3Neu5Ac1 or N2H1F3A1\n
+  | comp (string): composition formatted either in the style of Hex5HexNAc4Fuc1Neu5Ac2 or H5N4F1A2\n
   | Returns:
   | :-
   | Returns composition as a dictionary of style monosaccharide : count
@@ -321,6 +322,13 @@ def canonicalize_composition(comp):
   elif comp.isdigit():
     temp = {"Hex": int(comp[0]), "HexNAc": int(comp[1]), "Neu5Ac": int(comp[2]), "dHex": int(comp[3])}
     return {k: v for k, v in temp.items() if v}
+  elif comp[0].isdigit():
+    comp = comp.replace(' ', '')
+    if len(comp) < 5:
+      temp = {"Hex": int(comp[0]), "HexNAc": int(comp[1]), "Neu5Ac": int(comp[2]), "dHex": int(comp[3])}
+    else:
+      temp = {"Hex": int(comp[0]), "HexNAc": int(comp[1]), "Neu5Ac": int(comp[2]), "Neu5Gc": int(comp[3]), "dHex": int(comp[4])}
+    return {k: v for k, v in temp.items() if v}
   comp_dict = {}
   i = 0
   replace_dic = {"Neu5Ac": "NeuAc", "Neu5Gc": "NeuGc", '(': '', ')': '', ' ': '', '+': ''}
@@ -959,3 +967,46 @@ def equal_repeats(r1, r2):
   """
   r1_long = r1[:r1.rindex(')')+1] * 2
   return any(r1_long[i:i + len(r2)] == r2 for i in range(len(r1)))
+
+
+@rescue_compositions
+def parse_glycoform(glycoform, glycan_features = ['H', 'N', 'A', 'F', 'G']):
+  """converts composition of style H5N4F1A2 into monosaccharide counts\n
+  | Arguments:
+  | :-
+  | comp (string): composition formatted either in the style of Hex5HexNAc4Fuc1Neu5Ac2 or H5N4F1A2\n
+  | Returns:
+  | :-
+  | Returns composition as a dictionary of style monosaccharide : count
+  """
+  if isinstance(glycoform, dict):
+    return {k: glycoform.get(k, 0) for k in glycan_features}
+  components = {c: 0 for c in glycan_features}
+  matches = re.finditer(r'([HNAFG])(\d+)', glycoform)
+  for match in matches:
+    components[match.group(1)] = int(match.group(2))
+  return components
+
+
+def process_for_glycoshift(df):
+  """extracts and formats compositions in glycoproteomics dataset\n
+  | Arguments:
+  | :-
+  | df (dataframe): glycoproteomics dataset, expects index to be formatted as protein_site_composition\n
+  | Returns:
+  | :-
+  | (i) glycoproteomics dataset with new columns for protein_site, composition, and composition counts
+  | (ii) list of identified glycan features, such as different monosaccharides
+  """
+  df['Glycosite'] = [k.split('_')[0] + '_' + k.split('_')[2] for i, k in enumerate(df.index)]
+  if '[' in df.index[0]:
+    comps = ['['+k.split('[')[1] for k in df.index]
+    comps = [list(map(int, re.findall(r'\d+', s))) for s in comps]
+    df['Glycoform'] = [f'H{c[0]}N{c[1]}F{c[3]}A{c[2]}' for c in comps]
+    glycan_features = ['H', 'N', 'A', 'F', 'G']
+  else:
+    df['Glycoform'] = [canonicalize_composition(k.split('_')[-1]) for k in df.index]
+    glycan_features = set(unwrap([list(c.keys()) for c in df.Glycoform]))
+  org_cols = df.columns.tolist()
+  df = df.join(df['Glycoform'].apply(parse_glycoform, glycan_features = glycan_features).apply(pd.Series))
+  return df, [c for c in df.columns if c not in org_cols]