Skip to content

Commit

Permalink
add accounting for technical variation via the monte_carlo keyword in…
Browse files Browse the repository at this point in the history
… glycoDE
  • Loading branch information
Bribak committed Jun 21, 2024
1 parent a6087f9 commit ac8c826
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 37 deletions.
57 changes: 39 additions & 18 deletions build/lib/glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
sequence_richness, shannon_diversity_index, simpson_diversity_index,
get_equivalence_test, clr_transformation, anosim, permanova_with_permutation,
alpha_biodiversity_stats, get_additive_logratio_transformation, correct_multiple_testing,
omega_squared, get_glycoform_diff, process_glm_results, partial_corr)
omega_squared, get_glycoform_diff, process_glm_results, partial_corr, estimate_technical_variance,
perform_tests_monte_carlo)
from glycowork.motif.processing import enforce_class, process_for_glycoshift
from glycowork.motif.annotate import (annotate_dataset, quantify_motifs, link_find, create_correlation_network,
group_glycans_core, group_glycans_sia_fuc, group_glycans_N_glycan_type, load_lectin_lib,
Expand All @@ -38,7 +39,8 @@


def preprocess_data(df, group1, group2, experiment = "diff", motifs = False, feature_set = ['exhaustive', 'known'], paired = False,
impute = True, min_samples = 0.1, transform = "CLR", gamma = 0.1, custom_scale = 0, custom_motifs = []):
impute = True, min_samples = 0.1, transform = "CLR", gamma = 0.1, custom_scale = 0, custom_motifs = [],
monte_carlo = False):
"""Preprocesses data for analysis by the functions within .motif.analysis\n
| Arguments:
| :-
Expand All @@ -57,7 +59,8 @@ def preprocess_data(df, group1, group2, experiment = "diff", motifs = False, fea
| custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty
| transform (str): transformation to escape Aitchison space; options are CLR and ALR (use ALR if you have many glycans (>100) with low values); default:will be inferred
| gamma (float): uncertainty parameter to estimate scale uncertainty for CLR transformation; default: 0.1
| custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)\n
| custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)
| monte_carlo (bool): whether to account for technical variation via Monte Carlo simulations; will be slower and much more conservative; default:False\n
| Returns:
| :-
| (i) transformed and processed dataset
Expand Down Expand Up @@ -87,7 +90,11 @@ def preprocess_data(df, group1, group2, experiment = "diff", motifs = False, fea
if transform == "ALR":
df = get_additive_logratio_transformation(df, group1 if experiment == "diff" else df.columns[1:], group2, paired = paired, gamma = gamma, custom_scale = custom_scale)
elif transform == "CLR":
df.iloc[:, 1:] = clr_transformation(df.iloc[:, 1:], group1 if experiment == "diff" else df.columns[1:], group2, gamma = gamma, custom_scale = custom_scale)
if monte_carlo and not motifs:
df = pd.concat([df.iloc[:, 0], estimate_technical_variance(df.iloc[:, 1:], group1, group2,
gamma = gamma, custom_scale = custom_scale)], axis = 1)
else:
df.iloc[:, 1:] = clr_transformation(df.iloc[:, 1:], group1 if experiment == "diff" else df.columns[1:], group2, gamma = gamma, custom_scale = custom_scale)
elif transform == "Nothing":
pass
else:
Expand Down Expand Up @@ -570,7 +577,8 @@ def get_differential_expression(df, group1, group2,
motifs = False, feature_set = ['exhaustive', 'known'], paired = False,
impute = True, sets = False, set_thresh = 0.9, effect_size_variance = False,
min_samples = 0.1, grouped_BH = False, custom_motifs = [], transform = None,
gamma = 0.1, custom_scale = 0, glycoproteomics = False, level = 'peptide'):
gamma = 0.1, custom_scale = 0, glycoproteomics = False, level = 'peptide',
monte_carlo = False):
"""Calculates differentially expressed glycans or motifs from glycomics data\n
| Arguments:
| :-
Expand All @@ -594,7 +602,8 @@ def get_differential_expression(df, group1, group2,
| gamma (float): uncertainty parameter to estimate scale uncertainty for CLR transformation; default: 0.1
| custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)
| glycoproteomics (bool): whether the analyzed data in df comes from a glycoproteomics experiment; default:False
| level (string; only relevant if glycoproteomics=True): whether to analyze glycoform differential expression at the level of 'peptide' or 'protein'; default:'peptide'\n
| level (string; only relevant if glycoproteomics=True): whether to analyze glycoform differential expression at the level of 'peptide' or 'protein'; default:'peptide'
| monte_carlo (bool): whether to account for technical variation via Monte Carlo simulations; will be slower and much more conservative; default:False\n
| Returns:
| :-
| Returns a dataframe with:
Expand All @@ -611,11 +620,15 @@ def get_differential_expression(df, group1, group2,
"""
df, df_org, group1, group2 = preprocess_data(df, group1, group2, experiment = "diff", motifs = motifs, impute = impute,
min_samples = min_samples, transform = transform, feature_set = feature_set,
paired = paired, gamma = gamma, custom_scale = custom_scale, custom_motifs = custom_motifs)
paired = paired, gamma = gamma, custom_scale = custom_scale, custom_motifs = custom_motifs,
monte_carlo = monte_carlo)
# Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment
alpha = get_alphaN(len(group1+group2))
# Variance-based filtering of features
df, df_prison = variance_based_filtering(df)
if not monte_carlo:
df, df_prison = variance_based_filtering(df)
else:
df_prison = []
df_org = df_org.loc[df.index]
glycans = df.index.tolist()
mean_abundance = df_org.mean(axis = 1)
Expand Down Expand Up @@ -645,17 +658,23 @@ def get_differential_expression(df, group1, group2,
log2fc = (df_b.values - df_a.values).mean(axis = 1) if paired else (df_b.mean(axis = 1) - df_a.mean(axis = 1))
if paired:
assert len(group1) == len(group2), "For paired samples, the size of group1 and group2 should be the same"
pvals = [ttest_rel(row_b, row_a)[1] if paired else ttest_ind(row_b, row_a, equal_var = False)[1] for row_a, row_b in zip(df_a.values, df_b.values)]
equivalence_pvals = np.array([get_equivalence_test(row_a, row_b, paired = paired) if pvals[i] > 0.05 else np.nan for i, (row_a, row_b) in enumerate(zip(df_a.values, df_b.values))])
valid_equivalence_pvals = equivalence_pvals[~np.isnan(equivalence_pvals)]
corrected_equivalence_pvals = multipletests(valid_equivalence_pvals, method = 'fdr_tsbh')[1] if len(valid_equivalence_pvals) else []
equivalence_pvals[~np.isnan(equivalence_pvals)] = corrected_equivalence_pvals
equivalence_pvals[np.isnan(equivalence_pvals)] = 1.0
levene_pvals = [levene(row_b, row_a)[1] for row_a, row_b in zip(df_a.values, df_b.values)] if (df_a.shape[1] > 2 and df_b.shape[1] > 2) else [1.0]*len(df_a)
effects = [cohen_d(row_b, row_a, paired = paired) for row_a, row_b in zip(df_a.values, df_b.values)]
effect_sizes, variances = list(zip(*effects)) if effects else [[0]*len(glycans), [0]*len(glycans)]
if monte_carlo:
pvals, corrpvals, effect_sizes = perform_tests_monte_carlo(df_a, df_b, paired = paired)
significance = [cp < alpha for cp in corrpvals]
equivalence_pvals = [1.0]*len(pvals)
levene_pvals = [1.0]*len(pvals)
else:
pvals = [ttest_rel(row_b, row_a)[1] if paired else ttest_ind(row_b, row_a, equal_var = False)[1] for row_a, row_b in zip(df_a.values, df_b.values)]
equivalence_pvals = np.array([get_equivalence_test(row_a, row_b, paired = paired) if pvals[i] > 0.05 else np.nan for i, (row_a, row_b) in enumerate(zip(df_a.values, df_b.values))])
valid_equivalence_pvals = equivalence_pvals[~np.isnan(equivalence_pvals)]
corrected_equivalence_pvals = multipletests(valid_equivalence_pvals, method = 'fdr_tsbh')[1] if len(valid_equivalence_pvals) else []
equivalence_pvals[~np.isnan(equivalence_pvals)] = corrected_equivalence_pvals
equivalence_pvals[np.isnan(equivalence_pvals)] = 1.0
levene_pvals = [levene(row_b, row_a)[1] for row_a, row_b in zip(df_a.values, df_b.values)] if (df_a.shape[1] > 2 and df_b.shape[1] > 2) else [1.0]*len(df_a)
effects = [cohen_d(row_b, row_a, paired = paired) for row_a, row_b in zip(df_a.values, df_b.values)]
effect_sizes, variances = list(zip(*effects)) if effects else [[0]*len(glycans), [0]*len(glycans)]
# Multiple testing correction
if pvals:
if not monte_carlo and pvals:
if not motifs and grouped_BH:
grouped_glycans, grouped_pvals = select_grouping(df_b, df_a, glycans, pvals, paired = paired, grouped_BH = grouped_BH)
corrpvals, significance_dict = TST_grouped_benjamini_hochberg(grouped_glycans, grouped_pvals, alpha)
Expand All @@ -665,6 +684,8 @@ def get_differential_expression(df, group1, group2,
else:
corrpvals, significance = correct_multiple_testing(pvals, alpha)
levene_pvals = multipletests(levene_pvals, method = 'fdr_tsbh')[1]
elif monte_carlo:
pass
else:
corrpvals, significance = [1]*len(glycans), [False]*len(glycans)
df_out = pd.DataFrame(list(zip(glycans, mean_abundance, log2fc, pvals, corrpvals, significance, levene_pvals, effect_sizes, equivalence_pvals)),
Expand Down
71 changes: 70 additions & 1 deletion glycowork/glycan_data/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,7 +786,7 @@ def clr_transformation(df, group1, group2, gamma = 0.1, custom_scale = 0):
| :-
| Returns a dataframe that is CLR-transformed with scale model adjustment
"""
geometric_mean = gmean(df.replace(0, np.nan), axis = 0)
geometric_mean = gmean(df.replace(0, np.nan), axis = 0, nan_policy = 'omit')
clr_adjusted = np.zeros_like(df.values)
if gamma and not isinstance(custom_scale, dict):
group1i = [df.columns.get_loc(c) for c in group1]
Expand Down Expand Up @@ -1165,3 +1165,72 @@ def partial_corr(x, y, controls, motifs = False):
# Compute correlation of residuals
corr, pval = spearmanr(res_x, res_y)
return corr, pval


def estimate_technical_variance(df, group1, group2, num_instances = 128,
gamma = 0.1, custom_scale = 0):
"""Monte Carlo sampling from the Dirichlet distribution with relative abundances as concentration, followed by CLR transformation.\n
| Arguments:
| :-
| df (dataframe): dataframe containing glycan sequences in first column and relative abundances in subsequent columns [alternative: filepath to .csv or .xlsx]
| group1 (list): list of column indices or names for the first group of samples, usually the control
| group2 (list): list of column indices or names for the second group of samples
| num_instances (int): Number of Monte Carlo instances to sample; default:128
| gamma (float): uncertainty parameter to estimate scale uncertainty for CLR transformation; default: 0.1
| custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)\n
| Returns:
| :-
| Returns a transformed dataframe of shape (features, samples*num_instances) with CLR-transformed Monte Carlo Dirichlet instances.
"""
df = df.apply(lambda col: (col / col.sum())*5000, axis = 0)
features, samples = df.shape
transformed_data = np.zeros((features, samples, num_instances))
for j in range(samples):
dirichlet_samples = dirichlet.rvs(alpha = df.iloc[:, j], size = num_instances)
# CLR Transformation for each Monte Carlo instance
for n in range(num_instances):
sample_instance = pd.DataFrame(dirichlet_samples[n, :])
transformed_data[:, j, n] = clr_transformation(sample_instance, sample_instance.columns.tolist(), [],
gamma = gamma, custom_scale = custom_scale).squeeze()
columns = [col for col in df.columns for _ in range(num_instances)]
transformed_data_2d = transformed_data.reshape((features, samples* num_instances))
transformed_df = pd.DataFrame(transformed_data_2d, columns = columns)
return transformed_df


def perform_tests_monte_carlo(group_a, group_b, num_instances = 128, paired = False):
"""Perform tests on each Monte Carlo instance, apply Benjamini-Hochberg correction, and calculate effect sizes and variances.\n
| Arguments:
| :-
| group_a (dataframe): rows as featureas and columns as sample instances from one condition
| group_b (dataframe): rows as featureas and columns as sample instances from one condition
| num_instances (int): Number of Monte Carlo instances to sample; default:128
| paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False\n
| Returns:
| :-
| (i) list of uncorrected p-values
| (ii) list of corrected p-values (two-stage Benjamini-Hochberg)
| (ii) list of effect sizes (Cohen's d)
"""
num_features, num_columns = group_a.shape
avg_uncorrected_p_values, avg_corrected_p_values, avg_effect_sizes = np.zeros(num_features), np.zeros(num_features), np.zeros(num_features)
for instance in range(num_instances):
instance_p_values = []
instance_effect_sizes = []
for feature in range(num_features):
sample_a = group_a.iloc[feature, instance::num_instances].values
sample_b = group_b.iloc[feature, instance::num_instances].values
p_value = ttest_rel(sample_b, sample_a)[1] if paired else ttest_ind(sample_b, sample_a, equal_var = False)[1]
effect_size, effect_size_variance = cohen_d(sample_b, sample_a, paired = paired)
instance_p_values.append(p_value)
instance_effect_sizes.append(effect_size)
# Apply Benjamini-Hochberg correction for multiple testing within the instance
avg_uncorrected_p_values += instance_p_values
corrected_p_values = multipletests(instance_p_values, method = 'fdr_tsbh')[1]
avg_corrected_p_values += corrected_p_values
avg_effect_sizes += instance_effect_sizes
avg_uncorrected_p_values /= num_instances
avg_corrected_p_values /= num_instances
avg_corrected_p_values = [p if p >= avg_uncorrected_p_values[i] else avg_uncorrected_p_values[i] for i, p in enumerate(avg_corrected_p_values)]
avg_effect_sizes /= num_instances
return avg_uncorrected_p_values, avg_corrected_p_values, avg_effect_sizes
Loading

0 comments on commit ac8c826

Please sign in to comment.