From 4010a0e845e3bf810c578536153e3ecc698fa367 Mon Sep 17 00:00:00 2001
From: Daniel Bojar <daniel@bojar.net>
Date: Mon, 15 Apr 2024 19:08:09 +0200
Subject: [PATCH] add informed scale model

---
 build/lib/glycowork/motif/analysis.py |  8 +++++---
 glycowork/glycan_data/stats.py        | 15 +++++++++++----
 glycowork/motif/analysis.py           |  8 +++++---
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/build/lib/glycowork/motif/analysis.py b/build/lib/glycowork/motif/analysis.py
index a5fc0f1..31d9fe9 100644
--- a/build/lib/glycowork/motif/analysis.py
+++ b/build/lib/glycowork/motif/analysis.py
@@ -509,7 +509,8 @@ def select_grouping(cohort_b, cohort_a, glycans, p_values, paired = False, group
 def get_differential_expression(df, group1, group2,
                                 motifs = False, feature_set = ['exhaustive', 'known'], paired = False,
                                 impute = True, sets = False, set_thresh = 0.9, effect_size_variance = False,
-                                min_samples = 0.1, grouped_BH = False, custom_motifs = [], transform = "CLR", gamma = 0.1):
+                                min_samples = 0.1, grouped_BH = False, custom_motifs = [], transform = "CLR",
+                                gamma = 0.1, custom_scale = 0):
   """Calculates differentially expressed glycans or motifs from glycomics data\n
   | Arguments:
   | :-
@@ -530,7 +531,8 @@ def get_differential_expression(df, group1, group2,
   | grouped_BH (bool): whether to perform two-stage adaptive Benjamini-Hochberg as a grouped multiple testing correction; will SIGNIFICANTLY increase runtime; default:False
   | custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty
   | transform (str): transformation to escape Aitchison space; options are CLR and ALR (use ALR if you have many glycans (>100) with low values); default:CLR
-  | gamma (float): uncertainty parameter to estimate scale uncertainty for CLR transformation; default: 0.1\n
+  | gamma (float): uncertainty parameter to estimate scale uncertainty for CLR transformation; default: 0.1
+  | custom_scale (float): if you *know* the difference in scale between groups, provide the ratio of group2/group1 for an informed scale model\n
   | Returns:
   | :-
   | Returns a dataframe with:
@@ -560,7 +562,7 @@ def get_differential_expression(df, group1, group2,
   if transform == "ALR":
     df = get_additive_logratio_transformation(df, group1, group2, paired = paired)
   elif transform == "CLR":
-    df.iloc[:, 1:] = clr_transformation(df.iloc[:, 1:], group1, group2, gamma = gamma)
+    df.iloc[:, 1:] = clr_transformation(df.iloc[:, 1:], group1, group2, gamma = gamma, custom_scale = custom_scale)
   else:
     raise ValueError("Only ALR and CLR are valid transforms for now.")
   # Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment
diff --git a/glycowork/glycan_data/stats.py b/glycowork/glycan_data/stats.py
index 70bbcc9..ebbb6e7 100644
--- a/glycowork/glycan_data/stats.py
+++ b/glycowork/glycan_data/stats.py
@@ -772,14 +772,15 @@ def get_equivalence_test(row_a, row_b, paired = False):
   return ttost_paired(row_a, row_b, low, up)[0] if paired else ttost_ind(row_a, row_b, low, up)[0]
 
 
-def clr_transformation(df, group1, group2, gamma = 0.1):
+def clr_transformation(df, group1, group2, gamma = 0.1, custom_scale = 0):
   """performs the Center Log-Ratio (CLR) Transformation with scale model adjustment\n
   | Arguments:
   | :-
   | df (dataframe): dataframe containing features in rows and samples in columns
   | group1 (list): list of column indices or names for the first group of samples, usually the control
   | group2 (list): list of column indices or names for the second group of samples
-  | gamma (float): the degree of uncertainty that the CLR assumption holds; default: 0.1\n
+  | gamma (float): the degree of uncertainty that the CLR assumption holds; default: 0.1
+  | custom_scale (float): if you *know* the difference in scale between groups, provide the ratio of group2/group1 for an informed scale model\n
   | Returns:
   | :-
   | Returns a dataframe that is CLR-transformed with scale model adjustment
@@ -791,6 +792,12 @@ def clr_transformation(df, group1, group2, gamma = 0.1):
     group2i = [col_list.index(k) for k in group2]
     case_control = [0]*len(group1) + [1]*len(group2)
     clr_adjusted = np.zeros_like(df.values)
+    if custom_scale:
+      control = norm.rvs(loc = np.log2(1), scale = gamma, size = (df.shape[0], len(group1)))
+      clr_adjusted[:, group1i] = np.log2(df[group1]) + control
+      condition = norm.rvs(loc = np.log2(custom_scale), scale = gamma, size = (df.shape[0], len(group2)))
+      clr_adjusted[:, group2i] = np.log2(df[group2]) + condition
+      return pd.DataFrame(clr_adjusted, index = df.index, columns = df.columns)
     geometric_mean = -np.log2(geometric_mean)
     clr_adjusted[:, group1i] = np.log2(df[group1]) + geometric_mean[group1i]
     observed = norm.rvs(loc = geometric_mean[group2i], scale = gamma, size = (df.shape[0], len(group2)))
@@ -991,8 +998,8 @@ def correct_multiple_testing(pvals, alpha):
   corrpvals = [p if p >= pvals[i] else pvals[i] for i, p in enumerate(corrpvals)]
   significance = [p < alpha for p in corrpvals]
   if sum(significance) > 0.9*len(significance):
-    print("Significance inflation detected. The CLR/ALR transformation cannot seem to handle this dataset. \
-             Proceed with caution; for now switching to Bonferroni correction for being conservative about this.")
+    print("Significance inflation detected. The CLR/ALR transformation cannot seem to handle this dataset.\
+             Proceed with caution; for now switching to Bonferroni correction to be conservative about this.")
     corrpvals = multipletests(pvals, method = 'bonferroni')[1]
     significance = [p < alpha for p in corrpvals]
   return corrpvals, significance
diff --git a/glycowork/motif/analysis.py b/glycowork/motif/analysis.py
index a5fc0f1..31d9fe9 100644
--- a/glycowork/motif/analysis.py
+++ b/glycowork/motif/analysis.py
@@ -509,7 +509,8 @@ def select_grouping(cohort_b, cohort_a, glycans, p_values, paired = False, group
 def get_differential_expression(df, group1, group2,
                                 motifs = False, feature_set = ['exhaustive', 'known'], paired = False,
                                 impute = True, sets = False, set_thresh = 0.9, effect_size_variance = False,
-                                min_samples = 0.1, grouped_BH = False, custom_motifs = [], transform = "CLR", gamma = 0.1):
+                                min_samples = 0.1, grouped_BH = False, custom_motifs = [], transform = "CLR",
+                                gamma = 0.1, custom_scale = 0):
   """Calculates differentially expressed glycans or motifs from glycomics data\n
   | Arguments:
   | :-
@@ -530,7 +531,8 @@ def get_differential_expression(df, group1, group2,
   | grouped_BH (bool): whether to perform two-stage adaptive Benjamini-Hochberg as a grouped multiple testing correction; will SIGNIFICANTLY increase runtime; default:False
   | custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty
   | transform (str): transformation to escape Aitchison space; options are CLR and ALR (use ALR if you have many glycans (>100) with low values); default:CLR
-  | gamma (float): uncertainty parameter to estimate scale uncertainty for CLR transformation; default: 0.1\n
+  | gamma (float): uncertainty parameter to estimate scale uncertainty for CLR transformation; default: 0.1
+  | custom_scale (float): if you *know* the difference in scale between groups, provide the ratio of group2/group1 for an informed scale model\n
   | Returns:
   | :-
   | Returns a dataframe with:
@@ -560,7 +562,7 @@ def get_differential_expression(df, group1, group2,
   if transform == "ALR":
     df = get_additive_logratio_transformation(df, group1, group2, paired = paired)
   elif transform == "CLR":
-    df.iloc[:, 1:] = clr_transformation(df.iloc[:, 1:], group1, group2, gamma = gamma)
+    df.iloc[:, 1:] = clr_transformation(df.iloc[:, 1:], group1, group2, gamma = gamma, custom_scale = custom_scale)
   else:
     raise ValueError("Only ALR and CLR are valid transforms for now.")
   # Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment