make test_inter_vs_intra_group robust to non-paired data and data wit…

…h differing sample sizes per condition
BojarLab · Feb 26, 2024 · 791175b · 791175b
1 parent c75d61f
commit 791175b
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 9 deletions.
diff --git a/build/lib/glycowork/motif/analysis.py b/build/lib/glycowork/motif/analysis.py
@@ -477,14 +477,15 @@ def get_pca(df, groups = None, motifs = False, feature_set = ['known', 'exhausti
   plt.show()
 
 
-def select_grouping(cohort_b, cohort_a, glycans, p_values, grouped_BH = False):
+def select_grouping(cohort_b, cohort_a, glycans, p_values, paired = False, grouped_BH = False):
   """test various means of grouping glycans by domain knowledge, to obtain high intra-group correlation\n
   | Arguments:
   | :-
   | cohort_b (dataframe): dataframe of glycans as rows and samples as columns of the case samples
   | cohort_a (dataframe): dataframe of glycans as rows and samples as columns of the control samples
   | glycans (list): list of glycans in IUPAC-condensed nomenclature
   | p_values (list): list of associated p-values
+  | paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False
   | grouped_BH (bool): whether to perform two-stage adaptive Benjamini-Hochberg as a grouped multiple testing correction; will SIGNIFICANTLY increase runtime; default:False\n 
   | Returns:
   | :-
@@ -502,7 +503,7 @@ def select_grouping(cohort_b, cohort_a, glycans, p_values, grouped_BH = False):
     grouped_glycans, grouped_p_values = func(glycans, p_values)
     if any([len(g) < 2 for g in grouped_glycans.values()]):
       continue
-    intra, inter = test_inter_vs_intra_group(cohort_b, cohort_a, glycans, grouped_glycans)
+    intra, inter = test_inter_vs_intra_group(cohort_b, cohort_a, glycans, grouped_glycans, paired = paired)
     out[desc] = ((intra, inter), (grouped_glycans, grouped_p_values))
   desc = list(out.keys())[np.argmax([v[0][0] - v[0][1] for k, v in out.items()])]
   intra, inter = out[desc][0]
@@ -609,7 +610,7 @@ def get_differential_expression(df, group1, group2,
   # Multiple testing correction
   if pvals:
       if not motifs:
-          grouped_glycans, grouped_pvals = select_grouping(df_b, df_a, glycans, pvals, grouped_BH = grouped_BH)
+          grouped_glycans, grouped_pvals = select_grouping(df_b, df_a, glycans, pvals, paired = paired, grouped_BH = grouped_BH)
           corrpvals, significance_dict = TST_grouped_benjamini_hochberg(grouped_glycans, grouped_pvals, alpha)
           corrpvals = [corrpvals[g] for g in glycans]
           significance = [significance_dict[g] for g in glycans]

diff --git a/glycowork/glycan_data/stats.py b/glycowork/glycan_data/stats.py
@@ -594,20 +594,25 @@ def TST_grouped_benjamini_hochberg(identifiers_grouped, p_values_grouped, alpha)
   return adjusted_p_values, significance_dict
 
 
-def test_inter_vs_intra_group(cohort_b, cohort_a, glycans, grouped_glycans):
+def test_inter_vs_intra_group(cohort_b, cohort_a, glycans, grouped_glycans, paired = False):
   """estimates intra- and inter-group correlation of a given grouping of glycans via a mixed-effects model\n
   | Arguments:
   | :-
   | cohort_b (dataframe): dataframe of glycans as rows and samples as columns of the case samples
   | cohort_a (dataframe): dataframe of glycans as rows and samples as columns of the control samples
   | glycans (list): list of glycans in IUPAC-condensed nomenclature
-  | grouped_glycans (dict): dictionary of type group : glycans\n
+  | grouped_glycans (dict): dictionary of type group : glycans
+  | paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False\n
   | Returns:
   | :-
   | Returns floats for the intra-group and inter-group correlation
   """
   reverse_lookup = {k: v for v, l in grouped_glycans.items() for k in l}
-  temp = pd.DataFrame(np.log2(abs((cohort_b.values + 1e-8) / (cohort_a.values + 1e-8))))
+  if paired:
+    temp = pd.DataFrame(np.log2(abs((cohort_b.values + 1e-8) / (cohort_a.values + 1e-8))))
+  else:
+    mean_cohort_a = (cohort_a.mean(axis = 1) + 1e-8).values[:, np.newaxis]
+    temp = pd.DataFrame(np.log2((cohort_b.values + 1e-8) / mean_cohort_a))
   temp.index = glycans
   temp = temp.reset_index()
   # Melt the dataframe to long format

diff --git a/glycowork/motif/analysis.py b/glycowork/motif/analysis.py
@@ -477,14 +477,15 @@ def get_pca(df, groups = None, motifs = False, feature_set = ['known', 'exhausti
   plt.show()
 
 
-def select_grouping(cohort_b, cohort_a, glycans, p_values, grouped_BH = False):
+def select_grouping(cohort_b, cohort_a, glycans, p_values, paired = False, grouped_BH = False):
   """test various means of grouping glycans by domain knowledge, to obtain high intra-group correlation\n
   | Arguments:
   | :-
   | cohort_b (dataframe): dataframe of glycans as rows and samples as columns of the case samples
   | cohort_a (dataframe): dataframe of glycans as rows and samples as columns of the control samples
   | glycans (list): list of glycans in IUPAC-condensed nomenclature
   | p_values (list): list of associated p-values
+  | paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False
   | grouped_BH (bool): whether to perform two-stage adaptive Benjamini-Hochberg as a grouped multiple testing correction; will SIGNIFICANTLY increase runtime; default:False\n 
   | Returns:
   | :-
@@ -502,7 +503,7 @@ def select_grouping(cohort_b, cohort_a, glycans, p_values, grouped_BH = False):
     grouped_glycans, grouped_p_values = func(glycans, p_values)
     if any([len(g) < 2 for g in grouped_glycans.values()]):
       continue
-    intra, inter = test_inter_vs_intra_group(cohort_b, cohort_a, glycans, grouped_glycans)
+    intra, inter = test_inter_vs_intra_group(cohort_b, cohort_a, glycans, grouped_glycans, paired = paired)
     out[desc] = ((intra, inter), (grouped_glycans, grouped_p_values))
   desc = list(out.keys())[np.argmax([v[0][0] - v[0][1] for k, v in out.items()])]
   intra, inter = out[desc][0]
@@ -609,7 +610,7 @@ def get_differential_expression(df, group1, group2,
   # Multiple testing correction
   if pvals:
       if not motifs:
-          grouped_glycans, grouped_pvals = select_grouping(df_b, df_a, glycans, pvals, grouped_BH = grouped_BH)
+          grouped_glycans, grouped_pvals = select_grouping(df_b, df_a, glycans, pvals, paired = paired, grouped_BH = grouped_BH)
           corrpvals, significance_dict = TST_grouped_benjamini_hochberg(grouped_glycans, grouped_pvals, alpha)
           corrpvals = [corrpvals[g] for g in glycans]
           significance = [significance_dict[g] for g in glycans]