Skip to content

Commit

Permalink
make test_inter_vs_intra_group robust to non-paired data and data wit…
Browse files Browse the repository at this point in the history
…h differing sample sizes per condition
  • Loading branch information
Bribak committed Feb 26, 2024
1 parent c75d61f commit 791175b
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 9 deletions.
7 changes: 4 additions & 3 deletions build/lib/glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,14 +477,15 @@ def get_pca(df, groups = None, motifs = False, feature_set = ['known', 'exhausti
plt.show()


def select_grouping(cohort_b, cohort_a, glycans, p_values, grouped_BH = False):
def select_grouping(cohort_b, cohort_a, glycans, p_values, paired = False, grouped_BH = False):
"""test various means of grouping glycans by domain knowledge, to obtain high intra-group correlation\n
| Arguments:
| :-
| cohort_b (dataframe): dataframe of glycans as rows and samples as columns of the case samples
| cohort_a (dataframe): dataframe of glycans as rows and samples as columns of the control samples
| glycans (list): list of glycans in IUPAC-condensed nomenclature
| p_values (list): list of associated p-values
| paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False
| grouped_BH (bool): whether to perform two-stage adaptive Benjamini-Hochberg as a grouped multiple testing correction; will SIGNIFICANTLY increase runtime; default:False\n
| Returns:
| :-
Expand All @@ -502,7 +503,7 @@ def select_grouping(cohort_b, cohort_a, glycans, p_values, grouped_BH = False):
grouped_glycans, grouped_p_values = func(glycans, p_values)
if any([len(g) < 2 for g in grouped_glycans.values()]):
continue
intra, inter = test_inter_vs_intra_group(cohort_b, cohort_a, glycans, grouped_glycans)
intra, inter = test_inter_vs_intra_group(cohort_b, cohort_a, glycans, grouped_glycans, paired = paired)
out[desc] = ((intra, inter), (grouped_glycans, grouped_p_values))
desc = list(out.keys())[np.argmax([v[0][0] - v[0][1] for k, v in out.items()])]
intra, inter = out[desc][0]
Expand Down Expand Up @@ -609,7 +610,7 @@ def get_differential_expression(df, group1, group2,
# Multiple testing correction
if pvals:
if not motifs:
grouped_glycans, grouped_pvals = select_grouping(df_b, df_a, glycans, pvals, grouped_BH = grouped_BH)
grouped_glycans, grouped_pvals = select_grouping(df_b, df_a, glycans, pvals, paired = paired, grouped_BH = grouped_BH)
corrpvals, significance_dict = TST_grouped_benjamini_hochberg(grouped_glycans, grouped_pvals, alpha)
corrpvals = [corrpvals[g] for g in glycans]
significance = [significance_dict[g] for g in glycans]
Expand Down
11 changes: 8 additions & 3 deletions glycowork/glycan_data/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,20 +594,25 @@ def TST_grouped_benjamini_hochberg(identifiers_grouped, p_values_grouped, alpha)
return adjusted_p_values, significance_dict


def test_inter_vs_intra_group(cohort_b, cohort_a, glycans, grouped_glycans):
def test_inter_vs_intra_group(cohort_b, cohort_a, glycans, grouped_glycans, paired = False):
"""estimates intra- and inter-group correlation of a given grouping of glycans via a mixed-effects model\n
| Arguments:
| :-
| cohort_b (dataframe): dataframe of glycans as rows and samples as columns of the case samples
| cohort_a (dataframe): dataframe of glycans as rows and samples as columns of the control samples
| glycans (list): list of glycans in IUPAC-condensed nomenclature
| grouped_glycans (dict): dictionary of type group : glycans\n
| grouped_glycans (dict): dictionary of type group : glycans
| paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False\n
| Returns:
| :-
| Returns floats for the intra-group and inter-group correlation
"""
reverse_lookup = {k: v for v, l in grouped_glycans.items() for k in l}
temp = pd.DataFrame(np.log2(abs((cohort_b.values + 1e-8) / (cohort_a.values + 1e-8))))
if paired:
temp = pd.DataFrame(np.log2(abs((cohort_b.values + 1e-8) / (cohort_a.values + 1e-8))))
else:
mean_cohort_a = (cohort_a.mean(axis = 1) + 1e-8).values[:, np.newaxis]
temp = pd.DataFrame(np.log2((cohort_b.values + 1e-8) / mean_cohort_a))
temp.index = glycans
temp = temp.reset_index()
# Melt the dataframe to long format
Expand Down
7 changes: 4 additions & 3 deletions glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,14 +477,15 @@ def get_pca(df, groups = None, motifs = False, feature_set = ['known', 'exhausti
plt.show()


def select_grouping(cohort_b, cohort_a, glycans, p_values, grouped_BH = False):
def select_grouping(cohort_b, cohort_a, glycans, p_values, paired = False, grouped_BH = False):
"""test various means of grouping glycans by domain knowledge, to obtain high intra-group correlation\n
| Arguments:
| :-
| cohort_b (dataframe): dataframe of glycans as rows and samples as columns of the case samples
| cohort_a (dataframe): dataframe of glycans as rows and samples as columns of the control samples
| glycans (list): list of glycans in IUPAC-condensed nomenclature
| p_values (list): list of associated p-values
| paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False
| grouped_BH (bool): whether to perform two-stage adaptive Benjamini-Hochberg as a grouped multiple testing correction; will SIGNIFICANTLY increase runtime; default:False\n
| Returns:
| :-
Expand All @@ -502,7 +503,7 @@ def select_grouping(cohort_b, cohort_a, glycans, p_values, grouped_BH = False):
grouped_glycans, grouped_p_values = func(glycans, p_values)
if any([len(g) < 2 for g in grouped_glycans.values()]):
continue
intra, inter = test_inter_vs_intra_group(cohort_b, cohort_a, glycans, grouped_glycans)
intra, inter = test_inter_vs_intra_group(cohort_b, cohort_a, glycans, grouped_glycans, paired = paired)
out[desc] = ((intra, inter), (grouped_glycans, grouped_p_values))
desc = list(out.keys())[np.argmax([v[0][0] - v[0][1] for k, v in out.items()])]
intra, inter = out[desc][0]
Expand Down Expand Up @@ -609,7 +610,7 @@ def get_differential_expression(df, group1, group2,
# Multiple testing correction
if pvals:
if not motifs:
grouped_glycans, grouped_pvals = select_grouping(df_b, df_a, glycans, pvals, grouped_BH = grouped_BH)
grouped_glycans, grouped_pvals = select_grouping(df_b, df_a, glycans, pvals, paired = paired, grouped_BH = grouped_BH)
corrpvals, significance_dict = TST_grouped_benjamini_hochberg(grouped_glycans, grouped_pvals, alpha)
corrpvals = [corrpvals[g] for g in glycans]
significance = [significance_dict[g] for g in glycans]
Expand Down

0 comments on commit 791175b

Please sign in to comment.