Skip to content

Commit

Permalink
add support for multigroup custom_scale
Browse files Browse the repository at this point in the history
  • Loading branch information
Bribak committed May 6, 2024
1 parent 3bc00d7 commit e40ed78
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 62 deletions.
40 changes: 21 additions & 19 deletions build/lib/glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def get_heatmap(df, motifs = False, feature_set = ['known'], transform = '',
df = clr_transformation(df, [], [], gamma = 0)
elif transform == "ALR":
df = df.replace(0, np.nan).dropna(thresh = np.max([np.round(rarity_filter * df.shape[0]), 1]), axis = 1).fillna(1e-6)
df = get_additive_logratio_transformation(df.reset_index(), [], df.columns.tolist(), paired = False, gamma = 0)
df = get_additive_logratio_transformation(df.reset_index(), df.columns.tolist(), [], paired = False, gamma = 0)
df = df.set_index(df.columns[0])
if motifs:
if 'custom' in feature_set and len(feature_set) == 1 and len(custom_motifs) < 2:
Expand Down Expand Up @@ -530,7 +530,7 @@ def get_differential_expression(df, group1, group2,
| custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty
| transform (str): transformation to escape Aitchison space; options are CLR and ALR (use ALR if you have many glycans (>100) with low values); default:will be inferred
| gamma (float): uncertainty parameter to estimate scale uncertainty for CLR transformation; default: 0.1
| custom_scale (float): if you *know* the difference in scale between groups, provide the ratio of group2/group1 for an informed scale model\n
| custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)\n
| Returns:
| :-
| Returns a dataframe with:
Expand Down Expand Up @@ -769,8 +769,7 @@ def get_glycanova(df, groups, impute = True, motifs = False, feature_set = ['exh
if isinstance(df, str):
df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
results, posthoc_results = [], {}
df = df.iloc[:, :len(groups)+1]
df = df.fillna(0)
df = df.iloc[:, :len(groups)+1].fillna(0)
df = df.loc[~(df.iloc[:, 1:] == 0).all(axis = 1)]
df = df.apply(replace_outliers_winsorization, axis = 1)
groups_unq = sorted(set(groups))
Expand All @@ -779,9 +778,9 @@ def get_glycanova(df, groups, impute = True, motifs = False, feature_set = ['exh
if transform is None:
transform = "ALR" if enforce_class(df.iloc[0, 0], "N") and len(df) > 50 else "CLR"
if transform == "ALR":
df = get_additive_logratio_transformation(df, [], df.columns[1:].tolist(), paired = False, gamma = gamma)
df = get_additive_logratio_transformation(df, df.columns[1:].tolist(), [], paired = False, gamma = gamma)
elif transform == "CLR":
df.iloc[:, 1:] = clr_transformation(df.iloc[:, 1:], [], df.columns[1:].tolist(), gamma = gamma)
df.iloc[:, 1:] = clr_transformation(df.iloc[:, 1:], df.columns[1:].tolist(), [], gamma = gamma)
elif transform == "Nothing":
pass
else:
Expand Down Expand Up @@ -951,9 +950,9 @@ def get_time_series(df, impute = True, motifs = False, feature_set = ['known', '
if transform is None:
transform = "ALR" if enforce_class(df.iloc[0, 0], "N") and len(df) > 50 else "CLR"
if transform == "ALR":
df = get_additive_logratio_transformation(df, [], df.columns[1:].tolist(), paired = False, gamma = gamma)
df = get_additive_logratio_transformation(df, df.columns[1:].tolist(), [], paired = False, gamma = gamma)
elif transform == "CLR":
df.iloc[:, 1:] = clr_transformation(df.iloc[:, 1:], [], df.columns[1:].tolist(), gamma = gamma)
df.iloc[:, 1:] = clr_transformation(df.iloc[:, 1:], df.columns[1:].tolist(), [], gamma = gamma)
elif transform == "Nothing":
pass
else:
Expand Down Expand Up @@ -1023,9 +1022,9 @@ def get_jtk(df_in, timepoints, periods, interval, motifs = False, feature_set =
if transform is None:
transform = "ALR" if enforce_class(df.iloc[0, 0], "N") and len(df) > 50 else "CLR"
if transform == "ALR":
df = get_additive_logratio_transformation(df, [], df.columns[1:].tolist(), paired = False, gamma = gamma)
df = get_additive_logratio_transformation(df, df.columns[1:].tolist(), [], paired = False, gamma = gamma)
elif transform == "CLR":
df.iloc[:, 1:] = clr_transformation(df.iloc[:, 1:], [], df.columns[1:].tolist(), gamma = gamma)
df.iloc[:, 1:] = clr_transformation(df.iloc[:, 1:], df.columns[1:].tolist(), [], gamma = gamma)
elif transform == "Nothing":
pass
else:
Expand Down Expand Up @@ -1132,8 +1131,6 @@ def get_biodiversity(df, group1, group2, metrics = ['alpha', 'beta'], motifs = F
if transform is None:
transform = "ALR" if enforce_class(df_org.iloc[0, 0], "N") and len(df_org) > 50 else "CLR"
if transform == "ALR":
if not group2:
group1, group2 = group2, group1
df_org = get_additive_logratio_transformation(df_org, group1, group2, paired = paired, gamma = gamma)
elif transform == "CLR":
df_org.iloc[:, 1:] = clr_transformation(df_org.iloc[:, 1:], group1, group2, gamma = gamma)
Expand Down Expand Up @@ -1212,11 +1209,11 @@ def get_SparCC(df1, df2, motifs = False, feature_set = ["known", "exhaustive"],
if transform is None:
transform = "ALR" if (enforce_class(df1.iloc[0, 0], "N") and len(df1) > 50) and (enforce_class(df2.iloc[0, 0], "N") and len(df2) > 50) else "CLR"
if transform == "ALR":
df1 = get_additive_logratio_transformation(df1, [], df1.columns[1:].tolist(), paired = False, gamma = gamma)
df2 = get_additive_logratio_transformation(df2, [], df2.columns[1:].tolist(), paired = False, gamma = gamma)
df1 = get_additive_logratio_transformation(df1, df1.columns[1:].tolist(), [], paired = False, gamma = gamma)
df2 = get_additive_logratio_transformation(df2, df2.columns[1:].tolist(), [], paired = False, gamma = gamma)
elif transform == "CLR":
df1.iloc[:, 1:] = clr_transformation(df1.iloc[:, 1:], [], df1.columns.tolist()[1:], gamma = gamma)
df2.iloc[:, 1:] = clr_transformation(df2.iloc[:, 1:], [], df2.columns.tolist()[1:], gamma = gamma)
df1.iloc[:, 1:] = clr_transformation(df1.iloc[:, 1:], df1.columns.tolist()[1:], [], gamma = gamma)
df2.iloc[:, 1:] = clr_transformation(df2.iloc[:, 1:], df2.columns.tolist()[1:], [], gamma = gamma)
elif transform == "Nothing":
pass
else:
Expand Down Expand Up @@ -1267,7 +1264,7 @@ def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["kn
| custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty
| transform (str): transformation to escape Aitchison space; options are CLR and ALR (use ALR if you have many glycans (>100) with low values); default:will be inferred
| gamma (float): uncertainty parameter to estimate scale uncertainty for CLR transformation; default: 0.1
| custom_scale (float): if you *know* the difference in scale between groups, provide the ratio of group2/group1 for an informed scale model\n
| custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)\n
| Returns:
| :-
| Returns a sorted list of tuples of type (glycan, AUC score) and, optionally, ROC curve for best feature
Expand All @@ -1288,7 +1285,10 @@ def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["kn
if transform is None:
transform = "ALR" if enforce_class(df.iloc[0, 0], "N") and len(df) > 50 else "CLR"
if transform == "ALR":
df = get_additive_logratio_transformation(df, group1, group2, paired = paired, gamma = gamma, custom_scale = custom_scale)
if group2:
df = get_additive_logratio_transformation(df, group1, group2, paired = paired, gamma = gamma, custom_scale = custom_scale)
else:
df = get_additive_logratio_transformation(df, df.columns[1:].tolist(), [], paired = paired, gamma = gamma, custom_scale = custom_scale)
elif transform == "CLR":
df.iloc[:, 1:] = clr_transformation(df.iloc[:, 1:], group1, group2, gamma = gamma, custom_scale = custom_scale)
elif transform == "Nothing":
Expand Down Expand Up @@ -1377,7 +1377,7 @@ def get_lectin_array(df, group1, group2, paired = False):
| (i) Deduced glycan motifs altered between groups
| (ii) human names for features identified in the motifs from (i)
| (iii) Lectins supporting the change in (i)
| (iv) Direction of the change (e.g., "up" means higher in group2; IGNORE THIS if you have more than two groups)
| (iv) Direction of the change (e.g., "up" means higher in group2)
| (v) Score/Magnitude of the change (remember, if you have more than two groups this reports on any pairwise combination, like an ANOVA)
| (vi) Clustering of the scores into highly/moderate/low significance findings
"""
Expand Down Expand Up @@ -1426,4 +1426,6 @@ def get_lectin_array(df, group1, group2, paired = False):
temp = annotate_dataset(df_out.iloc[:, 0], condense = True)
occurring_motifs = [temp.columns[temp.iloc[idx].astype(bool)].tolist() for idx in range(len(temp))]
df_out.insert(1, "named_motifs", occurring_motifs)
if not group2:
df_out["change"] = ["different"] * len(df_out)
return df_out
46 changes: 22 additions & 24 deletions glycowork/glycan_data/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -784,31 +784,29 @@ def clr_transformation(df, group1, group2, gamma = 0.1, custom_scale = 0):
| group1 (list): list of column indices or names for the first group of samples, usually the control
| group2 (list): list of column indices or names for the second group of samples
| gamma (float): the degree of uncertainty that the CLR assumption holds; default: 0.1
| custom_scale (float): if you *know* the difference in scale between groups, provide the ratio of group2/group1 for an informed scale model\n
| custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)\n
| Returns:
| :-
| Returns a dataframe that is CLR-transformed with scale model adjustment
"""
geometric_mean = gmean(df.replace(0, np.nan), axis = 0)
if gamma and group2:
col_list = df.columns.tolist()
group1i = [col_list.index(k) for k in group1]
group2i = [col_list.index(k) for k in group2]
case_control = [0]*len(group1) + [1]*len(group2)
clr_adjusted = np.zeros_like(df.values)
if custom_scale:
control = norm.rvs(loc = np.log2(1), scale = gamma, size = (df.shape[0], len(group1)))
clr_adjusted[:, group1i] = np.log2(df[group1]) + control
condition = norm.rvs(loc = np.log2(custom_scale), scale = gamma, size = (df.shape[0], len(group2)))
clr_adjusted[:, group2i] = np.log2(df[group2]) + condition
return pd.DataFrame(clr_adjusted, index = df.index, columns = df.columns)
clr_adjusted = np.zeros_like(df.values)
if gamma and isinstance(custom_scale, float):
group1i = [df.columns.get_loc(c) for c in group1]
group2i = [df.columns.get_loc(c) for c in group2] if group2 else group1i
geometric_mean = -np.log2(geometric_mean)
clr_adjusted[:, group1i] = np.log2(df[group1]) + geometric_mean[group1i]
observed = norm.rvs(loc = geometric_mean[group2i], scale = gamma, size = (df.shape[0], len(group2)))
clr_adjusted[:, group2i] = np.log2(df[group2]) + observed
return pd.DataFrame(clr_adjusted, index = df.index, columns = df.columns)
clr_adjusted[:, group1i] = np.log2(df[group1]) + (geometric_mean[group1i] if not custom_scale else norm.rvs(loc = np.log2(1), scale = gamma, size = (df.shape[0], len(group1))))
condition = norm.rvs(loc = geometric_mean[group2i], scale = gamma, size = (df.shape[0], len(group2))) if not custom_scale else norm.rvs(loc = np.log2(custom_scale), scale = gamma, size = (df.shape[0], len(group2)))
clr_adjusted[:, group2i] = np.log2(df[group2]) + condition
elif not group2 and isinstance(custom_scale, dict):
gamma = max(gamma, 0.1)
for idx, col in enumerate(df.columns):
group_id = group1[idx]
scale_factor = custom_scale.get(group_id, 1)
clr_adjusted[:, idx] = np.log2(df.iloc[:, idx]) + norm.rvs(loc = np.log2(scale_factor), scale = gamma)
else:
return (np.log2(df) - np.log2(geometric_mean))
clr_adjusted = np.log2(df) - np.log2(geometric_mean)
return pd.DataFrame(clr_adjusted, index = df.index, columns = df.columns)


def anosim(df, group_labels_in, permutations = 999):
Expand Down Expand Up @@ -940,18 +938,18 @@ def get_procrustes_scores(df, group1, group2, paired = False, custom_scale = 0):
| group1 (list): list of column indices or names for the first group of samples, usually the control
| group2 (list): list of column indices or names for the second group of samples
| paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False
| custom_scale (float): if you *know* the difference in scale between groups, provide the ratio of group2/group1 for an informed scale model\n
| custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)\n
| Returns:
| :-
| List of Procrustes scores (Procrustes correlation * inverse of feature variance)
"""
if isinstance(group2[0], int):
if isinstance(group1[0], int):
group1 = [df.columns.tolist()[k] for k in group1]
group2 = [df.columns.tolist()[k] for k in group2]
df = df.iloc[:, 1:]
ref_matrix = clr_transformation(df, [], [], gamma = 0, custom_scale = custom_scale)
ref_matrix = clr_transformation(df, group1, group2, gamma = 0.01, custom_scale = custom_scale)
df = np.log2(df)
if group1:
if group2:
if paired:
differences = df[group1].values - df[group2].values
variances = np.var(differences, axis = 1, ddof = 1)
Expand All @@ -960,7 +958,7 @@ def get_procrustes_scores(df, group1, group2, paired = False, custom_scale = 0):
var_group2 = df[group2].var(axis = 1)
variances = abs(var_group1 - var_group2)
else:
variances = abs(df[group2].var(axis = 1))
variances = abs(df[group1].var(axis = 1))
procrustes_corr = [1 - procrustes(ref_matrix.drop(ref_matrix.index[i]), alr_transformation(df, i))[2] for i in range(df.shape[0])]
return [a * (1/b) for a, b in zip(procrustes_corr, variances)], procrustes_corr, variances

Expand All @@ -974,7 +972,7 @@ def get_additive_logratio_transformation(df, group1, group2, paired = False, gam
| group2 (list): list of column indices or names for the second group of samples
| paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False
| gamma (float): the degree of uncertainty that the CLR assumption holds; in case of CLR; default: 0.1
| custom_scale (float): if you *know* the difference in scale between groups, provide the ratio of group2/group1 for an informed scale model\n
| custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)\n
| Returns:
| :-
| ALR-transformed dataframe
Expand Down
Loading

0 comments on commit e40ed78

Please sign in to comment.