Skip to content

Commit

Permalink
extend get_time_series to polynomial fits
Browse files Browse the repository at this point in the history
  • Loading branch information
Bribak committed Jul 26, 2023
1 parent 118efa2 commit 53ee1fc
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 34 deletions.
43 changes: 29 additions & 14 deletions build/lib/glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import matplotlib.pyplot as plt
plt.style.use('default')
from collections import Counter
from scipy.stats import ttest_ind, ttest_rel, f, norm, levene
from scipy.stats import ttest_ind, ttest_rel, f, norm, levene, f_oneway
from statsmodels.formula.api import ols
from statsmodels.stats.multitest import multipletests
from statsmodels.stats.multicomp import pairwise_tukeyhsd
Expand Down Expand Up @@ -835,11 +835,12 @@ def get_meta_analysis(effect_sizes, variances, model = 'fixed', filepath = '',
return combined_effect_size, p_value


def get_glycan_change_over_time(data):
def get_glycan_change_over_time(data, degree = 1):
"""Tests if the abundance of a glycan changes significantly over time using an OLS model\n
| Arguments:
| :-
| data (numpy array): a 2D numpy array with two columns (time and glycan abundance) and one row per observation\n
| data (numpy array): a 2D numpy array with two columns (time and glycan abundance) and one row per observation
| degree (int): degree of the polynomial for regression, default:1 for linear regression\n
| Returns:
| :-
| (i) slope -- the slope of the regression line (i.e., the rate of change of glycan expression over time)
Expand All @@ -849,29 +850,43 @@ def get_glycan_change_over_time(data):
# Extract arrays for time and glycan abundance from the 2D input array
time = data[:, 0]
glycan_abundance = data[:, 1]

if degree == 1:
# Add a constant (for the intercept term)
time_with_intercept = sm.add_constant(time)

# Fit the OLS model
model = sm.OLS(glycan_abundance, time_with_intercept)
results = model.fit()

# Add a constant (for the intercept term)
time_with_intercept = sm.add_constant(time)
# Get the slope & the p-value for the slope from the model summary
coefficients = results.params[1]
p_value = results.pvalues[1]

# Fit the OLS model
model = sm.OLS(glycan_abundance, time_with_intercept)
results = model.fit()
else:
# Polynomial Regression
coeffs = np.polyfit(time, glycan_abundance, degree)
p = np.poly1d(coeffs)

# Get the slope & the p-value for the slope from the model summary
slope = results.params[1]
p_value = results.pvalues[1]
# Calculate the residuals
residuals = glycan_abundance - p(time)

# Perform F-test to get p_value
_, p_value = f_oneway(glycan_abundance, residuals)
coefficients = coeffs

return slope, p_value
return coefficients, p_value


def get_time_series(df, impute = True, motifs = False, feature_set = ['known', 'exhaustive'], min_samples = None):
def get_time_series(df, impute = True, motifs = False, feature_set = ['known', 'exhaustive'], degree = 1, min_samples = None):
"""Analyzes time series data of glycans using an OLS model\n
| Arguments:
| :-
| df (dataframe): dataframe containing sample IDs of style sampleID_UnitTimepoint_replicate (e.g., T1_h5_r1) in first column and glycan relative abundances in subsequent columns
| impute (bool): replaces zeroes with draws from left-shifted distribution or KNN-Imputer; default:True
| motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False
| feature_set (list): which feature set to use for annotations, add more to list to expand; default is ['exhaustive','known']; options are: 'known' (hand-crafted glycan features), 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), and 'chemical' (molecular properties of glycan)
| degree (int): degree of the polynomial for regression, default:1 for linear regression
| min_samples (int): How many samples per group need to have non-zero values for glycan to be kept; default: at least half per group\n
| Returns:
| :-
Expand Down Expand Up @@ -906,7 +921,7 @@ def get_time_series(df, impute = True, motifs = False, feature_set = ['known', '
for c in df.columns.tolist()[1:]:
glycan_abundance = np.array(df.loc[:, c].values.tolist()) # Glycan abundances for each time point and replicate
data = np.column_stack((time, glycan_abundance))
s, p = get_glycan_change_over_time(data)
s, p = get_glycan_change_over_time(data, degree = degree)
res.append((c, s, p))
res = pd.DataFrame(res)
res.columns = ['Glycan', 'Change', 'p-val']
Expand Down
5 changes: 3 additions & 2 deletions build/lib/glycowork/motif/draw.py
Original file line number Diff line number Diff line change
Expand Up @@ -2300,10 +2300,11 @@ def annotate_figure(svg_input, scale_range = (25, 80), compact = False, glycan_s
| scale_range (tuple): tuple of two integers defining min/max glycan dim; default:(25,80)
| compact (bool): if True, draw compact glycan figures; default:False
| glycan_size (string): modify glycan size; default:'medium'; options are 'small', 'medium', 'large'
| filepath (string): absolute path including full filename allows for saving the plot
| scale_by_DE_res (df): result table from motif_analysis.get_differential_expression. Include to scale glycan figure size by -10logp
| y_thresh (float): corr p threshhold for datapoints included for scaling, set to match get_differential_expression; default:0.05
| x_thresh (float): absolute x metric threshold for datapoints included for scaling, set to match get_differential_expression; default:1.0
| filepath (string): absolute path including full filename allows for saving the plot\n
| y_thresh (float): corr p threshhold for datapoints included for scaling, set to match get_differential_expression; default:0.05
| x_metric (string): x-axis metric; default:'Log2FC'; options are 'Log2FC', 'Effect size'\n
| Returns:
| :-
| Modified figure svg code
Expand Down
1 change: 0 additions & 1 deletion build/lib/glycowork/motif/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,6 @@ def impute_and_normalize(df, groups, impute = True, min_samples = None):
old_cols = df.columns
df.columns = df.columns.astype(str)
if impute:
#df = replace_zero_with_random_gaussian_knn(df, [len(group) for group in groups])
mf = MissForest()
df.replace(0, np.nan, inplace = True)
df = mf.fit_transform(df)
Expand Down
43 changes: 29 additions & 14 deletions glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import matplotlib.pyplot as plt
plt.style.use('default')
from collections import Counter
from scipy.stats import ttest_ind, ttest_rel, f, norm, levene
from scipy.stats import ttest_ind, ttest_rel, f, norm, levene, f_oneway
from statsmodels.formula.api import ols
from statsmodels.stats.multitest import multipletests
from statsmodels.stats.multicomp import pairwise_tukeyhsd
Expand Down Expand Up @@ -835,11 +835,12 @@ def get_meta_analysis(effect_sizes, variances, model = 'fixed', filepath = '',
return combined_effect_size, p_value


def get_glycan_change_over_time(data):
def get_glycan_change_over_time(data, degree = 1):
"""Tests if the abundance of a glycan changes significantly over time using an OLS model\n
| Arguments:
| :-
| data (numpy array): a 2D numpy array with two columns (time and glycan abundance) and one row per observation\n
| data (numpy array): a 2D numpy array with two columns (time and glycan abundance) and one row per observation
| degree (int): degree of the polynomial for regression, default:1 for linear regression\n
| Returns:
| :-
| (i) slope -- the slope of the regression line (i.e., the rate of change of glycan expression over time)
Expand All @@ -849,29 +850,43 @@ def get_glycan_change_over_time(data):
# Extract arrays for time and glycan abundance from the 2D input array
time = data[:, 0]
glycan_abundance = data[:, 1]

if degree == 1:
# Add a constant (for the intercept term)
time_with_intercept = sm.add_constant(time)

# Fit the OLS model
model = sm.OLS(glycan_abundance, time_with_intercept)
results = model.fit()

# Add a constant (for the intercept term)
time_with_intercept = sm.add_constant(time)
# Get the slope & the p-value for the slope from the model summary
coefficients = results.params[1]
p_value = results.pvalues[1]

# Fit the OLS model
model = sm.OLS(glycan_abundance, time_with_intercept)
results = model.fit()
else:
# Polynomial Regression
coeffs = np.polyfit(time, glycan_abundance, degree)
p = np.poly1d(coeffs)

# Get the slope & the p-value for the slope from the model summary
slope = results.params[1]
p_value = results.pvalues[1]
# Calculate the residuals
residuals = glycan_abundance - p(time)

# Perform F-test to get p_value
_, p_value = f_oneway(glycan_abundance, residuals)
coefficients = coeffs

return slope, p_value
return coefficients, p_value


def get_time_series(df, impute = True, motifs = False, feature_set = ['known', 'exhaustive'], min_samples = None):
def get_time_series(df, impute = True, motifs = False, feature_set = ['known', 'exhaustive'], degree = 1, min_samples = None):
"""Analyzes time series data of glycans using an OLS model\n
| Arguments:
| :-
| df (dataframe): dataframe containing sample IDs of style sampleID_UnitTimepoint_replicate (e.g., T1_h5_r1) in first column and glycan relative abundances in subsequent columns
| impute (bool): replaces zeroes with draws from left-shifted distribution or KNN-Imputer; default:True
| motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False
| feature_set (list): which feature set to use for annotations, add more to list to expand; default is ['exhaustive','known']; options are: 'known' (hand-crafted glycan features), 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), and 'chemical' (molecular properties of glycan)
| degree (int): degree of the polynomial for regression, default:1 for linear regression
| min_samples (int): How many samples per group need to have non-zero values for glycan to be kept; default: at least half per group\n
| Returns:
| :-
Expand Down Expand Up @@ -906,7 +921,7 @@ def get_time_series(df, impute = True, motifs = False, feature_set = ['known', '
for c in df.columns.tolist()[1:]:
glycan_abundance = np.array(df.loc[:, c].values.tolist()) # Glycan abundances for each time point and replicate
data = np.column_stack((time, glycan_abundance))
s, p = get_glycan_change_over_time(data)
s, p = get_glycan_change_over_time(data, degree = degree)
res.append((c, s, p))
res = pd.DataFrame(res)
res.columns = ['Glycan', 'Change', 'p-val']
Expand Down
5 changes: 3 additions & 2 deletions glycowork/motif/draw.py
Original file line number Diff line number Diff line change
Expand Up @@ -2300,10 +2300,11 @@ def annotate_figure(svg_input, scale_range = (25, 80), compact = False, glycan_s
| scale_range (tuple): tuple of two integers defining min/max glycan dim; default:(25,80)
| compact (bool): if True, draw compact glycan figures; default:False
| glycan_size (string): modify glycan size; default:'medium'; options are 'small', 'medium', 'large'
| filepath (string): absolute path including full filename allows for saving the plot
| scale_by_DE_res (df): result table from motif_analysis.get_differential_expression. Include to scale glycan figure size by -10logp
| y_thresh (float): corr p threshhold for datapoints included for scaling, set to match get_differential_expression; default:0.05
| x_thresh (float): absolute x metric threshold for datapoints included for scaling, set to match get_differential_expression; default:1.0
| filepath (string): absolute path including full filename allows for saving the plot\n
| y_thresh (float): corr p threshhold for datapoints included for scaling, set to match get_differential_expression; default:0.05
| x_metric (string): x-axis metric; default:'Log2FC'; options are 'Log2FC', 'Effect size'\n
| Returns:
| :-
| Modified figure svg code
Expand Down
1 change: 0 additions & 1 deletion glycowork/motif/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,6 @@ def impute_and_normalize(df, groups, impute = True, min_samples = None):
old_cols = df.columns
df.columns = df.columns.astype(str)
if impute:
#df = replace_zero_with_random_gaussian_knn(df, [len(group) for group in groups])
mf = MissForest()
df.replace(0, np.nan, inplace = True)
df = mf.fit_transform(df)
Expand Down

0 comments on commit 53ee1fc

Please sign in to comment.