extend get_time_series to polynomial fits

BojarLab · Jul 26, 2023 · 53ee1fc · 53ee1fc
1 parent 118efa2
commit 53ee1fc
Show file tree

Hide file tree

Showing 6 changed files with 64 additions and 34 deletions.
diff --git a/build/lib/glycowork/motif/analysis.py b/build/lib/glycowork/motif/analysis.py
@@ -7,7 +7,7 @@
 import matplotlib.pyplot as plt
 plt.style.use('default')
 from collections import Counter
-from scipy.stats import ttest_ind, ttest_rel, f, norm, levene
+from scipy.stats import ttest_ind, ttest_rel, f, norm, levene, f_oneway
 from statsmodels.formula.api import ols
 from statsmodels.stats.multitest import multipletests
 from statsmodels.stats.multicomp import pairwise_tukeyhsd
@@ -835,11 +835,12 @@ def get_meta_analysis(effect_sizes, variances, model = 'fixed', filepath = '',
     return combined_effect_size, p_value
 
 
-def get_glycan_change_over_time(data):
+def get_glycan_change_over_time(data, degree = 1):
     """Tests if the abundance of a glycan changes significantly over time using an OLS model\n
     | Arguments:
     | :-
-    | data (numpy array): a 2D numpy array with two columns (time and glycan abundance) and one row per observation\n
+    | data (numpy array): a 2D numpy array with two columns (time and glycan abundance) and one row per observation
+    | degree (int): degree of the polynomial for regression, default:1 for linear regression\n
     | Returns:
     | :-
     | (i) slope -- the slope of the regression line (i.e., the rate of change of glycan expression over time)
@@ -849,29 +850,43 @@ def get_glycan_change_over_time(data):
     # Extract arrays for time and glycan abundance from the 2D input array
     time = data[:, 0]
     glycan_abundance = data[:, 1]
+
+    if degree == 1:
+        # Add a constant (for the intercept term)
+        time_with_intercept = sm.add_constant(time)
+
+        # Fit the OLS model
+        model = sm.OLS(glycan_abundance, time_with_intercept)
+        results = model.fit()
 
-    # Add a constant (for the intercept term)
-    time_with_intercept = sm.add_constant(time)
+        # Get the slope & the p-value for the slope from the model summary
+        coefficients = results.params[1]
+        p_value = results.pvalues[1]
 
-    # Fit the OLS model
-    model = sm.OLS(glycan_abundance, time_with_intercept)
-    results = model.fit()
+    else:
+        # Polynomial Regression
+        coeffs = np.polyfit(time, glycan_abundance, degree)
+        p = np.poly1d(coeffs)
 
-    # Get the slope & the p-value for the slope from the model summary
-    slope = results.params[1]
-    p_value = results.pvalues[1]
+        # Calculate the residuals
+        residuals = glycan_abundance - p(time)
+
+        # Perform F-test to get p_value
+        _, p_value = f_oneway(glycan_abundance, residuals)
+        coefficients = coeffs
 
-    return slope, p_value
+    return coefficients, p_value
 
 
-def get_time_series(df, impute = True, motifs = False, feature_set = ['known', 'exhaustive'], min_samples = None):
+def get_time_series(df, impute = True, motifs = False, feature_set = ['known', 'exhaustive'], degree = 1, min_samples = None):
     """Analyzes time series data of glycans using an OLS model\n
     | Arguments:
     | :-
     | df (dataframe): dataframe containing sample IDs of style sampleID_UnitTimepoint_replicate (e.g., T1_h5_r1) in first column and glycan relative abundances in subsequent columns
     | impute (bool): replaces zeroes with draws from left-shifted distribution or KNN-Imputer; default:True
     | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False
     | feature_set (list): which feature set to use for annotations, add more to list to expand; default is ['exhaustive','known']; options are: 'known' (hand-crafted glycan features), 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), and 'chemical' (molecular properties of glycan)
+    | degree (int): degree of the polynomial for regression, default:1 for linear regression
     | min_samples (int): How many samples per group need to have non-zero values for glycan to be kept; default: at least half per group\n
     | Returns:
     | :-
@@ -906,7 +921,7 @@ def get_time_series(df, impute = True, motifs = False, feature_set = ['known', '
     for c in df.columns.tolist()[1:]:
         glycan_abundance = np.array(df.loc[:, c].values.tolist())  # Glycan abundances for each time point and replicate
         data = np.column_stack((time, glycan_abundance))
-        s, p = get_glycan_change_over_time(data)
+        s, p = get_glycan_change_over_time(data, degree = degree)
         res.append((c, s, p))
     res = pd.DataFrame(res)
     res.columns = ['Glycan', 'Change', 'p-val']

diff --git a/build/lib/glycowork/motif/draw.py b/build/lib/glycowork/motif/draw.py
@@ -2300,10 +2300,11 @@ def annotate_figure(svg_input, scale_range = (25, 80), compact = False, glycan_s
   | scale_range (tuple): tuple of two integers defining min/max glycan dim; default:(25,80)
   | compact (bool): if True, draw compact glycan figures; default:False
   | glycan_size (string): modify glycan size; default:'medium'; options are 'small', 'medium', 'large'
+  | filepath (string): absolute path including full filename allows for saving the plot
   | scale_by_DE_res (df): result table from motif_analysis.get_differential_expression. Include to scale glycan figure size by -10logp
-  | y_thresh (float): corr p threshhold for datapoints included for scaling, set to match get_differential_expression; default:0.05
   | x_thresh (float): absolute x metric threshold for datapoints included for scaling, set to match get_differential_expression; default:1.0
-  | filepath (string): absolute path including full filename allows for saving the plot\n
+  | y_thresh (float): corr p threshhold for datapoints included for scaling, set to match get_differential_expression; default:0.05
+  | x_metric (string): x-axis metric; default:'Log2FC'; options are 'Log2FC', 'Effect size'\n
   | Returns:
   | :-
   | Modified figure svg code

diff --git a/build/lib/glycowork/motif/processing.py b/build/lib/glycowork/motif/processing.py
@@ -541,7 +541,6 @@ def impute_and_normalize(df, groups, impute = True, min_samples = None):
       old_cols = df.columns
       df.columns = df.columns.astype(str)
     if impute:
-      #df = replace_zero_with_random_gaussian_knn(df, [len(group) for group in groups])
       mf = MissForest()
       df.replace(0, np.nan, inplace = True)
       df = mf.fit_transform(df)

diff --git a/glycowork/motif/analysis.py b/glycowork/motif/analysis.py
@@ -7,7 +7,7 @@
 import matplotlib.pyplot as plt
 plt.style.use('default')
 from collections import Counter
-from scipy.stats import ttest_ind, ttest_rel, f, norm, levene
+from scipy.stats import ttest_ind, ttest_rel, f, norm, levene, f_oneway
 from statsmodels.formula.api import ols
 from statsmodels.stats.multitest import multipletests
 from statsmodels.stats.multicomp import pairwise_tukeyhsd
@@ -835,11 +835,12 @@ def get_meta_analysis(effect_sizes, variances, model = 'fixed', filepath = '',
     return combined_effect_size, p_value
 
 
-def get_glycan_change_over_time(data):
+def get_glycan_change_over_time(data, degree = 1):
     """Tests if the abundance of a glycan changes significantly over time using an OLS model\n
     | Arguments:
     | :-
-    | data (numpy array): a 2D numpy array with two columns (time and glycan abundance) and one row per observation\n
+    | data (numpy array): a 2D numpy array with two columns (time and glycan abundance) and one row per observation
+    | degree (int): degree of the polynomial for regression, default:1 for linear regression\n
     | Returns:
     | :-
     | (i) slope -- the slope of the regression line (i.e., the rate of change of glycan expression over time)
@@ -849,29 +850,43 @@ def get_glycan_change_over_time(data):
     # Extract arrays for time and glycan abundance from the 2D input array
     time = data[:, 0]
     glycan_abundance = data[:, 1]
+
+    if degree == 1:
+        # Add a constant (for the intercept term)
+        time_with_intercept = sm.add_constant(time)
+
+        # Fit the OLS model
+        model = sm.OLS(glycan_abundance, time_with_intercept)
+        results = model.fit()
 
-    # Add a constant (for the intercept term)
-    time_with_intercept = sm.add_constant(time)
+        # Get the slope & the p-value for the slope from the model summary
+        coefficients = results.params[1]
+        p_value = results.pvalues[1]
 
-    # Fit the OLS model
-    model = sm.OLS(glycan_abundance, time_with_intercept)
-    results = model.fit()
+    else:
+        # Polynomial Regression
+        coeffs = np.polyfit(time, glycan_abundance, degree)
+        p = np.poly1d(coeffs)
 
-    # Get the slope & the p-value for the slope from the model summary
-    slope = results.params[1]
-    p_value = results.pvalues[1]
+        # Calculate the residuals
+        residuals = glycan_abundance - p(time)
+
+        # Perform F-test to get p_value
+        _, p_value = f_oneway(glycan_abundance, residuals)
+        coefficients = coeffs
 
-    return slope, p_value
+    return coefficients, p_value
 
 
-def get_time_series(df, impute = True, motifs = False, feature_set = ['known', 'exhaustive'], min_samples = None):
+def get_time_series(df, impute = True, motifs = False, feature_set = ['known', 'exhaustive'], degree = 1, min_samples = None):
     """Analyzes time series data of glycans using an OLS model\n
     | Arguments:
     | :-
     | df (dataframe): dataframe containing sample IDs of style sampleID_UnitTimepoint_replicate (e.g., T1_h5_r1) in first column and glycan relative abundances in subsequent columns
     | impute (bool): replaces zeroes with draws from left-shifted distribution or KNN-Imputer; default:True
     | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False
     | feature_set (list): which feature set to use for annotations, add more to list to expand; default is ['exhaustive','known']; options are: 'known' (hand-crafted glycan features), 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), and 'chemical' (molecular properties of glycan)
+    | degree (int): degree of the polynomial for regression, default:1 for linear regression
     | min_samples (int): How many samples per group need to have non-zero values for glycan to be kept; default: at least half per group\n
     | Returns:
     | :-
@@ -906,7 +921,7 @@ def get_time_series(df, impute = True, motifs = False, feature_set = ['known', '
     for c in df.columns.tolist()[1:]:
         glycan_abundance = np.array(df.loc[:, c].values.tolist())  # Glycan abundances for each time point and replicate
         data = np.column_stack((time, glycan_abundance))
-        s, p = get_glycan_change_over_time(data)
+        s, p = get_glycan_change_over_time(data, degree = degree)
         res.append((c, s, p))
     res = pd.DataFrame(res)
     res.columns = ['Glycan', 'Change', 'p-val']

diff --git a/glycowork/motif/draw.py b/glycowork/motif/draw.py
@@ -2300,10 +2300,11 @@ def annotate_figure(svg_input, scale_range = (25, 80), compact = False, glycan_s
   | scale_range (tuple): tuple of two integers defining min/max glycan dim; default:(25,80)
   | compact (bool): if True, draw compact glycan figures; default:False
   | glycan_size (string): modify glycan size; default:'medium'; options are 'small', 'medium', 'large'
+  | filepath (string): absolute path including full filename allows for saving the plot
   | scale_by_DE_res (df): result table from motif_analysis.get_differential_expression. Include to scale glycan figure size by -10logp
-  | y_thresh (float): corr p threshhold for datapoints included for scaling, set to match get_differential_expression; default:0.05
   | x_thresh (float): absolute x metric threshold for datapoints included for scaling, set to match get_differential_expression; default:1.0
-  | filepath (string): absolute path including full filename allows for saving the plot\n
+  | y_thresh (float): corr p threshhold for datapoints included for scaling, set to match get_differential_expression; default:0.05
+  | x_metric (string): x-axis metric; default:'Log2FC'; options are 'Log2FC', 'Effect size'\n
   | Returns:
   | :-
   | Modified figure svg code

diff --git a/glycowork/motif/processing.py b/glycowork/motif/processing.py
@@ -541,7 +541,6 @@ def impute_and_normalize(df, groups, impute = True, min_samples = None):
       old_cols = df.columns
       df.columns = df.columns.astype(str)
     if impute:
-      #df = replace_zero_with_random_gaussian_knn(df, [len(group) for group in groups])
       mf = MissForest()
       df.replace(0, np.nan, inplace = True)
       df = mf.fit_transform(df)