Switch to fits based on raw counts rather than CPMS--simpler but requ…

…ires test rewrites
AmandaBirmingham · Apr 24, 2024 · 9cb4b9d · 9cb4b9d
1 parent 0670957
commit 9cb4b9d
Show file tree

Hide file tree

Showing 8 changed files with 424 additions and 875 deletions.
diff --git a/docs/absolute_quant_example.xlsx b/docs/absolute_quant_example.xlsx
diff --git a/pysyndna/src/calc_cell_counts.py b/pysyndna/src/calc_cell_counts.py
@@ -31,8 +31,7 @@
 SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY = 'sequenced_sample_gdna_mass_ng'
 OGU_ID_KEY = 'ogu_id'
 OGU_READ_COUNT_KEY = 'ogu_read_count'
-OGU_CPM_KEY = 'ogu_CPM'
-LOG_10_OGU_CPM_KEY = 'log10_ogu_CPM'
+LOG_10_OGU_READ_COUNT_KEY = 'log10_ogu_read_count'
 OGU_PERCENT_COVERAGE_KEY = 'percent_coverage_of_ogu'
 TOTAL_OGU_READS_KEY = 'total_reads_per_ogu'
 LOG_10_OGU_GDNA_MASS_NG_KEY = 'log10_ogu_gdna_mass_ng'
@@ -365,16 +364,11 @@ def _calc_ogu_cell_counts_df_for_sample(
     sample_df = working_df[
         working_df[SAMPLE_ID_KEY] == sample_id].copy()
 
-    # get the total reads sequenced for this sample
-    sample_total_reads = per_sample_info_df.loc[
-        per_sample_info_df[SAMPLE_ID_KEY] == sample_id,
-        SAMPLE_TOTAL_READS_KEY].values[0]
-
     # predict mass of each OGU's gDNA in this sample from its counts
     # using the linear model
     ogu_gdna_masses = _calc_ogu_gdna_mass_ng_series_for_sample(
             sample_df, linregress_result[SLOPE_KEY],
-            linregress_result[INTERCEPT_KEY], sample_total_reads)
+            linregress_result[INTERCEPT_KEY])
     sample_df[OGU_GDNA_MASS_NG_KEY] = \
         sample_df[OGU_ID_KEY].map(ogu_gdna_masses)
 
@@ -413,8 +407,7 @@ def _calc_ogu_cell_counts_df_for_sample(
 def _calc_ogu_gdna_mass_ng_series_for_sample(
         sample_df: pd.DataFrame,
         sample_linregress_slope: float,
-        sample_linregress_intercept: float,
-        sample_total_reads: int) -> pd.Series:
+        sample_linregress_intercept: float) -> pd.Series:
 
     """Calculates mass of OGU gDNA in ng for each OGU in a sample.
 
@@ -427,9 +420,6 @@ def _calc_ogu_gdna_mass_ng_series_for_sample(
         Slope of the linear regression model for the sample.
     sample_linregress_intercept: float
         Intercept of the linear regression model for the sample.
-    sample_total_reads: int
-        Total number of reads for the sample (including all reads, not just
-        aligned ones).
 
     Returns
     -------
@@ -439,23 +429,26 @@ def _calc_ogu_gdna_mass_ng_series_for_sample(
     """
     working_df = sample_df.copy()
 
-    # add a column of counts per million (CPM) for each ogu by dividing
-    # each read_count by the total number of reads for this sample
-    # and then multiplying by a million (1,000,000)
-    # NB: dividing int/int in python gives float
-    working_df[OGU_CPM_KEY] = (working_df[OGU_READ_COUNT_KEY] /
-                               sample_total_reads) * 1000000
+    # NOTE that the linear regressions were originally done as described in
+    # the Zaramela et al notebooks, where the log10 of the CPM values were
+    # used as the independent variable.  Later scripts by Oriane Moyne
+    # showed that this is not necessary and that it is equivalent to simply
+    # use log10 of the read counts as the independent variable (as long as it
+    # is used for *both* the fit and the prediction, of course!).  Please see
+    # documentation on the fit_syndna_models.src._fit_linear_regression_models
+    # method for a full description of this change.
 
-    # add column of log10(ogu CPM) by taking log base 10 of the ogu CPM column
-    working_df[LOG_10_OGU_CPM_KEY] = np.log10(working_df[OGU_CPM_KEY])
+    # add column of log10(ogu read counts)
+    working_df[LOG_10_OGU_READ_COUNT_KEY] = \
+        np.log10(working_df[OGU_READ_COUNT_KEY])
 
     # calculate log10(ogu gdna mass) of each OGU's gDNA in this sample
-    # by multiplying each OGU's log10(ogu CPM) by the slope of this sample's
-    # regression model and adding the model's intercept.
+    # by multiplying each OGU's log10(ogu read count) by the slope of this
+    # sample's regression model and adding the model's intercept.
     # NB: this requires that the linear regression models were derived
     # using synDNA masses *in ng* and not in some other unit.
     working_df[LOG_10_OGU_GDNA_MASS_NG_KEY] = (
-            working_df[LOG_10_OGU_CPM_KEY] *
+            working_df[LOG_10_OGU_READ_COUNT_KEY] *
             sample_linregress_slope +
             sample_linregress_intercept)
 

diff --git a/pysyndna/src/fit_syndna_models.py b/pysyndna/src/fit_syndna_models.py
@@ -22,8 +22,7 @@
 SYNDNA_POOL_MASS_NG_KEY = 'mass_syndna_input_ng'
 SAMPLE_TOTAL_READS_KEY = 'raw_reads_r1r2'
 SYNDNA_COUNTS_KEY = 'read_count'
-COUNTS_PER_MIL_KEY = 'CPM'
-LOG10_COUNTS_PER_MIL_KEY = 'log10_CPM'
+LOG10_SYNDNA_COUNTS_KEY = 'log10_read_count'
 SYNDNA_INDIV_NG_KEY = 'syndna_ng'
 LOG10_SYNDNA_INDIV_NG_KEY = 'log10_syndna_ng'
 LIN_REGRESS_RESULT_KEY = 'lin_regress_by_sample_id'
@@ -206,15 +205,32 @@ def _fit_linear_regression_models(working_df: pd.DataFrame) -> \
 
     This function fits a linear regression model for each sample,
     predicting log10(mass of instances of a sequence) within a sample
-    from log10(counts per million for that sequence) within the sample,
+    from log10(read counts for that sequence) within the sample,
     using spike-in data from synDNAs.
 
+    Note that this function originally followed the R notebooks for Zaramela
+    et al. and fit the log10 of the mass of the syndna in the sample to the
+    log10 of the counts per million of the read for that syndna in the sample.
+    However, later work by Oriane Moyne left out the CPMs and demonstrated
+    that one can achieve the same end result (of OGU mass predictions) by
+    fitting the log10 mass of the syndna in the sample to log10 of the read
+    counts themselves.  (The slope of the fits are the same in both cases,
+    while the intercepts differ by a constant factor because the CPM
+    calculation modifies the read count by a constant factor--dividing by
+    total reads in the sample and multiplying by a million.)  When one uses
+    the resulting fits on the raw read counts for OGUs, one gets the same
+    mass predictions as when using the fits on the CPMs on the CPMs for the
+    OGUs. (HOWEVER, since I know it will come up: note that this is not true if
+    one looks at the mass predictions from the original Zaramela R notebooks,
+    which unintentionally used a different total counts value for the CPM
+    calculation in the fit than they used for the CPM calculation in the
+    OGU mass prediction, leading to inconsistent results.)
+
     Parameters
     ----------
     working_df: pd.DataFrame
         Long-form dataframe containing at least SAMPLE_ID_KEY,
-        SYNDNA_COUNTS_KEY, SAMPLE_TOTAL_READS_KEY, and
-        SYNDNA_INDIV_NG_KEY columns.
+        SYNDNA_COUNTS_KEY, and SYNDNA_INDIV_NG_KEY columns.
 
     Returns
     -------
@@ -230,23 +246,16 @@ def _fit_linear_regression_models(working_df: pd.DataFrame) -> \
     # drop any rows where the count value is 0--can't take log of 0
     working_df = working_df[working_df[SYNDNA_COUNTS_KEY] > 0].copy()
 
-    # add a column of counts per million (CPM) by dividing the count value
-    # in each read_count by the total number of reads for its sample_id and
-    # then multiplying by a million (1,000,000)
-    working_df.loc[:, COUNTS_PER_MIL_KEY] = \
-        (working_df[SYNDNA_COUNTS_KEY] /
-         working_df[SAMPLE_TOTAL_READS_KEY]) * 1000000
-
-    # add a column of log10(CMP) by taking the log base 10 of the CPM column
-    working_df.loc[:, LOG10_COUNTS_PER_MIL_KEY] = \
-        np.log10(working_df[COUNTS_PER_MIL_KEY])
+    # add a column for the log10 of the syndna read count column
+    working_df.loc[:, LOG10_SYNDNA_COUNTS_KEY] = \
+        np.log10(working_df[SYNDNA_COUNTS_KEY])
 
     # add a column for the log10 of the syndna ng column
     working_df.loc[:, LOG10_SYNDNA_INDIV_NG_KEY] = \
         np.log10(working_df[SYNDNA_INDIV_NG_KEY])
 
     # loop over each sample id and fit a linear regression model predicting
-    # log10(dna ng) from log10(counts per million)
+    # log10(dna ng) from log10(syndna read counts)
     linregress_by_sample_id = {}
     log_msgs_list = []
     for curr_sample_id in working_df[SAMPLE_ID_KEY].unique():
@@ -255,7 +264,7 @@ def _fit_linear_regression_models(working_df: pd.DataFrame) -> \
 
         try:
             curr_linregress_result = scipy.stats.linregress(
-                curr_sample_df[LOG10_COUNTS_PER_MIL_KEY],
+                curr_sample_df[LOG10_SYNDNA_COUNTS_KEY],
                 curr_sample_df[LOG10_SYNDNA_INDIV_NG_KEY])
         except Exception:
             # TODO: I need to know what kind of errors this can throw;