Skip to content

Commit

Permalink
Switch to fits based on raw counts rather than CPMS--simpler but requ…
Browse files Browse the repository at this point in the history
…ires test rewrites
  • Loading branch information
AmandaBirmingham committed Apr 24, 2024
1 parent 0670957 commit 9cb4b9d
Show file tree
Hide file tree
Showing 8 changed files with 424 additions and 875 deletions.
Binary file modified docs/absolute_quant_example.xlsx
Binary file not shown.
41 changes: 17 additions & 24 deletions pysyndna/src/calc_cell_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@
SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY = 'sequenced_sample_gdna_mass_ng'
OGU_ID_KEY = 'ogu_id'
OGU_READ_COUNT_KEY = 'ogu_read_count'
OGU_CPM_KEY = 'ogu_CPM'
LOG_10_OGU_CPM_KEY = 'log10_ogu_CPM'
LOG_10_OGU_READ_COUNT_KEY = 'log10_ogu_read_count'
OGU_PERCENT_COVERAGE_KEY = 'percent_coverage_of_ogu'
TOTAL_OGU_READS_KEY = 'total_reads_per_ogu'
LOG_10_OGU_GDNA_MASS_NG_KEY = 'log10_ogu_gdna_mass_ng'
Expand Down Expand Up @@ -365,16 +364,11 @@ def _calc_ogu_cell_counts_df_for_sample(
sample_df = working_df[
working_df[SAMPLE_ID_KEY] == sample_id].copy()

# get the total reads sequenced for this sample
sample_total_reads = per_sample_info_df.loc[
per_sample_info_df[SAMPLE_ID_KEY] == sample_id,
SAMPLE_TOTAL_READS_KEY].values[0]

# predict mass of each OGU's gDNA in this sample from its counts
# using the linear model
ogu_gdna_masses = _calc_ogu_gdna_mass_ng_series_for_sample(
sample_df, linregress_result[SLOPE_KEY],
linregress_result[INTERCEPT_KEY], sample_total_reads)
linregress_result[INTERCEPT_KEY])
sample_df[OGU_GDNA_MASS_NG_KEY] = \
sample_df[OGU_ID_KEY].map(ogu_gdna_masses)

Expand Down Expand Up @@ -413,8 +407,7 @@ def _calc_ogu_cell_counts_df_for_sample(
def _calc_ogu_gdna_mass_ng_series_for_sample(
sample_df: pd.DataFrame,
sample_linregress_slope: float,
sample_linregress_intercept: float,
sample_total_reads: int) -> pd.Series:
sample_linregress_intercept: float) -> pd.Series:

"""Calculates mass of OGU gDNA in ng for each OGU in a sample.
Expand All @@ -427,9 +420,6 @@ def _calc_ogu_gdna_mass_ng_series_for_sample(
Slope of the linear regression model for the sample.
sample_linregress_intercept: float
Intercept of the linear regression model for the sample.
sample_total_reads: int
Total number of reads for the sample (including all reads, not just
aligned ones).
Returns
-------
Expand All @@ -439,23 +429,26 @@ def _calc_ogu_gdna_mass_ng_series_for_sample(
"""
working_df = sample_df.copy()

# add a column of counts per million (CPM) for each ogu by dividing
# each read_count by the total number of reads for this sample
# and then multiplying by a million (1,000,000)
# NB: dividing int/int in python gives float
working_df[OGU_CPM_KEY] = (working_df[OGU_READ_COUNT_KEY] /
sample_total_reads) * 1000000
# NOTE that the linear regressions were originally done as described in
# the Zaramela et al notebooks, where the log10 of the CPM values were
# used as the independent variable. Later scripts by Oriane Moyne
# showed that this is not necessary and that it is equivalent to simply
# use log10 of the read counts as the independent variable (as long as it
# is used for *both* the fit and the prediction, of course!). Please see
# documentation on the fit_syndna_models.src._fit_linear_regression_models
# method for a full description of this change.

# add column of log10(ogu CPM) by taking log base 10 of the ogu CPM column
working_df[LOG_10_OGU_CPM_KEY] = np.log10(working_df[OGU_CPM_KEY])
# add column of log10(ogu read counts)
working_df[LOG_10_OGU_READ_COUNT_KEY] = \
np.log10(working_df[OGU_READ_COUNT_KEY])

# calculate log10(ogu gdna mass) of each OGU's gDNA in this sample
# by multiplying each OGU's log10(ogu CPM) by the slope of this sample's
# regression model and adding the model's intercept.
# by multiplying each OGU's log10(ogu read count) by the slope of this
# sample's regression model and adding the model's intercept.
# NB: this requires that the linear regression models were derived
# using synDNA masses *in ng* and not in some other unit.
working_df[LOG_10_OGU_GDNA_MASS_NG_KEY] = (
working_df[LOG_10_OGU_CPM_KEY] *
working_df[LOG_10_OGU_READ_COUNT_KEY] *
sample_linregress_slope +
sample_linregress_intercept)

Expand Down
43 changes: 26 additions & 17 deletions pysyndna/src/fit_syndna_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@
SYNDNA_POOL_MASS_NG_KEY = 'mass_syndna_input_ng'
SAMPLE_TOTAL_READS_KEY = 'raw_reads_r1r2'
SYNDNA_COUNTS_KEY = 'read_count'
COUNTS_PER_MIL_KEY = 'CPM'
LOG10_COUNTS_PER_MIL_KEY = 'log10_CPM'
LOG10_SYNDNA_COUNTS_KEY = 'log10_read_count'
SYNDNA_INDIV_NG_KEY = 'syndna_ng'
LOG10_SYNDNA_INDIV_NG_KEY = 'log10_syndna_ng'
LIN_REGRESS_RESULT_KEY = 'lin_regress_by_sample_id'
Expand Down Expand Up @@ -206,15 +205,32 @@ def _fit_linear_regression_models(working_df: pd.DataFrame) -> \
This function fits a linear regression model for each sample,
predicting log10(mass of instances of a sequence) within a sample
from log10(counts per million for that sequence) within the sample,
from log10(read counts for that sequence) within the sample,
using spike-in data from synDNAs.
Note that this function originally followed the R notebooks for Zaramela
et al. and fit the log10 of the mass of the syndna in the sample to the
log10 of the counts per million of the read for that syndna in the sample.
However, later work by Oriane Moyne left out the CPMs and demonstrated
that one can achieve the same end result (of OGU mass predictions) by
fitting the log10 mass of the syndna in the sample to log10 of the read
counts themselves. (The slope of the fits are the same in both cases,
while the intercepts differ by a constant factor because the CPM
calculation modifies the read count by a constant factor--dividing by
total reads in the sample and multiplying by a million.) When one uses
the resulting fits on the raw read counts for OGUs, one gets the same
mass predictions as when using the fits on the CPMs on the CPMs for the
OGUs. (HOWEVER, since I know it will come up: note that this is not true if
one looks at the mass predictions from the original Zaramela R notebooks,
which unintentionally used a different total counts value for the CPM
calculation in the fit than they used for the CPM calculation in the
OGU mass prediction, leading to inconsistent results.)
Parameters
----------
working_df: pd.DataFrame
Long-form dataframe containing at least SAMPLE_ID_KEY,
SYNDNA_COUNTS_KEY, SAMPLE_TOTAL_READS_KEY, and
SYNDNA_INDIV_NG_KEY columns.
SYNDNA_COUNTS_KEY, and SYNDNA_INDIV_NG_KEY columns.
Returns
-------
Expand All @@ -230,23 +246,16 @@ def _fit_linear_regression_models(working_df: pd.DataFrame) -> \
# drop any rows where the count value is 0--can't take log of 0
working_df = working_df[working_df[SYNDNA_COUNTS_KEY] > 0].copy()

# add a column of counts per million (CPM) by dividing the count value
# in each read_count by the total number of reads for its sample_id and
# then multiplying by a million (1,000,000)
working_df.loc[:, COUNTS_PER_MIL_KEY] = \
(working_df[SYNDNA_COUNTS_KEY] /
working_df[SAMPLE_TOTAL_READS_KEY]) * 1000000

# add a column of log10(CMP) by taking the log base 10 of the CPM column
working_df.loc[:, LOG10_COUNTS_PER_MIL_KEY] = \
np.log10(working_df[COUNTS_PER_MIL_KEY])
# add a column for the log10 of the syndna read count column
working_df.loc[:, LOG10_SYNDNA_COUNTS_KEY] = \
np.log10(working_df[SYNDNA_COUNTS_KEY])

# add a column for the log10 of the syndna ng column
working_df.loc[:, LOG10_SYNDNA_INDIV_NG_KEY] = \
np.log10(working_df[SYNDNA_INDIV_NG_KEY])

# loop over each sample id and fit a linear regression model predicting
# log10(dna ng) from log10(counts per million)
# log10(dna ng) from log10(syndna read counts)
linregress_by_sample_id = {}
log_msgs_list = []
for curr_sample_id in working_df[SAMPLE_ID_KEY].unique():
Expand All @@ -255,7 +264,7 @@ def _fit_linear_regression_models(working_df: pd.DataFrame) -> \

try:
curr_linregress_result = scipy.stats.linregress(
curr_sample_df[LOG10_COUNTS_PER_MIL_KEY],
curr_sample_df[LOG10_SYNDNA_COUNTS_KEY],
curr_sample_df[LOG10_SYNDNA_INDIV_NG_KEY])
except Exception:
# TODO: I need to know what kind of errors this can throw;
Expand Down
Loading

0 comments on commit 9cb4b9d

Please sign in to comment.