Merge pull request #3 from AmandaBirmingham/per_gram_fix

Per gram fix
AmandaBirmingham · Jan 15, 2024 · c7f64b9 · c7f64b9
2 parents 0f19e40 + de841d7
commit c7f64b9
Show file tree

Hide file tree

Showing 7 changed files with 1,001 additions and 1,280 deletions.
diff --git a/absolute_quant_example.xlsx b/absolute_quant_example.xlsx
diff --git a/pysyndna/src/calc_cell_counts.py b/pysyndna/src/calc_cell_counts.py
diff --git a/pysyndna/src/fit_syndna_models.py b/pysyndna/src/fit_syndna_models.py
@@ -131,6 +131,7 @@ def _validate_required_columns_exist(
 
     missing_cols = set(required_cols_list) - set(input_df.columns)
     if len(missing_cols) > 0:
+        missing_cols = sorted(missing_cols)
         raise ValueError(
             f"{error_msg}: {missing_cols}")
 
@@ -208,8 +209,8 @@ def fit_linear_regression_models(
 
     log_messages_list = []
 
-    # id any samples that have an inadequate total number of reads aligned
-    # to syndna (i.e. less than min_sample_counts). Don't drop yet.
+    # id any syndnas that have an inadequate total number of reads aligned
+    # to them across all samples (less than min_sample_counts). Don't drop yet.
     # Gathering this now bc it is easier while syndna id is still in the index,
     # but we want the full column set while doing the validation checks.
     # Note: synDNA author also made passing mention of dropping samples with
@@ -403,7 +404,7 @@ def _calc_indiv_syndna_weights(
     # by summing up the concentrations of each individual syndna
     total_syndna_ng_per_ul = syndna_concs_df[SYNDNA_INDIV_NG_UL_KEY].sum()
 
-    # add a column for the unitless fraction of the syndna pool made up of
+    # add a column for the fraction of the syndna pool made up of
     # each individual syndna by dividing the syndna_ng_per_uL of each
     # syndna by the total_syndna_ng_per_ul for the pool
     syndna_concs_df[SYNDNA_FRACTION_OF_POOL_KEY] = (

diff --git a/pysyndna/tests/data/modelling_output.tsv b/pysyndna/tests/data/modelling_output.tsv
@@ -1,9 +1,9 @@
 # This file is based on 
 # https://github.com/lzaramela/SynDNA/blob/main/data/saliva_linear_models.tsv
 # with the values of the a_intercept and b_intercept columns negated 
-# (because the lzaramela code generates regression models that predict the
+# (because the Zaramela code generates regression models that predict the
 # *negative* log10 of the read weight while the code under test predicts just
-# log10 of the read weight.
+# log10 of the read weight.)
 # All other columns were deleted.
 ID	a_intercept	b_slope
 A1_pool1_Fwd	-6.775395054	1.244876524

diff --git a/pysyndna/tests/data/models.yml b/pysyndna/tests/data/models.yml
@@ -1,11 +1,19 @@
-"A":
+"example1":
   "slope": 1.24487652379132
   "intercept": -6.77539505390338
   "rvalue": 0.9865030975156575
   "pvalue": 1.428443560659758e-07
   "stderr": 0.07305408550335003
   "intercept_stderr": 0.2361976278251443
-"B":
+"example2":
+  "slope": 1.24675913604407
+  "intercept": -7.155318973708384
+  "rvalue": 0.9863241797356326
+  "pvalue": 1.505381146809759e-07
+  "stderr": 0.07365795255302438
+  "intercept_stderr": 0.2563956755844754
+# example4 is a copy of example2
+"example4":
   "slope": 1.24675913604407
   "intercept": -7.155318973708384
   "rvalue": 0.9863241797356326

diff --git a/pysyndna/tests/test_calc_cell_counts.py b/pysyndna/tests/test_calc_cell_counts.py