diff --git a/absolute_quant_example.xlsx b/docs/absolute_quant_example.xlsx
similarity index 100%
rename from absolute_quant_example.xlsx
rename to docs/absolute_quant_example.xlsx
diff --git a/docs/rna_copy_quant_example.xlsx b/docs/rna_copy_quant_example.xlsx
new file mode 100644
index 0000000..256df07
Binary files /dev/null and b/docs/rna_copy_quant_example.xlsx differ
diff --git a/docs/rna_copy_quant_workflow.pdf b/docs/rna_copy_quant_workflow.pdf
new file mode 100644
index 0000000..9956aad
Binary files /dev/null and b/docs/rna_copy_quant_workflow.pdf differ
diff --git a/pysyndna/__init__.py b/pysyndna/__init__.py
index 5e98a98..d0146c9 100644
--- a/pysyndna/__init__.py
+++ b/pysyndna/__init__.py
@@ -2,11 +2,16 @@
     fit_linear_regression_models_for_qiita
 from pysyndna.src.calc_cell_counts import calc_ogu_cell_counts_biom, \
     calc_ogu_cell_counts_per_g_of_sample_for_qiita
+from pysyndna.src.quant_orfs import \
+    calc_copies_of_ogu_orf_ssrna_per_g_sample, \
+    calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita
 
 __all__ = ['fit_linear_regression_models',
            'fit_linear_regression_models_for_qiita',
            'calc_ogu_cell_counts_biom',
-           'calc_ogu_cell_counts_per_g_of_sample_for_qiita']
+           'calc_ogu_cell_counts_per_g_of_sample_for_qiita',
+           'calc_copies_of_ogu_orf_ssrna_per_g_sample',
+           'calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita']
 
 from . import _version
 __version__ = _version.get_versions()['version']
diff --git a/pysyndna/src/calc_cell_counts.py b/pysyndna/src/calc_cell_counts.py
index 1f97bf5..46d7db6 100644
--- a/pysyndna/src/calc_cell_counts.py
+++ b/pysyndna/src/calc_cell_counts.py
@@ -1,13 +1,19 @@
-from __future__ import annotations
-
 import biom
 import numpy as np
 import pandas as pd
 import yaml
-from typing import Optional
-
-from pysyndna.src.fit_syndna_models import SAMPLE_ID_KEY, \
-    SYNDNA_POOL_MASS_NG_KEY, _validate_required_columns_exist
+from typing import Optional, Union, Dict, List
+from pysyndna.src.util import calc_copies_genomic_element_per_g_series, \
+    calc_gs_genomic_element_in_aliquot, \
+    validate_required_columns_exist, \
+    validate_metadata_vs_reads_id_consistency, \
+    validate_metadata_vs_prep_id_consistency, \
+    DNA_BASEPAIR_G_PER_MOLE, NANOGRAMS_PER_GRAM, \
+    SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY, ELUTE_VOL_UL_KEY, \
+    REQUIRED_SAMPLE_INFO_KEYS
+
+from pysyndna.src.fit_syndna_models import SYNDNA_POOL_MASS_NG_KEY, \
+    SLOPE_KEY, INTERCEPT_KEY, SAMPLE_TOTAL_READS_KEY
 
 DEFAULT_SYNDNA_MASS_FRACTION_OF_SAMPLE = 0.05
 DEFAULT_READ_LENGTH = 150
@@ -18,8 +24,6 @@
 CELL_COUNT_LOG_KEY = 'calc_cell_counts_log'
 
 GDNA_CONCENTRATION_NG_UL_KEY = 'extracted_gdna_concentration_ng_ul'
-SAMPLE_IN_ALIQUOT_MASS_G_KEY = 'calc_mass_sample_aliquot_input_g'
-ELUTE_VOL_UL_KEY = 'vol_extracted_elution_ul'
 GDNA_FROM_ALIQUOT_MASS_G_KEY = 'extracted_gdna_concentration_g'
 # NB: below is NOT the full mass of gDNA extracted from the sample, but
 # ONLY the mass of gDNA that was put into sequencing. This mass should
@@ -41,209 +45,8 @@
 # (NOT limited to the amount of gDNA that was put into sequencing, unlike
 # SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY)
 GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY = 'gdna_mass_to_sample_mass_ratio'
-
-
-def calc_ogu_cell_counts_per_g_of_sample_for_qiita(
-        sample_info_df: pd.DataFrame,
-        prep_info_df: pd.DataFrame,
-        linregress_by_sample_id_fp: str,
-        ogu_counts_per_sample_biom: biom.Table,
-        ogu_lengths_fp: str,
-        read_length: int = DEFAULT_READ_LENGTH,
-        min_coverage: float = DEFAULT_MIN_COVERAGE,
-        min_rsquared: float = DEFAULT_MIN_RSQUARED,
-        syndna_mass_fraction_of_sample: float =
-        DEFAULT_SYNDNA_MASS_FRACTION_OF_SAMPLE) \
-        -> dict[str, str | biom.Table]:
-
-    """Gets # of cells of each OGU/g of sample for samples from Qiita.
-
-    Parameters
-    ----------
-    sample_info_df: pd.DataFrame
-        Dataframe containing sample info for all samples in the prep,
-        including SAMPLE_ID_KEY and SAMPLE_IN_ALIQUOT_MASS_G_KEY
-    prep_info_df: pd.DataFrame
-        Dataframe containing prep info for all samples in the prep,
-        including SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY,
-        ELUTE_VOL_UL_KEY, and SYNDNA_POOL_MASS_NG_KEY.
-    linregress_by_sample_id_fp: str
-        String containing the filepath to the yaml file holding the
-        dictionary keyed by sample id, containing for each sample a dictionary
-        representation of the sample's LinregressResult.
-    ogu_counts_per_sample_biom: biom.Table
-        Biom table holding the read counts aligned to each OGU in each sample.
-    ogu_lengths_fp : str
-        String containing the filepath to a tab-separated, two-column,
-        no-header file in which the first column is the OGU id and the
-         second is the OGU length in basepairs
-    read_length : int
-        Length of reads in bp (usually but not always 150).
-    min_coverage : float
-        Minimum allowable coverage of an OGU needed to include that OGU
-        in the output.
-    min_rsquared: float
-        Minimum allowable R^2 value for the linear regression model for a
-        sample; any sample with an R^2 value less than this will be excluded
-        from the output.
-    syndna_mass_fraction_of_sample: float
-        Fraction of the mass of the sample that is added as syndna (usually
-        0.05, which is to say 5%).
-
-    Returns
-    -------
-    output_by_out_type : dict of str or biom.Table
-        Dictionary of outputs keyed by their type Currently, the following keys
-        are defined:
-        CELL_COUNT_RESULT_KEY: biom.Table holding the calculated number of
-        cells per gram of sample material for each OGU in each sample.
-        CELL_COUNT_LOG_KEY: log of messages from the cell count calc process.
-    """
-
-    # check if the inputs all have the required columns
-    required_sample_info_cols = [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY]
-    _validate_required_columns_exist(
-        sample_info_df, required_sample_info_cols,
-        "sample info is missing required column(s)")
-
-    required_prep_info_cols = [SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY,
-                               ELUTE_VOL_UL_KEY, SYNDNA_POOL_MASS_NG_KEY]
-    _validate_required_columns_exist(
-        prep_info_df, required_prep_info_cols,
-        "prep info is missing required column(s)")
-
-    # calculate the mass of gDNA sequenced for each sample.  We have the
-    # mass of syndna pool that was added to each sample, and we know that the
-    # syndna pool mass is calculated to be a certain percentage of the mass of
-    # the sample (added into the library prep in addition to the sample mass).
-    # Therefore, if the syndna fraction is 0.05 or 5%, the mass of the sample
-    # gDNA put into sequencing is 1/0.05 = 20x the mass of syndna pool added.
-    prep_info_df[SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY] = \
-        prep_info_df[SYNDNA_POOL_MASS_NG_KEY] * \
-        (1 / syndna_mass_fraction_of_sample)
-
-    # merge the sample info and prep info dataframes
-    absolute_quant_params_per_sample_df = \
-        sample_info_df.merge(prep_info_df, on=SAMPLE_ID_KEY, how='left')
-
-    # read in the linregress_by_sample_id yaml file
-    with open(linregress_by_sample_id_fp) as f:
-        linregress_by_sample_id = yaml.load(f, Loader=yaml.FullLoader)
-
-    # read in the ogu_lengths file
-    ogu_lengths_df = pd.read_csv(ogu_lengths_fp, sep='\t', header=None,
-                                 names=[OGU_ID_KEY, OGU_LEN_IN_BP_KEY])
-
-    # calculate # cells per gram of sample material of each OGU in each sample
-    output_biom, log_msgs_list = calc_ogu_cell_counts_biom(
-        absolute_quant_params_per_sample_df, linregress_by_sample_id,
-        ogu_counts_per_sample_biom, ogu_lengths_df, read_length, min_coverage,
-        min_rsquared, OGU_CELLS_PER_G_OF_SAMPLE_KEY)
-
-    out_txt_by_out_type = {
-        CELL_COUNT_RESULT_KEY: output_biom,
-        CELL_COUNT_LOG_KEY: '\n'.join(log_msgs_list)}
-
-    return out_txt_by_out_type
-
-
-def calc_ogu_cell_counts_biom(
-        absolute_quant_params_per_sample_df: pd.DataFrame,
-        linregress_by_sample_id: dict[str, dict[str, float]],
-        ogu_counts_per_sample_biom: biom.Table,
-        ogu_lengths_df: pd.DataFrame,
-        read_length: int,
-        min_coverage: float,
-        min_rsquared: float,
-        output_cell_counts_metric: str) -> (biom.Table, list[str]):
-
-    """Calcs input cell count metric for each ogu & sample via linear models.
-
-    Parameters
-    ----------
-    absolute_quant_params_per_sample_df:  pd.DataFrame
-        Dataframe of at least SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY,
-        SAMPLE_IN_ALIQUOT_MASS_G_KEY, ELUTE_VOL_UL_KEY, and
-        SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY for each sample.
-    linregress_by_sample_id : dict[str, dict[str: float]]
-        Dictionary keyed by sample id, containing for each sample either None
-        (if no model could be trained for that SAMPLE_ID_KEY) or a dictionary
-        representation of the sample's LinregressResult.
-    ogu_counts_per_sample_biom: biom.Table
-        Biom table holding the read counts aligned to each OGU in each sample.
-    ogu_lengths_df : pd.DataFrame
-        Dataframe of OGU_ID_KEY and OGU_LEN_IN_BP_KEY for each OGU.
-    read_length : int
-        Length of reads in bp (usually but not always 150).
-    min_coverage : float
-        Minimum allowable coverage of an OGU needed to include that OGU
-        in the output.
-    min_rsquared: float
-        Minimum allowable R^2 value for the linear regression model for a
-        sample; any sample with an R^2 value less than this will be excluded
-        from the output.
-    output_cell_counts_metric : str
-        Name of the desired output cell count metric; options are
-        OGU_CELLS_PER_G_OF_GDNA_KEY and OGU_CELLS_PER_G_OF_SAMPLE_KEY.
-
-    Returns
-    -------
-    ogu_cell_counts_biom : biom.Table
-        Dataframe with a column for OGU_ID_KEY and then one additional column
-        for each sample id, which holds the predicted number of cells per gram
-        of sample material of that OGU in that sample.
-    log_messages_list : list[str]
-        List of strings containing log messages generated by this function.
-    """
-
-    working_params_df = absolute_quant_params_per_sample_df.copy()
-
-    # cast the GDNA_CONCENTRATION_NG_UL_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY,
-    # ELUTE_VOL_UL_KEY, and SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY columns of
-    # params df to float if they aren't already
-    for col in [GDNA_CONCENTRATION_NG_UL_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY,
-                ELUTE_VOL_UL_KEY, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY]:
-        if working_params_df[col].dtype != float:
-            working_params_df[col] = \
-                working_params_df[col].astype(float)
-
-    # calculate the ratio of extracted gDNA mass to sample mass put into
-    # extraction for each sample
-    gdna_mass_to_sample_mass_by_sample_series = \
-        _calc_gdna_mass_to_sample_mass_by_sample_df(working_params_df)
-    per_sample_mass_info_df = _series_to_df(
-        gdna_mass_to_sample_mass_by_sample_series, SAMPLE_ID_KEY,
-        GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY)
-
-    # merge only the SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY column of
-    # working_params_df into gdna_mass_to_sample_mass_df by SAMPLE_ID_KEY
-    per_sample_mass_info_df = per_sample_mass_info_df.merge(
-        working_params_df[[SAMPLE_ID_KEY, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY]],
-        on=SAMPLE_ID_KEY, how='left')
-
-    # convert input biom table to a dataframe with sparse columns, which
-    # should act basically the same as a dense dataframe but use less memory
-    ogu_counts_per_sample_df = ogu_counts_per_sample_biom.to_dataframe(
-        dense=False)
-
-    ogu_cell_counts_long_format_df, log_msgs_list = (
-        _calc_long_format_ogu_cell_counts_df(
-            linregress_by_sample_id, ogu_counts_per_sample_df,
-            ogu_lengths_df, per_sample_mass_info_df, read_length,
-            min_coverage, min_rsquared))
-
-    ogu_cell_counts_wide_format_df = ogu_cell_counts_long_format_df.pivot(
-        index=OGU_ID_KEY, columns=SAMPLE_ID_KEY)[output_cell_counts_metric]
-
-    # convert dataframe to biom table; input params are
-    # data (the "output_cell_count_metric"s), observation_ids (the "ogu_id"s),
-    # and sample_ids (er, the "sample_id"s)
-    ogu_cell_counts_biom = biom.Table(
-        ogu_cell_counts_wide_format_df.values,
-        ogu_cell_counts_wide_format_df.index,
-        ogu_cell_counts_wide_format_df.columns)
-
-    return ogu_cell_counts_biom, log_msgs_list
+REQUIRED_DNA_PREP_INFO_KEYS = [SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY,
+                               ELUTE_VOL_UL_KEY, SAMPLE_TOTAL_READS_KEY]
 
 
 def _calc_gdna_mass_to_sample_mass_by_sample_df(
@@ -258,26 +61,22 @@ def _calc_gdna_mass_to_sample_mass_by_sample_df(
     Parameters
     ----------
     absolute_quant_params_per_sample_df:  pd.DataFrame
-        Dataframe of at least SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY,
+        A Dataframe of at least SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY,
         SAMPLE_IN_ALIQUOT_MASS_G_KEY, and ELUTE_VOL_UL_KEY for
         each sample.
 
     Returns
     -------
     gdna_mass_to_sample_mass_by_sample_series : pd.Series
-        Series with index of sample id and values of the ratio of gDNA mass
+        A Series with index of sample id and values of the ratio of gDNA mass
         units extracted from each mass unit of input sample (only) mass.
     """
 
-    working_df = absolute_quant_params_per_sample_df.copy()
-
     # get the total grams of gDNA that are in the elute after extraction;
-    # this is sample-specific:
-    # concentration of gDNA after extraction in ng/uL times volume of elute
-    # from the extraction in uL, times 1/10^9 g/ng
-    working_df[GDNA_FROM_ALIQUOT_MASS_G_KEY] = \
-        working_df[GDNA_CONCENTRATION_NG_UL_KEY] * \
-        working_df[ELUTE_VOL_UL_KEY] / 10 ** 9
+    # this is sample-specific
+    working_df = calc_gs_genomic_element_in_aliquot(
+        absolute_quant_params_per_sample_df, GDNA_CONCENTRATION_NG_UL_KEY,
+        GDNA_FROM_ALIQUOT_MASS_G_KEY)
 
     # determine how many mass units of gDNA are produced from the extraction of
     # each mass unit of sample material; this is sample-specific:
@@ -299,7 +98,7 @@ def _series_to_df(a_series, index_col_name, val_col_name):
     Parameters
     ----------
     a_series : pd.Series
-        Series to be converted to a dataframe.
+        A Series to be converted to a dataframe.
     index_col_name : str
         Name of the index-derived in the resulting dataframe.
     val_col_name : str
@@ -308,7 +107,7 @@ def _series_to_df(a_series, index_col_name, val_col_name):
     Returns
     -------
     a_df : pd.DataFrame
-        Dataframe with two columns, one from the index and one containing the
+        A Dataframe with two columns, one from the index and one containing the
         values from the input series.
     """
 
@@ -319,13 +118,13 @@ def _series_to_df(a_series, index_col_name, val_col_name):
 
 
 def _calc_long_format_ogu_cell_counts_df(
-        linregress_by_sample_id: dict[str, dict[str, float]],
+        linregress_by_sample_id: Dict[str, Dict[str, float]],
         ogu_counts_per_sample_df: pd.DataFrame,
         ogu_lengths_df: pd.DataFrame,
-        per_sample_mass_info_df: pd.DataFrame,
+        per_sample_calc_info_df: pd.DataFrame,
         read_length: int,
         min_coverage: float,
-        min_rsquared: float) -> (pd.DataFrame | None, list[str]):
+        min_rsquared: float) -> (Union[pd.DataFrame, None], List[str]):
 
     """Predicts the # of cells of each OGU in each sample from the read counts.
 
@@ -336,14 +135,15 @@ def _calc_long_format_ogu_cell_counts_df(
         (if no model could be trained for that SAMPLE_ID_KEY) or a dictionary
         representation of the sample's LinregressResult.
     ogu_counts_per_sample_df: pd.DataFrame
-        Dataframe with a column for OGU_ID_KEY and then one additional column
+        A Dataframe with a column for OGU_ID_KEY and then one additional column
         for each sample id, which holds the read counts aligned to that OGU in
         that sample.
     ogu_lengths_df : pd.DataFrame
-        Dataframe of OGU_ID_KEY and OGU_LEN_IN_BP_KEY for each OGU.
-    per_sample_mass_info_df : pd.DataFrame
-        Dataframe of SAMPLE_ID_KEY, GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY, and
-        SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY for each sample.
+        A Dataframe of OGU_ID_KEY and OGU_LEN_IN_BP_KEY for each OGU.
+    per_sample_calc_info_df : pd.DataFrame
+        A Dataframe of SAMPLE_ID_KEY, GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY,
+        SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY, and SAMPLE_TOTAL_READS_KEY
+        for each sample.
     read_length : int
         Length of reads in bp (usually but not always 150).
     min_coverage : float
@@ -381,11 +181,14 @@ def _calc_long_format_ogu_cell_counts_df(
         # gDNA in this sample and also per gram of stool in this sample
         curr_sample_df, curr_log_msgs = _calc_ogu_cell_counts_df_for_sample(
             curr_sample_id, linregress_by_sample_id,
-            per_sample_mass_info_df, working_df, min_rsquared)
+            per_sample_calc_info_df, working_df, min_rsquared)
         log_messages_list.extend(curr_log_msgs)
         if curr_sample_df is None:
             log_messages_list.append(f"No cell counts calculated for "
                                      f"sample {curr_sample_id}")
+
+            # NB: if no cell counts were calculated for this sample,
+            # this sample is left out of the final cell_counts_df.
             continue
 
         # if cell_counts_df does not yet exist, create it from curr_sample_df;
@@ -407,7 +210,7 @@ def _prepare_cell_counts_calc_df(
         ogu_counts_per_sample_df: pd.DataFrame,
         ogu_lengths_df: pd.DataFrame,
         read_length: int,
-        min_coverage: float) -> (pd.DataFrame, list[str]):
+        min_coverage: float) -> (pd.DataFrame, List[str]):
 
     """Prepares long-format dataframe containing fields needed for later calcs.
 
@@ -418,7 +221,7 @@ def _prepare_cell_counts_calc_df(
         column for each sample id, which holds the read counts
         aligned to that OGU in that sample.
     ogu_lengths_df : pd.DataFrame
-        Dataframe of OGU_ID_KEY and OGU_LEN_IN_BP_KEY for each OGU.
+        A Dataframe of OGU_ID_KEY and OGU_LEN_IN_BP_KEY for each OGU.
     read_length : int
         Length of reads in bp (usually but not always 150).
     min_coverage : float
@@ -486,11 +289,12 @@ def _prepare_cell_counts_calc_df(
 
 def _calc_ogu_cell_counts_df_for_sample(
         sample_id: str,
-        linregress_by_sample_id: dict[str, dict[str, float]],
-        per_sample_mass_info_df: pd.DataFrame,
+        linregress_by_sample_id: Dict[str, Dict[str, float]],
+        per_sample_info_df: pd.DataFrame,
         working_df: pd.DataFrame,
         min_rsquared: float,
-        is_test: Optional[bool] = False) -> (pd.DataFrame | None, list[str]):
+        is_test: Optional[bool] = False) \
+        -> (Union[pd.DataFrame, None], List[str]):
 
     """Calculates # cells of each OGU per gram of sample material for sample.
 
@@ -502,9 +306,10 @@ def _calc_ogu_cell_counts_df_for_sample(
         Dictionary keyed by sample id, containing for each sample either None
         (if no model could be trained for that SAMPLE_ID_KEY) or a dictionary
         representation of the sample's LinregressResult.
-    per_sample_mass_info_df : pd.DataFrame
-        Dataframe of SAMPLE_ID_KEY, GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY, and
-        SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY for each sample.
+    per_sample_info_df : pd.DataFrame
+        A Dataframe of SAMPLE_ID_KEY, GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY, and
+        SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY, and SAMPLE_TOTAL_READS_KEY
+        for each sample.
     working_df : pd.DataFrame
         Long-format dataframe with columns for OGU_ID_KEY, SAMPLE_ID_KEY,
         OGU_READ_COUNT_KEY, and OGU_LEN_IN_BP_KEY
@@ -552,16 +357,22 @@ def _calc_ogu_cell_counts_df_for_sample(
     sample_df = working_df[
         working_df[SAMPLE_ID_KEY] == sample_id].copy()
 
-    # predict mass of each OGU's gDNA in this sample using the linear model
+    # get the total reads sequenced for this sample
+    sample_total_reads = per_sample_info_df.loc[
+        per_sample_info_df[SAMPLE_ID_KEY] == sample_id,
+        SAMPLE_TOTAL_READS_KEY].values[0]
+
+    # predict mass of each OGU's gDNA in this sample from its counts
+    # using the linear model
     ogu_gdna_masses = _calc_ogu_gdna_mass_ng_series_for_sample(
-            sample_df, linregress_result["slope"],
-            linregress_result["intercept"])
+            sample_df, linregress_result[SLOPE_KEY],
+            linregress_result[INTERCEPT_KEY], sample_total_reads)
     sample_df[OGU_GDNA_MASS_NG_KEY] = \
         sample_df[OGU_ID_KEY].map(ogu_gdna_masses)
 
     # get the mass of gDNA put into sequencing for this sample
-    sequenced_sample_gdna_mass_ng = per_sample_mass_info_df.loc[
-        per_sample_mass_info_df[SAMPLE_ID_KEY] == sample_id,
+    sequenced_sample_gdna_mass_ng = per_sample_info_df.loc[
+        per_sample_info_df[SAMPLE_ID_KEY] == sample_id,
         SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY].values[0]
 
     # calc the # of genomes of each OGU per gram of gDNA in this sample
@@ -581,8 +392,8 @@ def _calc_ogu_cell_counts_df_for_sample(
 
     # calc the # of cells of each OGU per gram of actual sample material
     # (e.g., per gram of stool if these are fecal samples) for this sample
-    mass_ratio_for_sample = per_sample_mass_info_df.loc[
-        per_sample_mass_info_df[SAMPLE_ID_KEY] == sample_id,
+    mass_ratio_for_sample = per_sample_info_df.loc[
+        per_sample_info_df[SAMPLE_ID_KEY] == sample_id,
         GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY].values[0]
     sample_df[OGU_CELLS_PER_G_OF_SAMPLE_KEY] = \
         sample_df[OGU_CELLS_PER_G_OF_GDNA_KEY] * \
@@ -594,38 +405,38 @@ def _calc_ogu_cell_counts_df_for_sample(
 def _calc_ogu_gdna_mass_ng_series_for_sample(
         sample_df: pd.DataFrame,
         sample_linregress_slope: float,
-        sample_linregress_intercept: float) -> pd.Series:
+        sample_linregress_intercept: float,
+        sample_total_reads: int) -> pd.Series:
 
     """Calculates mass of OGU gDNA in ng for each OGU in a sample.
 
     Parameters
     ----------
     sample_df: pd.DataFrame
-        Dataframe with rows for a single sample, containing at least columns
+        A Dataframe with rows for a single sample, containing at least columns
         for OGU_ID_KEY and OGU_READ_COUNT_KEY.
     sample_linregress_slope: float
         Slope of the linear regression model for the sample.
     sample_linregress_intercept: float
         Intercept of the linear regression model for the sample.
+    sample_total_reads: int
+        Total number of reads for the sample (including all reads, not just
+        aligned ones).
 
     Returns
     -------
     ogu_genomes_per_g_of_gdna_series : pd.Series
-        Series with index of OGU_ID_KEY and values of the number of genomes
+        A Series with index of OGU_ID_KEY and values of the number of genomes
         of each OGU per gram of gDNA in the sample.
     """
     working_df = sample_df.copy()
 
-    # calculate the total number of reads for this sample (a scalar)
-    # by summing read counts for all the rows in the sample table
-    total_reads_per_sample = working_df[OGU_READ_COUNT_KEY].sum()
-
     # add a column of counts per million (CPM) for each ogu by dividing
     # each read_count by the total number of reads for this sample
     # and then multiplying by a million (1,000,000)
     # NB: dividing int/int in python gives float
     working_df[OGU_CPM_KEY] = (working_df[OGU_READ_COUNT_KEY] /
-                               total_reads_per_sample) * 1000000
+                               sample_total_reads) * 1000000
 
     # add column of log10(ogu CPM) by taking log base 10 of the ogu CPM column
     working_df[LOG_10_OGU_CPM_KEY] = np.log10(working_df[OGU_CPM_KEY])
@@ -660,7 +471,7 @@ def _calc_ogu_genomes_per_g_of_gdna_series_for_sample(
     Parameters
     ----------
     sample_df: pd.DataFrame
-        Dataframe with rows related to only a single sample, containing
+        A Dataframe with rows related to only a single sample, containing
         at least columns for OGU_ID_KEY, OGU_LEN_IN_BP_KEY, and
         OGU_GDNA_MASS_NG_KEY.
     total_sample_gdna_mass_ng: float
@@ -675,7 +486,7 @@ def _calc_ogu_genomes_per_g_of_gdna_series_for_sample(
     Returns
     -------
     ogu_genomes_per_g_of_gdna_series : pd.Series
-        Series with index of OGU_ID_KEY and values of the number of genomes
+        A Series with index of OGU_ID_KEY and values of the number of genomes
         of each OGU per gram of gDNA of the sample.
     """
 
@@ -707,7 +518,7 @@ def _calc_ogu_genomes_series_for_sample(
     Parameters
     ----------
     sample_df: pd.DataFrame
-        Dataframe with rows related to only a single sample, containing
+        A Dataframe with rows related to only a single sample, containing
         at least columns for OGU_ID_KEY, OGU_LEN_IN_BP_KEY, and
         OGU_GDNA_MASS_NG_KEY.
     is_test: Optional[bool]
@@ -720,7 +531,7 @@ def _calc_ogu_genomes_series_for_sample(
     Returns
     -------
     ogu_genomes_series : pd.Series
-        Series with index of OGU_ID_KEY and values of the number of genomes
+        A Series with index of OGU_ID_KEY and values of the number of genomes
         of each OGU in the sequenced sample.
 
     This calculates the total number of genomes for each OGU in the sequenced
@@ -738,34 +549,243 @@ def _calc_ogu_genomes_series_for_sample(
     molecules--in this case, genomes--in a mole of a substance.
     """
 
-    # seems weird to make this a variable since it's famously a constant, but..
-    avogadros_num = 6.02214076e23
-    # this is done so we can test against Livia's results, which use
-    # a truncated version of the constant. This should NOT be done in
-    # production.  In testing, makes a difference of e.g., about 10 cells
-    # out of 25K for the first OGU in the first sample in Livia's dataset.
-    if is_test:
-        avogadros_num = 6.022e23
-
-    # TODO: do we have to worry about integer overflow here?
-    #  Dan H. said, "if you use ints, the length * 650 * 10^9
-    #  can overflow integers with very long genomes".  HOWEVER,
-    #  the internet says that python *3* , "[o]nly floats have a hard
-    #  limit in python. Integers are implemented as “long” integer
-    #  objects of arbitrary size"(https://stackoverflow.com/a/52151786)
-    #  HOWEVER HOWEVER, *numpy* integer types are fixed width, and
-    #  "Some pandas and numpy functions, such as sum on arrays or
-    #  Series return an np.int64 so this might be the reason you are
-    #  seeing int overflows in Python3."
-    #  (https://stackoverflow.com/a/58640340)
-    #  What to do?
-
-    numerator_series = sample_df[OGU_GDNA_MASS_NG_KEY] * avogadros_num
-    denominator_series = sample_df[OGU_LEN_IN_BP_KEY] * 650 * 1e9
-
-    ogu_genomes_series = numerator_series/denominator_series
+    ogu_copies_per_g_series = calc_copies_genomic_element_per_g_series(
+        sample_df[OGU_LEN_IN_BP_KEY], DNA_BASEPAIR_G_PER_MOLE, is_test=is_test)
+    ogu_copies_per_extracted_sample_series = \
+        sample_df[OGU_GDNA_MASS_NG_KEY] * \
+        ogu_copies_per_g_series / NANOGRAMS_PER_GRAM
 
     # Set the index of the series to be the OGU_ID_KEY
-    ogu_genomes_series.index = sample_df[OGU_ID_KEY]
+    ogu_copies_per_extracted_sample_series.index = sample_df[OGU_ID_KEY]
+    return ogu_copies_per_extracted_sample_series
+
+
+def calc_ogu_cell_counts_biom(
+        absolute_quant_params_per_sample_df: pd.DataFrame,
+        linregress_by_sample_id: Dict[str, Dict[str, float]],
+        ogu_counts_per_sample_biom: biom.Table,
+        ogu_lengths_df: pd.DataFrame,
+        read_length: int,
+        min_coverage: float,
+        min_rsquared: float,
+        output_cell_counts_metric: str) -> (biom.Table, List[str]):
+
+    """Calcs input cell count metric for each ogu & sample via linear models.
+
+    Parameters
+    ----------
+    absolute_quant_params_per_sample_df:  pd.DataFrame
+        A Dataframe of at least SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY,
+        SAMPLE_IN_ALIQUOT_MASS_G_KEY, ELUTE_VOL_UL_KEY,
+        SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY, and SAMPLE_TOTAL_READS_KEY
+        for each sample.
+    linregress_by_sample_id : dict[str, dict[str: float]]
+        Dictionary keyed by sample id, containing for each sample either None
+        (if no model could be trained for that SAMPLE_ID_KEY) or a dictionary
+        representation of the sample's LinregressResult.
+    ogu_counts_per_sample_biom: biom.Table
+        Biom table holding the read counts aligned to each OGU in each sample.
+    ogu_lengths_df : pd.DataFrame
+        A Dataframe of OGU_ID_KEY and OGU_LEN_IN_BP_KEY for each OGU.
+    read_length : int
+        Length of reads in bp (usually but not always 150).
+    min_coverage : float
+        Minimum allowable coverage of an OGU needed to include that OGU
+        in the output.
+    min_rsquared: float
+        Minimum allowable R^2 value for the linear regression model for a
+        sample; any sample with an R^2 value less than this will be excluded
+        from the output.
+    output_cell_counts_metric : str
+        Name of the desired output cell count metric; options are
+        OGU_CELLS_PER_G_OF_GDNA_KEY and OGU_CELLS_PER_G_OF_SAMPLE_KEY.
+
+    Returns
+    -------
+    ogu_cell_counts_biom : biom.Table
+        Dataframe with a column for OGU_ID_KEY and then one additional column
+        for each sample id, which holds the predicted number of cells per gram
+        of sample material of that OGU in that sample.
+    log_messages_list : list[str]
+        List of strings containing log messages generated by this function.
+    """
+
+    # check if the inputs all have the required columns
+    required_cols_list = list(
+        {SAMPLE_ID_KEY, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY} |
+        set(REQUIRED_DNA_PREP_INFO_KEYS))
+    validate_required_columns_exist(
+        absolute_quant_params_per_sample_df, required_cols_list,
+        "sample info is missing required column(s)")
+
+    # Check if any samples in the reads data are missing from the metadata;
+    # Not bothering to report samples that are in metadata but not the reads--
+    # maybe those failed the sequencing run.
+    _ = validate_metadata_vs_reads_id_consistency(
+        absolute_quant_params_per_sample_df, ogu_counts_per_sample_biom)
+
+    working_params_df = absolute_quant_params_per_sample_df.copy()
+
+    # cast the GDNA_CONCENTRATION_NG_UL_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY,
+    # ELUTE_VOL_UL_KEY, and SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY columns of
+    # params df to float if they aren't already
+    for col in [GDNA_CONCENTRATION_NG_UL_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY,
+                ELUTE_VOL_UL_KEY, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY]:
+        if working_params_df[col].dtype != float:
+            working_params_df[col] = \
+                working_params_df[col].astype(float)
+
+    # calculate the ratio of extracted gDNA mass to sample mass put into
+    # extraction for each sample
+    gdna_mass_to_sample_mass_by_sample_series = \
+        _calc_gdna_mass_to_sample_mass_by_sample_df(working_params_df)
+    per_sample_calc_info_df = _series_to_df(
+        gdna_mass_to_sample_mass_by_sample_series, SAMPLE_ID_KEY,
+        GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY)
+
+    # merge the SAMPLE_TOTAL_READS_KEY and SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY
+    # columns of working_params_df into gdna_mass_to_sample_mass_df
+    # by SAMPLE_ID_KEY
+    per_sample_calc_info_df = per_sample_calc_info_df.merge(
+        working_params_df[[SAMPLE_ID_KEY, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY,
+                           SAMPLE_TOTAL_READS_KEY]],
+        on=SAMPLE_ID_KEY, how='left')
+
+    # convert input biom table to a dataframe with sparse columns, which
+    # should act basically the same as a dense dataframe but use less memory
+    ogu_counts_per_sample_df = ogu_counts_per_sample_biom.to_dataframe(
+        dense=False)
+
+    ogu_cell_counts_long_format_df, log_msgs_list = (
+        _calc_long_format_ogu_cell_counts_df(
+            linregress_by_sample_id, ogu_counts_per_sample_df,
+            ogu_lengths_df, per_sample_calc_info_df, read_length,
+            min_coverage, min_rsquared))
+
+    ogu_cell_counts_wide_format_df = ogu_cell_counts_long_format_df.pivot(
+        index=OGU_ID_KEY, columns=SAMPLE_ID_KEY)[output_cell_counts_metric]
+
+    # replace NaNs with 0s; per Daniel McDonald, much downstream analysis
+    # cannot handle NaNs, and it is preferable to set invalid values
+    # to 0 and provide a log message saying they are not usable than to leave
+    # them as NaNs
+    ogu_cell_counts_wide_format_df.fillna(0, inplace=True)
+
+    # convert dataframe to biom table; input params are
+    # data (the "output_cell_count_metric"s), observation_ids (the "ogu_id"s),
+    # and sample_ids (er, the "sample_id"s)
+    ogu_cell_counts_biom = biom.Table(
+        ogu_cell_counts_wide_format_df.values,
+        ogu_cell_counts_wide_format_df.index,
+        ogu_cell_counts_wide_format_df.columns)
+
+    return ogu_cell_counts_biom, log_msgs_list
 
-    return ogu_genomes_series
+
+def calc_ogu_cell_counts_per_g_of_sample_for_qiita(
+        sample_info_df: pd.DataFrame,
+        prep_info_df: pd.DataFrame,
+        linregress_by_sample_id_fp: str,
+        ogu_counts_per_sample_biom: biom.Table,
+        ogu_lengths_fp: str,
+        read_length: int = DEFAULT_READ_LENGTH,
+        min_coverage: float = DEFAULT_MIN_COVERAGE,
+        min_rsquared: float = DEFAULT_MIN_RSQUARED,
+        syndna_mass_fraction_of_sample: float =
+        DEFAULT_SYNDNA_MASS_FRACTION_OF_SAMPLE) \
+        -> Dict[str, Union[str, biom.Table]]:
+
+    """Gets # of cells of each OGU/g of sample for samples from Qiita.
+
+    Parameters
+    ----------
+    sample_info_df: pd.DataFrame
+        A Dataframe containing sample info for all samples in the prep,
+        including SAMPLE_ID_KEY and SAMPLE_IN_ALIQUOT_MASS_G_KEY
+    prep_info_df: pd.DataFrame
+        A Dataframe containing prep info for all samples in the prep,
+        including SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY,
+        ELUTE_VOL_UL_KEY, SYNDNA_POOL_MASS_NG_KEY, and SAMPLE_TOTAL_READS_KEY.
+    linregress_by_sample_id_fp: str
+        String containing the filepath to the yaml file holding the
+        dictionary keyed by sample id, containing for each sample a dictionary
+        representation of the sample's LinregressResult.
+    ogu_counts_per_sample_biom: biom.Table
+        Biom table holding the read counts aligned to each OGU in each sample.
+    ogu_lengths_fp : str
+        String containing the filepath to a tab-separated, two-column,
+        no-header file in which the first column is the OGU id and the
+         second is the OGU length in basepairs
+    read_length : int
+        Length of reads in bp (usually but not always 150).
+    min_coverage : float
+        Minimum allowable coverage of an OGU needed to include that OGU
+        in the output.
+    min_rsquared: float
+        Minimum allowable R^2 value for the linear regression model for a
+        sample; any sample with an R^2 value less than this will be excluded
+        from the output.
+    syndna_mass_fraction_of_sample: float
+        Fraction of the mass of the sample that is added as syndna (usually
+        0.05, which is to say 5%).
+
+    Returns
+    -------
+    output_by_out_type : dict of str or biom.Table
+        Dictionary of outputs keyed by their type Currently, the following keys
+        are defined:
+        CELL_COUNT_RESULT_KEY: biom.Table holding the calculated number of
+        cells per gram of sample material for each OGU in each sample.
+        CELL_COUNT_LOG_KEY: log of messages from the cell count calc process.
+    """
+
+    # check if the inputs all have the required columns
+    validate_required_columns_exist(
+        sample_info_df, REQUIRED_SAMPLE_INFO_KEYS,
+        "sample info is missing required column(s)")
+
+    required_prep_cols = list(
+        {SYNDNA_POOL_MASS_NG_KEY} | set(REQUIRED_DNA_PREP_INFO_KEYS))
+    validate_required_columns_exist(
+        prep_info_df, required_prep_cols,
+        "prep info is missing required column(s)")
+
+    # Check if any samples in the prep are missing from the sample info;
+    # Not bothering to report samples that are in sample info but not the prep
+    # --maybe those just weren't included in this prep.
+    _ = validate_metadata_vs_prep_id_consistency(
+        sample_info_df, prep_info_df)
+
+    # calculate the mass of gDNA sequenced for each sample.  We have the
+    # mass of syndna pool that was added to each sample, and we know that the
+    # syndna pool mass is calculated to be a certain percentage of the mass of
+    # the sample (added into the library prep in addition to the sample mass).
+    # Therefore, if the syndna fraction is 0.05 or 5%, the mass of the sample
+    # gDNA put into sequencing is 1/0.05 = 20x the mass of syndna pool added.
+    prep_info_df[SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY] = \
+        prep_info_df[SYNDNA_POOL_MASS_NG_KEY] * \
+        (1 / syndna_mass_fraction_of_sample)
+
+    # merge the sample info and prep info dataframes
+    absolute_quant_params_per_sample_df = \
+        sample_info_df.merge(prep_info_df, on=SAMPLE_ID_KEY, how='left')
+
+    # read in the linregress_by_sample_id yaml file
+    with open(linregress_by_sample_id_fp) as f:
+        linregress_by_sample_id = yaml.load(f, Loader=yaml.FullLoader)
+
+    # read in the ogu_lengths file
+    ogu_lengths_df = pd.read_csv(ogu_lengths_fp, sep='\t', header=None,
+                                 names=[OGU_ID_KEY, OGU_LEN_IN_BP_KEY])
+
+    # calculate # cells per gram of sample material of each OGU in each sample
+    output_biom, log_msgs_list = calc_ogu_cell_counts_biom(
+        absolute_quant_params_per_sample_df, linregress_by_sample_id,
+        ogu_counts_per_sample_biom, ogu_lengths_df, read_length, min_coverage,
+        min_rsquared, OGU_CELLS_PER_G_OF_SAMPLE_KEY)
+
+    out_txt_by_out_type = {
+        CELL_COUNT_RESULT_KEY: output_biom,
+        CELL_COUNT_LOG_KEY: '\n'.join(log_msgs_list)}
+
+    return out_txt_by_out_type
diff --git a/pysyndna/src/fit_syndna_models.py b/pysyndna/src/fit_syndna_models.py
index b0ddb05..f791808 100644
--- a/pysyndna/src/fit_syndna_models.py
+++ b/pysyndna/src/fit_syndna_models.py
@@ -5,20 +5,21 @@
 import os
 import pandas as pd
 import scipy
+import traceback
 import yaml
 
-from typing import Optional
+from typing import Optional, List, Dict, Union
+from pysyndna.src.util import validate_required_columns_exist, \
+    validate_metadata_vs_reads_id_consistency, SAMPLE_ID_KEY
 
 DEFAULT_MIN_SAMPLE_COUNTS = 1
 
-# NB: sample_name instead of sample_id bc that's what qiita uses
-SAMPLE_ID_KEY = 'sample_name'
 SYNDNA_ID_KEY = 'syndna_id'
 SYNDNA_POOL_NUM_KEY = 'syndna_pool_number'
 SYNDNA_INDIV_NG_UL_KEY = 'syndna_indiv_ng_ul'
 SYNDNA_FRACTION_OF_POOL_KEY = 'syndna_fraction_of_pool'
 SYNDNA_POOL_MASS_NG_KEY = 'mass_syndna_input_ng'
-SYNDNA_TOTAL_READS_KEY = 'raw_reads_r1r2'
+SAMPLE_TOTAL_READS_KEY = 'raw_reads_r1r2'
 SYNDNA_COUNTS_KEY = 'read_count'
 COUNTS_PER_MIL_KEY = 'CPM'
 LOG10_COUNTS_PER_MIL_KEY = 'log10_CPM'
@@ -26,114 +27,14 @@
 LOG10_SYNDNA_INDIV_NG_KEY = 'log10_syndna_ng'
 LIN_REGRESS_RESULT_KEY = 'lin_regress_by_sample_id'
 FIT_SYNDNA_MODELS_LOG_KEY = 'fit_syndna_models_log'
-
-
-# TODO: if they sequenced over multiple lanes, would be different prep
-#  info files--talk to lab about whether they will ever do this :(
-#  this would require merge of multiple preparations
-def fit_linear_regression_models_for_qiita(
-        prep_info_df: pd.DataFrame,
-        reads_per_syndna_per_sample_biom: biom.Table,
-        min_sample_counts: int = DEFAULT_MIN_SAMPLE_COUNTS,
-        syndna_pool_config_fp: Optional[str] = None) -> dict[str: str]:
-
-    """Fits linear regressions predicting mass from counts using Qiita inputs.
-
-    Parameters
-    ----------
-    prep_info_df: pd.DataFrame
-        Dataframe containing prep info for all samples in the prep,
-        including SAMPLE_ID, SYNDNA_POOL_NUM_KEY, SYNDNA_POOL_MASS_NG_KEY,
-        and SYNDNA_TOTAL_READS_KEY
-    reads_per_syndna_per_sample_biom: biom.Table
-        Biom table holding read counts aligned to each synDNA in each sample.
-        Note: should already have combined forward and reverse counts.
-    min_sample_counts: int
-        Minimum number of counts required for a sample to be included in
-        the regression.  Samples with fewer counts will be excluded.
-    syndna_pool_config_fp: str, optional
-        Path to the yaml file holding the concentrations of each syndna
-        in the syndna pool used in this experiment.  If not provided, will
-        look for the config.yml file in the parent directory of this file.
-
-    Returns
-    -------
-    out_txt_by_out_type : dict of str
-        Dictionary of output strings (ready to be written to files) keyed
-        by the type of output they contain.  Currently, the following keys
-        are defined:
-        LIN_REGRESS_RESULT_KEY: yaml of dict[str, dict[str, float]]
-        FIT_SYNDNA_MODELS_LOG_KEY: txt log of messages from the fitting process
-    """
-
-    # check that the prep_info_df has the expected columns
-    expected_prep_info_cols = [
-        SAMPLE_ID_KEY, SYNDNA_POOL_NUM_KEY, SYNDNA_POOL_MASS_NG_KEY,
-        SYNDNA_TOTAL_READS_KEY]
-    _validate_required_columns_exist(
-        prep_info_df, expected_prep_info_cols,
-        "prep info is missing required column(s)")
-
-    # pull the syndna pool number from the prep info, ensure it is the same for
-    # all samples, and convert to the pool name
-    syndna_pool_number = prep_info_df[SYNDNA_POOL_NUM_KEY].unique()
-    if len(syndna_pool_number) > 1:
-        raise ValueError(
-            f"Multiple syndna_pool_numbers found in prep info: "
-            f"{syndna_pool_number}")
-    syndna_pool_name = f"pool{syndna_pool_number[0]}"
-
-    # look in the SYNDNA_INDIV_NG_UL_KEY section of the config file to find the
-    # individual syndna concentrations associated with the relevant syndna
-    # pool name and turn the resulting dictionary into a dataframe
-    config_dict = _extract_config_dict(syndna_pool_config_fp)
-    conc_ng_ul_per_indiv_syndna = \
-        config_dict[SYNDNA_INDIV_NG_UL_KEY][syndna_pool_name]
-    syndna_concs_df = pd.DataFrame(
-        conc_ng_ul_per_indiv_syndna.items(),
-        columns=[SYNDNA_ID_KEY, SYNDNA_INDIV_NG_UL_KEY])
-
-    # convert input biom table to a pd.SparseDataFrame, which is should act
-    # basically like a pd.DataFrame but take up less memory
-    reads_per_syndna_per_sample_df = \
-        reads_per_syndna_per_sample_biom.to_dataframe(dense=False)
-
-    # fit linear regression models for each sample
-    linregress_by_sample_id, msg_list = fit_linear_regression_models(
-        syndna_concs_df, prep_info_df, reads_per_syndna_per_sample_df,
-        min_sample_counts)
-    linregress_results_dict = _convert_linregressresults_to_dict(
-        linregress_by_sample_id)
-
-    out_txt_by_out_type = {
-        LIN_REGRESS_RESULT_KEY: yaml.safe_dump(linregress_results_dict),
-        FIT_SYNDNA_MODELS_LOG_KEY: '\n'.join(msg_list)}
-
-    return out_txt_by_out_type
-
-
-def _validate_required_columns_exist(
-        input_df: pd.DataFrame,
-        required_cols_list: list[str],
-        error_msg: str):
-
-    """Checks that the input dataframe has the required columns.
-
-    Parameters
-    ----------
-    input_df: pd.DataFrame
-        Dataframe to be checked.
-    required_cols_list: list[str]
-        List of column names that must be present in the dataframe.
-    error_msg: str
-        Error message to be raised if any of the required columns are missing.
-    """
-
-    missing_cols = set(required_cols_list) - set(input_df.columns)
-    if len(missing_cols) > 0:
-        missing_cols = sorted(missing_cols)
-        raise ValueError(
-            f"{error_msg}: {missing_cols}")
+SLOPE_KEY = 'slope'
+INTERCEPT_KEY = 'intercept'
+RVALUE_KEY = 'rvalue'
+PVALUE_KEY = 'pvalue'
+STDERR_KEY = 'stderr'
+INTERCEPT_STDERR_KEY = 'intercept_stderr'
+REGRESSION_KEYS = [SLOPE_KEY, INTERCEPT_KEY, RVALUE_KEY, PVALUE_KEY,
+                   STDERR_KEY, INTERCEPT_STDERR_KEY]
 
 
 def _extract_config_dict(config_fp=None):
@@ -161,113 +62,6 @@ def _extract_config_dict(config_fp=None):
     return config_dict
 
 
-def fit_linear_regression_models(
-        syndna_concs_df: pd.DataFrame,
-        sample_syndna_weights_and_total_reads_df: pd.DataFrame,
-        reads_per_syndna_per_sample_df: pd.DataFrame,
-        min_sample_counts: int) -> \
-        (dict[str, scipy.stats.LinregressResult], list[str]):
-
-    """Fits per-sample linear regression models predicting mass from counts.
-
-    This fits a linear regression model for each sample, predicting
-    log10(mass of instances of a sequence) within a sample from
-    log10(counts per million for that sequence) within the sample,
-    using spike-in data from synDNAs.
-
-    Parameters
-    ----------
-    syndna_concs_df: pd.DataFrame
-        Dataframe containing SYNDNA_ID_KEY and SYNDNA_INDIV_NG_UL_KEY
-        (e.g. 1, 0.1, 0.01, 0.001, 0.0001) for all syndnas in the syndna pool
-        used in this experiment
-    sample_syndna_weights_and_total_reads_df: pd.DataFrame
-        Dataframe containing at least SAMPLE_ID_KEY, SYNDNA_POOL_MASS_NG_KEY
-        (the total weight of all syndnas in the sample combined, in ng), and
-        SYNDNA_TOTAL_READS_KEY (the number of total reads--not just aligned
-        reads--for all syndnas in the sample, including both r1 and r2)
-    reads_per_syndna_per_sample_df: pd.DataFrame
-        Wide-format dataframe with syndna ids as index and one
-        column for each sample id, which holds the read counts
-        aligned to that syndna in that sample. Note: should already have
-        combined forward and reverse counts.
-    min_sample_counts : int
-        Minimum number of counts required for a sample to be included in
-        the regression.  Samples with fewer counts will be excluded.
-
-    Returns
-    -------
-    linregress_by_sample_id : dict[str, scipy.stats.LinregressResult]
-        returns a dictionary keyed by sample_id, for each sample_id in
-        reads_per_syndna_per_sample_df.  Dictionary values are either None
-        (if no model could be trained for that sample_id) or a
-        scipy.stats.LinregressResult object defining the trained model.
-        Suitable for pickling to a file.
-    log_messages_list : list[str]
-        List of log messages generated during the fitting process.
-    """
-
-    log_messages_list = []
-
-    # id any syndnas that have an inadequate total number of reads aligned
-    # to them across all samples (less than min_sample_counts). Don't drop yet.
-    # Gathering this now bc it is easier while syndna id is still in the index,
-    # but we want the full column set while doing the validation checks.
-    # Note: synDNA author also made passing mention of dropping samples with
-    # inadequate "quality" but didn't provide any guidance on that.
-    too_low_counts_mask = \
-        reads_per_syndna_per_sample_df.sum(axis=1) < min_sample_counts
-    syndnas_to_drop = \
-        reads_per_syndna_per_sample_df[too_low_counts_mask].index.tolist()
-
-    # move the syndna ids from the index to a column, bc I hate implicit
-    reads_per_syndna_per_sample_df = \
-        reads_per_syndna_per_sample_df.reset_index(names=[SYNDNA_ID_KEY])
-
-    # validate that the syndna ids in the config and the data are consistent
-    _validate_syndna_id_consistency(syndna_concs_df,
-                                    reads_per_syndna_per_sample_df)
-
-    # validate that sample ids in the experiment info and data are consistent
-    missing_sample_ids = _validate_sample_id_consistency(
-        sample_syndna_weights_and_total_reads_df,
-        reads_per_syndna_per_sample_df)
-    if missing_sample_ids is not None:
-        log_messages_list.append(f'The following sample ids were in the '
-                                 f'experiment info but not in the data: '
-                                 f'{missing_sample_ids}')
-
-    # NOW remove any syndnas with too few counts from the dataframe,
-    # and log if there were any
-    filtered_reads_per_syndna_per_sample_df = \
-        reads_per_syndna_per_sample_df[
-            ~reads_per_syndna_per_sample_df[SYNDNA_ID_KEY].isin(
-                syndnas_to_drop)]
-    if len(syndnas_to_drop) > 0:
-        log_messages_list.append(f'The following syndnas were dropped '
-                                 f'because they had fewer than '
-                                 f'{min_sample_counts} total reads aligned:'
-                                 f'{syndnas_to_drop}')
-
-    # reformat filtered_reads_per_syndna_per_sample_df into "long form":
-    # columns for syndna id, sample id, and read count
-    working_df = filtered_reads_per_syndna_per_sample_df.melt(
-        id_vars=[SYNDNA_ID_KEY], var_name=SAMPLE_ID_KEY,
-        value_name=SYNDNA_COUNTS_KEY)
-
-    # merge w sample_total_reads_df to include total_reads column
-    working_df = working_df.merge(sample_syndna_weights_and_total_reads_df,
-                                  on=SAMPLE_ID_KEY, how='left')
-
-    # calculate the weight in ng of *each* syndna in each sample
-    working_df = _calc_indiv_syndna_weights(syndna_concs_df, working_df)
-
-    # fit linear regression models for each sample
-    linregress_by_sample_id = _fit_linear_regression_models(working_df)
-
-    return linregress_by_sample_id, log_messages_list
-
-
 def _validate_syndna_id_consistency(
         syndna_concs_df: pd.DataFrame,
         reads_per_syndna_per_sample_df: pd.DataFrame):
@@ -317,19 +111,20 @@ def _validate_syndna_id_consistency(
 
 def _validate_sample_id_consistency(
         sample_syndna_weights_and_total_reads_df: pd.DataFrame,
-        reads_per_syndna_per_sample_df: pd.DataFrame) -> list[str] | None:
+        reads_per_syndna_per_sample_df: pd.DataFrame) -> \
+        Union[List[str], None]:
     """
     Checks that the sample ids in the experiment info and data are consistent.
 
     Parameters
     ----------
     sample_syndna_weights_and_total_reads_df: pd.DataFrame
-        Dataframe containing at least SAMPLE_ID_KEY, SYNDNA_POOL_MASS_NG_KEY
+        A Dataframe containing at least SAMPLE_ID_KEY, SYNDNA_POOL_MASS_NG_KEY
         (the total weight of all syndnas in the sample combined, in ng), and
-        SYNDNA_TOTAL_READS_KEY (the number of total reads--not just aligned
-        reads--for all syndnas in the sample, including both r1 and r2)
+        SAMPLE_TOTAL_READS_KEY (the number of total reads--not just aligned
+        reads--for the sample, including both r1 and r2)
     reads_per_syndna_per_sample_df: pd.DataFrame
-        Dataframe with a column for syndna_id and then one additional column
+        A Dataframe with a column for syndna_id and then one additional column
         for each sample_id, which holds the read counts aligned to that syndna
         in that sample. Note: should already have combined forward and
         reverse counts.
@@ -346,30 +141,11 @@ def _validate_sample_id_consistency(
         data.  None if all sample ids in the experiment info were in the data.
     """
 
-    sample_ids_in_info = \
-        set(sample_syndna_weights_and_total_reads_df[SAMPLE_ID_KEY])
-    sample_ids_in_data = set(reads_per_syndna_per_sample_df.columns)
-    sample_ids_in_data.remove(SYNDNA_ID_KEY)
+    simplified_reads_df = reads_per_syndna_per_sample_df.copy()
+    simplified_reads_df.drop(columns=[SYNDNA_ID_KEY], inplace=True)
 
-    # if there are sample ids in the data that are not in the info, raise
-    # an error, since we don't know how to process that
-    data_only_samples = sample_ids_in_data - sample_ids_in_info
-    if len(data_only_samples) > 0:
-        raise ValueError(
-            f"Found sample ids in reads_per_syndna_per_sample_df that were "
-            f"not in sample_syndna_weights_and_total_reads_df: "
-            f"{data_only_samples}")
-
-    # check if there are sample ids in the info that are not in the data and
-    # if so, capture a list of them. Sometimes a sample just fails sequencing
-    # and that shouldn't preclude processing the others that did work, but we
-    # want to know about it.
-    missing_sample_ids_set = sample_ids_in_info - sample_ids_in_data
-
-    if len(missing_sample_ids_set) > 0:
-        missing_sample_ids = list(missing_sample_ids_set)
-    else:
-        missing_sample_ids = None
+    missing_sample_ids = validate_metadata_vs_reads_id_consistency(
+        sample_syndna_weights_and_total_reads_df, simplified_reads_df)
 
     return missing_sample_ids
 
@@ -383,7 +159,7 @@ def _calc_indiv_syndna_weights(
     Parameters
     ----------
     syndna_concs_df: pd.DataFrame
-        Dataframe containing SYNDNA_ID_KEY and SYNDNA_INDIV_NG_UL_KEY
+        A Dataframe containing SYNDNA_ID_KEY and SYNDNA_INDIV_NG_UL_KEY
         (e.g. 1, 0.1, 0.01, 0.001, 0.0001) for all syndnas in the syndna pool
         used in this experiment
     working_df: pd.DataFrame
@@ -423,7 +199,7 @@ def _calc_indiv_syndna_weights(
 
 
 def _fit_linear_regression_models(working_df: pd.DataFrame) -> \
-        dict[str, scipy.stats.LinregressResult]:
+        (Dict[str, Union[scipy.stats.LinregressResult, None]], List[str]):
 
     """Fits per-sample linear regression models predicting mass from counts.
 
@@ -436,15 +212,18 @@ def _fit_linear_regression_models(working_df: pd.DataFrame) -> \
     ----------
     working_df: pd.DataFrame
         Long-form dataframe containing at least SAMPLE_ID_KEY,
-        SYNDNA_COUNTS_KEY, SYNDNA_TOTAL_READS_KEY, and
+        SYNDNA_COUNTS_KEY, SAMPLE_TOTAL_READS_KEY, and
         SYNDNA_INDIV_NG_KEY columns.
 
     Returns
     -------
-    linregress_by_sample_id : dict[str, scipy.stats.LinregressResult]
+    linregress_by_sample_id : dict[str, scipy.stats.LinregressResult | None]
         returns a dictionary keyed by sample_id, for each sample_id in
         reads_per_syndna_per_sample_df.  Dictionary values are
-        scipy.stats.LinregressResult objects defining the trained models.
+        scipy.stats.LinregressResult objects defining the trained models, or
+        None if no model could be fit.
+    log_msgs_list : list[str]
+        List of messages generated during the fitting process.
     """
 
     # drop any rows where the count value is 0--can't take log of 0
@@ -455,7 +234,7 @@ def _fit_linear_regression_models(working_df: pd.DataFrame) -> \
     # then multiplying by a million (1,000,000)
     working_df.loc[:, COUNTS_PER_MIL_KEY] = \
         (working_df[SYNDNA_COUNTS_KEY] /
-         working_df[SYNDNA_TOTAL_READS_KEY]) * 1000000
+         working_df[SAMPLE_TOTAL_READS_KEY]) * 1000000
 
     # add a column of log10(CMP) by taking the log base 10 of the CPM column
     working_df.loc[:, LOG10_COUNTS_PER_MIL_KEY] = \
@@ -468,44 +247,48 @@ def _fit_linear_regression_models(working_df: pd.DataFrame) -> \
     # loop over each sample id and fit a linear regression model predicting
     # log10(dna ng) from log10(counts per million)
     linregress_by_sample_id = {}
+    log_msgs_list = []
     for curr_sample_id in working_df[SAMPLE_ID_KEY].unique():
         curr_sample_df = \
             working_df[working_df[SAMPLE_ID_KEY] == curr_sample_id]
 
-        # TODO: I need to know what kind of errors this can throw; some of them
-        #  may just mean a linear regression can't be fit for this sample, but
-        #  others may mean something is wrong with the data or the code.
-        #  Once I know which is which, I can decide whether to try/catch
-        #  anything silently.
-
         try:
             curr_linregress_result = scipy.stats.linregress(
                 curr_sample_df[LOG10_COUNTS_PER_MIL_KEY],
                 curr_sample_df[LOG10_SYNDNA_INDIV_NG_KEY])
         except Exception:
+            # TODO: I need to know what kind of errors this can throw;
+            #  some of them may just mean a linear regression can't be fit for
+            #  this sample, but others may mean something is wrong with the
+            #  data (or the code). Once I know which is which, I can decide
+            #  whether to try/catch things silently.
+            # if the regression fails, log the error and set the result to None
+            log_msgs_list.append(
+                f"Error fitting regression model for '{curr_sample_id}': ")
+            log_msgs_list.append(traceback.format_exc())
             curr_linregress_result = None
 
         # record the whole lingregress result object in the output dictionary
         linregress_by_sample_id[curr_sample_id] = curr_linregress_result
     # next sample_id
 
-    return linregress_by_sample_id
+    return linregress_by_sample_id, log_msgs_list
 
 
 def _convert_linregressresults_to_dict(
-        linregress_by_sample_id: dict[str, scipy.stats.LinregressResult]) -> \
-        dict[str, dict[str, float]]:
+        linregress_by_sample_id: Dict[str, Union[scipy.stats.LinregressResult, None]]
+        ) -> Dict[str, Union[Dict[str, float], None]]:
 
-    """Converts a scipy.stats.LinregressResult object to a dictionary.
+    """Converts scipy.stats.LinregressResult dict to dict of primitives.
 
     Returns
     -------
-    linregress_result_dict :  dict[str, dict[str, float]]
+    linregress_result_dict :  dict[str, dict[str, float] | None]
         Dictionary keyed by sample id, containing for each sample either None
         (if no model could be trained for that SAMPLE_ID_KEY) or a dictionary
         representation of the sample's LinregressResult, with each property
-        name as a key and that property's value as the value.  Values are
-        rounded to no more than 15 decimal places.
+        name as a key and that property's value as the value, as a float.
+        Values are rounded to no more than 15 decimal places.
     """
 
     linregress_result_dict = {}
@@ -530,6 +313,13 @@ def _convert_linregressresults_to_dict(
                     # and sometimes differs between mac/ubuntu past this point.
                     new_dict[k] = truncate(new_float, 12)
 
+            # if there are any values in REGRESSION_KEYS that are not in the
+            # keys of new_dict, then raise an error
+            missing_keys = set(REGRESSION_KEYS) - set(new_dict.keys())
+            if len(missing_keys) > 0:
+                raise ValueError(
+                    f"Regression for sample {curr_sample_id} does not "
+                    f"include the following required keys: {missing_keys}")
             linregress_result_dict[curr_sample_id] = new_dict
 
     return linregress_result_dict
@@ -541,17 +331,217 @@ def truncate(a_float, num_decimals):
     Parameters
     ----------
     a_float : float
-        Float to be truncated.
+        A Float to be truncated.
     num_decimals : int
         Number of decimal places to which the float should be truncated.
 
     Returns
     -------
     truncated_float : float
-        Float truncated to the specified number of decimal places.
+        A Float truncated to the specified number of decimal places.
     """
 
     # multiply a_float by 10^num_decimals, convert to an integer, then divide
     # by 10^num_decimals to get the truncated float
     truncated_float = int(a_float * 10 ** num_decimals) / 10 ** num_decimals
     return truncated_float
+
+
+def fit_linear_regression_models(
+        syndna_concs_df: pd.DataFrame,
+        sample_syndna_weights_and_total_reads_df: pd.DataFrame,
+        reads_per_syndna_per_sample_df: pd.DataFrame,
+        min_sample_counts: int) -> \
+        (Dict[str, Union[Dict[str, float], None]], List[str]):
+
+    """Fits per-sample linear regression models predicting mass from counts.
+
+    This fits a linear regression model for each sample, predicting
+    log10(mass of instances of a sequence) within a sample from
+    log10(counts per million for that sequence) within the sample,
+    using spike-in data from synDNAs.
+
+    Parameters
+    ----------
+    syndna_concs_df: pd.DataFrame
+        A Dataframe containing SYNDNA_ID_KEY and SYNDNA_INDIV_NG_UL_KEY
+        (e.g. 1, 0.1, 0.01, 0.001, 0.0001) for all syndnas in the syndna pool
+        used in this experiment
+    sample_syndna_weights_and_total_reads_df: pd.DataFrame
+        A Dataframe containing at least SAMPLE_ID_KEY, SYNDNA_POOL_MASS_NG_KEY
+        (the total weight of all syndnas in the sample combined, in ng), and
+        SAMPLE_TOTAL_READS_KEY (the number of total reads--not just aligned
+        reads--for the sample, including both r1 and r2)
+    reads_per_syndna_per_sample_df: pd.DataFrame
+        Wide-format dataframe with syndna ids as index and one
+        column for each sample id, which holds the read counts
+        aligned to that syndna in that sample. Note: should already have
+        combined forward and reverse counts.
+    min_sample_counts : int
+        Minimum number of counts required for a sample to be included in
+        the regression.  Samples with fewer counts will be excluded.
+
+    Returns
+    -------
+    linregress_result_dict :  dict[str, dict[str, float] | None]
+        Dictionary keyed by sample id, containing for each sample either None
+        (if no model could be trained for that SAMPLE_ID_KEY) or a dictionary
+        representation of the sample's LinregressResult, with each property
+        name as a key and that property's value as the value, as a float.
+        Values are rounded to no more than 15 decimal places.
+    log_messages_list : list[str]
+        List of log messages generated during the fitting process.
+    """
+
+    log_messages_list = []
+
+    # check sample_syndna_weights_and_total_reads_df has the expected columns
+    expected_info_cols = [
+        SAMPLE_ID_KEY, SYNDNA_POOL_MASS_NG_KEY, SAMPLE_TOTAL_READS_KEY]
+    validate_required_columns_exist(
+        sample_syndna_weights_and_total_reads_df, expected_info_cols,
+        "sample metadata is missing required column(s)")
+
+    # id any syndnas that have an inadequate total number of reads aligned
+    # to them across all samples (less than min_sample_counts). Don't drop yet.
+    # Gathering this now bc it is easier while syndna id is still in the index,
+    # but we want the full column set while doing the validation checks.
+    # Note: synDNA author also made passing mention of dropping samples with
+    # inadequate "quality" but didn't provide any guidance on that.
+    too_low_counts_mask = \
+        reads_per_syndna_per_sample_df.sum(axis=1) < min_sample_counts
+    syndnas_to_drop = \
+        reads_per_syndna_per_sample_df[too_low_counts_mask].index.tolist()
+
+    # move the syndna ids from the index to a column, bc I hate implicit
+    reads_per_syndna_per_sample_df = \
+        reads_per_syndna_per_sample_df.reset_index(names=[SYNDNA_ID_KEY])
+
+    # validate that the syndna ids in the config and the data are consistent
+    _validate_syndna_id_consistency(syndna_concs_df,
+                                    reads_per_syndna_per_sample_df)
+
+    # validate that sample ids in the experiment info and data are consistent
+    missing_sample_ids = _validate_sample_id_consistency(
+        sample_syndna_weights_and_total_reads_df,
+        reads_per_syndna_per_sample_df)
+    if missing_sample_ids is not None:
+        log_messages_list.append(f'The following sample ids were in the '
+                                 f'experiment info but not in the data: '
+                                 f'{missing_sample_ids}')
+
+    # NOW remove any syndnas with too few counts from the dataframe,
+    # and log if there were any
+    filtered_reads_per_syndna_per_sample_df = \
+        reads_per_syndna_per_sample_df[
+            ~reads_per_syndna_per_sample_df[SYNDNA_ID_KEY].isin(
+                syndnas_to_drop)]
+    if len(syndnas_to_drop) > 0:
+        log_messages_list.append(f'The following syndnas were dropped '
+                                 f'because they had fewer than '
+                                 f'{min_sample_counts} total reads aligned:'
+                                 f'{syndnas_to_drop}')
+
+    # reformat filtered_reads_per_syndna_per_sample_df into "long form":
+    # columns for syndna id, sample id, and read count
+    working_df = filtered_reads_per_syndna_per_sample_df.melt(
+        id_vars=[SYNDNA_ID_KEY], var_name=SAMPLE_ID_KEY,
+        value_name=SYNDNA_COUNTS_KEY)
+
+    # merge w sample_total_reads_df to include total_reads column
+    working_df = working_df.merge(sample_syndna_weights_and_total_reads_df,
+                                  on=SAMPLE_ID_KEY, how='left')
+
+    # calculate the weight in ng of *each* syndna in each sample
+    working_df = _calc_indiv_syndna_weights(syndna_concs_df, working_df)
+
+    # fit linear regression models for each sample
+    linregress_by_sample_id, fit_msgs_list = \
+        _fit_linear_regression_models(working_df)
+    log_messages_list.extend(fit_msgs_list)
+    linregress_results_dict = _convert_linregressresults_to_dict(
+        linregress_by_sample_id)
+
+    return linregress_results_dict, log_messages_list
+
+
+# TODO: if they sequenced over multiple lanes, would be different prep
+#  info files--talk to lab about whether they will ever do this :(
+#  this would require merge of multiple preparations
+def fit_linear_regression_models_for_qiita(
+        prep_info_df: pd.DataFrame,
+        reads_per_syndna_per_sample_biom: biom.Table,
+        min_sample_counts: int = DEFAULT_MIN_SAMPLE_COUNTS,
+        syndna_pool_config_fp: Optional[str] = None) -> dict[str: str]:
+
+    """Fits linear regressions predicting mass from counts using Qiita inputs.
+
+    Parameters
+    ----------
+    prep_info_df: pd.DataFrame
+        A Dataframe containing prep info for all samples in the prep,
+        including SAMPLE_ID, SYNDNA_POOL_NUM_KEY, SYNDNA_POOL_MASS_NG_KEY,
+        and SAMPLE_TOTAL_READS_KEY
+    reads_per_syndna_per_sample_biom: biom.Table
+        Biom table holding read counts aligned to each synDNA in each sample.
+        Note: should already have combined forward and reverse counts.
+    min_sample_counts: int
+        Minimum number of counts required for a sample to be included in
+        the regression.  Samples with fewer counts will be excluded.
+    syndna_pool_config_fp: str, optional
+        Path to the yaml file holding the concentrations of each syndna
+        in the syndna pool used in this experiment.  If not provided, will
+        look for the config.yml file in the parent directory of this file.
+
+    Returns
+    -------
+    out_txt_by_out_type : dict of str
+        Dictionary of output strings (ready to be written to files) keyed
+        by the type of output they contain.  Currently, the following keys
+        are defined:
+        LIN_REGRESS_RESULT_KEY: yaml of dict[str, dict[str, float] | None]
+        FIT_SYNDNA_MODELS_LOG_KEY: txt log of messages from the fitting process
+    """
+
+    # check that the prep_info_df has the expected columns
+    expected_prep_info_cols = [
+        SAMPLE_ID_KEY, SYNDNA_POOL_NUM_KEY, SYNDNA_POOL_MASS_NG_KEY,
+        SAMPLE_TOTAL_READS_KEY]
+    validate_required_columns_exist(
+        prep_info_df, expected_prep_info_cols,
+        "prep info is missing required column(s)")
+
+    # pull the syndna pool number from the prep info, ensure it is the same for
+    # all samples, and convert to the pool name
+    syndna_pool_number = prep_info_df[SYNDNA_POOL_NUM_KEY].unique()
+    if len(syndna_pool_number) > 1:
+        raise ValueError(
+            f"Multiple syndna_pool_numbers found in prep info: "
+            f"{syndna_pool_number}")
+    syndna_pool_name = f"pool{syndna_pool_number[0]}"
+
+    # look in the SYNDNA_INDIV_NG_UL_KEY section of the config file to find the
+    # individual syndna concentrations associated with the relevant syndna
+    # pool name and turn the resulting dictionary into a dataframe
+    config_dict = _extract_config_dict(syndna_pool_config_fp)
+    conc_ng_ul_per_indiv_syndna = \
+        config_dict[SYNDNA_INDIV_NG_UL_KEY][syndna_pool_name]
+    syndna_concs_df = pd.DataFrame(
+        conc_ng_ul_per_indiv_syndna.items(),
+        columns=[SYNDNA_ID_KEY, SYNDNA_INDIV_NG_UL_KEY])
+
+    # convert input biom table to a pd.SparseDataFrame, which is should act
+    # basically like a pd.DataFrame but take up less memory
+    reads_per_syndna_per_sample_df = \
+        reads_per_syndna_per_sample_biom.to_dataframe(dense=False)
+
+    # fit linear regression models for each sample
+    linregress_results_dict, msg_list = fit_linear_regression_models(
+        syndna_concs_df, prep_info_df, reads_per_syndna_per_sample_df,
+        min_sample_counts)
+
+    out_txt_by_out_type = {
+        LIN_REGRESS_RESULT_KEY: yaml.safe_dump(linregress_results_dict),
+        FIT_SYNDNA_MODELS_LOG_KEY: '\n'.join(msg_list)}
+
+    return out_txt_by_out_type
diff --git a/pysyndna/src/quant_orfs.py b/pysyndna/src/quant_orfs.py
new file mode 100644
index 0000000..29e6606
--- /dev/null
+++ b/pysyndna/src/quant_orfs.py
@@ -0,0 +1,336 @@
+import biom.table
+import pandas
+from pysyndna.src.util import calc_copies_genomic_element_per_g_series, \
+    calc_gs_genomic_element_in_aliquot, \
+    validate_required_columns_exist, \
+    validate_metadata_vs_reads_id_consistency, \
+    validate_metadata_vs_prep_id_consistency, SAMPLE_ID_KEY, \
+    SAMPLE_IN_ALIQUOT_MASS_G_KEY, ELUTE_VOL_UL_KEY, RNA_BASE_G_PER_MOLE, \
+    REQUIRED_SAMPLE_INFO_KEYS
+
+OGU_ORF_ID_KEY = "ogu_orf_id"
+OGU_ORF_START_KEY = "ogu_orf_start"
+OGU_ORF_END_KEY = "ogu_orf_end"
+OGU_ORF_LEN_KEY = "ogu_orf_len"
+COPIES_PER_G_OGU_ORF_SSRNA_KEY = "copies_per_g_ogu_orf_ss_rna"
+TOTAL_BIOLOGICAL_READS_KEY = "total_biological_reads_r1r2"
+SSRNA_CONCENTRATION_NG_UL_KEY = "total_rna_concentration_ng_ul"
+SSRNA_FROM_ALIQUOT_MASS_G_KEY = "ssrna_from_aliquot_mass_g"
+REQUIRED_RNA_PREP_INFO_KEYS = [SAMPLE_ID_KEY, SSRNA_CONCENTRATION_NG_UL_KEY,
+                               ELUTE_VOL_UL_KEY, TOTAL_BIOLOGICAL_READS_KEY]
+
+
+def _read_ogu_orf_coords_to_df(wol_reannotations_fp: str) -> pandas.DataFrame:
+    """Read the OGU+ORF coordinates file into a DataFrame.
+
+    Parameters
+    ----------
+    wol_reannotations_fp : str
+    Filepath to the ORF coordinates file in the wol reannotations format, e.g.:
+    >G000005825
+    1	816	2168
+    2	2348	3490
+    3	3744	3959
+    4	3971	5086
+    5	5098	5373
+    6	5432	7372
+    7	7399	9966
+
+    Returns
+    -------
+    ogu_orf_coords_df : pandas.DataFrame
+    A DataFrame containing columns for OGU_ORF_ID_KEY, OGU_ORF_START_KEY,
+    and OGU_ORF_END_KEY.
+    """
+    curr_ogu_id, curr_ogu_orf_id = None, None
+    curr_ogu_orf_start, curr_ogu_orf_end = None, None
+    ogu_orf_ids, ogu_orf_starts, ogu_orf_ends = [], [], []
+
+    with open(wol_reannotations_fp, "r") as fh:
+        for line in fh.readlines():
+            line = line.strip()
+            if line.startswith(">G"):
+                curr_ogu_id = line.replace(">", "")
+            else:
+                line_pieces = line.split("\t")
+                curr_orf_id = line_pieces[0]
+                curr_ogu_orf_start = int(line_pieces[1])
+                curr_ogu_orf_end = int(line_pieces[2])
+                curr_ogu_orf_id = curr_ogu_id + "_" + curr_orf_id
+                ogu_orf_ids.append(curr_ogu_orf_id)
+                ogu_orf_starts.append(curr_ogu_orf_start)
+                ogu_orf_ends.append(curr_ogu_orf_end)
+            # endif what to do with this line
+        # next line
+
+    ogu_orf_coords_dict = {
+        OGU_ORF_ID_KEY: ogu_orf_ids,
+        OGU_ORF_START_KEY: ogu_orf_starts,
+        OGU_ORF_END_KEY: ogu_orf_ends
+    }
+    coords_df = pandas.DataFrame(ogu_orf_coords_dict)
+    return coords_df
+
+
+def _calc_ogu_orf_copies_per_g_from_coords(
+        ogu_orf_coords_df: pandas.DataFrame) -> pandas.DataFrame:
+    """Calculate the copies per gram of each OGU+ORF ssRNA.
+
+    Note that this not (necessarily) the same as the copies per gram of the
+    ssRNA *transcript* containing each OGU+ORF, since the latter might also
+    contain other OGU+ORFs and thus be heavier.
+    Parameters
+    ----------
+    ogu_orf_coords_df : pandas.DataFrame
+        A DataFrame with columns for OGU_ORF_ID_KEY, OGU_ORF_START_KEY, and
+        OGU_ORF_END_KEY.
+
+    Returns
+    -------
+    ogu_orf_copies_per_g_df: pandas.DataFrame
+        A DataFrame with columns for OGU_ORF_ID_KEY and
+        COPIES_PER_G_OGU_ORF_SSRNA_KEY.
+    """
+
+    output_df = ogu_orf_coords_df.copy()
+
+    # calculate the length of each OGU+ORF ssRNA:
+    # abs(ogu_orf_end - ogu_orf_start) + 1
+    # abs because sometimes the start is greater than the end,
+    # +1 because the length is inclusive
+    output_df[OGU_ORF_LEN_KEY] = \
+        output_df[OGU_ORF_END_KEY] - \
+        output_df[OGU_ORF_START_KEY]
+    output_df[OGU_ORF_LEN_KEY] = \
+        output_df[OGU_ORF_LEN_KEY].abs()
+    output_df[OGU_ORF_LEN_KEY] = \
+        output_df[OGU_ORF_LEN_KEY] + 1
+
+    # calculate the copies per gram of each OGU+ORF ssRNA
+    ogu_orf_copies_per_g_series = calc_copies_genomic_element_per_g_series(
+        output_df[OGU_ORF_LEN_KEY], RNA_BASE_G_PER_MOLE)
+
+    output_df[COPIES_PER_G_OGU_ORF_SSRNA_KEY] = \
+        ogu_orf_copies_per_g_series
+    output_df.index = output_df[OGU_ORF_ID_KEY]
+
+    return output_df
+
+
+def _calc_copies_of_ogu_orf_ssrna_per_g_sample(
+        quant_params_per_sample_df: pandas.DataFrame,
+        reads_per_ogu_orf_per_sample_biom: biom.Table,
+        ogu_orf_copies_per_g_ssrna_df: pandas.DataFrame) -> biom.Table:
+
+    """Calculate the copies of each OGU+ORF ssRNA per gram of sample.
+
+    Parameters
+    ----------
+    quant_params_per_sample_df : pandas.DataFrame
+        A DataFrame containing at least SAMPLE_ID_KEY,
+        SAMPLE_IN_ALIQUOT_MASS_G_KEY, SSRNA_CONCENTRATION_NG_UL_KEY,
+        ELUTE_VOL_UL_KEY, and TOTAL_BIOLOGICAL_READS_KEY.
+    reads_per_ogu_orf_per_sample_biom : biom.Table
+        A biom.Table with the number of reads per OGU+ORF per sample, such
+        as that output by woltka.
+    ogu_orf_copies_per_g_ssrna_df: pandas.DataFrame
+        A DataFrame with columns for OGU_ORF_ID_KEY and
+        COPIES_PER_G_OGU_ORF_SSRNA_KEY.
+
+    Returns
+    -------
+    copies_of_ogu_orf_ssrna_per_g_sample : biom.Table
+        A biom.Table with the copies of each OGU+ORF ssRNA per gram of sample.
+    """
+
+    # turn REQUIRED_SAMPLE_INFO_KEYS and REQUIRED_RNA_PREP_INFO_KEYS into sets
+    # and combine them into a single set, then turn it back into a list
+    required_cols_list = list(
+        set(REQUIRED_SAMPLE_INFO_KEYS) | set(REQUIRED_RNA_PREP_INFO_KEYS))
+    validate_required_columns_exist(
+        quant_params_per_sample_df, required_cols_list,
+        "parameters dataframe is missing required column(s)")
+
+    # validate that the sample ids in the quant_params_per_sample_df match the
+    # sample ids in the reads_per_ogu_orf_per_sample_biom. Ignore sample ids
+    # in the quant_params_per_sample_df that are not in the biom table; those
+    # could just be samples that failed sequencing/etc.
+    _ = validate_metadata_vs_reads_id_consistency(
+        quant_params_per_sample_df, reads_per_ogu_orf_per_sample_biom)
+
+    # Set index on quant_params_per_sample_df to be SAMPLE_ID_KEY for easy
+    # lookup of values by sample id during biom lambda functions
+    quant_params_per_sample_df.index = \
+        quant_params_per_sample_df[SAMPLE_ID_KEY]
+
+    # Calculate the grams of total ssRNA from each sample that are in the elute
+    # after extraction
+    g_total_ssrna_per_sample_df = calc_gs_genomic_element_in_aliquot(
+        quant_params_per_sample_df, SSRNA_CONCENTRATION_NG_UL_KEY,
+        SSRNA_FROM_ALIQUOT_MASS_G_KEY)
+
+    # step 1 of OGU+ORF quantitation is upstream of this function:
+    # Run woltka to get the reads_per_ogu_orf_per_sample_biom.
+    # Calculations below are done directly on biom tables, since they are
+    # expected to be very large and very sparse.
+
+    # step 2:
+    # Calculate fraction of total biological reads per OGU+ORF per sample:
+    # Divide every value in reads_per_ogu_orf_per_sample_biom by the
+    # value of the TOTAL_BIOLOGICAL_READS_KEY for that value's OGU_ORF_ID_KEY
+    # in quant_params_per_sample_df.
+    # See https://biom-format.org/documentation/generated/biom.table.Table.transform.html
+    # for details of how to write and use a function for biom.transform().
+    def get_fraction_of_sample_reads(data, id_, _):
+        # df.at[] is fast to get a single value by a row/column label pair
+        return data / quant_params_per_sample_df.at[id_, TOTAL_BIOLOGICAL_READS_KEY]
+    fraction_of_sample_reads_per_sample_biom = \
+        reads_per_ogu_orf_per_sample_biom.transform(
+            f=get_fraction_of_sample_reads, axis='sample', inplace=False)
+
+    # step 3:
+    # Calculate grams of ssRNA per OGU+ORF per sample:
+    # Multiply the fraction of total biological reads per OGU+ORF per sample
+    # by the total grams of ssRNA from each sample that are in the elute after
+    # extraction.
+    def get_ogu_orf_ssrna_g_in_sample(data, id_, _):
+        return data * g_total_ssrna_per_sample_df.at[id_, SSRNA_FROM_ALIQUOT_MASS_G_KEY]
+    g_ssrna_per_ogu_orf_per_sample_biom = \
+        fraction_of_sample_reads_per_sample_biom.transform(
+            f=get_ogu_orf_ssrna_g_in_sample, axis='sample', inplace=False)
+
+    # step 4:
+    # Calculate copies per OGU+ORF per sample
+    # Multiply the grams of ssRNA of each OGU+ORF per sample by the copies per
+    # gram of each OGU+ORF ssRNA.
+    # This gives you the copies of each OGU+ORF ssRNA present in the whole
+    # extracted sample.
+    def get_copies_per_ogu_orf_per_sample(data, id_, _):
+        return data * ogu_orf_copies_per_g_ssrna_df.at[id_, COPIES_PER_G_OGU_ORF_SSRNA_KEY]
+    copies_per_ogu_orf_per_sample_biom = \
+        g_ssrna_per_ogu_orf_per_sample_biom.transform(
+            f=get_copies_per_ogu_orf_per_sample, axis='observation', inplace=False)
+
+    # Step 5:
+    # Calculate the copies of each OGU+ORF ssRNA per gram of sample material
+    # Divide the copies per OGU+ORF in each extracted sample by the grams of
+    # sample material put into the extraction for the relevant sample
+    def get_copies_per_g_sample(data, id_, _):
+        return data / quant_params_per_sample_df.at[id_, SAMPLE_IN_ALIQUOT_MASS_G_KEY]
+    copies_of_ogu_orf_ssrna_per_g_sample_biom = \
+        copies_per_ogu_orf_per_sample_biom.transform(
+            f=get_copies_per_g_sample, axis='sample', inplace=False)
+
+    return copies_of_ogu_orf_ssrna_per_g_sample_biom
+
+
+def calc_copies_of_ogu_orf_ssrna_per_g_sample(
+        quant_params_per_sample_df: pandas.DataFrame,
+        reads_per_ogu_orf_per_sample_biom: biom.Table,
+        ogu_orf_coords_fp: str) -> biom.Table:
+    """Calculate the copies of each OGU+ORF ssRNA per gram of sample.
+
+    Parameters
+    ----------
+    quant_params_per_sample_df : pandas.DataFrame
+        A DataFrame containing at least SAMPLE_ID_KEY,
+        SAMPLE_IN_ALIQUOT_MASS_G_KEY, SSRNA_CONCENTRATION_NG_UL_KEY,
+        ELUTE_VOL_UL_KEY, and TOTAL_BIOLOGICAL_READS_KEY.
+    reads_per_ogu_orf_per_sample_biom : biom.Table
+        A biom.Table with the number of reads per OGU+ORF per sample, such
+        as that output by woltka.
+    ogu_orf_coords_fp : str
+        Filepath to the OGU+ORF coordinates file, such as the coords.txt
+        file used by woltka, in the format shown below:
+        >G000005825
+        1	816	2168
+        2	2348	3490
+        3	3744	3959
+        4	3971	5086
+        5	5098	5373
+        6	5432	7372
+        7	7399	9966
+        <etc.>
+
+    Returns
+    -------
+    copies_of_ogu_orf_ssrna_per_g_sample : biom.Table
+        A biom.Table with the copies of each OGU+ORF ssRNA per gram of sample.
+    """
+
+    # Calculate the copies per gram of each OGU+ORF ssRNA
+    ogu_orf_coords_df = _read_ogu_orf_coords_to_df(ogu_orf_coords_fp)
+    ogu_orf_copies_per_g_ssrna_df = _calc_ogu_orf_copies_per_g_from_coords(
+        ogu_orf_coords_df)
+
+    copies_of_ogu_orf_ssrna_per_g_sample_biom = \
+        _calc_copies_of_ogu_orf_ssrna_per_g_sample(
+            quant_params_per_sample_df, reads_per_ogu_orf_per_sample_biom,
+            ogu_orf_copies_per_g_ssrna_df)
+
+    return copies_of_ogu_orf_ssrna_per_g_sample_biom
+
+
+def calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita(
+        sample_info_df: pandas.DataFrame,
+        prep_info_df: pandas.DataFrame,
+        reads_per_ogu_orf_per_sample_biom: biom.Table,
+        ogu_orf_coords_fp: str) -> biom.Table:
+
+    """Calculate the copies of each OGU+ORF ssRNA per gram of sample for Qiita.
+
+    Parameters
+    ----------
+    sample_info_df : pandas.DataFrame
+        A DataFrame containing sample info for all samples in the prep,
+        including SAMPLE_ID_KEY and SAMPLE_IN_ALIQUOT_MASS_G_KEY
+    prep_info_df : pandas.DataFrame
+        A DataFrame containing prep info for all samples in the prep,
+        including SAMPLE_ID_KEY, SSRNA_CONCENTRATION_NG_UL_KEY,
+        ELUTE_VOL_UL_KEY, and TOTAL_BIOLOGICAL_READS_KEY.
+    reads_per_ogu_orf_per_sample_biom : biom.Table
+        A biom.Table with the number of reads per OGU+ORF per sample, such
+        as that output by woltka.
+    ogu_orf_coords_fp : str
+        Filepath to the OGU+ORF coordinates file, such as the coords.txt
+        file used by woltka, in the format shown below:
+        >G000005825
+        1	816	2168
+        2	2348	3490
+        3	3744	3959
+        4	3971	5086
+        5	5098	5373
+        6	5432	7372
+        7	7399	9966
+        <etc.>
+
+    Returns
+    -------
+    copies_of_ogu_orf_ssrna_per_g_sample : biom.Table
+        A biom.Table with the copies of each OGU+ORF ssRNA per gram of sample.
+    """
+
+    # check if the inputs all have the required columns
+    validate_required_columns_exist(
+        sample_info_df, REQUIRED_SAMPLE_INFO_KEYS,
+        "sample info is missing required column(s)")
+
+    validate_required_columns_exist(
+        prep_info_df, REQUIRED_RNA_PREP_INFO_KEYS,
+        "prep info is missing required column(s)")
+
+    # validate that the sample ids in the sample_info_df match the sample ids
+    # in the prep_info_df. Ignore sample ids in sample_info_df that are not in
+    # the prep_info_df; these could just not be included in this prep.
+    _ = validate_metadata_vs_prep_id_consistency(
+        sample_info_df, prep_info_df)
+
+    quant_params_per_sample_df = prep_info_df.merge(
+        sample_info_df, on=SAMPLE_ID_KEY, how="inner")
+
+    copies_of_ogu_orf_ssrna_per_g_sample_biom = \
+        calc_copies_of_ogu_orf_ssrna_per_g_sample(
+            quant_params_per_sample_df, reads_per_ogu_orf_per_sample_biom,
+            ogu_orf_coords_fp)
+
+    return copies_of_ogu_orf_ssrna_per_g_sample_biom
diff --git a/pysyndna/src/util.py b/pysyndna/src/util.py
new file mode 100644
index 0000000..786862f
--- /dev/null
+++ b/pysyndna/src/util.py
@@ -0,0 +1,274 @@
+from typing import Optional, Union, List
+
+import biom
+import pandas
+import pandas as pd
+
+DNA_BASEPAIR_G_PER_MOLE = 650
+RNA_BASE_G_PER_MOLE = 340
+NANOGRAMS_PER_GRAM = 1e9
+
+# NB: sample_name instead of sample_id bc that's what qiita uses
+SAMPLE_ID_KEY = 'sample_name'
+SAMPLE_IN_ALIQUOT_MASS_G_KEY = 'calc_mass_sample_aliquot_input_g'
+ELUTE_VOL_UL_KEY = 'vol_extracted_elution_ul'
+REQUIRED_SAMPLE_INFO_KEYS = [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY]
+
+
+def _validate_sample_id_consistency(
+        sample_ids_in_metadata: set,
+        sample_ids_in_data: set,
+        metadata_name: str,
+        data_set_name: str) \
+        -> Union[List[str], None]:
+    """
+    Checks that the sample ids in the metadata and data are consistent.
+
+    Parameters
+    ----------
+    sample_ids_in_metadata: set
+        A set of the sample ids in the metadata
+    sample_ids_in_data: set
+        A set of the sample ids in the data
+    metadata_name: str
+        A string identifying the metadata being checked, for use in error
+        messages.
+    data_set_name: str
+        A string identifying the data set being checked, for use in error
+        messages.
+
+    Raises
+    ------
+    ValueError
+        If there are sample ids in the data that aren't in the metadata
+
+    Returns
+    -------
+    missing_sample_ids : set
+        A set of sample ids that are in the metadata but not in the
+        data.  Empty if all sample ids in the metadata were in the data.
+
+    """
+
+    # if there are sample ids in the data that are not in the metadata, raise
+    # an error, since we don't know how to process that
+    data_only_samples = sample_ids_in_data - sample_ids_in_metadata
+    if len(data_only_samples) > 0:
+        raise ValueError(
+            f"Found sample ids in {data_set_name} that were "
+            f"not in {metadata_name}: {data_only_samples}")
+
+    # check if there are sample ids in the metadata that are not in the data
+    # and if so, capture a list of them. Sometimes a sample just fails
+    # sequencing and that shouldn't preclude processing the others that did
+    # work, but we want to know about it.
+    missing_sample_ids_set = sample_ids_in_metadata - sample_ids_in_data
+
+    if len(missing_sample_ids_set) > 0:
+        missing_sample_ids = list(missing_sample_ids_set)
+    else:
+        missing_sample_ids = None
+
+    return missing_sample_ids
+
+
+def validate_required_columns_exist(
+        input_df: pd.DataFrame,
+        required_cols_list: List[str],
+        error_msg: str):
+
+    """Checks that the input dataframe has the required columns.
+
+    Parameters
+    ----------
+    input_df: pd.DataFrame
+        A Dataframe to be checked.
+    required_cols_list: list[str]
+        List of column names that must be present in the dataframe.
+    error_msg: str
+        Error message to be raised if any of the required columns are missing.
+    """
+
+    missing_cols = set(required_cols_list) - set(input_df.columns)
+    if len(missing_cols) > 0:
+        missing_cols = sorted(missing_cols)
+        raise ValueError(
+            f"{error_msg}: {missing_cols}")
+
+
+def validate_metadata_vs_reads_id_consistency(
+        metadata_df: pd.DataFrame,
+        reads_df: Union[pd.DataFrame, biom.Table]) \
+        -> Union[List[str], None]:
+    """
+    Checks that the sample ids in the sample metadata and data are consistent.
+
+    Parameters
+    ----------
+    metadata_df: pd.DataFrame
+        A Dataframe containing at least SAMPLE_ID_KEY column
+    reads_df: pd.DataFrame | biom.Table
+        Either a Dataframe with a column for each SAMPLE_ID_KEY or a biom.Table
+        with a column for each SAMPLE_ID_KEY
+
+    Raises
+    ------
+    ValueError
+        If there are sample ids in the data that aren't in the metadata df
+
+    Returns
+    -------
+    missing_sample_ids : List[str] | None
+        List of sample ids that are in the sample info but not in the
+        data.  None if all sample ids in the experiment info were in the data.
+    """
+
+    sample_ids_in_metadata = set(metadata_df[SAMPLE_ID_KEY])
+    if isinstance(reads_df, biom.Table):
+        sample_ids_in_reads = set(reads_df.ids(axis='sample'))
+    else:
+        sample_ids_in_reads = set(reads_df.columns)
+    missing_reads_ids = _validate_sample_id_consistency(
+        sample_ids_in_metadata, sample_ids_in_reads, "sample info",
+        "reads data")
+
+    return missing_reads_ids
+
+
+def validate_metadata_vs_prep_id_consistency(
+        metadata_df: pd.DataFrame,
+        prep_df: pd.DataFrame) \
+        -> Union[List[str], None]:
+    """
+    Checks that sample ids in the sample metadata and prep info are consistent.
+
+    Parameters
+    ----------
+    metadata_df: pd.DataFrame
+        A Dataframe of sample metadata containing at least SAMPLE_ID_KEY column
+    prep_df: pd.DataFrame
+        A Dataframe of prep info with a column for SAMPLE_ID_KEY
+
+    Raises
+    ------
+    ValueError
+        If there are sample ids in prep info that aren't in sample metadata
+
+    Returns
+    -------
+    missing_sample_ids : List[str] | None
+        List of sample ids that are in the sample metadata but not in the
+        prep info.  None if all sample ids in the sample metadata were in the
+        prep info.
+    """
+
+    sample_ids_in_metadata = set(metadata_df[SAMPLE_ID_KEY])
+    sample_ids_in_prep = set(prep_df[SAMPLE_ID_KEY])
+    missing_prep_ids = _validate_sample_id_consistency(
+        sample_ids_in_metadata, sample_ids_in_prep,
+        "sample info",  "prep info")
+    return missing_prep_ids
+
+
+def calc_copies_genomic_element_per_g_series(
+        genomic_elements_lengths_series: pd.Series,
+        genomic_element_unit_avg_g_per_mole: float,
+        is_test: Optional[bool] = False) -> pd.Series:
+
+    """Calculates copies of genomic unit per gram of genomic element's unit.
+
+    For example, get copies of OGU genomes per gram of double-stranded OGU gDNA
+    or copies of OGU+ORF RNAs per gram of single-stranded OGU+ORF RNA.
+
+    Parameters
+    ----------
+    genomic_elements_lengths_series: pd.Series
+        A Series with index identifying each genomic element, containing length
+        of each element in genomic element units.  For example, length in DNA
+        basepairs for OGUs or length in (single-stranded) RNA bases for
+        OGU+ORF RNAs.
+    genomic_element_unit_avg_g_per_mole: float
+        Average mass in grams per mole of a genomic element unit.  For example,
+        650 g/mole for a DNA basepair or 340 g/mole for an RNA base.
+    is_test: Optional[bool]
+        Default is False.  If True, the function will use the less-precise
+        value of Avogadro's number (6.022*(10^23)) used in cell [16] of the
+        https://github.com/lzaramela/SynDNA/blob/main/SynDNA_saliva_samples_analysis.ipynb
+        notebook, rather than the more precise value (6.02214076*(10^23))
+        calculation used if False.  This is True in testing ONLY.
+
+    Returns
+    -------
+    ogu_genomes_series : pd.Series
+        A Series with index of OGU_ID_KEY and values of the number of genomes
+        of each OGU in the sequenced sample.
+
+    Terminology:
+    genomic_element: a distinct element measured on a genome such as an OGU
+        (i.e., the whole genome) or an ORF on an OGU (called "OGU+ORF")
+    genomic_element_unit: the units in which the genomic element is measured;
+        in the case of OGUs, this is DNA basepairs, while in the case of
+        OGU+ORFs, the units are RNA bases (i.e., single-stranded).
+
+    This calculates the total number of copies of genomic element X per gram
+    of genomic element units by the equation:
+
+        Avogadro's number in (copies of genomic element X)/mole
+    =	---------------------------------------------------------------
+        length of genomic element X in genomic element units *
+            average g/mole per genomic element unit
+
+    Avogadro's number is 6.02214076 × 10^23 , and is the number of
+    molecules--such as OGU genomes or OGU+ORF RNAs--in a mole of the genomic
+    element.
+    """
+
+    # seems weird to make this a variable since it's famously a constant, but..
+    avogadros_num = 6.02214076e23
+    # this is done so we can test against Livia's results, which use
+    # a truncated version of the constant. This should NOT be done in
+    # production.  In testing, makes a difference of e.g., about 10 cells
+    # out of 25K for the first OGU in the first sample in Livia's dataset.
+    if is_test:
+        avogadros_num = 6.022e23
+
+    # TODO: do we have to worry about integer overflow here?
+    #  Dan H. said, "if you use ints, the length * 650 * 10^9
+    #  can overflow integers with very long genomes".  HOWEVER,
+    #  the internet says that python *3* , "[o]nly floats have a hard
+    #  limit in python. Integers are implemented as “long” integer
+    #  objects of arbitrary size"(https://stackoverflow.com/a/52151786)
+    #  HOWEVER HOWEVER, *numpy* integer types are fixed width, and
+    #  "Some pandas and numpy functions, such as sum on arrays or
+    #  Series return an np.int64 so this might be the reason you are
+    #  seeing int overflows in Python3."
+    #  (https://stackoverflow.com/a/58640340)
+    #  What to do?
+
+    denominator_series = \
+        genomic_elements_lengths_series * genomic_element_unit_avg_g_per_mole
+
+    copies_of_genomic_element_per_g_of_genomic_element_unit = \
+        avogadros_num/denominator_series
+
+    return copies_of_genomic_element_per_g_of_genomic_element_unit
+
+
+def calc_gs_genomic_element_in_aliquot(
+        genomic_elements_df: pd.DataFrame,
+        genomic_element_conc_key: str,
+        genomic_element_mass_key: str) -> pandas.DataFrame:
+
+    working_df = genomic_elements_df.copy()
+
+    # get the total grams of the genomic element that are in the elute after
+    # extraction; this is sample-specific:
+    # concentration of genomic element after extraction in ng/uL times
+    # volume of elute from the extraction in uL, divided by 10^9 ng/g
+    # (which is the same as multiplied by 1/10^9 g/ng)
+
+    working_df[genomic_element_mass_key] = \
+        working_df[genomic_element_conc_key] * \
+        working_df[ELUTE_VOL_UL_KEY] / NANOGRAMS_PER_GRAM
+
+    return working_df
diff --git a/pysyndna/tests/data/coords.txt b/pysyndna/tests/data/coords.txt
new file mode 100644
index 0000000..2dc6850
--- /dev/null
+++ b/pysyndna/tests/data/coords.txt
@@ -0,0 +1,12 @@
+>G000005825
+1	816	2168
+2	2348	3490
+3	3744	3959
+4	3971	5086
+5	5098	5373
+>G900163845
+3247	3392209	3390413
+3248	3393051	3392206
+3249	3393938	3393048
+3250	3394702	3393935
+3251	3395077	3395721
\ No newline at end of file
diff --git a/pysyndna/tests/test_calc_cell_counts.py b/pysyndna/tests/test_calc_cell_counts.py
index ab0468b..18edf78 100644
--- a/pysyndna/tests/test_calc_cell_counts.py
+++ b/pysyndna/tests/test_calc_cell_counts.py
@@ -9,15 +9,15 @@
 from unittest import TestCase
 from pysyndna import calc_ogu_cell_counts_biom, \
     calc_ogu_cell_counts_per_g_of_sample_for_qiita
-from pysyndna.src.fit_syndna_models import SAMPLE_ID_KEY
-from pysyndna.src.calc_cell_counts import OGU_ID_KEY, OGU_READ_COUNT_KEY, \
+from pysyndna.src.calc_cell_counts import SAMPLE_ID_KEY, ELUTE_VOL_UL_KEY, \
+    OGU_ID_KEY, OGU_READ_COUNT_KEY, \
     OGU_LEN_IN_BP_KEY, OGU_GDNA_MASS_NG_KEY, \
     SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY, OGU_GENOMES_PER_G_OF_GDNA_KEY, \
     OGU_CELLS_PER_G_OF_GDNA_KEY, SYNDNA_POOL_MASS_NG_KEY, \
     GDNA_CONCENTRATION_NG_UL_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY, \
-    ELUTE_VOL_UL_KEY, GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY, \
+    GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY, \
     OGU_CELLS_PER_G_OF_SAMPLE_KEY, TOTAL_OGU_READS_KEY, OGU_COVERAGE_KEY, \
-    CELL_COUNT_RESULT_KEY, CELL_COUNT_LOG_KEY, \
+    CELL_COUNT_RESULT_KEY, CELL_COUNT_LOG_KEY, SAMPLE_TOTAL_READS_KEY, \
     _calc_long_format_ogu_cell_counts_df, \
     _prepare_cell_counts_calc_df, \
     _calc_ogu_cell_counts_df_for_sample, \
@@ -64,12 +64,15 @@ class TestCalcCellCounts(TestCase):
         SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY: [5, 4.76],
         GDNA_CONCENTRATION_NG_UL_KEY: [2, 1.4],
         ELUTE_VOL_UL_KEY: [100, 100],
-        SYNDNA_POOL_MASS_NG_KEY: [0.25, 0.238]
+        SYNDNA_POOL_MASS_NG_KEY: [0.25, 0.238],
     }
 
-    # Values from "absolute_quant_example.xlsx"
-    mass_ratio_dict = {
+    # Values from "absolute_quant_example.xlsx" EXCEPT for the
+    # SAMPLE_TOTAL_READS_KEY values, which come from summing
+    # the OGU_READ_COUNT_KEY values for each sample
+    mass_and_totals_dict = {
         SAMPLE_ID_KEY: ["example1", "example2"],
+        SAMPLE_TOTAL_READS_KEY: [472140, 611913],
         SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY: [5, 4.76],
         GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY: [7.1867431342E-06,
                                              4.7470988923E-06]
@@ -245,7 +248,7 @@ class TestCalcCellCounts(TestCase):
                      "Escherichia coli", "Tyzzerella nexilis",
                      "Prevotella sp. oral taxon 299",
                      "Streptococcus mitis", "Leptolyngbya valderiana",
-                      #"Neisseria subflava",
+                     # "Neisseria subflava",
                      "Neisseria flavescens",
                      "Fusobacterium periodonticum",
                      "Streptococcus pneumoniae",
@@ -261,7 +264,7 @@ class TestCalcCellCounts(TestCase):
                              1975,
                              # 0,
                              22303, 197830, 14478,
-                             #12,
+                             # 12,
                              14609],
         # These count values are the same as those in
         # self.example1_ogu_full_outputs_full_avogadro_dict
@@ -323,14 +326,16 @@ class TestCalcCellCounts(TestCase):
         # self.example2_ogu_filtered_inputs_outputs_full_avogadro_dict.  Note
         # that with reordering, the 4th sub-array is the one for L. gasseri,
         # the 5th is for L. valderiana, and the 9th is for R. albus.
+        # The two 0 values are for N. subflava and H. influenzae, which were
+        # removed from example2 data due to low coverage.
         OGU_CELLS_PER_G_OF_GDNA_KEY: [
             [21897704979729.094, 7101240813289.261],
             [7100063146106.998, 40527863244164.32],
-            [5718752608946.0205, np.nan],
+            [5718752608946.0205, 0],
             [52695192015949.67, 17086455403978.045],
             [11223075218306.252, 3613767901730.258],
             [9289882608698.639, 3004973286163.8184],
-            [10879422748260.775, np.nan],
+            [10879422748260.775, 0],
             [12674159207435.06, 4102264162505.8833],
             [27710822536547.69, 8987677125515.266],
             [11582576292095.531, 3747369928484.789],
@@ -392,9 +397,6 @@ def _generate_sample_names_list(self, use_filtered_ex2=True):
             output.extend(curr_names_list)
         return output
 
-    def setUp(self):
-        self.test_data_dir = os.path.join(os.path.dirname(__file__), 'data')
-
     # The built-in self.assertEqual works fine to compare biom tables that
     # don't have NaNs, but it doesn't work for tables that do have NaNs
     # because NaN != NaN so two tables that contain NaNs are by definition
@@ -423,18 +425,23 @@ def assert_biom_tables_equal(self, expected_out_biom, output_biom,
                                 output_biom.matrix_data.data[obs_an],
                                 decimal=decimal_precision)
 
+    def setUp(self):
+        self.test_data_dir = os.path.join(os.path.dirname(__file__), 'data')
+
     def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita(self):
         # example4 is the same as example2 except that the elute volume is 70;
         # see "absolute_quant_example.xlsx" for details.
         example4_elute_vol = 70
         sample_ids = ["example1", "example4"]
-        sample_info_dict = {k: self.sample_and_prep_input_dict[k].copy() for k in
-                            [SAMPLE_IN_ALIQUOT_MASS_G_KEY]}
+        sample_info_dict = {k: self.sample_and_prep_input_dict[k].copy() for
+                            k in [SAMPLE_IN_ALIQUOT_MASS_G_KEY]}
         sample_info_dict[SAMPLE_ID_KEY] = sample_ids
 
         prep_info_dict = {k: self.sample_and_prep_input_dict[k].copy() for k in
                           [GDNA_CONCENTRATION_NG_UL_KEY,
-                            ELUTE_VOL_UL_KEY, SYNDNA_POOL_MASS_NG_KEY]}
+                           ELUTE_VOL_UL_KEY, SYNDNA_POOL_MASS_NG_KEY]}
+        prep_info_dict[SAMPLE_TOTAL_READS_KEY] = \
+            self.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY]
         prep_info_dict[SAMPLE_ID_KEY] = sample_ids
         prep_info_dict[ELUTE_VOL_UL_KEY][1] = example4_elute_vol
 
@@ -445,7 +452,7 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita(self):
         # those in self.example1_ogu_full_outputs_full_avogadro_dict because
         # the gdna-to-sample mass ratio calculated internally during this
         # soup-to-nuts function has more digits past the decimal than does the
-        # example1 entry in the manually-populated self.mass_ratio_dict.
+        # example1 entry in the manually-populated self.mass_and_totals_dict.
         # Since we are multiplying/dividing by large numbers like e.g., 10^9
         # (to change ng to g), this ends up making a slight difference in the
         # end product: for example, for L.gasseri,
@@ -463,11 +470,11 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita(self):
         ogu_cell_counts_per_g_sample = np.array([
             [157373183.3914873, 23597204.3149076],
             [51026330.8697321, 134672840.2210325],
-            [41099206.6945521, np.nan],
+            [41099206.6945521, 0],
             [378706815.3787082, 56777764.5887874],
             [80657360.0375914, 12008439.3369959],
             [66764001.1050239, 9985433.5965833],
-            [78187617.9691203, np.nan],
+            [78187617.9691203, 0],
             [91085928.0975326, 13631697.3528372],
             [199150566.7379318, 29865774.0278729],
             [83241001.9519951, 12452394.7533948],
@@ -546,6 +553,8 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_prep_err(self):
         # missing required columns
         prep_info_dict = {k: self.sample_and_prep_input_dict[k] for k in
                           [SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY]}
+        prep_info_dict[SAMPLE_TOTAL_READS_KEY] = \
+            self.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY]
 
         counts_vals = self._make_combined_counts_np_array()
 
@@ -569,11 +578,50 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_prep_err(self):
                 sample_info_df, prep_info_df, models_fp, counts_biom,
                 lengths_fp, read_len, min_coverage, min_rsquared)
 
+    def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_ids_err(self):
+        sample_info_dict = {k: self.sample_and_prep_input_dict[k] for k in
+                            [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY]}
+
+        prep_info_dict = {k: self.sample_and_prep_input_dict[k] for k in
+                          [SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY,
+                           ELUTE_VOL_UL_KEY, SYNDNA_POOL_MASS_NG_KEY]}
+        prep_info_dict[SAMPLE_TOTAL_READS_KEY] = \
+            self.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY]
+
+        counts_vals = self._make_combined_counts_np_array()
+
+        # remove one of the sample ids from the sample info; this will cause
+        # an error (whereas the reverse--sample id in sample info but not in
+        # prep info--will NOT)
+        sample_info_df = pd.DataFrame(sample_info_dict)
+        sample_info_df.drop(index=0, axis=0, inplace=True)
+
+        prep_info_df = pd.DataFrame(prep_info_dict)
+        counts_biom = biom.table.Table(
+            counts_vals,
+            self.ogu_lengths_dict[OGU_ID_KEY],
+            prep_info_dict[SAMPLE_ID_KEY])
+        models_fp = os.path.join(self.test_data_dir, "models.yml")
+        lengths_fp = os.path.join(self.test_data_dir, "ogu_lengths.tsv")
+
+        read_len = 150
+        min_coverage = 1
+        min_rsquared = 0.8
+
+        err_msg = (r"Found sample ids in prep info that were not in"
+                   r" sample info: \{'example1'\}")
+        with self.assertRaisesRegex(ValueError, err_msg):
+            calc_ogu_cell_counts_per_g_of_sample_for_qiita(
+                sample_info_df, prep_info_df, models_fp, counts_biom,
+                lengths_fp, read_len, min_coverage, min_rsquared)
+
     def test_calc_ogu_cell_counts_biom(self):
         params_dict = {k: self.sample_and_prep_input_dict[k] for k in
                        [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY,
                         GDNA_CONCENTRATION_NG_UL_KEY, ELUTE_VOL_UL_KEY,
                         SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY]}
+        params_dict[SAMPLE_TOTAL_READS_KEY] = \
+            self.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY]
 
         counts_vals = self._make_combined_counts_np_array()
 
@@ -616,6 +664,69 @@ def test_calc_ogu_cell_counts_biom(self):
              "'example2;Haemophilus influenzae']"],
             output_msgs)
 
+    def test_calc_ogu_cell_counts_biom_w_col_err(self):
+        # missing SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY col
+        params_dict = {k: self.sample_and_prep_input_dict[k] for k in
+                       [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY,
+                        GDNA_CONCENTRATION_NG_UL_KEY, ELUTE_VOL_UL_KEY]}
+        params_dict[SAMPLE_TOTAL_READS_KEY] = \
+            self.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY]
+
+        counts_vals = self._make_combined_counts_np_array()
+
+        params_df = pd.DataFrame(params_dict)
+        counts_biom = biom.table.Table(
+            counts_vals,
+            self.ogu_lengths_dict[OGU_ID_KEY],
+            params_dict[SAMPLE_ID_KEY])
+        lengths_df = pd.DataFrame(self.ogu_lengths_dict)
+
+        read_len = 150
+        min_coverage = 1
+        min_rsquared = 0.8
+        output_metric = OGU_CELLS_PER_G_OF_GDNA_KEY
+
+        err_msg = r"sample info is missing required column\(s\): " \
+                  r"\['sequenced_sample_gdna_mass_ng'\]"
+        with self.assertRaisesRegex(ValueError, err_msg):
+            calc_ogu_cell_counts_biom(
+                params_df, self.linregresses_dict, counts_biom, lengths_df,
+                read_len, min_coverage, min_rsquared, output_metric)
+
+    def test_calc_ogu_cell_counts_biom_w_id_err(self):
+        params_dict = {k: self.sample_and_prep_input_dict[k] for k in
+                       [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY,
+                        GDNA_CONCENTRATION_NG_UL_KEY, ELUTE_VOL_UL_KEY,
+                        SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY]}
+        params_dict[SAMPLE_TOTAL_READS_KEY] = \
+            self.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY]
+
+        counts_vals = self._make_combined_counts_np_array()
+
+        # remove one of the sample ids from the params info; this will cause
+        # an error (whereas the reverse--sample id in params info but not in
+        # reads data--will NOT)
+        params_df = pd.DataFrame(params_dict)
+        params_df.drop(index=0, axis=0, inplace=True)
+
+        counts_biom = biom.table.Table(
+            counts_vals,
+            self.ogu_lengths_dict[OGU_ID_KEY],
+            params_dict[SAMPLE_ID_KEY])
+        lengths_df = pd.DataFrame(self.ogu_lengths_dict)
+
+        read_len = 150
+        min_coverage = 1
+        min_rsquared = 0.8
+        output_metric = OGU_CELLS_PER_G_OF_GDNA_KEY
+
+        err_msg = (r"Found sample ids in reads data that were not in "
+                   r"sample info: \{'example1'\}")
+        with self.assertRaisesRegex(ValueError, err_msg):
+            calc_ogu_cell_counts_biom(
+                params_df, self.linregresses_dict, counts_biom, lengths_df,
+                read_len, min_coverage, min_rsquared, output_metric)
+
     def test_calc_ogu_cell_counts_biom_w_cast(self):
         # these values are the same as those in self.sample_and_prep_input_dict
         # except that some of them are represented as strings instead of #s
@@ -624,7 +735,8 @@ def test_calc_ogu_cell_counts_biom_w_cast(self):
             GDNA_CONCENTRATION_NG_UL_KEY: ["2", 1.4],
             SAMPLE_IN_ALIQUOT_MASS_G_KEY: [0.027829017, "0.029491697"],
             ELUTE_VOL_UL_KEY: ["100", "70"],
-            SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY: [5, "4.76"]
+            SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY: [5, "4.76"],
+            SAMPLE_TOTAL_READS_KEY: self.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY]
         }
 
         counts_vals = self._make_combined_counts_np_array()
@@ -713,7 +825,7 @@ def test__calc_long_format_ogu_cell_counts_df(self):
 
         counts_df = pd.DataFrame(counts_dict)
         counts_df.set_index(OGU_ID_KEY, inplace=True)
-        mass_ratio_df = pd.DataFrame(self.mass_ratio_dict)
+        per_sample_calc_info_df = pd.DataFrame(self.mass_and_totals_dict)
         lengths_df = pd.DataFrame(self.ogu_lengths_dict)
         expected_df = pd.DataFrame(expected_dict)
 
@@ -722,8 +834,8 @@ def test__calc_long_format_ogu_cell_counts_df(self):
         min_rsquared = 0.8
 
         output_df, output_msgs = _calc_long_format_ogu_cell_counts_df(
-            self.linregresses_dict, counts_df, lengths_df, mass_ratio_df,
-            read_len, min_coverage, min_rsquared)
+            self.linregresses_dict, counts_df, lengths_df,
+            per_sample_calc_info_df, read_len, min_coverage, min_rsquared)
 
         pd.testing.assert_frame_equal(expected_df, output_df)
         self.assertListEqual(
@@ -741,7 +853,7 @@ def test__calc_long_format_ogu_cell_counts_df_error(self):
                 self.example2_ogu_full_inputs_dict[OGU_READ_COUNT_KEY]),
         }
 
-        mass_ratio_dict = {k: self.mass_ratio_dict[k] for k in
+        mass_ratio_dict = {k: self.mass_and_totals_dict[k] for k in
                            (SAMPLE_ID_KEY, GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY)}
 
         linregresses_dict = {
@@ -880,7 +992,7 @@ def test__prepare_cell_counts_calc_df_v_sparse(self):
     def test__calc_ogu_cell_counts_df_for_sample(self):
         input_dict = self._combine_inputs()
         input_df = pd.DataFrame(input_dict)
-        mass_ratio_df = pd.DataFrame(self.mass_ratio_dict)
+        per_sample_info_df = pd.DataFrame(self.mass_and_totals_dict)
 
         expected_additions_dict = {
             k: self.example1_ogu_full_outputs_short_avogadro_dict[k] for k in
@@ -897,7 +1009,7 @@ def test__calc_ogu_cell_counts_df_for_sample(self):
         min_rsquared = 0.8
 
         output_df, output_msgs = _calc_ogu_cell_counts_df_for_sample(
-            sample_id, self.linregresses_dict, mass_ratio_df, input_df,
+            sample_id, self.linregresses_dict, per_sample_info_df, input_df,
             min_rsquared, is_test=True)
 
         pd.testing.assert_frame_equal(expected_out_df, output_df)
@@ -949,7 +1061,7 @@ def test__calc_gdna_mass_to_sample_mass_by_sample_df(self):
                        (SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY,
                         SAMPLE_IN_ALIQUOT_MASS_G_KEY, ELUTE_VOL_UL_KEY)}
 
-        expected_dict = {k: self.mass_ratio_dict[k] for k in
+        expected_dict = {k: self.mass_and_totals_dict[k] for k in
                          (SAMPLE_ID_KEY, GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY)}
 
         inputs_df = pd.DataFrame(inputs_dict)
@@ -976,6 +1088,12 @@ def test__calc_ogu_gdna_mass_ng_series_for_sample(self):
         slope = 1.24487652379132
         intercept = -6.77539505390338
 
+        # This number comes from summing all the reads in the input_df.
+        # This matches what was done for the Zaramela calculations.  I
+        # suspect that this should perhaps be the total reads for the
+        # whole sample, but for testing this will do.
+        sample_total_reads = 472140
+
         input_df = pd.DataFrame(input_dict)
         expected_series = pd.Series(
             self.example1_ogu_full_outputs_short_avogadro_dict[
@@ -985,7 +1103,7 @@ def test__calc_ogu_gdna_mass_ng_series_for_sample(self):
         expected_series.index.name = OGU_ID_KEY
 
         output_series = _calc_ogu_gdna_mass_ng_series_for_sample(
-            input_df, slope, intercept)
+            input_df, slope, intercept, sample_total_reads)
 
         assert_series_equal(expected_series, output_series)
 
diff --git a/pysyndna/tests/test_fit_syndna_models.py b/pysyndna/tests/test_fit_syndna_models.py
index fa57b7b..04c5795 100644
--- a/pysyndna/tests/test_fit_syndna_models.py
+++ b/pysyndna/tests/test_fit_syndna_models.py
@@ -11,7 +11,7 @@
 from pysyndna.src.fit_syndna_models import SAMPLE_ID_KEY, SYNDNA_ID_KEY, \
     SYNDNA_POOL_MASS_NG_KEY, SYNDNA_INDIV_NG_UL_KEY, \
     SYNDNA_FRACTION_OF_POOL_KEY,  SYNDNA_INDIV_NG_KEY, \
-    SYNDNA_TOTAL_READS_KEY, SYNDNA_POOL_NUM_KEY, \
+    SAMPLE_TOTAL_READS_KEY, SYNDNA_POOL_NUM_KEY, \
     _validate_syndna_id_consistency, _validate_sample_id_consistency, \
     _calc_indiv_syndna_weights, _fit_linear_regression_models
 
@@ -36,7 +36,7 @@ class FitSyndnaModelsTest(TestCase):
     # system.
     a_sample_syndna_weights_and_total_reads_dict = {
         SAMPLE_ID_KEY: [sample_ids[0]],
-        SYNDNA_TOTAL_READS_KEY: [3216923],
+        SAMPLE_TOTAL_READS_KEY: [3216923],
         SYNDNA_POOL_MASS_NG_KEY: [0.25],
     }
 
@@ -46,7 +46,7 @@ class FitSyndnaModelsTest(TestCase):
     # Syndna pool masses are plausible values for our experimental system.
     a_b_sample_syndna_weights_and_total_reads_dict = {
         SAMPLE_ID_KEY: sample_ids,
-        SYNDNA_TOTAL_READS_KEY: [3216923, 1723417],
+        SAMPLE_TOTAL_READS_KEY: [3216923, 1723417],
         SYNDNA_POOL_MASS_NG_KEY: [0.25, 0.2],
     }
 
@@ -57,11 +57,12 @@ class FitSyndnaModelsTest(TestCase):
     # Syndna pool masses are plausible values for our experimental system.
     a_b_c_sample_syndna_weights_and_total_reads_dict = {
         SAMPLE_ID_KEY: [sample_ids[0], sample_ids[1], "C"],
-        SYNDNA_TOTAL_READS_KEY: [3216923, 1723417, 2606004],
+        SAMPLE_TOTAL_READS_KEY: [3216923, 1723417, 2606004],
         SYNDNA_POOL_MASS_NG_KEY: [0.25, 0.2, 0.3],
     }
 
-    # The below sample values come from the "A1_pool1_S21_L001_R1_001.fastq_output_forward_paired.fq.sam.bam.f13_r1.fq_synDNA"
+    # The below sample values come from the
+    # "A1_pool1_S21_L001_R1_001.fastq_output_forward_paired.fq.sam.bam.f13_r1.fq_synDNA"
     # and "A1_pool2_S22_L001_R1_001.fastq_output_forward_paired.fq.sam.bam.f13_r1.fq_synDNA"
     # columns of https://github.com/lzaramela/SynDNA/blob/main/data/synDNA_Fwd_Rev_sam.biom.tsv ,
     # while the syndna ids are inferred from the contents of the "OTUID"
@@ -85,16 +86,20 @@ class FitSyndnaModelsTest(TestCase):
     # "A1_pool1_Fwd" *but* we use a different pool mass than Zaramela,
     # so the same syndna counts are based on different masses.
     lingress_results = {
-        'A': LinregressResult(
-            slope=1.244876523791319, intercept=-6.7242381884894655,
-            rvalue=0.9865030975156575, pvalue=1.428443560659758e-07,
-            stderr=0.07305408550335003,
-            intercept_stderr=0.2361976278251443),
-        'B': LinregressResult(
-            slope=1.24675913604407, intercept=-7.155318973708384,
-            rvalue=0.9863241797356326, pvalue=1.505381146809759e-07,
-            stderr=0.07365795255302438,
-            intercept_stderr=0.2563956755844754)
+        'A': {
+            "slope": 1.244876523791319,
+            "intercept": -6.7242381884894655,
+            "rvalue": 0.9865030975156575,
+            "pvalue": 1.428443560659758e-07,
+            "stderr": 0.07305408550335003,
+            "intercept_stderr": 0.2361976278251443},
+        'B': {
+            "slope": 1.24675913604407,
+            "intercept": -7.155318973708384,
+            "rvalue": 0.9863241797356326,
+            "pvalue": 1.505381146809759e-07,
+            "stderr": 0.07365795255302438,
+            "intercept_stderr": 0.2563956755844754}
     }
 
     prep_info_dict = copy.deepcopy(
@@ -111,14 +116,14 @@ class FitSyndnaModelsTest(TestCase):
             reads_per_syndna_per_sample_dict["B"])])
 
     def assert_lingressresult_dict_almost_equal(self, d1, d2, places=7):
-        """Assert that two dicts of LinregressResult are almost equal.
+        """Assert that two dicts of lingress results are almost equal.
 
         Parameters
         ----------
         d1 : dict
-            The first dict of LinregressResult to compare
+            The first dict to compare
         d2 : dict
-            The second dict of LinregressResult to compare
+            The second dict to compare
         places : int, optional
             The number of decimal places to compare to
 
@@ -131,30 +136,10 @@ def assert_lingressresult_dict_almost_equal(self, d1, d2, places=7):
         self.assertIsInstance(d2, dict)
         self.assertEqual(d1.keys(), d2.keys())
         for k in d1.keys():
-            self.assert_linregressresult_almost_equal(d1[k], d2[k], places)
-
-    def assert_linregressresult_almost_equal(self, l1, l2, places=7):
-        """Assert that two LinregressResult are almost equal.
-
-        Parameters
-        ----------
-        l1 : dict
-            The first LinregressResult to compare
-        l2 : dict
-            The second LinregressResult to compare
-        places : int, optional
-            The number of decimal places to compare to
-
-        Raises
-        ------
-        AssertionError
-            If the LinregressResults are not almost equal
-        """
-        self.assertIsInstance(l1, LinregressResult)
-        self.assertIsInstance(l2, LinregressResult)
-        self.assertEqual(len(l1), len(l2))
-        for i in range(0, len(l1)):
-            self.assertAlmostEqual(l1[i], l2[i], places=places)
+            for m in d1[k].keys():
+                m1 = d1[k][m]
+                m2 = d2[k][m]
+                self.assertAlmostEqual(m1, m2)
 
     def setUp(self):
         self.maxDiff = None
@@ -292,7 +277,7 @@ def test_fit_linear_regression_models_for_qiita_w_col_error(self):
         prep_info_dict = {
             SAMPLE_ID_KEY: ["A", "B"],
             "sequencing_type": ["shotgun", "shotgun"],
-            SYNDNA_TOTAL_READS_KEY: [3216923, 1723417],
+            SAMPLE_TOTAL_READS_KEY: [3216923, 1723417],
             SYNDNA_POOL_MASS_NG_KEY: [0.25, 0.2],
             # missing the SYNDNA_POOL_NUM_KEY column
         }
@@ -340,16 +325,20 @@ def test_fit_linear_regression_models_w_log_msgs(self):
         # syndnas with <200 total counts removed on "linear regressions" sheet
         # of "absolute_quant_example.xlsx").
         expected_out_dict = {
-            'A': LinregressResult(
-                slope=1.2561949109446753, intercept=-6.7671601206840855,
-                rvalue=0.982777689569875, pvalue=2.1705143708536327e-06,
-                stderr=0.08927614710714807,
-                intercept_stderr=0.30147987595768355),
-            'B': LinregressResult(
-                slope=1.2568191864801976, intercept=-7.196128673001381,
-                rvalue=0.9825127010266727, pvalue=2.2890733334160456e-06,
-                stderr=0.09002330756867402,
-                intercept_stderr=0.32657986324660143)
+            'A': {
+                "slope": 1.2561949109446753,
+                "intercept": -6.7671601206840855,
+                "rvalue": 0.982777689569875,
+                "pvalue": 2.1705143708536327e-06,
+                "stderr": 0.08927614710714807,
+                "intercept_stderr": 0.30147987595768355},
+            'B': {
+                "slope": 1.2568191864801976,
+                "intercept": -7.196128673001381,
+                "rvalue": 0.9825127010266727,
+                "pvalue": 2.2890733334160456e-06,
+                "stderr": 0.09002330756867402,
+                "intercept_stderr": 0.32657986324660143}
         }
         expected_out_msgs = [
             "The following sample ids were in the experiment info but not in "
@@ -382,8 +371,8 @@ def test_fit_linear_regression_models_w_sample_error(self):
         # which is in the data
 
         expected_err_msg = \
-            r"Found sample ids in reads_per_syndna_per_sample_df that were " \
-            r"not in sample_syndna_weights_and_total_reads_df: \{'B'\}"
+            (r"Found sample ids in reads data that were not in sample info: "
+             r"\{'B'\}")
 
         syndna_concs_df = pd.DataFrame(self.syndna_concs_dict)
         sample_syndna_weights_and_total_reads_df = pd.DataFrame(
@@ -559,8 +548,8 @@ def test__validate_sample_id_consistency_w_error(self):
         reads_per_syndna_per_sample_df = pd.DataFrame(
             self.reads_per_syndna_per_sample_dict)
 
-        err_msg = "Found sample ids in reads_per_syndna_per_sample_df " \
-                  "that were not in sample_syndna_weights_and_total_reads_df"
+        err_msg = (r"Found sample ids in reads data that were not in sample "
+                   r"info: \{'B'\}")
         with self.assertRaisesRegex(ValueError, err_msg):
             _validate_sample_id_consistency(
                 sample_syndna_weights_and_total_reads_df,
@@ -630,7 +619,7 @@ def test__fit_linear_regression_models(self):
         input_fp = os.path.join(self.data_dir, 'modelling_input.tsv')
         working_df = pd.read_csv(input_fp, sep="\t", comment="#")
 
-        output = _fit_linear_regression_models(working_df)
+        output, out_msgs_list = _fit_linear_regression_models(working_df)
 
         expected_fp = os.path.join(self.data_dir, 'modelling_output.tsv')
         expected_df = pd.read_csv(expected_fp, sep="\t", comment="#")
@@ -645,3 +634,5 @@ def test__fit_linear_regression_models(self):
             self.assertAlmostEqual(expected_slope, v.slope)
             self.assertAlmostEqual(expected_intercept, v.intercept)
         # next model
+
+        self.assertEqual([], out_msgs_list)
diff --git a/pysyndna/tests/test_quant_orfs.py b/pysyndna/tests/test_quant_orfs.py
new file mode 100644
index 0000000..b4724a1
--- /dev/null
+++ b/pysyndna/tests/test_quant_orfs.py
@@ -0,0 +1,316 @@
+import biom.table
+import numpy as np
+import os
+import pandas
+from pandas.testing import assert_frame_equal
+from unittest import TestCase
+from pysyndna import calc_copies_of_ogu_orf_ssrna_per_g_sample, \
+    calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita
+from pysyndna.src.quant_orfs import _read_ogu_orf_coords_to_df, \
+    _calc_ogu_orf_copies_per_g_from_coords, \
+    _calc_copies_of_ogu_orf_ssrna_per_g_sample, \
+    OGU_ORF_ID_KEY, OGU_ORF_START_KEY, OGU_ORF_END_KEY, OGU_ORF_LEN_KEY, \
+    COPIES_PER_G_OGU_ORF_SSRNA_KEY, SAMPLE_ID_KEY, \
+    SAMPLE_IN_ALIQUOT_MASS_G_KEY, SSRNA_CONCENTRATION_NG_UL_KEY, \
+    ELUTE_VOL_UL_KEY, TOTAL_BIOLOGICAL_READS_KEY
+
+
+class TestQuantOrfs(TestCase):
+    COORDS_DICT = {
+        OGU_ORF_ID_KEY: ["G000005825_1", "G000005825_2", "G000005825_3",
+                         "G000005825_4", "G000005825_5", "G900163845_3247",
+                         "G900163845_3248", "G900163845_3249",
+                         "G900163845_3250", "G900163845_3251"],
+        OGU_ORF_START_KEY: [816, 2348, 3744, 3971, 5098, 3392209, 3393051,
+                            3393938, 3394702, 3395077],
+        OGU_ORF_END_KEY: [2168, 3490, 3959, 5086, 5373, 3390413, 3392206,
+                          3393048, 3393935, 3395721]
+    }
+
+    LEN_AND_COPIES_DICT = {
+            OGU_ORF_ID_KEY: ["G000005825_1", "G000005825_2", "G000005825_3",
+                             "G000005825_4", "G000005825_5", "G900163845_3247",
+                             "G900163845_3248", "G900163845_3249",
+                             "G900163845_3250", "G900163845_3251"],
+            OGU_ORF_LEN_KEY: [1353, 1143, 216, 1116, 276, 1797, 846, 891,
+                              768, 645],
+            COPIES_PER_G_OGU_ORF_SSRNA_KEY: [1.3091041E+18, 1.5496219E+18,
+                                             8.2000827E+18, 1.5871128E+18,
+                                             6.4174561E+18, 9.8565268E+17,
+                                             2.0936381E+18, 1.9878988E+18,
+                                             2.3062733E+18, 2.7460742E+18]
+        }
+
+    SAMPLE_IDS = ["IBSRS3526007", "IQSRS3526010"]
+    COUNT_VALS = np.array([
+        [0, 0],
+        [2, 0],
+        [0, 1],
+        [35, 0],
+        [0, 694],
+        [10292, 382],
+        [0, 0],
+        [190, 10],
+        [0, 630],
+        [34, 1003]])
+
+    PARAMS_DICT = {
+        SAMPLE_ID_KEY: SAMPLE_IDS,
+        SAMPLE_IN_ALIQUOT_MASS_G_KEY: [0.003, 0.00082],
+        SSRNA_CONCENTRATION_NG_UL_KEY: [0.132714286, 0.0042],
+        ELUTE_VOL_UL_KEY: [70, 70],
+        TOTAL_BIOLOGICAL_READS_KEY: [213988, 3028580]
+    }
+
+    COPIES_PER_G_SAMPLE_VALS = np.array([
+        [0, 0],
+        [4.4849829E+07, 0],
+        [0, 9.7076176E+05],
+        [8.0386085E+08, 0],
+        [0, 5.2725026E+08],
+        [1.4680090E+11, 4.4574009E+07],
+        [0, 0],
+        [5.4657898E+09, 2.3533619E+06],
+        [0, 1.7200685E+08],
+        [1.3511272E+09, 3.2606759E+08]])
+
+    def setUp(self):
+        self.maxDiff = None
+        self.data_dir = os.path.join(os.path.dirname(__file__), 'data')
+
+    def test__read_ogu_orf_coords_to_df(self):
+        expected_df = pandas.DataFrame(self.COORDS_DICT)
+
+        ogu_orf_coords_fp = os.path.join(self.data_dir, "coords.txt")
+        output_df = _read_ogu_orf_coords_to_df(ogu_orf_coords_fp)
+        assert_frame_equal(output_df, expected_df)
+
+    def test__calc_ogu_orf_copies_per_g_from_coords(self):
+        expected_dict = self.COORDS_DICT.copy()
+        expected_dict.update(self.LEN_AND_COPIES_DICT)
+        expected_df = pandas.DataFrame(
+            expected_dict, index=expected_dict[OGU_ORF_ID_KEY])
+        expected_df.index.name = OGU_ORF_ID_KEY
+
+        input_df = pandas.DataFrame(self.COORDS_DICT)
+        output_df = _calc_ogu_orf_copies_per_g_from_coords(input_df)
+
+        assert_frame_equal(expected_df, output_df)
+
+    def test__calc_copies_of_ogu_orf_ssrna_per_g_sample(self):
+        input_quant_params_per_sample_df = pandas.DataFrame(self.PARAMS_DICT)
+        input_ogu_orf_copies_per_g_ssrna_df = pandas.DataFrame(
+            self.LEN_AND_COPIES_DICT,
+            index=self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY])
+
+        input_reads_per_ogu_orf_per_sample_biom = biom.table.Table(
+            self.COUNT_VALS,
+            self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY],
+            self.SAMPLE_IDS)
+
+        expected_biom = biom.table.Table(
+            self.COPIES_PER_G_SAMPLE_VALS,
+            self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY],
+            self.SAMPLE_IDS)
+
+        output_biom = _calc_copies_of_ogu_orf_ssrna_per_g_sample(
+            input_quant_params_per_sample_df,
+            input_reads_per_ogu_orf_per_sample_biom,
+            input_ogu_orf_copies_per_g_ssrna_df)
+
+        # NB: Comparing the bioms as dataframes because the biom equality
+        # compare does not allow "almost equal" checking for float values,
+        # whereas rtol and atol are built in to assert_frame_equal
+        output_df = output_biom.to_dataframe()
+        expected_df = expected_biom.to_dataframe()
+        pandas.testing.assert_frame_equal(output_df, expected_df)
+
+    def test__calc_copies_of_ogu_orf_ssrna_per_g_sample_ids_err(self):
+        # drop the first sample from the params dataframe; now the reads
+        # will contain a sample that the params dataframe does not
+        input_quant_params_per_sample_df = pandas.DataFrame(self.PARAMS_DICT)
+        input_quant_params_per_sample_df.drop(index=0, axis=0, inplace=True)
+
+        input_ogu_orf_copies_per_g_ssrna_df = pandas.DataFrame(
+            self.LEN_AND_COPIES_DICT,
+            index=self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY])
+
+        input_reads_per_ogu_orf_per_sample_biom = biom.table.Table(
+            self.COUNT_VALS,
+            self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY],
+            self.SAMPLE_IDS)
+
+        expected_msg = r"Found sample ids in reads data that were not in" \
+                       r" sample info: \{'IBSRS3526007'\}"
+        with self.assertRaisesRegex(ValueError, expected_msg):
+            _ = _calc_copies_of_ogu_orf_ssrna_per_g_sample(
+                input_quant_params_per_sample_df,
+                input_reads_per_ogu_orf_per_sample_biom,
+                input_ogu_orf_copies_per_g_ssrna_df)
+
+    def test__calc_copies_of_ogu_orf_ssrna_per_g_sample_col_err(self):
+        params_dict = self.PARAMS_DICT.copy()
+
+        # drop a necessary column from the params dict
+        del params_dict[TOTAL_BIOLOGICAL_READS_KEY]
+        input_quant_params_per_sample_df = pandas.DataFrame(params_dict)
+        input_quant_params_per_sample_df.drop(index=0, axis=0, inplace=True)
+
+        input_ogu_orf_copies_per_g_ssrna_df = pandas.DataFrame(
+            self.LEN_AND_COPIES_DICT,
+            index=self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY])
+
+        input_reads_per_ogu_orf_per_sample_biom = biom.table.Table(
+            self.COUNT_VALS,
+            self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY],
+            self.SAMPLE_IDS)
+
+        expected_msg = r"parameters dataframe is missing required " \
+                       r"column\(s\): \['total_biological_reads_r1r2'\]"
+        with self.assertRaisesRegex(ValueError, expected_msg):
+            _ = _calc_copies_of_ogu_orf_ssrna_per_g_sample(
+                input_quant_params_per_sample_df,
+                input_reads_per_ogu_orf_per_sample_biom,
+                input_ogu_orf_copies_per_g_ssrna_df)
+
+    def test_calc_copies_of_ogu_orf_ssrna_per_g_sample(self):
+        input_quant_params_per_sample_df = pandas.DataFrame(self.PARAMS_DICT)
+        ogu_orf_coords_fp = os.path.join(self.data_dir, "coords.txt")
+
+        input_reads_per_ogu_orf_per_sample_biom = biom.table.Table(
+            self.COUNT_VALS,
+            self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY],
+            self.SAMPLE_IDS)
+
+        expected_biom = biom.table.Table(
+            self.COPIES_PER_G_SAMPLE_VALS,
+            self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY],
+            self.SAMPLE_IDS)
+
+        output_biom = calc_copies_of_ogu_orf_ssrna_per_g_sample(
+            input_quant_params_per_sample_df,
+            input_reads_per_ogu_orf_per_sample_biom,
+            ogu_orf_coords_fp)
+
+        # NB: Comparing the bioms as dataframes because the biom equality
+        # compare does not allow "almost equal" checking for float values,
+        # whereas rtol and atol are built in to assert_frame_equal
+        output_df = output_biom.to_dataframe()
+        expected_df = expected_biom.to_dataframe()
+        pandas.testing.assert_frame_equal(output_df, expected_df)
+
+    def test_calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita(self):
+        sample_info_dict = {k: self.PARAMS_DICT[k] for k in
+                            [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY]}
+
+        prep_info_dict = {k: self.PARAMS_DICT[k] for k in
+                          [SAMPLE_ID_KEY, ELUTE_VOL_UL_KEY,
+                           SSRNA_CONCENTRATION_NG_UL_KEY,
+                           TOTAL_BIOLOGICAL_READS_KEY]}
+
+        sample_info_df = pandas.DataFrame(sample_info_dict)
+        prep_info_df = pandas.DataFrame(prep_info_dict)
+        ogu_orf_coords_fp = os.path.join(self.data_dir, "coords.txt")
+
+        input_reads_per_ogu_orf_per_sample_biom = biom.table.Table(
+            self.COUNT_VALS,
+            self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY],
+            self.SAMPLE_IDS)
+
+        expected_biom = biom.table.Table(
+            self.COPIES_PER_G_SAMPLE_VALS,
+            self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY],
+            self.SAMPLE_IDS)
+
+        output_biom = calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita(
+            sample_info_df, prep_info_df,
+            input_reads_per_ogu_orf_per_sample_biom,
+            ogu_orf_coords_fp)
+
+        # NB: Comparing the bioms as dataframes because the biom equality
+        # compare does not allow "almost equal" checking for float values,
+        # whereas rtol and atol are built in to assert_frame_equal
+        output_df = output_biom.to_dataframe()
+        expected_df = expected_biom.to_dataframe()
+        pandas.testing.assert_frame_equal(output_df, expected_df)
+
+    def test_calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita_col_err(self):
+        sample_info_dict = {k: self.PARAMS_DICT[k] for k in
+                            [SAMPLE_ID_KEY]}
+
+        prep_info_dict = {k: self.PARAMS_DICT[k] for k in
+                          [SAMPLE_ID_KEY, ELUTE_VOL_UL_KEY,
+                           SSRNA_CONCENTRATION_NG_UL_KEY,
+                           TOTAL_BIOLOGICAL_READS_KEY]}
+
+        sample_info_df = pandas.DataFrame(sample_info_dict)
+        prep_info_df = pandas.DataFrame(prep_info_dict)
+        ogu_orf_coords_fp = os.path.join(self.data_dir, "coords.txt")
+
+        input_reads_per_ogu_orf_per_sample_biom = biom.table.Table(
+            self.COUNT_VALS,
+            self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY],
+            self.SAMPLE_IDS)
+
+        expected_msg = r"sample info is missing required " \
+                       r"column\(s\): \['calc_mass_sample_aliquot_input_g'\]"
+        with self.assertRaisesRegex(ValueError, expected_msg):
+            _ = calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita(
+                sample_info_df, prep_info_df,
+                input_reads_per_ogu_orf_per_sample_biom,
+                ogu_orf_coords_fp)
+
+    def test_calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita_col_err2(self):
+        sample_info_dict = {k: self.PARAMS_DICT[k] for k in
+                            [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY]}
+
+        prep_info_dict = {k: self.PARAMS_DICT[k] for k in
+                          [SAMPLE_ID_KEY, ELUTE_VOL_UL_KEY,
+                           SSRNA_CONCENTRATION_NG_UL_KEY]}
+
+        sample_info_df = pandas.DataFrame(sample_info_dict)
+        prep_info_df = pandas.DataFrame(prep_info_dict)
+        ogu_orf_coords_fp = os.path.join(self.data_dir, "coords.txt")
+
+        input_reads_per_ogu_orf_per_sample_biom = biom.table.Table(
+            self.COUNT_VALS,
+            self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY],
+            self.SAMPLE_IDS)
+
+        expected_msg = r"prep info is missing required " \
+                       r"column\(s\): \['total_biological_reads_r1r2'\]"
+        with self.assertRaisesRegex(ValueError, expected_msg):
+            _ = calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita(
+                sample_info_df, prep_info_df,
+                input_reads_per_ogu_orf_per_sample_biom,
+                ogu_orf_coords_fp)
+
+    def test_calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita_id_err(self):
+        sample_info_dict = {k: self.PARAMS_DICT[k] for k in
+                            [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY]}
+
+        prep_info_dict = {k: self.PARAMS_DICT[k] for k in
+                          [SAMPLE_ID_KEY, ELUTE_VOL_UL_KEY,
+                           SSRNA_CONCENTRATION_NG_UL_KEY,
+                           TOTAL_BIOLOGICAL_READS_KEY]}
+
+        sample_info_df = pandas.DataFrame(sample_info_dict)
+
+        # drop the first sample from the prep dataframe; now the sample info
+        # will contain a sample that the prep dataframe does not.
+        prep_info_df = pandas.DataFrame(prep_info_dict)
+        prep_info_df.drop(index=0, axis=0, inplace=True)
+
+        ogu_orf_coords_fp = os.path.join(self.data_dir, "coords.txt")
+        input_reads_per_ogu_orf_per_sample_biom = biom.table.Table(
+            self.COUNT_VALS,
+            self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY],
+            self.SAMPLE_IDS)
+
+        expected_msg = (r"Found sample ids in reads data that were not in "
+                        r"sample info: \{'IBSRS3526007'\}")
+        with self.assertRaisesRegex(ValueError, expected_msg):
+            _ = calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita(
+                sample_info_df, prep_info_df,
+                input_reads_per_ogu_orf_per_sample_biom,
+                ogu_orf_coords_fp)
diff --git a/pysyndna/tests/test_util.py b/pysyndna/tests/test_util.py
new file mode 100644
index 0000000..7289999
--- /dev/null
+++ b/pysyndna/tests/test_util.py
@@ -0,0 +1,235 @@
+import biom
+import numpy as np
+import pandas
+from pandas.testing import assert_series_equal, assert_frame_equal
+from unittest import TestCase
+from pysyndna.src.util import calc_copies_genomic_element_per_g_series, \
+    calc_gs_genomic_element_in_aliquot, \
+    validate_metadata_vs_prep_id_consistency, \
+    validate_metadata_vs_reads_id_consistency, \
+    validate_required_columns_exist, SAMPLE_ID_KEY, ELUTE_VOL_UL_KEY
+
+
+class TestCalcCellCounts(TestCase):
+    def test_validate_required_columns_exist_true(self):
+        input_dict = {
+            'sample_id': ['sample1'],
+            'prep_id': ['prep1'],
+        }
+        input_df = pandas.DataFrame(input_dict)
+        required_columns = ['sample_id', 'prep_id']
+
+        validate_required_columns_exist(
+            input_df, required_columns, "missing")
+
+        # Pass test if we made it this far
+        self.assertTrue(True)
+
+    def test_validate_required_columns_exist_err(self):
+        input_dict = {
+            'sample_id': ['sample1'],
+        }
+        input_df = pandas.DataFrame(input_dict)
+        required_columns = ['sample_id', 'prep_id']
+
+        expected_err = r"missing: \['prep_id'\]"
+        with self.assertRaisesRegex(ValueError, expected_err):
+            validate_required_columns_exist(
+                input_df, required_columns, "missing")
+
+    def test_validate_metadata_vs_prep_id_consistency_true(self):
+        input_dict = {
+            SAMPLE_ID_KEY: ['sample1'],
+            'color': ['blue'],
+        }
+        input_df = pandas.DataFrame(input_dict)
+
+        prep_dict = {
+            SAMPLE_ID_KEY: ['sample1'],
+            'prep_id': ['prep1'],
+        }
+        prep_df = pandas.DataFrame(prep_dict)
+
+        _ = validate_metadata_vs_prep_id_consistency(input_df, prep_df)
+
+        # Pass test if we made it this far
+        self.assertTrue(True)
+
+    def test_validate_metadata_vs_prep_id_consistency_true_w_msg(self):
+        input_dict = {
+            SAMPLE_ID_KEY: ['sample1', 'sample2'],
+            'color': ['blue', 'aqua'],
+        }
+        input_df = pandas.DataFrame(input_dict)
+
+        prep_dict = {
+            SAMPLE_ID_KEY: ['sample1'],
+            'prep_id': ['prep1'],
+        }
+        prep_df = pandas.DataFrame(prep_dict)
+
+        not_in_prep_ids = validate_metadata_vs_prep_id_consistency(
+            input_df, prep_df)
+
+        expected_not_in_prep_ids = ['sample2']
+        self.assertEqual(not_in_prep_ids, expected_not_in_prep_ids)
+
+    def test_validate_metadata_vs_prep_id_consistency_err(self):
+        input_dict = {
+            SAMPLE_ID_KEY: ['sample1'],
+            'color': ['blue'],
+        }
+        input_df = pandas.DataFrame(input_dict)
+
+        prep_dict = {
+            SAMPLE_ID_KEY: ['sample1', 'sample2'],
+            'prep_id': ['prep1', 'prep2'],
+        }
+        prep_df = pandas.DataFrame(prep_dict)
+
+        expected_err = (r"Found sample ids in prep info that were not in "
+                        r"sample info: \{'sample2'\}")
+        with self.assertRaisesRegex(ValueError, expected_err):
+            _ = validate_metadata_vs_prep_id_consistency(
+                input_df, prep_df)
+
+    def test_validate_metadata_vs_reads_id_consistency_df_true(self):
+        input_dict = {
+            SAMPLE_ID_KEY: ['sample1', 'sample2'],
+            'color': ['blue', 'aqua'],
+        }
+        input_df = pandas.DataFrame(input_dict)
+
+        reads_dict = {
+            'sample1': [1, 2],
+            'sample2': [3, 4],
+        }
+        reads_df = pandas.DataFrame(reads_dict)
+
+        _ = validate_metadata_vs_reads_id_consistency(input_df, reads_df)
+
+        # Pass test if we made it this far
+        self.assertTrue(True)
+
+    def test_validate_metadata_vs_reads_id_consistency_df_true_w_msg(self):
+        input_dict = {
+            SAMPLE_ID_KEY: ['sample1', 'sample2', 'sample3'],
+            'color': ['blue', 'aqua', 'cerulean'],
+        }
+        input_df = pandas.DataFrame(input_dict)
+
+        reads_dict = {
+            'sample1': [1, 2],
+            'sample2': [3, 4],
+        }
+        reads_df = pandas.DataFrame(reads_dict)
+
+        not_in_prep_ids = validate_metadata_vs_reads_id_consistency(
+            input_df, reads_df)
+
+        expected_not_in_prep_ids = ['sample3']
+        self.assertEqual(not_in_prep_ids, expected_not_in_prep_ids)
+
+    def test_validate_metadata_vs_reads_id_consistency_df_err(self):
+        input_dict = {
+            SAMPLE_ID_KEY: ['sample1'],
+            'color': ['blue'],
+        }
+        input_df = pandas.DataFrame(input_dict)
+
+        reads_dict = {
+            'sample1': [1, 2],
+            'sample2': [3, 4],
+        }
+        reads_df = pandas.DataFrame(reads_dict)
+
+        expected_err = (r"Found sample ids in reads data that were not in "
+                        r"sample info: \{'sample2'\}")
+        with self.assertRaisesRegex(ValueError, expected_err):
+            _ = validate_metadata_vs_reads_id_consistency(
+                input_df, reads_df)
+
+    def test_validate_metadata_vs_reads_id_consistency_biom_true(self):
+        input_dict = {
+            SAMPLE_ID_KEY: ['sample1', 'sample2'],
+            'color': ['blue', 'aqua'],
+        }
+        input_df = pandas.DataFrame(input_dict)
+
+        reads_biom = biom.table.Table(
+            np.array([[1, 2], [3, 4]]),
+            ['obs1', 'obs2'],
+            ['sample1', 'sample2'])
+
+        _ = validate_metadata_vs_reads_id_consistency(input_df, reads_biom)
+
+        # Pass test if we made it this far
+        self.assertTrue(True)
+
+    def test_validate_metadata_vs_reads_id_consistency_biom_true_w_msg(self):
+        input_dict = {
+            SAMPLE_ID_KEY: ['sample1', 'sample2', 'sample3'],
+            'color': ['blue', 'aqua', 'cerulean'],
+        }
+        input_df = pandas.DataFrame(input_dict)
+
+        reads_biom = biom.table.Table(
+            np.array([[1, 2], [3, 4]]),
+            ['obs1', 'obs2'],
+            ['sample1', 'sample2'])
+
+        not_in_prep_ids = validate_metadata_vs_reads_id_consistency(
+            input_df, reads_biom)
+
+        expected_not_in_prep_ids = ['sample3']
+        self.assertEqual(not_in_prep_ids, expected_not_in_prep_ids)
+
+    def test_validate_metadata_vs_reads_id_consistency_biom_err(self):
+        input_dict = {
+            SAMPLE_ID_KEY: ['sample1'],
+            'color': ['blue'],
+        }
+        input_df = pandas.DataFrame(input_dict)
+
+        reads_biom = biom.table.Table(
+            np.array([[1, 2], [3, 4]]),
+            ['obs1', 'obs2'],
+            ['sample1', 'sample2'])
+
+        expected_err = (r"Found sample ids in reads data that were not in "
+                        r"sample info: \{'sample2'\}")
+        with self.assertRaisesRegex(ValueError, expected_err):
+            _ = validate_metadata_vs_reads_id_consistency(
+                input_df, reads_biom)
+
+    def test_calc_copies_genomic_element_per_g_series(self):
+        # example from "rna_copy_quant_example.xlsx" "full_calc" tab,
+        # ogu_orf_calculations table
+        elements_lens = [1353, 1143, 216, 1116, 276, 1797, 846, 891, 768, 645]
+        copies_per_g = [1.309104e+18, 1.549622e+18, 8.200083e+18, 1.587113e+18,
+                        6.417456e+18, 9.856527e+17, 2.093638e+18, 1.987899e+18,
+                        2.306273e+18, 2.746074e+18]
+        expected_series = pandas.Series(copies_per_g)
+        obs_series = calc_copies_genomic_element_per_g_series(
+            pandas.Series(elements_lens), 340)
+        assert_series_equal(expected_series, obs_series)
+
+    def test_calc_gs_genomic_element_in_aliquot(self):
+        # example from "rna_copy_quant_example.xlsx" "full_calc" tab,
+        # quant_params_per_sample table
+        input_dict = {
+            SAMPLE_ID_KEY: ["IBSRS3526007", "IQSRS3526010"],
+            'conc_ng_ul': [0.132714, 0.004200],
+            ELUTE_VOL_UL_KEY: [70, 70]
+        }
+
+        added_dict = {'mass_key': [9.290000e-09, 2.940000e-10]}
+        expected_dict = input_dict.copy()
+        expected_dict.update(added_dict)
+
+        input_df = pandas.DataFrame(input_dict)
+        expected_df = pandas.DataFrame(expected_dict)
+
+        obs_df = calc_gs_genomic_element_in_aliquot(
+            input_df, 'conc_ng_ul', 'mass_key')
+        assert_frame_equal(expected_df, obs_df)