diff --git a/absolute_quant_example.xlsx b/docs/absolute_quant_example.xlsx similarity index 100% rename from absolute_quant_example.xlsx rename to docs/absolute_quant_example.xlsx diff --git a/docs/rna_copy_quant_example.xlsx b/docs/rna_copy_quant_example.xlsx new file mode 100644 index 0000000..256df07 Binary files /dev/null and b/docs/rna_copy_quant_example.xlsx differ diff --git a/docs/rna_copy_quant_workflow.pdf b/docs/rna_copy_quant_workflow.pdf new file mode 100644 index 0000000..9956aad Binary files /dev/null and b/docs/rna_copy_quant_workflow.pdf differ diff --git a/pysyndna/__init__.py b/pysyndna/__init__.py index 5e98a98..d0146c9 100644 --- a/pysyndna/__init__.py +++ b/pysyndna/__init__.py @@ -2,11 +2,16 @@ fit_linear_regression_models_for_qiita from pysyndna.src.calc_cell_counts import calc_ogu_cell_counts_biom, \ calc_ogu_cell_counts_per_g_of_sample_for_qiita +from pysyndna.src.quant_orfs import \ + calc_copies_of_ogu_orf_ssrna_per_g_sample, \ + calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita __all__ = ['fit_linear_regression_models', 'fit_linear_regression_models_for_qiita', 'calc_ogu_cell_counts_biom', - 'calc_ogu_cell_counts_per_g_of_sample_for_qiita'] + 'calc_ogu_cell_counts_per_g_of_sample_for_qiita', + 'calc_copies_of_ogu_orf_ssrna_per_g_sample', + 'calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita'] from . import _version __version__ = _version.get_versions()['version'] diff --git a/pysyndna/src/calc_cell_counts.py b/pysyndna/src/calc_cell_counts.py index 1f97bf5..46d7db6 100644 --- a/pysyndna/src/calc_cell_counts.py +++ b/pysyndna/src/calc_cell_counts.py @@ -1,13 +1,19 @@ -from __future__ import annotations - import biom import numpy as np import pandas as pd import yaml -from typing import Optional - -from pysyndna.src.fit_syndna_models import SAMPLE_ID_KEY, \ - SYNDNA_POOL_MASS_NG_KEY, _validate_required_columns_exist +from typing import Optional, Union, Dict, List +from pysyndna.src.util import calc_copies_genomic_element_per_g_series, \ + calc_gs_genomic_element_in_aliquot, \ + validate_required_columns_exist, \ + validate_metadata_vs_reads_id_consistency, \ + validate_metadata_vs_prep_id_consistency, \ + DNA_BASEPAIR_G_PER_MOLE, NANOGRAMS_PER_GRAM, \ + SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY, ELUTE_VOL_UL_KEY, \ + REQUIRED_SAMPLE_INFO_KEYS + +from pysyndna.src.fit_syndna_models import SYNDNA_POOL_MASS_NG_KEY, \ + SLOPE_KEY, INTERCEPT_KEY, SAMPLE_TOTAL_READS_KEY DEFAULT_SYNDNA_MASS_FRACTION_OF_SAMPLE = 0.05 DEFAULT_READ_LENGTH = 150 @@ -18,8 +24,6 @@ CELL_COUNT_LOG_KEY = 'calc_cell_counts_log' GDNA_CONCENTRATION_NG_UL_KEY = 'extracted_gdna_concentration_ng_ul' -SAMPLE_IN_ALIQUOT_MASS_G_KEY = 'calc_mass_sample_aliquot_input_g' -ELUTE_VOL_UL_KEY = 'vol_extracted_elution_ul' GDNA_FROM_ALIQUOT_MASS_G_KEY = 'extracted_gdna_concentration_g' # NB: below is NOT the full mass of gDNA extracted from the sample, but # ONLY the mass of gDNA that was put into sequencing. This mass should @@ -41,209 +45,8 @@ # (NOT limited to the amount of gDNA that was put into sequencing, unlike # SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY) GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY = 'gdna_mass_to_sample_mass_ratio' - - -def calc_ogu_cell_counts_per_g_of_sample_for_qiita( - sample_info_df: pd.DataFrame, - prep_info_df: pd.DataFrame, - linregress_by_sample_id_fp: str, - ogu_counts_per_sample_biom: biom.Table, - ogu_lengths_fp: str, - read_length: int = DEFAULT_READ_LENGTH, - min_coverage: float = DEFAULT_MIN_COVERAGE, - min_rsquared: float = DEFAULT_MIN_RSQUARED, - syndna_mass_fraction_of_sample: float = - DEFAULT_SYNDNA_MASS_FRACTION_OF_SAMPLE) \ - -> dict[str, str | biom.Table]: - - """Gets # of cells of each OGU/g of sample for samples from Qiita. - - Parameters - ---------- - sample_info_df: pd.DataFrame - Dataframe containing sample info for all samples in the prep, - including SAMPLE_ID_KEY and SAMPLE_IN_ALIQUOT_MASS_G_KEY - prep_info_df: pd.DataFrame - Dataframe containing prep info for all samples in the prep, - including SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY, - ELUTE_VOL_UL_KEY, and SYNDNA_POOL_MASS_NG_KEY. - linregress_by_sample_id_fp: str - String containing the filepath to the yaml file holding the - dictionary keyed by sample id, containing for each sample a dictionary - representation of the sample's LinregressResult. - ogu_counts_per_sample_biom: biom.Table - Biom table holding the read counts aligned to each OGU in each sample. - ogu_lengths_fp : str - String containing the filepath to a tab-separated, two-column, - no-header file in which the first column is the OGU id and the - second is the OGU length in basepairs - read_length : int - Length of reads in bp (usually but not always 150). - min_coverage : float - Minimum allowable coverage of an OGU needed to include that OGU - in the output. - min_rsquared: float - Minimum allowable R^2 value for the linear regression model for a - sample; any sample with an R^2 value less than this will be excluded - from the output. - syndna_mass_fraction_of_sample: float - Fraction of the mass of the sample that is added as syndna (usually - 0.05, which is to say 5%). - - Returns - ------- - output_by_out_type : dict of str or biom.Table - Dictionary of outputs keyed by their type Currently, the following keys - are defined: - CELL_COUNT_RESULT_KEY: biom.Table holding the calculated number of - cells per gram of sample material for each OGU in each sample. - CELL_COUNT_LOG_KEY: log of messages from the cell count calc process. - """ - - # check if the inputs all have the required columns - required_sample_info_cols = [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY] - _validate_required_columns_exist( - sample_info_df, required_sample_info_cols, - "sample info is missing required column(s)") - - required_prep_info_cols = [SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY, - ELUTE_VOL_UL_KEY, SYNDNA_POOL_MASS_NG_KEY] - _validate_required_columns_exist( - prep_info_df, required_prep_info_cols, - "prep info is missing required column(s)") - - # calculate the mass of gDNA sequenced for each sample. We have the - # mass of syndna pool that was added to each sample, and we know that the - # syndna pool mass is calculated to be a certain percentage of the mass of - # the sample (added into the library prep in addition to the sample mass). - # Therefore, if the syndna fraction is 0.05 or 5%, the mass of the sample - # gDNA put into sequencing is 1/0.05 = 20x the mass of syndna pool added. - prep_info_df[SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY] = \ - prep_info_df[SYNDNA_POOL_MASS_NG_KEY] * \ - (1 / syndna_mass_fraction_of_sample) - - # merge the sample info and prep info dataframes - absolute_quant_params_per_sample_df = \ - sample_info_df.merge(prep_info_df, on=SAMPLE_ID_KEY, how='left') - - # read in the linregress_by_sample_id yaml file - with open(linregress_by_sample_id_fp) as f: - linregress_by_sample_id = yaml.load(f, Loader=yaml.FullLoader) - - # read in the ogu_lengths file - ogu_lengths_df = pd.read_csv(ogu_lengths_fp, sep='\t', header=None, - names=[OGU_ID_KEY, OGU_LEN_IN_BP_KEY]) - - # calculate # cells per gram of sample material of each OGU in each sample - output_biom, log_msgs_list = calc_ogu_cell_counts_biom( - absolute_quant_params_per_sample_df, linregress_by_sample_id, - ogu_counts_per_sample_biom, ogu_lengths_df, read_length, min_coverage, - min_rsquared, OGU_CELLS_PER_G_OF_SAMPLE_KEY) - - out_txt_by_out_type = { - CELL_COUNT_RESULT_KEY: output_biom, - CELL_COUNT_LOG_KEY: '\n'.join(log_msgs_list)} - - return out_txt_by_out_type - - -def calc_ogu_cell_counts_biom( - absolute_quant_params_per_sample_df: pd.DataFrame, - linregress_by_sample_id: dict[str, dict[str, float]], - ogu_counts_per_sample_biom: biom.Table, - ogu_lengths_df: pd.DataFrame, - read_length: int, - min_coverage: float, - min_rsquared: float, - output_cell_counts_metric: str) -> (biom.Table, list[str]): - - """Calcs input cell count metric for each ogu & sample via linear models. - - Parameters - ---------- - absolute_quant_params_per_sample_df: pd.DataFrame - Dataframe of at least SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY, - SAMPLE_IN_ALIQUOT_MASS_G_KEY, ELUTE_VOL_UL_KEY, and - SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY for each sample. - linregress_by_sample_id : dict[str, dict[str: float]] - Dictionary keyed by sample id, containing for each sample either None - (if no model could be trained for that SAMPLE_ID_KEY) or a dictionary - representation of the sample's LinregressResult. - ogu_counts_per_sample_biom: biom.Table - Biom table holding the read counts aligned to each OGU in each sample. - ogu_lengths_df : pd.DataFrame - Dataframe of OGU_ID_KEY and OGU_LEN_IN_BP_KEY for each OGU. - read_length : int - Length of reads in bp (usually but not always 150). - min_coverage : float - Minimum allowable coverage of an OGU needed to include that OGU - in the output. - min_rsquared: float - Minimum allowable R^2 value for the linear regression model for a - sample; any sample with an R^2 value less than this will be excluded - from the output. - output_cell_counts_metric : str - Name of the desired output cell count metric; options are - OGU_CELLS_PER_G_OF_GDNA_KEY and OGU_CELLS_PER_G_OF_SAMPLE_KEY. - - Returns - ------- - ogu_cell_counts_biom : biom.Table - Dataframe with a column for OGU_ID_KEY and then one additional column - for each sample id, which holds the predicted number of cells per gram - of sample material of that OGU in that sample. - log_messages_list : list[str] - List of strings containing log messages generated by this function. - """ - - working_params_df = absolute_quant_params_per_sample_df.copy() - - # cast the GDNA_CONCENTRATION_NG_UL_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY, - # ELUTE_VOL_UL_KEY, and SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY columns of - # params df to float if they aren't already - for col in [GDNA_CONCENTRATION_NG_UL_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY, - ELUTE_VOL_UL_KEY, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY]: - if working_params_df[col].dtype != float: - working_params_df[col] = \ - working_params_df[col].astype(float) - - # calculate the ratio of extracted gDNA mass to sample mass put into - # extraction for each sample - gdna_mass_to_sample_mass_by_sample_series = \ - _calc_gdna_mass_to_sample_mass_by_sample_df(working_params_df) - per_sample_mass_info_df = _series_to_df( - gdna_mass_to_sample_mass_by_sample_series, SAMPLE_ID_KEY, - GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY) - - # merge only the SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY column of - # working_params_df into gdna_mass_to_sample_mass_df by SAMPLE_ID_KEY - per_sample_mass_info_df = per_sample_mass_info_df.merge( - working_params_df[[SAMPLE_ID_KEY, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY]], - on=SAMPLE_ID_KEY, how='left') - - # convert input biom table to a dataframe with sparse columns, which - # should act basically the same as a dense dataframe but use less memory - ogu_counts_per_sample_df = ogu_counts_per_sample_biom.to_dataframe( - dense=False) - - ogu_cell_counts_long_format_df, log_msgs_list = ( - _calc_long_format_ogu_cell_counts_df( - linregress_by_sample_id, ogu_counts_per_sample_df, - ogu_lengths_df, per_sample_mass_info_df, read_length, - min_coverage, min_rsquared)) - - ogu_cell_counts_wide_format_df = ogu_cell_counts_long_format_df.pivot( - index=OGU_ID_KEY, columns=SAMPLE_ID_KEY)[output_cell_counts_metric] - - # convert dataframe to biom table; input params are - # data (the "output_cell_count_metric"s), observation_ids (the "ogu_id"s), - # and sample_ids (er, the "sample_id"s) - ogu_cell_counts_biom = biom.Table( - ogu_cell_counts_wide_format_df.values, - ogu_cell_counts_wide_format_df.index, - ogu_cell_counts_wide_format_df.columns) - - return ogu_cell_counts_biom, log_msgs_list +REQUIRED_DNA_PREP_INFO_KEYS = [SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY, + ELUTE_VOL_UL_KEY, SAMPLE_TOTAL_READS_KEY] def _calc_gdna_mass_to_sample_mass_by_sample_df( @@ -258,26 +61,22 @@ def _calc_gdna_mass_to_sample_mass_by_sample_df( Parameters ---------- absolute_quant_params_per_sample_df: pd.DataFrame - Dataframe of at least SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY, + A Dataframe of at least SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY, and ELUTE_VOL_UL_KEY for each sample. Returns ------- gdna_mass_to_sample_mass_by_sample_series : pd.Series - Series with index of sample id and values of the ratio of gDNA mass + A Series with index of sample id and values of the ratio of gDNA mass units extracted from each mass unit of input sample (only) mass. """ - working_df = absolute_quant_params_per_sample_df.copy() - # get the total grams of gDNA that are in the elute after extraction; - # this is sample-specific: - # concentration of gDNA after extraction in ng/uL times volume of elute - # from the extraction in uL, times 1/10^9 g/ng - working_df[GDNA_FROM_ALIQUOT_MASS_G_KEY] = \ - working_df[GDNA_CONCENTRATION_NG_UL_KEY] * \ - working_df[ELUTE_VOL_UL_KEY] / 10 ** 9 + # this is sample-specific + working_df = calc_gs_genomic_element_in_aliquot( + absolute_quant_params_per_sample_df, GDNA_CONCENTRATION_NG_UL_KEY, + GDNA_FROM_ALIQUOT_MASS_G_KEY) # determine how many mass units of gDNA are produced from the extraction of # each mass unit of sample material; this is sample-specific: @@ -299,7 +98,7 @@ def _series_to_df(a_series, index_col_name, val_col_name): Parameters ---------- a_series : pd.Series - Series to be converted to a dataframe. + A Series to be converted to a dataframe. index_col_name : str Name of the index-derived in the resulting dataframe. val_col_name : str @@ -308,7 +107,7 @@ def _series_to_df(a_series, index_col_name, val_col_name): Returns ------- a_df : pd.DataFrame - Dataframe with two columns, one from the index and one containing the + A Dataframe with two columns, one from the index and one containing the values from the input series. """ @@ -319,13 +118,13 @@ def _series_to_df(a_series, index_col_name, val_col_name): def _calc_long_format_ogu_cell_counts_df( - linregress_by_sample_id: dict[str, dict[str, float]], + linregress_by_sample_id: Dict[str, Dict[str, float]], ogu_counts_per_sample_df: pd.DataFrame, ogu_lengths_df: pd.DataFrame, - per_sample_mass_info_df: pd.DataFrame, + per_sample_calc_info_df: pd.DataFrame, read_length: int, min_coverage: float, - min_rsquared: float) -> (pd.DataFrame | None, list[str]): + min_rsquared: float) -> (Union[pd.DataFrame, None], List[str]): """Predicts the # of cells of each OGU in each sample from the read counts. @@ -336,14 +135,15 @@ def _calc_long_format_ogu_cell_counts_df( (if no model could be trained for that SAMPLE_ID_KEY) or a dictionary representation of the sample's LinregressResult. ogu_counts_per_sample_df: pd.DataFrame - Dataframe with a column for OGU_ID_KEY and then one additional column + A Dataframe with a column for OGU_ID_KEY and then one additional column for each sample id, which holds the read counts aligned to that OGU in that sample. ogu_lengths_df : pd.DataFrame - Dataframe of OGU_ID_KEY and OGU_LEN_IN_BP_KEY for each OGU. - per_sample_mass_info_df : pd.DataFrame - Dataframe of SAMPLE_ID_KEY, GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY, and - SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY for each sample. + A Dataframe of OGU_ID_KEY and OGU_LEN_IN_BP_KEY for each OGU. + per_sample_calc_info_df : pd.DataFrame + A Dataframe of SAMPLE_ID_KEY, GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY, + SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY, and SAMPLE_TOTAL_READS_KEY + for each sample. read_length : int Length of reads in bp (usually but not always 150). min_coverage : float @@ -381,11 +181,14 @@ def _calc_long_format_ogu_cell_counts_df( # gDNA in this sample and also per gram of stool in this sample curr_sample_df, curr_log_msgs = _calc_ogu_cell_counts_df_for_sample( curr_sample_id, linregress_by_sample_id, - per_sample_mass_info_df, working_df, min_rsquared) + per_sample_calc_info_df, working_df, min_rsquared) log_messages_list.extend(curr_log_msgs) if curr_sample_df is None: log_messages_list.append(f"No cell counts calculated for " f"sample {curr_sample_id}") + + # NB: if no cell counts were calculated for this sample, + # this sample is left out of the final cell_counts_df. continue # if cell_counts_df does not yet exist, create it from curr_sample_df; @@ -407,7 +210,7 @@ def _prepare_cell_counts_calc_df( ogu_counts_per_sample_df: pd.DataFrame, ogu_lengths_df: pd.DataFrame, read_length: int, - min_coverage: float) -> (pd.DataFrame, list[str]): + min_coverage: float) -> (pd.DataFrame, List[str]): """Prepares long-format dataframe containing fields needed for later calcs. @@ -418,7 +221,7 @@ def _prepare_cell_counts_calc_df( column for each sample id, which holds the read counts aligned to that OGU in that sample. ogu_lengths_df : pd.DataFrame - Dataframe of OGU_ID_KEY and OGU_LEN_IN_BP_KEY for each OGU. + A Dataframe of OGU_ID_KEY and OGU_LEN_IN_BP_KEY for each OGU. read_length : int Length of reads in bp (usually but not always 150). min_coverage : float @@ -486,11 +289,12 @@ def _prepare_cell_counts_calc_df( def _calc_ogu_cell_counts_df_for_sample( sample_id: str, - linregress_by_sample_id: dict[str, dict[str, float]], - per_sample_mass_info_df: pd.DataFrame, + linregress_by_sample_id: Dict[str, Dict[str, float]], + per_sample_info_df: pd.DataFrame, working_df: pd.DataFrame, min_rsquared: float, - is_test: Optional[bool] = False) -> (pd.DataFrame | None, list[str]): + is_test: Optional[bool] = False) \ + -> (Union[pd.DataFrame, None], List[str]): """Calculates # cells of each OGU per gram of sample material for sample. @@ -502,9 +306,10 @@ def _calc_ogu_cell_counts_df_for_sample( Dictionary keyed by sample id, containing for each sample either None (if no model could be trained for that SAMPLE_ID_KEY) or a dictionary representation of the sample's LinregressResult. - per_sample_mass_info_df : pd.DataFrame - Dataframe of SAMPLE_ID_KEY, GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY, and - SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY for each sample. + per_sample_info_df : pd.DataFrame + A Dataframe of SAMPLE_ID_KEY, GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY, and + SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY, and SAMPLE_TOTAL_READS_KEY + for each sample. working_df : pd.DataFrame Long-format dataframe with columns for OGU_ID_KEY, SAMPLE_ID_KEY, OGU_READ_COUNT_KEY, and OGU_LEN_IN_BP_KEY @@ -552,16 +357,22 @@ def _calc_ogu_cell_counts_df_for_sample( sample_df = working_df[ working_df[SAMPLE_ID_KEY] == sample_id].copy() - # predict mass of each OGU's gDNA in this sample using the linear model + # get the total reads sequenced for this sample + sample_total_reads = per_sample_info_df.loc[ + per_sample_info_df[SAMPLE_ID_KEY] == sample_id, + SAMPLE_TOTAL_READS_KEY].values[0] + + # predict mass of each OGU's gDNA in this sample from its counts + # using the linear model ogu_gdna_masses = _calc_ogu_gdna_mass_ng_series_for_sample( - sample_df, linregress_result["slope"], - linregress_result["intercept"]) + sample_df, linregress_result[SLOPE_KEY], + linregress_result[INTERCEPT_KEY], sample_total_reads) sample_df[OGU_GDNA_MASS_NG_KEY] = \ sample_df[OGU_ID_KEY].map(ogu_gdna_masses) # get the mass of gDNA put into sequencing for this sample - sequenced_sample_gdna_mass_ng = per_sample_mass_info_df.loc[ - per_sample_mass_info_df[SAMPLE_ID_KEY] == sample_id, + sequenced_sample_gdna_mass_ng = per_sample_info_df.loc[ + per_sample_info_df[SAMPLE_ID_KEY] == sample_id, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY].values[0] # calc the # of genomes of each OGU per gram of gDNA in this sample @@ -581,8 +392,8 @@ def _calc_ogu_cell_counts_df_for_sample( # calc the # of cells of each OGU per gram of actual sample material # (e.g., per gram of stool if these are fecal samples) for this sample - mass_ratio_for_sample = per_sample_mass_info_df.loc[ - per_sample_mass_info_df[SAMPLE_ID_KEY] == sample_id, + mass_ratio_for_sample = per_sample_info_df.loc[ + per_sample_info_df[SAMPLE_ID_KEY] == sample_id, GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY].values[0] sample_df[OGU_CELLS_PER_G_OF_SAMPLE_KEY] = \ sample_df[OGU_CELLS_PER_G_OF_GDNA_KEY] * \ @@ -594,38 +405,38 @@ def _calc_ogu_cell_counts_df_for_sample( def _calc_ogu_gdna_mass_ng_series_for_sample( sample_df: pd.DataFrame, sample_linregress_slope: float, - sample_linregress_intercept: float) -> pd.Series: + sample_linregress_intercept: float, + sample_total_reads: int) -> pd.Series: """Calculates mass of OGU gDNA in ng for each OGU in a sample. Parameters ---------- sample_df: pd.DataFrame - Dataframe with rows for a single sample, containing at least columns + A Dataframe with rows for a single sample, containing at least columns for OGU_ID_KEY and OGU_READ_COUNT_KEY. sample_linregress_slope: float Slope of the linear regression model for the sample. sample_linregress_intercept: float Intercept of the linear regression model for the sample. + sample_total_reads: int + Total number of reads for the sample (including all reads, not just + aligned ones). Returns ------- ogu_genomes_per_g_of_gdna_series : pd.Series - Series with index of OGU_ID_KEY and values of the number of genomes + A Series with index of OGU_ID_KEY and values of the number of genomes of each OGU per gram of gDNA in the sample. """ working_df = sample_df.copy() - # calculate the total number of reads for this sample (a scalar) - # by summing read counts for all the rows in the sample table - total_reads_per_sample = working_df[OGU_READ_COUNT_KEY].sum() - # add a column of counts per million (CPM) for each ogu by dividing # each read_count by the total number of reads for this sample # and then multiplying by a million (1,000,000) # NB: dividing int/int in python gives float working_df[OGU_CPM_KEY] = (working_df[OGU_READ_COUNT_KEY] / - total_reads_per_sample) * 1000000 + sample_total_reads) * 1000000 # add column of log10(ogu CPM) by taking log base 10 of the ogu CPM column working_df[LOG_10_OGU_CPM_KEY] = np.log10(working_df[OGU_CPM_KEY]) @@ -660,7 +471,7 @@ def _calc_ogu_genomes_per_g_of_gdna_series_for_sample( Parameters ---------- sample_df: pd.DataFrame - Dataframe with rows related to only a single sample, containing + A Dataframe with rows related to only a single sample, containing at least columns for OGU_ID_KEY, OGU_LEN_IN_BP_KEY, and OGU_GDNA_MASS_NG_KEY. total_sample_gdna_mass_ng: float @@ -675,7 +486,7 @@ def _calc_ogu_genomes_per_g_of_gdna_series_for_sample( Returns ------- ogu_genomes_per_g_of_gdna_series : pd.Series - Series with index of OGU_ID_KEY and values of the number of genomes + A Series with index of OGU_ID_KEY and values of the number of genomes of each OGU per gram of gDNA of the sample. """ @@ -707,7 +518,7 @@ def _calc_ogu_genomes_series_for_sample( Parameters ---------- sample_df: pd.DataFrame - Dataframe with rows related to only a single sample, containing + A Dataframe with rows related to only a single sample, containing at least columns for OGU_ID_KEY, OGU_LEN_IN_BP_KEY, and OGU_GDNA_MASS_NG_KEY. is_test: Optional[bool] @@ -720,7 +531,7 @@ def _calc_ogu_genomes_series_for_sample( Returns ------- ogu_genomes_series : pd.Series - Series with index of OGU_ID_KEY and values of the number of genomes + A Series with index of OGU_ID_KEY and values of the number of genomes of each OGU in the sequenced sample. This calculates the total number of genomes for each OGU in the sequenced @@ -738,34 +549,243 @@ def _calc_ogu_genomes_series_for_sample( molecules--in this case, genomes--in a mole of a substance. """ - # seems weird to make this a variable since it's famously a constant, but.. - avogadros_num = 6.02214076e23 - # this is done so we can test against Livia's results, which use - # a truncated version of the constant. This should NOT be done in - # production. In testing, makes a difference of e.g., about 10 cells - # out of 25K for the first OGU in the first sample in Livia's dataset. - if is_test: - avogadros_num = 6.022e23 - - # TODO: do we have to worry about integer overflow here? - # Dan H. said, "if you use ints, the length * 650 * 10^9 - # can overflow integers with very long genomes". HOWEVER, - # the internet says that python *3* , "[o]nly floats have a hard - # limit in python. Integers are implemented as “long” integer - # objects of arbitrary size"(https://stackoverflow.com/a/52151786) - # HOWEVER HOWEVER, *numpy* integer types are fixed width, and - # "Some pandas and numpy functions, such as sum on arrays or - # Series return an np.int64 so this might be the reason you are - # seeing int overflows in Python3." - # (https://stackoverflow.com/a/58640340) - # What to do? - - numerator_series = sample_df[OGU_GDNA_MASS_NG_KEY] * avogadros_num - denominator_series = sample_df[OGU_LEN_IN_BP_KEY] * 650 * 1e9 - - ogu_genomes_series = numerator_series/denominator_series + ogu_copies_per_g_series = calc_copies_genomic_element_per_g_series( + sample_df[OGU_LEN_IN_BP_KEY], DNA_BASEPAIR_G_PER_MOLE, is_test=is_test) + ogu_copies_per_extracted_sample_series = \ + sample_df[OGU_GDNA_MASS_NG_KEY] * \ + ogu_copies_per_g_series / NANOGRAMS_PER_GRAM # Set the index of the series to be the OGU_ID_KEY - ogu_genomes_series.index = sample_df[OGU_ID_KEY] + ogu_copies_per_extracted_sample_series.index = sample_df[OGU_ID_KEY] + return ogu_copies_per_extracted_sample_series + + +def calc_ogu_cell_counts_biom( + absolute_quant_params_per_sample_df: pd.DataFrame, + linregress_by_sample_id: Dict[str, Dict[str, float]], + ogu_counts_per_sample_biom: biom.Table, + ogu_lengths_df: pd.DataFrame, + read_length: int, + min_coverage: float, + min_rsquared: float, + output_cell_counts_metric: str) -> (biom.Table, List[str]): + + """Calcs input cell count metric for each ogu & sample via linear models. + + Parameters + ---------- + absolute_quant_params_per_sample_df: pd.DataFrame + A Dataframe of at least SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY, + SAMPLE_IN_ALIQUOT_MASS_G_KEY, ELUTE_VOL_UL_KEY, + SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY, and SAMPLE_TOTAL_READS_KEY + for each sample. + linregress_by_sample_id : dict[str, dict[str: float]] + Dictionary keyed by sample id, containing for each sample either None + (if no model could be trained for that SAMPLE_ID_KEY) or a dictionary + representation of the sample's LinregressResult. + ogu_counts_per_sample_biom: biom.Table + Biom table holding the read counts aligned to each OGU in each sample. + ogu_lengths_df : pd.DataFrame + A Dataframe of OGU_ID_KEY and OGU_LEN_IN_BP_KEY for each OGU. + read_length : int + Length of reads in bp (usually but not always 150). + min_coverage : float + Minimum allowable coverage of an OGU needed to include that OGU + in the output. + min_rsquared: float + Minimum allowable R^2 value for the linear regression model for a + sample; any sample with an R^2 value less than this will be excluded + from the output. + output_cell_counts_metric : str + Name of the desired output cell count metric; options are + OGU_CELLS_PER_G_OF_GDNA_KEY and OGU_CELLS_PER_G_OF_SAMPLE_KEY. + + Returns + ------- + ogu_cell_counts_biom : biom.Table + Dataframe with a column for OGU_ID_KEY and then one additional column + for each sample id, which holds the predicted number of cells per gram + of sample material of that OGU in that sample. + log_messages_list : list[str] + List of strings containing log messages generated by this function. + """ + + # check if the inputs all have the required columns + required_cols_list = list( + {SAMPLE_ID_KEY, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY} | + set(REQUIRED_DNA_PREP_INFO_KEYS)) + validate_required_columns_exist( + absolute_quant_params_per_sample_df, required_cols_list, + "sample info is missing required column(s)") + + # Check if any samples in the reads data are missing from the metadata; + # Not bothering to report samples that are in metadata but not the reads-- + # maybe those failed the sequencing run. + _ = validate_metadata_vs_reads_id_consistency( + absolute_quant_params_per_sample_df, ogu_counts_per_sample_biom) + + working_params_df = absolute_quant_params_per_sample_df.copy() + + # cast the GDNA_CONCENTRATION_NG_UL_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY, + # ELUTE_VOL_UL_KEY, and SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY columns of + # params df to float if they aren't already + for col in [GDNA_CONCENTRATION_NG_UL_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY, + ELUTE_VOL_UL_KEY, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY]: + if working_params_df[col].dtype != float: + working_params_df[col] = \ + working_params_df[col].astype(float) + + # calculate the ratio of extracted gDNA mass to sample mass put into + # extraction for each sample + gdna_mass_to_sample_mass_by_sample_series = \ + _calc_gdna_mass_to_sample_mass_by_sample_df(working_params_df) + per_sample_calc_info_df = _series_to_df( + gdna_mass_to_sample_mass_by_sample_series, SAMPLE_ID_KEY, + GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY) + + # merge the SAMPLE_TOTAL_READS_KEY and SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY + # columns of working_params_df into gdna_mass_to_sample_mass_df + # by SAMPLE_ID_KEY + per_sample_calc_info_df = per_sample_calc_info_df.merge( + working_params_df[[SAMPLE_ID_KEY, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY, + SAMPLE_TOTAL_READS_KEY]], + on=SAMPLE_ID_KEY, how='left') + + # convert input biom table to a dataframe with sparse columns, which + # should act basically the same as a dense dataframe but use less memory + ogu_counts_per_sample_df = ogu_counts_per_sample_biom.to_dataframe( + dense=False) + + ogu_cell_counts_long_format_df, log_msgs_list = ( + _calc_long_format_ogu_cell_counts_df( + linregress_by_sample_id, ogu_counts_per_sample_df, + ogu_lengths_df, per_sample_calc_info_df, read_length, + min_coverage, min_rsquared)) + + ogu_cell_counts_wide_format_df = ogu_cell_counts_long_format_df.pivot( + index=OGU_ID_KEY, columns=SAMPLE_ID_KEY)[output_cell_counts_metric] + + # replace NaNs with 0s; per Daniel McDonald, much downstream analysis + # cannot handle NaNs, and it is preferable to set invalid values + # to 0 and provide a log message saying they are not usable than to leave + # them as NaNs + ogu_cell_counts_wide_format_df.fillna(0, inplace=True) + + # convert dataframe to biom table; input params are + # data (the "output_cell_count_metric"s), observation_ids (the "ogu_id"s), + # and sample_ids (er, the "sample_id"s) + ogu_cell_counts_biom = biom.Table( + ogu_cell_counts_wide_format_df.values, + ogu_cell_counts_wide_format_df.index, + ogu_cell_counts_wide_format_df.columns) + + return ogu_cell_counts_biom, log_msgs_list - return ogu_genomes_series + +def calc_ogu_cell_counts_per_g_of_sample_for_qiita( + sample_info_df: pd.DataFrame, + prep_info_df: pd.DataFrame, + linregress_by_sample_id_fp: str, + ogu_counts_per_sample_biom: biom.Table, + ogu_lengths_fp: str, + read_length: int = DEFAULT_READ_LENGTH, + min_coverage: float = DEFAULT_MIN_COVERAGE, + min_rsquared: float = DEFAULT_MIN_RSQUARED, + syndna_mass_fraction_of_sample: float = + DEFAULT_SYNDNA_MASS_FRACTION_OF_SAMPLE) \ + -> Dict[str, Union[str, biom.Table]]: + + """Gets # of cells of each OGU/g of sample for samples from Qiita. + + Parameters + ---------- + sample_info_df: pd.DataFrame + A Dataframe containing sample info for all samples in the prep, + including SAMPLE_ID_KEY and SAMPLE_IN_ALIQUOT_MASS_G_KEY + prep_info_df: pd.DataFrame + A Dataframe containing prep info for all samples in the prep, + including SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY, + ELUTE_VOL_UL_KEY, SYNDNA_POOL_MASS_NG_KEY, and SAMPLE_TOTAL_READS_KEY. + linregress_by_sample_id_fp: str + String containing the filepath to the yaml file holding the + dictionary keyed by sample id, containing for each sample a dictionary + representation of the sample's LinregressResult. + ogu_counts_per_sample_biom: biom.Table + Biom table holding the read counts aligned to each OGU in each sample. + ogu_lengths_fp : str + String containing the filepath to a tab-separated, two-column, + no-header file in which the first column is the OGU id and the + second is the OGU length in basepairs + read_length : int + Length of reads in bp (usually but not always 150). + min_coverage : float + Minimum allowable coverage of an OGU needed to include that OGU + in the output. + min_rsquared: float + Minimum allowable R^2 value for the linear regression model for a + sample; any sample with an R^2 value less than this will be excluded + from the output. + syndna_mass_fraction_of_sample: float + Fraction of the mass of the sample that is added as syndna (usually + 0.05, which is to say 5%). + + Returns + ------- + output_by_out_type : dict of str or biom.Table + Dictionary of outputs keyed by their type Currently, the following keys + are defined: + CELL_COUNT_RESULT_KEY: biom.Table holding the calculated number of + cells per gram of sample material for each OGU in each sample. + CELL_COUNT_LOG_KEY: log of messages from the cell count calc process. + """ + + # check if the inputs all have the required columns + validate_required_columns_exist( + sample_info_df, REQUIRED_SAMPLE_INFO_KEYS, + "sample info is missing required column(s)") + + required_prep_cols = list( + {SYNDNA_POOL_MASS_NG_KEY} | set(REQUIRED_DNA_PREP_INFO_KEYS)) + validate_required_columns_exist( + prep_info_df, required_prep_cols, + "prep info is missing required column(s)") + + # Check if any samples in the prep are missing from the sample info; + # Not bothering to report samples that are in sample info but not the prep + # --maybe those just weren't included in this prep. + _ = validate_metadata_vs_prep_id_consistency( + sample_info_df, prep_info_df) + + # calculate the mass of gDNA sequenced for each sample. We have the + # mass of syndna pool that was added to each sample, and we know that the + # syndna pool mass is calculated to be a certain percentage of the mass of + # the sample (added into the library prep in addition to the sample mass). + # Therefore, if the syndna fraction is 0.05 or 5%, the mass of the sample + # gDNA put into sequencing is 1/0.05 = 20x the mass of syndna pool added. + prep_info_df[SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY] = \ + prep_info_df[SYNDNA_POOL_MASS_NG_KEY] * \ + (1 / syndna_mass_fraction_of_sample) + + # merge the sample info and prep info dataframes + absolute_quant_params_per_sample_df = \ + sample_info_df.merge(prep_info_df, on=SAMPLE_ID_KEY, how='left') + + # read in the linregress_by_sample_id yaml file + with open(linregress_by_sample_id_fp) as f: + linregress_by_sample_id = yaml.load(f, Loader=yaml.FullLoader) + + # read in the ogu_lengths file + ogu_lengths_df = pd.read_csv(ogu_lengths_fp, sep='\t', header=None, + names=[OGU_ID_KEY, OGU_LEN_IN_BP_KEY]) + + # calculate # cells per gram of sample material of each OGU in each sample + output_biom, log_msgs_list = calc_ogu_cell_counts_biom( + absolute_quant_params_per_sample_df, linregress_by_sample_id, + ogu_counts_per_sample_biom, ogu_lengths_df, read_length, min_coverage, + min_rsquared, OGU_CELLS_PER_G_OF_SAMPLE_KEY) + + out_txt_by_out_type = { + CELL_COUNT_RESULT_KEY: output_biom, + CELL_COUNT_LOG_KEY: '\n'.join(log_msgs_list)} + + return out_txt_by_out_type diff --git a/pysyndna/src/fit_syndna_models.py b/pysyndna/src/fit_syndna_models.py index b0ddb05..f791808 100644 --- a/pysyndna/src/fit_syndna_models.py +++ b/pysyndna/src/fit_syndna_models.py @@ -5,20 +5,21 @@ import os import pandas as pd import scipy +import traceback import yaml -from typing import Optional +from typing import Optional, List, Dict, Union +from pysyndna.src.util import validate_required_columns_exist, \ + validate_metadata_vs_reads_id_consistency, SAMPLE_ID_KEY DEFAULT_MIN_SAMPLE_COUNTS = 1 -# NB: sample_name instead of sample_id bc that's what qiita uses -SAMPLE_ID_KEY = 'sample_name' SYNDNA_ID_KEY = 'syndna_id' SYNDNA_POOL_NUM_KEY = 'syndna_pool_number' SYNDNA_INDIV_NG_UL_KEY = 'syndna_indiv_ng_ul' SYNDNA_FRACTION_OF_POOL_KEY = 'syndna_fraction_of_pool' SYNDNA_POOL_MASS_NG_KEY = 'mass_syndna_input_ng' -SYNDNA_TOTAL_READS_KEY = 'raw_reads_r1r2' +SAMPLE_TOTAL_READS_KEY = 'raw_reads_r1r2' SYNDNA_COUNTS_KEY = 'read_count' COUNTS_PER_MIL_KEY = 'CPM' LOG10_COUNTS_PER_MIL_KEY = 'log10_CPM' @@ -26,114 +27,14 @@ LOG10_SYNDNA_INDIV_NG_KEY = 'log10_syndna_ng' LIN_REGRESS_RESULT_KEY = 'lin_regress_by_sample_id' FIT_SYNDNA_MODELS_LOG_KEY = 'fit_syndna_models_log' - - -# TODO: if they sequenced over multiple lanes, would be different prep -# info files--talk to lab about whether they will ever do this :( -# this would require merge of multiple preparations -def fit_linear_regression_models_for_qiita( - prep_info_df: pd.DataFrame, - reads_per_syndna_per_sample_biom: biom.Table, - min_sample_counts: int = DEFAULT_MIN_SAMPLE_COUNTS, - syndna_pool_config_fp: Optional[str] = None) -> dict[str: str]: - - """Fits linear regressions predicting mass from counts using Qiita inputs. - - Parameters - ---------- - prep_info_df: pd.DataFrame - Dataframe containing prep info for all samples in the prep, - including SAMPLE_ID, SYNDNA_POOL_NUM_KEY, SYNDNA_POOL_MASS_NG_KEY, - and SYNDNA_TOTAL_READS_KEY - reads_per_syndna_per_sample_biom: biom.Table - Biom table holding read counts aligned to each synDNA in each sample. - Note: should already have combined forward and reverse counts. - min_sample_counts: int - Minimum number of counts required for a sample to be included in - the regression. Samples with fewer counts will be excluded. - syndna_pool_config_fp: str, optional - Path to the yaml file holding the concentrations of each syndna - in the syndna pool used in this experiment. If not provided, will - look for the config.yml file in the parent directory of this file. - - Returns - ------- - out_txt_by_out_type : dict of str - Dictionary of output strings (ready to be written to files) keyed - by the type of output they contain. Currently, the following keys - are defined: - LIN_REGRESS_RESULT_KEY: yaml of dict[str, dict[str, float]] - FIT_SYNDNA_MODELS_LOG_KEY: txt log of messages from the fitting process - """ - - # check that the prep_info_df has the expected columns - expected_prep_info_cols = [ - SAMPLE_ID_KEY, SYNDNA_POOL_NUM_KEY, SYNDNA_POOL_MASS_NG_KEY, - SYNDNA_TOTAL_READS_KEY] - _validate_required_columns_exist( - prep_info_df, expected_prep_info_cols, - "prep info is missing required column(s)") - - # pull the syndna pool number from the prep info, ensure it is the same for - # all samples, and convert to the pool name - syndna_pool_number = prep_info_df[SYNDNA_POOL_NUM_KEY].unique() - if len(syndna_pool_number) > 1: - raise ValueError( - f"Multiple syndna_pool_numbers found in prep info: " - f"{syndna_pool_number}") - syndna_pool_name = f"pool{syndna_pool_number[0]}" - - # look in the SYNDNA_INDIV_NG_UL_KEY section of the config file to find the - # individual syndna concentrations associated with the relevant syndna - # pool name and turn the resulting dictionary into a dataframe - config_dict = _extract_config_dict(syndna_pool_config_fp) - conc_ng_ul_per_indiv_syndna = \ - config_dict[SYNDNA_INDIV_NG_UL_KEY][syndna_pool_name] - syndna_concs_df = pd.DataFrame( - conc_ng_ul_per_indiv_syndna.items(), - columns=[SYNDNA_ID_KEY, SYNDNA_INDIV_NG_UL_KEY]) - - # convert input biom table to a pd.SparseDataFrame, which is should act - # basically like a pd.DataFrame but take up less memory - reads_per_syndna_per_sample_df = \ - reads_per_syndna_per_sample_biom.to_dataframe(dense=False) - - # fit linear regression models for each sample - linregress_by_sample_id, msg_list = fit_linear_regression_models( - syndna_concs_df, prep_info_df, reads_per_syndna_per_sample_df, - min_sample_counts) - linregress_results_dict = _convert_linregressresults_to_dict( - linregress_by_sample_id) - - out_txt_by_out_type = { - LIN_REGRESS_RESULT_KEY: yaml.safe_dump(linregress_results_dict), - FIT_SYNDNA_MODELS_LOG_KEY: '\n'.join(msg_list)} - - return out_txt_by_out_type - - -def _validate_required_columns_exist( - input_df: pd.DataFrame, - required_cols_list: list[str], - error_msg: str): - - """Checks that the input dataframe has the required columns. - - Parameters - ---------- - input_df: pd.DataFrame - Dataframe to be checked. - required_cols_list: list[str] - List of column names that must be present in the dataframe. - error_msg: str - Error message to be raised if any of the required columns are missing. - """ - - missing_cols = set(required_cols_list) - set(input_df.columns) - if len(missing_cols) > 0: - missing_cols = sorted(missing_cols) - raise ValueError( - f"{error_msg}: {missing_cols}") +SLOPE_KEY = 'slope' +INTERCEPT_KEY = 'intercept' +RVALUE_KEY = 'rvalue' +PVALUE_KEY = 'pvalue' +STDERR_KEY = 'stderr' +INTERCEPT_STDERR_KEY = 'intercept_stderr' +REGRESSION_KEYS = [SLOPE_KEY, INTERCEPT_KEY, RVALUE_KEY, PVALUE_KEY, + STDERR_KEY, INTERCEPT_STDERR_KEY] def _extract_config_dict(config_fp=None): @@ -161,113 +62,6 @@ def _extract_config_dict(config_fp=None): return config_dict -def fit_linear_regression_models( - syndna_concs_df: pd.DataFrame, - sample_syndna_weights_and_total_reads_df: pd.DataFrame, - reads_per_syndna_per_sample_df: pd.DataFrame, - min_sample_counts: int) -> \ - (dict[str, scipy.stats.LinregressResult], list[str]): - - """Fits per-sample linear regression models predicting mass from counts. - - This fits a linear regression model for each sample, predicting - log10(mass of instances of a sequence) within a sample from - log10(counts per million for that sequence) within the sample, - using spike-in data from synDNAs. - - Parameters - ---------- - syndna_concs_df: pd.DataFrame - Dataframe containing SYNDNA_ID_KEY and SYNDNA_INDIV_NG_UL_KEY - (e.g. 1, 0.1, 0.01, 0.001, 0.0001) for all syndnas in the syndna pool - used in this experiment - sample_syndna_weights_and_total_reads_df: pd.DataFrame - Dataframe containing at least SAMPLE_ID_KEY, SYNDNA_POOL_MASS_NG_KEY - (the total weight of all syndnas in the sample combined, in ng), and - SYNDNA_TOTAL_READS_KEY (the number of total reads--not just aligned - reads--for all syndnas in the sample, including both r1 and r2) - reads_per_syndna_per_sample_df: pd.DataFrame - Wide-format dataframe with syndna ids as index and one - column for each sample id, which holds the read counts - aligned to that syndna in that sample. Note: should already have - combined forward and reverse counts. - min_sample_counts : int - Minimum number of counts required for a sample to be included in - the regression. Samples with fewer counts will be excluded. - - Returns - ------- - linregress_by_sample_id : dict[str, scipy.stats.LinregressResult] - returns a dictionary keyed by sample_id, for each sample_id in - reads_per_syndna_per_sample_df. Dictionary values are either None - (if no model could be trained for that sample_id) or a - scipy.stats.LinregressResult object defining the trained model. - Suitable for pickling to a file. - log_messages_list : list[str] - List of log messages generated during the fitting process. - """ - - log_messages_list = [] - - # id any syndnas that have an inadequate total number of reads aligned - # to them across all samples (less than min_sample_counts). Don't drop yet. - # Gathering this now bc it is easier while syndna id is still in the index, - # but we want the full column set while doing the validation checks. - # Note: synDNA author also made passing mention of dropping samples with - # inadequate "quality" but didn't provide any guidance on that. - too_low_counts_mask = \ - reads_per_syndna_per_sample_df.sum(axis=1) < min_sample_counts - syndnas_to_drop = \ - reads_per_syndna_per_sample_df[too_low_counts_mask].index.tolist() - - # move the syndna ids from the index to a column, bc I hate implicit - reads_per_syndna_per_sample_df = \ - reads_per_syndna_per_sample_df.reset_index(names=[SYNDNA_ID_KEY]) - - # validate that the syndna ids in the config and the data are consistent - _validate_syndna_id_consistency(syndna_concs_df, - reads_per_syndna_per_sample_df) - - # validate that sample ids in the experiment info and data are consistent - missing_sample_ids = _validate_sample_id_consistency( - sample_syndna_weights_and_total_reads_df, - reads_per_syndna_per_sample_df) - if missing_sample_ids is not None: - log_messages_list.append(f'The following sample ids were in the ' - f'experiment info but not in the data: ' - f'{missing_sample_ids}') - - # NOW remove any syndnas with too few counts from the dataframe, - # and log if there were any - filtered_reads_per_syndna_per_sample_df = \ - reads_per_syndna_per_sample_df[ - ~reads_per_syndna_per_sample_df[SYNDNA_ID_KEY].isin( - syndnas_to_drop)] - if len(syndnas_to_drop) > 0: - log_messages_list.append(f'The following syndnas were dropped ' - f'because they had fewer than ' - f'{min_sample_counts} total reads aligned:' - f'{syndnas_to_drop}') - - # reformat filtered_reads_per_syndna_per_sample_df into "long form": - # columns for syndna id, sample id, and read count - working_df = filtered_reads_per_syndna_per_sample_df.melt( - id_vars=[SYNDNA_ID_KEY], var_name=SAMPLE_ID_KEY, - value_name=SYNDNA_COUNTS_KEY) - - # merge w sample_total_reads_df to include total_reads column - working_df = working_df.merge(sample_syndna_weights_and_total_reads_df, - on=SAMPLE_ID_KEY, how='left') - - # calculate the weight in ng of *each* syndna in each sample - working_df = _calc_indiv_syndna_weights(syndna_concs_df, working_df) - - # fit linear regression models for each sample - linregress_by_sample_id = _fit_linear_regression_models(working_df) - - return linregress_by_sample_id, log_messages_list - - def _validate_syndna_id_consistency( syndna_concs_df: pd.DataFrame, reads_per_syndna_per_sample_df: pd.DataFrame): @@ -317,19 +111,20 @@ def _validate_syndna_id_consistency( def _validate_sample_id_consistency( sample_syndna_weights_and_total_reads_df: pd.DataFrame, - reads_per_syndna_per_sample_df: pd.DataFrame) -> list[str] | None: + reads_per_syndna_per_sample_df: pd.DataFrame) -> \ + Union[List[str], None]: """ Checks that the sample ids in the experiment info and data are consistent. Parameters ---------- sample_syndna_weights_and_total_reads_df: pd.DataFrame - Dataframe containing at least SAMPLE_ID_KEY, SYNDNA_POOL_MASS_NG_KEY + A Dataframe containing at least SAMPLE_ID_KEY, SYNDNA_POOL_MASS_NG_KEY (the total weight of all syndnas in the sample combined, in ng), and - SYNDNA_TOTAL_READS_KEY (the number of total reads--not just aligned - reads--for all syndnas in the sample, including both r1 and r2) + SAMPLE_TOTAL_READS_KEY (the number of total reads--not just aligned + reads--for the sample, including both r1 and r2) reads_per_syndna_per_sample_df: pd.DataFrame - Dataframe with a column for syndna_id and then one additional column + A Dataframe with a column for syndna_id and then one additional column for each sample_id, which holds the read counts aligned to that syndna in that sample. Note: should already have combined forward and reverse counts. @@ -346,30 +141,11 @@ def _validate_sample_id_consistency( data. None if all sample ids in the experiment info were in the data. """ - sample_ids_in_info = \ - set(sample_syndna_weights_and_total_reads_df[SAMPLE_ID_KEY]) - sample_ids_in_data = set(reads_per_syndna_per_sample_df.columns) - sample_ids_in_data.remove(SYNDNA_ID_KEY) + simplified_reads_df = reads_per_syndna_per_sample_df.copy() + simplified_reads_df.drop(columns=[SYNDNA_ID_KEY], inplace=True) - # if there are sample ids in the data that are not in the info, raise - # an error, since we don't know how to process that - data_only_samples = sample_ids_in_data - sample_ids_in_info - if len(data_only_samples) > 0: - raise ValueError( - f"Found sample ids in reads_per_syndna_per_sample_df that were " - f"not in sample_syndna_weights_and_total_reads_df: " - f"{data_only_samples}") - - # check if there are sample ids in the info that are not in the data and - # if so, capture a list of them. Sometimes a sample just fails sequencing - # and that shouldn't preclude processing the others that did work, but we - # want to know about it. - missing_sample_ids_set = sample_ids_in_info - sample_ids_in_data - - if len(missing_sample_ids_set) > 0: - missing_sample_ids = list(missing_sample_ids_set) - else: - missing_sample_ids = None + missing_sample_ids = validate_metadata_vs_reads_id_consistency( + sample_syndna_weights_and_total_reads_df, simplified_reads_df) return missing_sample_ids @@ -383,7 +159,7 @@ def _calc_indiv_syndna_weights( Parameters ---------- syndna_concs_df: pd.DataFrame - Dataframe containing SYNDNA_ID_KEY and SYNDNA_INDIV_NG_UL_KEY + A Dataframe containing SYNDNA_ID_KEY and SYNDNA_INDIV_NG_UL_KEY (e.g. 1, 0.1, 0.01, 0.001, 0.0001) for all syndnas in the syndna pool used in this experiment working_df: pd.DataFrame @@ -423,7 +199,7 @@ def _calc_indiv_syndna_weights( def _fit_linear_regression_models(working_df: pd.DataFrame) -> \ - dict[str, scipy.stats.LinregressResult]: + (Dict[str, Union[scipy.stats.LinregressResult, None]], List[str]): """Fits per-sample linear regression models predicting mass from counts. @@ -436,15 +212,18 @@ def _fit_linear_regression_models(working_df: pd.DataFrame) -> \ ---------- working_df: pd.DataFrame Long-form dataframe containing at least SAMPLE_ID_KEY, - SYNDNA_COUNTS_KEY, SYNDNA_TOTAL_READS_KEY, and + SYNDNA_COUNTS_KEY, SAMPLE_TOTAL_READS_KEY, and SYNDNA_INDIV_NG_KEY columns. Returns ------- - linregress_by_sample_id : dict[str, scipy.stats.LinregressResult] + linregress_by_sample_id : dict[str, scipy.stats.LinregressResult | None] returns a dictionary keyed by sample_id, for each sample_id in reads_per_syndna_per_sample_df. Dictionary values are - scipy.stats.LinregressResult objects defining the trained models. + scipy.stats.LinregressResult objects defining the trained models, or + None if no model could be fit. + log_msgs_list : list[str] + List of messages generated during the fitting process. """ # drop any rows where the count value is 0--can't take log of 0 @@ -455,7 +234,7 @@ def _fit_linear_regression_models(working_df: pd.DataFrame) -> \ # then multiplying by a million (1,000,000) working_df.loc[:, COUNTS_PER_MIL_KEY] = \ (working_df[SYNDNA_COUNTS_KEY] / - working_df[SYNDNA_TOTAL_READS_KEY]) * 1000000 + working_df[SAMPLE_TOTAL_READS_KEY]) * 1000000 # add a column of log10(CMP) by taking the log base 10 of the CPM column working_df.loc[:, LOG10_COUNTS_PER_MIL_KEY] = \ @@ -468,44 +247,48 @@ def _fit_linear_regression_models(working_df: pd.DataFrame) -> \ # loop over each sample id and fit a linear regression model predicting # log10(dna ng) from log10(counts per million) linregress_by_sample_id = {} + log_msgs_list = [] for curr_sample_id in working_df[SAMPLE_ID_KEY].unique(): curr_sample_df = \ working_df[working_df[SAMPLE_ID_KEY] == curr_sample_id] - # TODO: I need to know what kind of errors this can throw; some of them - # may just mean a linear regression can't be fit for this sample, but - # others may mean something is wrong with the data or the code. - # Once I know which is which, I can decide whether to try/catch - # anything silently. - try: curr_linregress_result = scipy.stats.linregress( curr_sample_df[LOG10_COUNTS_PER_MIL_KEY], curr_sample_df[LOG10_SYNDNA_INDIV_NG_KEY]) except Exception: + # TODO: I need to know what kind of errors this can throw; + # some of them may just mean a linear regression can't be fit for + # this sample, but others may mean something is wrong with the + # data (or the code). Once I know which is which, I can decide + # whether to try/catch things silently. + # if the regression fails, log the error and set the result to None + log_msgs_list.append( + f"Error fitting regression model for '{curr_sample_id}': ") + log_msgs_list.append(traceback.format_exc()) curr_linregress_result = None # record the whole lingregress result object in the output dictionary linregress_by_sample_id[curr_sample_id] = curr_linregress_result # next sample_id - return linregress_by_sample_id + return linregress_by_sample_id, log_msgs_list def _convert_linregressresults_to_dict( - linregress_by_sample_id: dict[str, scipy.stats.LinregressResult]) -> \ - dict[str, dict[str, float]]: + linregress_by_sample_id: Dict[str, Union[scipy.stats.LinregressResult, None]] + ) -> Dict[str, Union[Dict[str, float], None]]: - """Converts a scipy.stats.LinregressResult object to a dictionary. + """Converts scipy.stats.LinregressResult dict to dict of primitives. Returns ------- - linregress_result_dict : dict[str, dict[str, float]] + linregress_result_dict : dict[str, dict[str, float] | None] Dictionary keyed by sample id, containing for each sample either None (if no model could be trained for that SAMPLE_ID_KEY) or a dictionary representation of the sample's LinregressResult, with each property - name as a key and that property's value as the value. Values are - rounded to no more than 15 decimal places. + name as a key and that property's value as the value, as a float. + Values are rounded to no more than 15 decimal places. """ linregress_result_dict = {} @@ -530,6 +313,13 @@ def _convert_linregressresults_to_dict( # and sometimes differs between mac/ubuntu past this point. new_dict[k] = truncate(new_float, 12) + # if there are any values in REGRESSION_KEYS that are not in the + # keys of new_dict, then raise an error + missing_keys = set(REGRESSION_KEYS) - set(new_dict.keys()) + if len(missing_keys) > 0: + raise ValueError( + f"Regression for sample {curr_sample_id} does not " + f"include the following required keys: {missing_keys}") linregress_result_dict[curr_sample_id] = new_dict return linregress_result_dict @@ -541,17 +331,217 @@ def truncate(a_float, num_decimals): Parameters ---------- a_float : float - Float to be truncated. + A Float to be truncated. num_decimals : int Number of decimal places to which the float should be truncated. Returns ------- truncated_float : float - Float truncated to the specified number of decimal places. + A Float truncated to the specified number of decimal places. """ # multiply a_float by 10^num_decimals, convert to an integer, then divide # by 10^num_decimals to get the truncated float truncated_float = int(a_float * 10 ** num_decimals) / 10 ** num_decimals return truncated_float + + +def fit_linear_regression_models( + syndna_concs_df: pd.DataFrame, + sample_syndna_weights_and_total_reads_df: pd.DataFrame, + reads_per_syndna_per_sample_df: pd.DataFrame, + min_sample_counts: int) -> \ + (Dict[str, Union[Dict[str, float], None]], List[str]): + + """Fits per-sample linear regression models predicting mass from counts. + + This fits a linear regression model for each sample, predicting + log10(mass of instances of a sequence) within a sample from + log10(counts per million for that sequence) within the sample, + using spike-in data from synDNAs. + + Parameters + ---------- + syndna_concs_df: pd.DataFrame + A Dataframe containing SYNDNA_ID_KEY and SYNDNA_INDIV_NG_UL_KEY + (e.g. 1, 0.1, 0.01, 0.001, 0.0001) for all syndnas in the syndna pool + used in this experiment + sample_syndna_weights_and_total_reads_df: pd.DataFrame + A Dataframe containing at least SAMPLE_ID_KEY, SYNDNA_POOL_MASS_NG_KEY + (the total weight of all syndnas in the sample combined, in ng), and + SAMPLE_TOTAL_READS_KEY (the number of total reads--not just aligned + reads--for the sample, including both r1 and r2) + reads_per_syndna_per_sample_df: pd.DataFrame + Wide-format dataframe with syndna ids as index and one + column for each sample id, which holds the read counts + aligned to that syndna in that sample. Note: should already have + combined forward and reverse counts. + min_sample_counts : int + Minimum number of counts required for a sample to be included in + the regression. Samples with fewer counts will be excluded. + + Returns + ------- + linregress_result_dict : dict[str, dict[str, float] | None] + Dictionary keyed by sample id, containing for each sample either None + (if no model could be trained for that SAMPLE_ID_KEY) or a dictionary + representation of the sample's LinregressResult, with each property + name as a key and that property's value as the value, as a float. + Values are rounded to no more than 15 decimal places. + log_messages_list : list[str] + List of log messages generated during the fitting process. + """ + + log_messages_list = [] + + # check sample_syndna_weights_and_total_reads_df has the expected columns + expected_info_cols = [ + SAMPLE_ID_KEY, SYNDNA_POOL_MASS_NG_KEY, SAMPLE_TOTAL_READS_KEY] + validate_required_columns_exist( + sample_syndna_weights_and_total_reads_df, expected_info_cols, + "sample metadata is missing required column(s)") + + # id any syndnas that have an inadequate total number of reads aligned + # to them across all samples (less than min_sample_counts). Don't drop yet. + # Gathering this now bc it is easier while syndna id is still in the index, + # but we want the full column set while doing the validation checks. + # Note: synDNA author also made passing mention of dropping samples with + # inadequate "quality" but didn't provide any guidance on that. + too_low_counts_mask = \ + reads_per_syndna_per_sample_df.sum(axis=1) < min_sample_counts + syndnas_to_drop = \ + reads_per_syndna_per_sample_df[too_low_counts_mask].index.tolist() + + # move the syndna ids from the index to a column, bc I hate implicit + reads_per_syndna_per_sample_df = \ + reads_per_syndna_per_sample_df.reset_index(names=[SYNDNA_ID_KEY]) + + # validate that the syndna ids in the config and the data are consistent + _validate_syndna_id_consistency(syndna_concs_df, + reads_per_syndna_per_sample_df) + + # validate that sample ids in the experiment info and data are consistent + missing_sample_ids = _validate_sample_id_consistency( + sample_syndna_weights_and_total_reads_df, + reads_per_syndna_per_sample_df) + if missing_sample_ids is not None: + log_messages_list.append(f'The following sample ids were in the ' + f'experiment info but not in the data: ' + f'{missing_sample_ids}') + + # NOW remove any syndnas with too few counts from the dataframe, + # and log if there were any + filtered_reads_per_syndna_per_sample_df = \ + reads_per_syndna_per_sample_df[ + ~reads_per_syndna_per_sample_df[SYNDNA_ID_KEY].isin( + syndnas_to_drop)] + if len(syndnas_to_drop) > 0: + log_messages_list.append(f'The following syndnas were dropped ' + f'because they had fewer than ' + f'{min_sample_counts} total reads aligned:' + f'{syndnas_to_drop}') + + # reformat filtered_reads_per_syndna_per_sample_df into "long form": + # columns for syndna id, sample id, and read count + working_df = filtered_reads_per_syndna_per_sample_df.melt( + id_vars=[SYNDNA_ID_KEY], var_name=SAMPLE_ID_KEY, + value_name=SYNDNA_COUNTS_KEY) + + # merge w sample_total_reads_df to include total_reads column + working_df = working_df.merge(sample_syndna_weights_and_total_reads_df, + on=SAMPLE_ID_KEY, how='left') + + # calculate the weight in ng of *each* syndna in each sample + working_df = _calc_indiv_syndna_weights(syndna_concs_df, working_df) + + # fit linear regression models for each sample + linregress_by_sample_id, fit_msgs_list = \ + _fit_linear_regression_models(working_df) + log_messages_list.extend(fit_msgs_list) + linregress_results_dict = _convert_linregressresults_to_dict( + linregress_by_sample_id) + + return linregress_results_dict, log_messages_list + + +# TODO: if they sequenced over multiple lanes, would be different prep +# info files--talk to lab about whether they will ever do this :( +# this would require merge of multiple preparations +def fit_linear_regression_models_for_qiita( + prep_info_df: pd.DataFrame, + reads_per_syndna_per_sample_biom: biom.Table, + min_sample_counts: int = DEFAULT_MIN_SAMPLE_COUNTS, + syndna_pool_config_fp: Optional[str] = None) -> dict[str: str]: + + """Fits linear regressions predicting mass from counts using Qiita inputs. + + Parameters + ---------- + prep_info_df: pd.DataFrame + A Dataframe containing prep info for all samples in the prep, + including SAMPLE_ID, SYNDNA_POOL_NUM_KEY, SYNDNA_POOL_MASS_NG_KEY, + and SAMPLE_TOTAL_READS_KEY + reads_per_syndna_per_sample_biom: biom.Table + Biom table holding read counts aligned to each synDNA in each sample. + Note: should already have combined forward and reverse counts. + min_sample_counts: int + Minimum number of counts required for a sample to be included in + the regression. Samples with fewer counts will be excluded. + syndna_pool_config_fp: str, optional + Path to the yaml file holding the concentrations of each syndna + in the syndna pool used in this experiment. If not provided, will + look for the config.yml file in the parent directory of this file. + + Returns + ------- + out_txt_by_out_type : dict of str + Dictionary of output strings (ready to be written to files) keyed + by the type of output they contain. Currently, the following keys + are defined: + LIN_REGRESS_RESULT_KEY: yaml of dict[str, dict[str, float] | None] + FIT_SYNDNA_MODELS_LOG_KEY: txt log of messages from the fitting process + """ + + # check that the prep_info_df has the expected columns + expected_prep_info_cols = [ + SAMPLE_ID_KEY, SYNDNA_POOL_NUM_KEY, SYNDNA_POOL_MASS_NG_KEY, + SAMPLE_TOTAL_READS_KEY] + validate_required_columns_exist( + prep_info_df, expected_prep_info_cols, + "prep info is missing required column(s)") + + # pull the syndna pool number from the prep info, ensure it is the same for + # all samples, and convert to the pool name + syndna_pool_number = prep_info_df[SYNDNA_POOL_NUM_KEY].unique() + if len(syndna_pool_number) > 1: + raise ValueError( + f"Multiple syndna_pool_numbers found in prep info: " + f"{syndna_pool_number}") + syndna_pool_name = f"pool{syndna_pool_number[0]}" + + # look in the SYNDNA_INDIV_NG_UL_KEY section of the config file to find the + # individual syndna concentrations associated with the relevant syndna + # pool name and turn the resulting dictionary into a dataframe + config_dict = _extract_config_dict(syndna_pool_config_fp) + conc_ng_ul_per_indiv_syndna = \ + config_dict[SYNDNA_INDIV_NG_UL_KEY][syndna_pool_name] + syndna_concs_df = pd.DataFrame( + conc_ng_ul_per_indiv_syndna.items(), + columns=[SYNDNA_ID_KEY, SYNDNA_INDIV_NG_UL_KEY]) + + # convert input biom table to a pd.SparseDataFrame, which is should act + # basically like a pd.DataFrame but take up less memory + reads_per_syndna_per_sample_df = \ + reads_per_syndna_per_sample_biom.to_dataframe(dense=False) + + # fit linear regression models for each sample + linregress_results_dict, msg_list = fit_linear_regression_models( + syndna_concs_df, prep_info_df, reads_per_syndna_per_sample_df, + min_sample_counts) + + out_txt_by_out_type = { + LIN_REGRESS_RESULT_KEY: yaml.safe_dump(linregress_results_dict), + FIT_SYNDNA_MODELS_LOG_KEY: '\n'.join(msg_list)} + + return out_txt_by_out_type diff --git a/pysyndna/src/quant_orfs.py b/pysyndna/src/quant_orfs.py new file mode 100644 index 0000000..29e6606 --- /dev/null +++ b/pysyndna/src/quant_orfs.py @@ -0,0 +1,336 @@ +import biom.table +import pandas +from pysyndna.src.util import calc_copies_genomic_element_per_g_series, \ + calc_gs_genomic_element_in_aliquot, \ + validate_required_columns_exist, \ + validate_metadata_vs_reads_id_consistency, \ + validate_metadata_vs_prep_id_consistency, SAMPLE_ID_KEY, \ + SAMPLE_IN_ALIQUOT_MASS_G_KEY, ELUTE_VOL_UL_KEY, RNA_BASE_G_PER_MOLE, \ + REQUIRED_SAMPLE_INFO_KEYS + +OGU_ORF_ID_KEY = "ogu_orf_id" +OGU_ORF_START_KEY = "ogu_orf_start" +OGU_ORF_END_KEY = "ogu_orf_end" +OGU_ORF_LEN_KEY = "ogu_orf_len" +COPIES_PER_G_OGU_ORF_SSRNA_KEY = "copies_per_g_ogu_orf_ss_rna" +TOTAL_BIOLOGICAL_READS_KEY = "total_biological_reads_r1r2" +SSRNA_CONCENTRATION_NG_UL_KEY = "total_rna_concentration_ng_ul" +SSRNA_FROM_ALIQUOT_MASS_G_KEY = "ssrna_from_aliquot_mass_g" +REQUIRED_RNA_PREP_INFO_KEYS = [SAMPLE_ID_KEY, SSRNA_CONCENTRATION_NG_UL_KEY, + ELUTE_VOL_UL_KEY, TOTAL_BIOLOGICAL_READS_KEY] + + +def _read_ogu_orf_coords_to_df(wol_reannotations_fp: str) -> pandas.DataFrame: + """Read the OGU+ORF coordinates file into a DataFrame. + + Parameters + ---------- + wol_reannotations_fp : str + Filepath to the ORF coordinates file in the wol reannotations format, e.g.: + >G000005825 + 1 816 2168 + 2 2348 3490 + 3 3744 3959 + 4 3971 5086 + 5 5098 5373 + 6 5432 7372 + 7 7399 9966 + + Returns + ------- + ogu_orf_coords_df : pandas.DataFrame + A DataFrame containing columns for OGU_ORF_ID_KEY, OGU_ORF_START_KEY, + and OGU_ORF_END_KEY. + """ + curr_ogu_id, curr_ogu_orf_id = None, None + curr_ogu_orf_start, curr_ogu_orf_end = None, None + ogu_orf_ids, ogu_orf_starts, ogu_orf_ends = [], [], [] + + with open(wol_reannotations_fp, "r") as fh: + for line in fh.readlines(): + line = line.strip() + if line.startswith(">G"): + curr_ogu_id = line.replace(">", "") + else: + line_pieces = line.split("\t") + curr_orf_id = line_pieces[0] + curr_ogu_orf_start = int(line_pieces[1]) + curr_ogu_orf_end = int(line_pieces[2]) + curr_ogu_orf_id = curr_ogu_id + "_" + curr_orf_id + ogu_orf_ids.append(curr_ogu_orf_id) + ogu_orf_starts.append(curr_ogu_orf_start) + ogu_orf_ends.append(curr_ogu_orf_end) + # endif what to do with this line + # next line + + ogu_orf_coords_dict = { + OGU_ORF_ID_KEY: ogu_orf_ids, + OGU_ORF_START_KEY: ogu_orf_starts, + OGU_ORF_END_KEY: ogu_orf_ends + } + coords_df = pandas.DataFrame(ogu_orf_coords_dict) + return coords_df + + +def _calc_ogu_orf_copies_per_g_from_coords( + ogu_orf_coords_df: pandas.DataFrame) -> pandas.DataFrame: + """Calculate the copies per gram of each OGU+ORF ssRNA. + + Note that this not (necessarily) the same as the copies per gram of the + ssRNA *transcript* containing each OGU+ORF, since the latter might also + contain other OGU+ORFs and thus be heavier. + Parameters + ---------- + ogu_orf_coords_df : pandas.DataFrame + A DataFrame with columns for OGU_ORF_ID_KEY, OGU_ORF_START_KEY, and + OGU_ORF_END_KEY. + + Returns + ------- + ogu_orf_copies_per_g_df: pandas.DataFrame + A DataFrame with columns for OGU_ORF_ID_KEY and + COPIES_PER_G_OGU_ORF_SSRNA_KEY. + """ + + output_df = ogu_orf_coords_df.copy() + + # calculate the length of each OGU+ORF ssRNA: + # abs(ogu_orf_end - ogu_orf_start) + 1 + # abs because sometimes the start is greater than the end, + # +1 because the length is inclusive + output_df[OGU_ORF_LEN_KEY] = \ + output_df[OGU_ORF_END_KEY] - \ + output_df[OGU_ORF_START_KEY] + output_df[OGU_ORF_LEN_KEY] = \ + output_df[OGU_ORF_LEN_KEY].abs() + output_df[OGU_ORF_LEN_KEY] = \ + output_df[OGU_ORF_LEN_KEY] + 1 + + # calculate the copies per gram of each OGU+ORF ssRNA + ogu_orf_copies_per_g_series = calc_copies_genomic_element_per_g_series( + output_df[OGU_ORF_LEN_KEY], RNA_BASE_G_PER_MOLE) + + output_df[COPIES_PER_G_OGU_ORF_SSRNA_KEY] = \ + ogu_orf_copies_per_g_series + output_df.index = output_df[OGU_ORF_ID_KEY] + + return output_df + + +def _calc_copies_of_ogu_orf_ssrna_per_g_sample( + quant_params_per_sample_df: pandas.DataFrame, + reads_per_ogu_orf_per_sample_biom: biom.Table, + ogu_orf_copies_per_g_ssrna_df: pandas.DataFrame) -> biom.Table: + + """Calculate the copies of each OGU+ORF ssRNA per gram of sample. + + Parameters + ---------- + quant_params_per_sample_df : pandas.DataFrame + A DataFrame containing at least SAMPLE_ID_KEY, + SAMPLE_IN_ALIQUOT_MASS_G_KEY, SSRNA_CONCENTRATION_NG_UL_KEY, + ELUTE_VOL_UL_KEY, and TOTAL_BIOLOGICAL_READS_KEY. + reads_per_ogu_orf_per_sample_biom : biom.Table + A biom.Table with the number of reads per OGU+ORF per sample, such + as that output by woltka. + ogu_orf_copies_per_g_ssrna_df: pandas.DataFrame + A DataFrame with columns for OGU_ORF_ID_KEY and + COPIES_PER_G_OGU_ORF_SSRNA_KEY. + + Returns + ------- + copies_of_ogu_orf_ssrna_per_g_sample : biom.Table + A biom.Table with the copies of each OGU+ORF ssRNA per gram of sample. + """ + + # turn REQUIRED_SAMPLE_INFO_KEYS and REQUIRED_RNA_PREP_INFO_KEYS into sets + # and combine them into a single set, then turn it back into a list + required_cols_list = list( + set(REQUIRED_SAMPLE_INFO_KEYS) | set(REQUIRED_RNA_PREP_INFO_KEYS)) + validate_required_columns_exist( + quant_params_per_sample_df, required_cols_list, + "parameters dataframe is missing required column(s)") + + # validate that the sample ids in the quant_params_per_sample_df match the + # sample ids in the reads_per_ogu_orf_per_sample_biom. Ignore sample ids + # in the quant_params_per_sample_df that are not in the biom table; those + # could just be samples that failed sequencing/etc. + _ = validate_metadata_vs_reads_id_consistency( + quant_params_per_sample_df, reads_per_ogu_orf_per_sample_biom) + + # Set index on quant_params_per_sample_df to be SAMPLE_ID_KEY for easy + # lookup of values by sample id during biom lambda functions + quant_params_per_sample_df.index = \ + quant_params_per_sample_df[SAMPLE_ID_KEY] + + # Calculate the grams of total ssRNA from each sample that are in the elute + # after extraction + g_total_ssrna_per_sample_df = calc_gs_genomic_element_in_aliquot( + quant_params_per_sample_df, SSRNA_CONCENTRATION_NG_UL_KEY, + SSRNA_FROM_ALIQUOT_MASS_G_KEY) + + # step 1 of OGU+ORF quantitation is upstream of this function: + # Run woltka to get the reads_per_ogu_orf_per_sample_biom. + # Calculations below are done directly on biom tables, since they are + # expected to be very large and very sparse. + + # step 2: + # Calculate fraction of total biological reads per OGU+ORF per sample: + # Divide every value in reads_per_ogu_orf_per_sample_biom by the + # value of the TOTAL_BIOLOGICAL_READS_KEY for that value's OGU_ORF_ID_KEY + # in quant_params_per_sample_df. + # See https://biom-format.org/documentation/generated/biom.table.Table.transform.html + # for details of how to write and use a function for biom.transform(). + def get_fraction_of_sample_reads(data, id_, _): + # df.at[] is fast to get a single value by a row/column label pair + return data / quant_params_per_sample_df.at[id_, TOTAL_BIOLOGICAL_READS_KEY] + fraction_of_sample_reads_per_sample_biom = \ + reads_per_ogu_orf_per_sample_biom.transform( + f=get_fraction_of_sample_reads, axis='sample', inplace=False) + + # step 3: + # Calculate grams of ssRNA per OGU+ORF per sample: + # Multiply the fraction of total biological reads per OGU+ORF per sample + # by the total grams of ssRNA from each sample that are in the elute after + # extraction. + def get_ogu_orf_ssrna_g_in_sample(data, id_, _): + return data * g_total_ssrna_per_sample_df.at[id_, SSRNA_FROM_ALIQUOT_MASS_G_KEY] + g_ssrna_per_ogu_orf_per_sample_biom = \ + fraction_of_sample_reads_per_sample_biom.transform( + f=get_ogu_orf_ssrna_g_in_sample, axis='sample', inplace=False) + + # step 4: + # Calculate copies per OGU+ORF per sample + # Multiply the grams of ssRNA of each OGU+ORF per sample by the copies per + # gram of each OGU+ORF ssRNA. + # This gives you the copies of each OGU+ORF ssRNA present in the whole + # extracted sample. + def get_copies_per_ogu_orf_per_sample(data, id_, _): + return data * ogu_orf_copies_per_g_ssrna_df.at[id_, COPIES_PER_G_OGU_ORF_SSRNA_KEY] + copies_per_ogu_orf_per_sample_biom = \ + g_ssrna_per_ogu_orf_per_sample_biom.transform( + f=get_copies_per_ogu_orf_per_sample, axis='observation', inplace=False) + + # Step 5: + # Calculate the copies of each OGU+ORF ssRNA per gram of sample material + # Divide the copies per OGU+ORF in each extracted sample by the grams of + # sample material put into the extraction for the relevant sample + def get_copies_per_g_sample(data, id_, _): + return data / quant_params_per_sample_df.at[id_, SAMPLE_IN_ALIQUOT_MASS_G_KEY] + copies_of_ogu_orf_ssrna_per_g_sample_biom = \ + copies_per_ogu_orf_per_sample_biom.transform( + f=get_copies_per_g_sample, axis='sample', inplace=False) + + return copies_of_ogu_orf_ssrna_per_g_sample_biom + + +def calc_copies_of_ogu_orf_ssrna_per_g_sample( + quant_params_per_sample_df: pandas.DataFrame, + reads_per_ogu_orf_per_sample_biom: biom.Table, + ogu_orf_coords_fp: str) -> biom.Table: + """Calculate the copies of each OGU+ORF ssRNA per gram of sample. + + Parameters + ---------- + quant_params_per_sample_df : pandas.DataFrame + A DataFrame containing at least SAMPLE_ID_KEY, + SAMPLE_IN_ALIQUOT_MASS_G_KEY, SSRNA_CONCENTRATION_NG_UL_KEY, + ELUTE_VOL_UL_KEY, and TOTAL_BIOLOGICAL_READS_KEY. + reads_per_ogu_orf_per_sample_biom : biom.Table + A biom.Table with the number of reads per OGU+ORF per sample, such + as that output by woltka. + ogu_orf_coords_fp : str + Filepath to the OGU+ORF coordinates file, such as the coords.txt + file used by woltka, in the format shown below: + >G000005825 + 1 816 2168 + 2 2348 3490 + 3 3744 3959 + 4 3971 5086 + 5 5098 5373 + 6 5432 7372 + 7 7399 9966 + + + Returns + ------- + copies_of_ogu_orf_ssrna_per_g_sample : biom.Table + A biom.Table with the copies of each OGU+ORF ssRNA per gram of sample. + """ + + # Calculate the copies per gram of each OGU+ORF ssRNA + ogu_orf_coords_df = _read_ogu_orf_coords_to_df(ogu_orf_coords_fp) + ogu_orf_copies_per_g_ssrna_df = _calc_ogu_orf_copies_per_g_from_coords( + ogu_orf_coords_df) + + copies_of_ogu_orf_ssrna_per_g_sample_biom = \ + _calc_copies_of_ogu_orf_ssrna_per_g_sample( + quant_params_per_sample_df, reads_per_ogu_orf_per_sample_biom, + ogu_orf_copies_per_g_ssrna_df) + + return copies_of_ogu_orf_ssrna_per_g_sample_biom + + +def calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita( + sample_info_df: pandas.DataFrame, + prep_info_df: pandas.DataFrame, + reads_per_ogu_orf_per_sample_biom: biom.Table, + ogu_orf_coords_fp: str) -> biom.Table: + + """Calculate the copies of each OGU+ORF ssRNA per gram of sample for Qiita. + + Parameters + ---------- + sample_info_df : pandas.DataFrame + A DataFrame containing sample info for all samples in the prep, + including SAMPLE_ID_KEY and SAMPLE_IN_ALIQUOT_MASS_G_KEY + prep_info_df : pandas.DataFrame + A DataFrame containing prep info for all samples in the prep, + including SAMPLE_ID_KEY, SSRNA_CONCENTRATION_NG_UL_KEY, + ELUTE_VOL_UL_KEY, and TOTAL_BIOLOGICAL_READS_KEY. + reads_per_ogu_orf_per_sample_biom : biom.Table + A biom.Table with the number of reads per OGU+ORF per sample, such + as that output by woltka. + ogu_orf_coords_fp : str + Filepath to the OGU+ORF coordinates file, such as the coords.txt + file used by woltka, in the format shown below: + >G000005825 + 1 816 2168 + 2 2348 3490 + 3 3744 3959 + 4 3971 5086 + 5 5098 5373 + 6 5432 7372 + 7 7399 9966 + + + Returns + ------- + copies_of_ogu_orf_ssrna_per_g_sample : biom.Table + A biom.Table with the copies of each OGU+ORF ssRNA per gram of sample. + """ + + # check if the inputs all have the required columns + validate_required_columns_exist( + sample_info_df, REQUIRED_SAMPLE_INFO_KEYS, + "sample info is missing required column(s)") + + validate_required_columns_exist( + prep_info_df, REQUIRED_RNA_PREP_INFO_KEYS, + "prep info is missing required column(s)") + + # validate that the sample ids in the sample_info_df match the sample ids + # in the prep_info_df. Ignore sample ids in sample_info_df that are not in + # the prep_info_df; these could just not be included in this prep. + _ = validate_metadata_vs_prep_id_consistency( + sample_info_df, prep_info_df) + + quant_params_per_sample_df = prep_info_df.merge( + sample_info_df, on=SAMPLE_ID_KEY, how="inner") + + copies_of_ogu_orf_ssrna_per_g_sample_biom = \ + calc_copies_of_ogu_orf_ssrna_per_g_sample( + quant_params_per_sample_df, reads_per_ogu_orf_per_sample_biom, + ogu_orf_coords_fp) + + return copies_of_ogu_orf_ssrna_per_g_sample_biom diff --git a/pysyndna/src/util.py b/pysyndna/src/util.py new file mode 100644 index 0000000..786862f --- /dev/null +++ b/pysyndna/src/util.py @@ -0,0 +1,274 @@ +from typing import Optional, Union, List + +import biom +import pandas +import pandas as pd + +DNA_BASEPAIR_G_PER_MOLE = 650 +RNA_BASE_G_PER_MOLE = 340 +NANOGRAMS_PER_GRAM = 1e9 + +# NB: sample_name instead of sample_id bc that's what qiita uses +SAMPLE_ID_KEY = 'sample_name' +SAMPLE_IN_ALIQUOT_MASS_G_KEY = 'calc_mass_sample_aliquot_input_g' +ELUTE_VOL_UL_KEY = 'vol_extracted_elution_ul' +REQUIRED_SAMPLE_INFO_KEYS = [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY] + + +def _validate_sample_id_consistency( + sample_ids_in_metadata: set, + sample_ids_in_data: set, + metadata_name: str, + data_set_name: str) \ + -> Union[List[str], None]: + """ + Checks that the sample ids in the metadata and data are consistent. + + Parameters + ---------- + sample_ids_in_metadata: set + A set of the sample ids in the metadata + sample_ids_in_data: set + A set of the sample ids in the data + metadata_name: str + A string identifying the metadata being checked, for use in error + messages. + data_set_name: str + A string identifying the data set being checked, for use in error + messages. + + Raises + ------ + ValueError + If there are sample ids in the data that aren't in the metadata + + Returns + ------- + missing_sample_ids : set + A set of sample ids that are in the metadata but not in the + data. Empty if all sample ids in the metadata were in the data. + + """ + + # if there are sample ids in the data that are not in the metadata, raise + # an error, since we don't know how to process that + data_only_samples = sample_ids_in_data - sample_ids_in_metadata + if len(data_only_samples) > 0: + raise ValueError( + f"Found sample ids in {data_set_name} that were " + f"not in {metadata_name}: {data_only_samples}") + + # check if there are sample ids in the metadata that are not in the data + # and if so, capture a list of them. Sometimes a sample just fails + # sequencing and that shouldn't preclude processing the others that did + # work, but we want to know about it. + missing_sample_ids_set = sample_ids_in_metadata - sample_ids_in_data + + if len(missing_sample_ids_set) > 0: + missing_sample_ids = list(missing_sample_ids_set) + else: + missing_sample_ids = None + + return missing_sample_ids + + +def validate_required_columns_exist( + input_df: pd.DataFrame, + required_cols_list: List[str], + error_msg: str): + + """Checks that the input dataframe has the required columns. + + Parameters + ---------- + input_df: pd.DataFrame + A Dataframe to be checked. + required_cols_list: list[str] + List of column names that must be present in the dataframe. + error_msg: str + Error message to be raised if any of the required columns are missing. + """ + + missing_cols = set(required_cols_list) - set(input_df.columns) + if len(missing_cols) > 0: + missing_cols = sorted(missing_cols) + raise ValueError( + f"{error_msg}: {missing_cols}") + + +def validate_metadata_vs_reads_id_consistency( + metadata_df: pd.DataFrame, + reads_df: Union[pd.DataFrame, biom.Table]) \ + -> Union[List[str], None]: + """ + Checks that the sample ids in the sample metadata and data are consistent. + + Parameters + ---------- + metadata_df: pd.DataFrame + A Dataframe containing at least SAMPLE_ID_KEY column + reads_df: pd.DataFrame | biom.Table + Either a Dataframe with a column for each SAMPLE_ID_KEY or a biom.Table + with a column for each SAMPLE_ID_KEY + + Raises + ------ + ValueError + If there are sample ids in the data that aren't in the metadata df + + Returns + ------- + missing_sample_ids : List[str] | None + List of sample ids that are in the sample info but not in the + data. None if all sample ids in the experiment info were in the data. + """ + + sample_ids_in_metadata = set(metadata_df[SAMPLE_ID_KEY]) + if isinstance(reads_df, biom.Table): + sample_ids_in_reads = set(reads_df.ids(axis='sample')) + else: + sample_ids_in_reads = set(reads_df.columns) + missing_reads_ids = _validate_sample_id_consistency( + sample_ids_in_metadata, sample_ids_in_reads, "sample info", + "reads data") + + return missing_reads_ids + + +def validate_metadata_vs_prep_id_consistency( + metadata_df: pd.DataFrame, + prep_df: pd.DataFrame) \ + -> Union[List[str], None]: + """ + Checks that sample ids in the sample metadata and prep info are consistent. + + Parameters + ---------- + metadata_df: pd.DataFrame + A Dataframe of sample metadata containing at least SAMPLE_ID_KEY column + prep_df: pd.DataFrame + A Dataframe of prep info with a column for SAMPLE_ID_KEY + + Raises + ------ + ValueError + If there are sample ids in prep info that aren't in sample metadata + + Returns + ------- + missing_sample_ids : List[str] | None + List of sample ids that are in the sample metadata but not in the + prep info. None if all sample ids in the sample metadata were in the + prep info. + """ + + sample_ids_in_metadata = set(metadata_df[SAMPLE_ID_KEY]) + sample_ids_in_prep = set(prep_df[SAMPLE_ID_KEY]) + missing_prep_ids = _validate_sample_id_consistency( + sample_ids_in_metadata, sample_ids_in_prep, + "sample info", "prep info") + return missing_prep_ids + + +def calc_copies_genomic_element_per_g_series( + genomic_elements_lengths_series: pd.Series, + genomic_element_unit_avg_g_per_mole: float, + is_test: Optional[bool] = False) -> pd.Series: + + """Calculates copies of genomic unit per gram of genomic element's unit. + + For example, get copies of OGU genomes per gram of double-stranded OGU gDNA + or copies of OGU+ORF RNAs per gram of single-stranded OGU+ORF RNA. + + Parameters + ---------- + genomic_elements_lengths_series: pd.Series + A Series with index identifying each genomic element, containing length + of each element in genomic element units. For example, length in DNA + basepairs for OGUs or length in (single-stranded) RNA bases for + OGU+ORF RNAs. + genomic_element_unit_avg_g_per_mole: float + Average mass in grams per mole of a genomic element unit. For example, + 650 g/mole for a DNA basepair or 340 g/mole for an RNA base. + is_test: Optional[bool] + Default is False. If True, the function will use the less-precise + value of Avogadro's number (6.022*(10^23)) used in cell [16] of the + https://github.com/lzaramela/SynDNA/blob/main/SynDNA_saliva_samples_analysis.ipynb + notebook, rather than the more precise value (6.02214076*(10^23)) + calculation used if False. This is True in testing ONLY. + + Returns + ------- + ogu_genomes_series : pd.Series + A Series with index of OGU_ID_KEY and values of the number of genomes + of each OGU in the sequenced sample. + + Terminology: + genomic_element: a distinct element measured on a genome such as an OGU + (i.e., the whole genome) or an ORF on an OGU (called "OGU+ORF") + genomic_element_unit: the units in which the genomic element is measured; + in the case of OGUs, this is DNA basepairs, while in the case of + OGU+ORFs, the units are RNA bases (i.e., single-stranded). + + This calculates the total number of copies of genomic element X per gram + of genomic element units by the equation: + + Avogadro's number in (copies of genomic element X)/mole + = --------------------------------------------------------------- + length of genomic element X in genomic element units * + average g/mole per genomic element unit + + Avogadro's number is 6.02214076 × 10^23 , and is the number of + molecules--such as OGU genomes or OGU+ORF RNAs--in a mole of the genomic + element. + """ + + # seems weird to make this a variable since it's famously a constant, but.. + avogadros_num = 6.02214076e23 + # this is done so we can test against Livia's results, which use + # a truncated version of the constant. This should NOT be done in + # production. In testing, makes a difference of e.g., about 10 cells + # out of 25K for the first OGU in the first sample in Livia's dataset. + if is_test: + avogadros_num = 6.022e23 + + # TODO: do we have to worry about integer overflow here? + # Dan H. said, "if you use ints, the length * 650 * 10^9 + # can overflow integers with very long genomes". HOWEVER, + # the internet says that python *3* , "[o]nly floats have a hard + # limit in python. Integers are implemented as “long” integer + # objects of arbitrary size"(https://stackoverflow.com/a/52151786) + # HOWEVER HOWEVER, *numpy* integer types are fixed width, and + # "Some pandas and numpy functions, such as sum on arrays or + # Series return an np.int64 so this might be the reason you are + # seeing int overflows in Python3." + # (https://stackoverflow.com/a/58640340) + # What to do? + + denominator_series = \ + genomic_elements_lengths_series * genomic_element_unit_avg_g_per_mole + + copies_of_genomic_element_per_g_of_genomic_element_unit = \ + avogadros_num/denominator_series + + return copies_of_genomic_element_per_g_of_genomic_element_unit + + +def calc_gs_genomic_element_in_aliquot( + genomic_elements_df: pd.DataFrame, + genomic_element_conc_key: str, + genomic_element_mass_key: str) -> pandas.DataFrame: + + working_df = genomic_elements_df.copy() + + # get the total grams of the genomic element that are in the elute after + # extraction; this is sample-specific: + # concentration of genomic element after extraction in ng/uL times + # volume of elute from the extraction in uL, divided by 10^9 ng/g + # (which is the same as multiplied by 1/10^9 g/ng) + + working_df[genomic_element_mass_key] = \ + working_df[genomic_element_conc_key] * \ + working_df[ELUTE_VOL_UL_KEY] / NANOGRAMS_PER_GRAM + + return working_df diff --git a/pysyndna/tests/data/coords.txt b/pysyndna/tests/data/coords.txt new file mode 100644 index 0000000..2dc6850 --- /dev/null +++ b/pysyndna/tests/data/coords.txt @@ -0,0 +1,12 @@ +>G000005825 +1 816 2168 +2 2348 3490 +3 3744 3959 +4 3971 5086 +5 5098 5373 +>G900163845 +3247 3392209 3390413 +3248 3393051 3392206 +3249 3393938 3393048 +3250 3394702 3393935 +3251 3395077 3395721 \ No newline at end of file diff --git a/pysyndna/tests/test_calc_cell_counts.py b/pysyndna/tests/test_calc_cell_counts.py index ab0468b..18edf78 100644 --- a/pysyndna/tests/test_calc_cell_counts.py +++ b/pysyndna/tests/test_calc_cell_counts.py @@ -9,15 +9,15 @@ from unittest import TestCase from pysyndna import calc_ogu_cell_counts_biom, \ calc_ogu_cell_counts_per_g_of_sample_for_qiita -from pysyndna.src.fit_syndna_models import SAMPLE_ID_KEY -from pysyndna.src.calc_cell_counts import OGU_ID_KEY, OGU_READ_COUNT_KEY, \ +from pysyndna.src.calc_cell_counts import SAMPLE_ID_KEY, ELUTE_VOL_UL_KEY, \ + OGU_ID_KEY, OGU_READ_COUNT_KEY, \ OGU_LEN_IN_BP_KEY, OGU_GDNA_MASS_NG_KEY, \ SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY, OGU_GENOMES_PER_G_OF_GDNA_KEY, \ OGU_CELLS_PER_G_OF_GDNA_KEY, SYNDNA_POOL_MASS_NG_KEY, \ GDNA_CONCENTRATION_NG_UL_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY, \ - ELUTE_VOL_UL_KEY, GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY, \ + GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY, \ OGU_CELLS_PER_G_OF_SAMPLE_KEY, TOTAL_OGU_READS_KEY, OGU_COVERAGE_KEY, \ - CELL_COUNT_RESULT_KEY, CELL_COUNT_LOG_KEY, \ + CELL_COUNT_RESULT_KEY, CELL_COUNT_LOG_KEY, SAMPLE_TOTAL_READS_KEY, \ _calc_long_format_ogu_cell_counts_df, \ _prepare_cell_counts_calc_df, \ _calc_ogu_cell_counts_df_for_sample, \ @@ -64,12 +64,15 @@ class TestCalcCellCounts(TestCase): SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY: [5, 4.76], GDNA_CONCENTRATION_NG_UL_KEY: [2, 1.4], ELUTE_VOL_UL_KEY: [100, 100], - SYNDNA_POOL_MASS_NG_KEY: [0.25, 0.238] + SYNDNA_POOL_MASS_NG_KEY: [0.25, 0.238], } - # Values from "absolute_quant_example.xlsx" - mass_ratio_dict = { + # Values from "absolute_quant_example.xlsx" EXCEPT for the + # SAMPLE_TOTAL_READS_KEY values, which come from summing + # the OGU_READ_COUNT_KEY values for each sample + mass_and_totals_dict = { SAMPLE_ID_KEY: ["example1", "example2"], + SAMPLE_TOTAL_READS_KEY: [472140, 611913], SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY: [5, 4.76], GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY: [7.1867431342E-06, 4.7470988923E-06] @@ -245,7 +248,7 @@ class TestCalcCellCounts(TestCase): "Escherichia coli", "Tyzzerella nexilis", "Prevotella sp. oral taxon 299", "Streptococcus mitis", "Leptolyngbya valderiana", - #"Neisseria subflava", + # "Neisseria subflava", "Neisseria flavescens", "Fusobacterium periodonticum", "Streptococcus pneumoniae", @@ -261,7 +264,7 @@ class TestCalcCellCounts(TestCase): 1975, # 0, 22303, 197830, 14478, - #12, + # 12, 14609], # These count values are the same as those in # self.example1_ogu_full_outputs_full_avogadro_dict @@ -323,14 +326,16 @@ class TestCalcCellCounts(TestCase): # self.example2_ogu_filtered_inputs_outputs_full_avogadro_dict. Note # that with reordering, the 4th sub-array is the one for L. gasseri, # the 5th is for L. valderiana, and the 9th is for R. albus. + # The two 0 values are for N. subflava and H. influenzae, which were + # removed from example2 data due to low coverage. OGU_CELLS_PER_G_OF_GDNA_KEY: [ [21897704979729.094, 7101240813289.261], [7100063146106.998, 40527863244164.32], - [5718752608946.0205, np.nan], + [5718752608946.0205, 0], [52695192015949.67, 17086455403978.045], [11223075218306.252, 3613767901730.258], [9289882608698.639, 3004973286163.8184], - [10879422748260.775, np.nan], + [10879422748260.775, 0], [12674159207435.06, 4102264162505.8833], [27710822536547.69, 8987677125515.266], [11582576292095.531, 3747369928484.789], @@ -392,9 +397,6 @@ def _generate_sample_names_list(self, use_filtered_ex2=True): output.extend(curr_names_list) return output - def setUp(self): - self.test_data_dir = os.path.join(os.path.dirname(__file__), 'data') - # The built-in self.assertEqual works fine to compare biom tables that # don't have NaNs, but it doesn't work for tables that do have NaNs # because NaN != NaN so two tables that contain NaNs are by definition @@ -423,18 +425,23 @@ def assert_biom_tables_equal(self, expected_out_biom, output_biom, output_biom.matrix_data.data[obs_an], decimal=decimal_precision) + def setUp(self): + self.test_data_dir = os.path.join(os.path.dirname(__file__), 'data') + def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita(self): # example4 is the same as example2 except that the elute volume is 70; # see "absolute_quant_example.xlsx" for details. example4_elute_vol = 70 sample_ids = ["example1", "example4"] - sample_info_dict = {k: self.sample_and_prep_input_dict[k].copy() for k in - [SAMPLE_IN_ALIQUOT_MASS_G_KEY]} + sample_info_dict = {k: self.sample_and_prep_input_dict[k].copy() for + k in [SAMPLE_IN_ALIQUOT_MASS_G_KEY]} sample_info_dict[SAMPLE_ID_KEY] = sample_ids prep_info_dict = {k: self.sample_and_prep_input_dict[k].copy() for k in [GDNA_CONCENTRATION_NG_UL_KEY, - ELUTE_VOL_UL_KEY, SYNDNA_POOL_MASS_NG_KEY]} + ELUTE_VOL_UL_KEY, SYNDNA_POOL_MASS_NG_KEY]} + prep_info_dict[SAMPLE_TOTAL_READS_KEY] = \ + self.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY] prep_info_dict[SAMPLE_ID_KEY] = sample_ids prep_info_dict[ELUTE_VOL_UL_KEY][1] = example4_elute_vol @@ -445,7 +452,7 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita(self): # those in self.example1_ogu_full_outputs_full_avogadro_dict because # the gdna-to-sample mass ratio calculated internally during this # soup-to-nuts function has more digits past the decimal than does the - # example1 entry in the manually-populated self.mass_ratio_dict. + # example1 entry in the manually-populated self.mass_and_totals_dict. # Since we are multiplying/dividing by large numbers like e.g., 10^9 # (to change ng to g), this ends up making a slight difference in the # end product: for example, for L.gasseri, @@ -463,11 +470,11 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita(self): ogu_cell_counts_per_g_sample = np.array([ [157373183.3914873, 23597204.3149076], [51026330.8697321, 134672840.2210325], - [41099206.6945521, np.nan], + [41099206.6945521, 0], [378706815.3787082, 56777764.5887874], [80657360.0375914, 12008439.3369959], [66764001.1050239, 9985433.5965833], - [78187617.9691203, np.nan], + [78187617.9691203, 0], [91085928.0975326, 13631697.3528372], [199150566.7379318, 29865774.0278729], [83241001.9519951, 12452394.7533948], @@ -546,6 +553,8 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_prep_err(self): # missing required columns prep_info_dict = {k: self.sample_and_prep_input_dict[k] for k in [SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY]} + prep_info_dict[SAMPLE_TOTAL_READS_KEY] = \ + self.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY] counts_vals = self._make_combined_counts_np_array() @@ -569,11 +578,50 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_prep_err(self): sample_info_df, prep_info_df, models_fp, counts_biom, lengths_fp, read_len, min_coverage, min_rsquared) + def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_ids_err(self): + sample_info_dict = {k: self.sample_and_prep_input_dict[k] for k in + [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY]} + + prep_info_dict = {k: self.sample_and_prep_input_dict[k] for k in + [SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY, + ELUTE_VOL_UL_KEY, SYNDNA_POOL_MASS_NG_KEY]} + prep_info_dict[SAMPLE_TOTAL_READS_KEY] = \ + self.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY] + + counts_vals = self._make_combined_counts_np_array() + + # remove one of the sample ids from the sample info; this will cause + # an error (whereas the reverse--sample id in sample info but not in + # prep info--will NOT) + sample_info_df = pd.DataFrame(sample_info_dict) + sample_info_df.drop(index=0, axis=0, inplace=True) + + prep_info_df = pd.DataFrame(prep_info_dict) + counts_biom = biom.table.Table( + counts_vals, + self.ogu_lengths_dict[OGU_ID_KEY], + prep_info_dict[SAMPLE_ID_KEY]) + models_fp = os.path.join(self.test_data_dir, "models.yml") + lengths_fp = os.path.join(self.test_data_dir, "ogu_lengths.tsv") + + read_len = 150 + min_coverage = 1 + min_rsquared = 0.8 + + err_msg = (r"Found sample ids in prep info that were not in" + r" sample info: \{'example1'\}") + with self.assertRaisesRegex(ValueError, err_msg): + calc_ogu_cell_counts_per_g_of_sample_for_qiita( + sample_info_df, prep_info_df, models_fp, counts_biom, + lengths_fp, read_len, min_coverage, min_rsquared) + def test_calc_ogu_cell_counts_biom(self): params_dict = {k: self.sample_and_prep_input_dict[k] for k in [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY, GDNA_CONCENTRATION_NG_UL_KEY, ELUTE_VOL_UL_KEY, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY]} + params_dict[SAMPLE_TOTAL_READS_KEY] = \ + self.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY] counts_vals = self._make_combined_counts_np_array() @@ -616,6 +664,69 @@ def test_calc_ogu_cell_counts_biom(self): "'example2;Haemophilus influenzae']"], output_msgs) + def test_calc_ogu_cell_counts_biom_w_col_err(self): + # missing SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY col + params_dict = {k: self.sample_and_prep_input_dict[k] for k in + [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY, + GDNA_CONCENTRATION_NG_UL_KEY, ELUTE_VOL_UL_KEY]} + params_dict[SAMPLE_TOTAL_READS_KEY] = \ + self.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY] + + counts_vals = self._make_combined_counts_np_array() + + params_df = pd.DataFrame(params_dict) + counts_biom = biom.table.Table( + counts_vals, + self.ogu_lengths_dict[OGU_ID_KEY], + params_dict[SAMPLE_ID_KEY]) + lengths_df = pd.DataFrame(self.ogu_lengths_dict) + + read_len = 150 + min_coverage = 1 + min_rsquared = 0.8 + output_metric = OGU_CELLS_PER_G_OF_GDNA_KEY + + err_msg = r"sample info is missing required column\(s\): " \ + r"\['sequenced_sample_gdna_mass_ng'\]" + with self.assertRaisesRegex(ValueError, err_msg): + calc_ogu_cell_counts_biom( + params_df, self.linregresses_dict, counts_biom, lengths_df, + read_len, min_coverage, min_rsquared, output_metric) + + def test_calc_ogu_cell_counts_biom_w_id_err(self): + params_dict = {k: self.sample_and_prep_input_dict[k] for k in + [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY, + GDNA_CONCENTRATION_NG_UL_KEY, ELUTE_VOL_UL_KEY, + SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY]} + params_dict[SAMPLE_TOTAL_READS_KEY] = \ + self.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY] + + counts_vals = self._make_combined_counts_np_array() + + # remove one of the sample ids from the params info; this will cause + # an error (whereas the reverse--sample id in params info but not in + # reads data--will NOT) + params_df = pd.DataFrame(params_dict) + params_df.drop(index=0, axis=0, inplace=True) + + counts_biom = biom.table.Table( + counts_vals, + self.ogu_lengths_dict[OGU_ID_KEY], + params_dict[SAMPLE_ID_KEY]) + lengths_df = pd.DataFrame(self.ogu_lengths_dict) + + read_len = 150 + min_coverage = 1 + min_rsquared = 0.8 + output_metric = OGU_CELLS_PER_G_OF_GDNA_KEY + + err_msg = (r"Found sample ids in reads data that were not in " + r"sample info: \{'example1'\}") + with self.assertRaisesRegex(ValueError, err_msg): + calc_ogu_cell_counts_biom( + params_df, self.linregresses_dict, counts_biom, lengths_df, + read_len, min_coverage, min_rsquared, output_metric) + def test_calc_ogu_cell_counts_biom_w_cast(self): # these values are the same as those in self.sample_and_prep_input_dict # except that some of them are represented as strings instead of #s @@ -624,7 +735,8 @@ def test_calc_ogu_cell_counts_biom_w_cast(self): GDNA_CONCENTRATION_NG_UL_KEY: ["2", 1.4], SAMPLE_IN_ALIQUOT_MASS_G_KEY: [0.027829017, "0.029491697"], ELUTE_VOL_UL_KEY: ["100", "70"], - SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY: [5, "4.76"] + SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY: [5, "4.76"], + SAMPLE_TOTAL_READS_KEY: self.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY] } counts_vals = self._make_combined_counts_np_array() @@ -713,7 +825,7 @@ def test__calc_long_format_ogu_cell_counts_df(self): counts_df = pd.DataFrame(counts_dict) counts_df.set_index(OGU_ID_KEY, inplace=True) - mass_ratio_df = pd.DataFrame(self.mass_ratio_dict) + per_sample_calc_info_df = pd.DataFrame(self.mass_and_totals_dict) lengths_df = pd.DataFrame(self.ogu_lengths_dict) expected_df = pd.DataFrame(expected_dict) @@ -722,8 +834,8 @@ def test__calc_long_format_ogu_cell_counts_df(self): min_rsquared = 0.8 output_df, output_msgs = _calc_long_format_ogu_cell_counts_df( - self.linregresses_dict, counts_df, lengths_df, mass_ratio_df, - read_len, min_coverage, min_rsquared) + self.linregresses_dict, counts_df, lengths_df, + per_sample_calc_info_df, read_len, min_coverage, min_rsquared) pd.testing.assert_frame_equal(expected_df, output_df) self.assertListEqual( @@ -741,7 +853,7 @@ def test__calc_long_format_ogu_cell_counts_df_error(self): self.example2_ogu_full_inputs_dict[OGU_READ_COUNT_KEY]), } - mass_ratio_dict = {k: self.mass_ratio_dict[k] for k in + mass_ratio_dict = {k: self.mass_and_totals_dict[k] for k in (SAMPLE_ID_KEY, GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY)} linregresses_dict = { @@ -880,7 +992,7 @@ def test__prepare_cell_counts_calc_df_v_sparse(self): def test__calc_ogu_cell_counts_df_for_sample(self): input_dict = self._combine_inputs() input_df = pd.DataFrame(input_dict) - mass_ratio_df = pd.DataFrame(self.mass_ratio_dict) + per_sample_info_df = pd.DataFrame(self.mass_and_totals_dict) expected_additions_dict = { k: self.example1_ogu_full_outputs_short_avogadro_dict[k] for k in @@ -897,7 +1009,7 @@ def test__calc_ogu_cell_counts_df_for_sample(self): min_rsquared = 0.8 output_df, output_msgs = _calc_ogu_cell_counts_df_for_sample( - sample_id, self.linregresses_dict, mass_ratio_df, input_df, + sample_id, self.linregresses_dict, per_sample_info_df, input_df, min_rsquared, is_test=True) pd.testing.assert_frame_equal(expected_out_df, output_df) @@ -949,7 +1061,7 @@ def test__calc_gdna_mass_to_sample_mass_by_sample_df(self): (SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY, ELUTE_VOL_UL_KEY)} - expected_dict = {k: self.mass_ratio_dict[k] for k in + expected_dict = {k: self.mass_and_totals_dict[k] for k in (SAMPLE_ID_KEY, GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY)} inputs_df = pd.DataFrame(inputs_dict) @@ -976,6 +1088,12 @@ def test__calc_ogu_gdna_mass_ng_series_for_sample(self): slope = 1.24487652379132 intercept = -6.77539505390338 + # This number comes from summing all the reads in the input_df. + # This matches what was done for the Zaramela calculations. I + # suspect that this should perhaps be the total reads for the + # whole sample, but for testing this will do. + sample_total_reads = 472140 + input_df = pd.DataFrame(input_dict) expected_series = pd.Series( self.example1_ogu_full_outputs_short_avogadro_dict[ @@ -985,7 +1103,7 @@ def test__calc_ogu_gdna_mass_ng_series_for_sample(self): expected_series.index.name = OGU_ID_KEY output_series = _calc_ogu_gdna_mass_ng_series_for_sample( - input_df, slope, intercept) + input_df, slope, intercept, sample_total_reads) assert_series_equal(expected_series, output_series) diff --git a/pysyndna/tests/test_fit_syndna_models.py b/pysyndna/tests/test_fit_syndna_models.py index fa57b7b..04c5795 100644 --- a/pysyndna/tests/test_fit_syndna_models.py +++ b/pysyndna/tests/test_fit_syndna_models.py @@ -11,7 +11,7 @@ from pysyndna.src.fit_syndna_models import SAMPLE_ID_KEY, SYNDNA_ID_KEY, \ SYNDNA_POOL_MASS_NG_KEY, SYNDNA_INDIV_NG_UL_KEY, \ SYNDNA_FRACTION_OF_POOL_KEY, SYNDNA_INDIV_NG_KEY, \ - SYNDNA_TOTAL_READS_KEY, SYNDNA_POOL_NUM_KEY, \ + SAMPLE_TOTAL_READS_KEY, SYNDNA_POOL_NUM_KEY, \ _validate_syndna_id_consistency, _validate_sample_id_consistency, \ _calc_indiv_syndna_weights, _fit_linear_regression_models @@ -36,7 +36,7 @@ class FitSyndnaModelsTest(TestCase): # system. a_sample_syndna_weights_and_total_reads_dict = { SAMPLE_ID_KEY: [sample_ids[0]], - SYNDNA_TOTAL_READS_KEY: [3216923], + SAMPLE_TOTAL_READS_KEY: [3216923], SYNDNA_POOL_MASS_NG_KEY: [0.25], } @@ -46,7 +46,7 @@ class FitSyndnaModelsTest(TestCase): # Syndna pool masses are plausible values for our experimental system. a_b_sample_syndna_weights_and_total_reads_dict = { SAMPLE_ID_KEY: sample_ids, - SYNDNA_TOTAL_READS_KEY: [3216923, 1723417], + SAMPLE_TOTAL_READS_KEY: [3216923, 1723417], SYNDNA_POOL_MASS_NG_KEY: [0.25, 0.2], } @@ -57,11 +57,12 @@ class FitSyndnaModelsTest(TestCase): # Syndna pool masses are plausible values for our experimental system. a_b_c_sample_syndna_weights_and_total_reads_dict = { SAMPLE_ID_KEY: [sample_ids[0], sample_ids[1], "C"], - SYNDNA_TOTAL_READS_KEY: [3216923, 1723417, 2606004], + SAMPLE_TOTAL_READS_KEY: [3216923, 1723417, 2606004], SYNDNA_POOL_MASS_NG_KEY: [0.25, 0.2, 0.3], } - # The below sample values come from the "A1_pool1_S21_L001_R1_001.fastq_output_forward_paired.fq.sam.bam.f13_r1.fq_synDNA" + # The below sample values come from the + # "A1_pool1_S21_L001_R1_001.fastq_output_forward_paired.fq.sam.bam.f13_r1.fq_synDNA" # and "A1_pool2_S22_L001_R1_001.fastq_output_forward_paired.fq.sam.bam.f13_r1.fq_synDNA" # columns of https://github.com/lzaramela/SynDNA/blob/main/data/synDNA_Fwd_Rev_sam.biom.tsv , # while the syndna ids are inferred from the contents of the "OTUID" @@ -85,16 +86,20 @@ class FitSyndnaModelsTest(TestCase): # "A1_pool1_Fwd" *but* we use a different pool mass than Zaramela, # so the same syndna counts are based on different masses. lingress_results = { - 'A': LinregressResult( - slope=1.244876523791319, intercept=-6.7242381884894655, - rvalue=0.9865030975156575, pvalue=1.428443560659758e-07, - stderr=0.07305408550335003, - intercept_stderr=0.2361976278251443), - 'B': LinregressResult( - slope=1.24675913604407, intercept=-7.155318973708384, - rvalue=0.9863241797356326, pvalue=1.505381146809759e-07, - stderr=0.07365795255302438, - intercept_stderr=0.2563956755844754) + 'A': { + "slope": 1.244876523791319, + "intercept": -6.7242381884894655, + "rvalue": 0.9865030975156575, + "pvalue": 1.428443560659758e-07, + "stderr": 0.07305408550335003, + "intercept_stderr": 0.2361976278251443}, + 'B': { + "slope": 1.24675913604407, + "intercept": -7.155318973708384, + "rvalue": 0.9863241797356326, + "pvalue": 1.505381146809759e-07, + "stderr": 0.07365795255302438, + "intercept_stderr": 0.2563956755844754} } prep_info_dict = copy.deepcopy( @@ -111,14 +116,14 @@ class FitSyndnaModelsTest(TestCase): reads_per_syndna_per_sample_dict["B"])]) def assert_lingressresult_dict_almost_equal(self, d1, d2, places=7): - """Assert that two dicts of LinregressResult are almost equal. + """Assert that two dicts of lingress results are almost equal. Parameters ---------- d1 : dict - The first dict of LinregressResult to compare + The first dict to compare d2 : dict - The second dict of LinregressResult to compare + The second dict to compare places : int, optional The number of decimal places to compare to @@ -131,30 +136,10 @@ def assert_lingressresult_dict_almost_equal(self, d1, d2, places=7): self.assertIsInstance(d2, dict) self.assertEqual(d1.keys(), d2.keys()) for k in d1.keys(): - self.assert_linregressresult_almost_equal(d1[k], d2[k], places) - - def assert_linregressresult_almost_equal(self, l1, l2, places=7): - """Assert that two LinregressResult are almost equal. - - Parameters - ---------- - l1 : dict - The first LinregressResult to compare - l2 : dict - The second LinregressResult to compare - places : int, optional - The number of decimal places to compare to - - Raises - ------ - AssertionError - If the LinregressResults are not almost equal - """ - self.assertIsInstance(l1, LinregressResult) - self.assertIsInstance(l2, LinregressResult) - self.assertEqual(len(l1), len(l2)) - for i in range(0, len(l1)): - self.assertAlmostEqual(l1[i], l2[i], places=places) + for m in d1[k].keys(): + m1 = d1[k][m] + m2 = d2[k][m] + self.assertAlmostEqual(m1, m2) def setUp(self): self.maxDiff = None @@ -292,7 +277,7 @@ def test_fit_linear_regression_models_for_qiita_w_col_error(self): prep_info_dict = { SAMPLE_ID_KEY: ["A", "B"], "sequencing_type": ["shotgun", "shotgun"], - SYNDNA_TOTAL_READS_KEY: [3216923, 1723417], + SAMPLE_TOTAL_READS_KEY: [3216923, 1723417], SYNDNA_POOL_MASS_NG_KEY: [0.25, 0.2], # missing the SYNDNA_POOL_NUM_KEY column } @@ -340,16 +325,20 @@ def test_fit_linear_regression_models_w_log_msgs(self): # syndnas with <200 total counts removed on "linear regressions" sheet # of "absolute_quant_example.xlsx"). expected_out_dict = { - 'A': LinregressResult( - slope=1.2561949109446753, intercept=-6.7671601206840855, - rvalue=0.982777689569875, pvalue=2.1705143708536327e-06, - stderr=0.08927614710714807, - intercept_stderr=0.30147987595768355), - 'B': LinregressResult( - slope=1.2568191864801976, intercept=-7.196128673001381, - rvalue=0.9825127010266727, pvalue=2.2890733334160456e-06, - stderr=0.09002330756867402, - intercept_stderr=0.32657986324660143) + 'A': { + "slope": 1.2561949109446753, + "intercept": -6.7671601206840855, + "rvalue": 0.982777689569875, + "pvalue": 2.1705143708536327e-06, + "stderr": 0.08927614710714807, + "intercept_stderr": 0.30147987595768355}, + 'B': { + "slope": 1.2568191864801976, + "intercept": -7.196128673001381, + "rvalue": 0.9825127010266727, + "pvalue": 2.2890733334160456e-06, + "stderr": 0.09002330756867402, + "intercept_stderr": 0.32657986324660143} } expected_out_msgs = [ "The following sample ids were in the experiment info but not in " @@ -382,8 +371,8 @@ def test_fit_linear_regression_models_w_sample_error(self): # which is in the data expected_err_msg = \ - r"Found sample ids in reads_per_syndna_per_sample_df that were " \ - r"not in sample_syndna_weights_and_total_reads_df: \{'B'\}" + (r"Found sample ids in reads data that were not in sample info: " + r"\{'B'\}") syndna_concs_df = pd.DataFrame(self.syndna_concs_dict) sample_syndna_weights_and_total_reads_df = pd.DataFrame( @@ -559,8 +548,8 @@ def test__validate_sample_id_consistency_w_error(self): reads_per_syndna_per_sample_df = pd.DataFrame( self.reads_per_syndna_per_sample_dict) - err_msg = "Found sample ids in reads_per_syndna_per_sample_df " \ - "that were not in sample_syndna_weights_and_total_reads_df" + err_msg = (r"Found sample ids in reads data that were not in sample " + r"info: \{'B'\}") with self.assertRaisesRegex(ValueError, err_msg): _validate_sample_id_consistency( sample_syndna_weights_and_total_reads_df, @@ -630,7 +619,7 @@ def test__fit_linear_regression_models(self): input_fp = os.path.join(self.data_dir, 'modelling_input.tsv') working_df = pd.read_csv(input_fp, sep="\t", comment="#") - output = _fit_linear_regression_models(working_df) + output, out_msgs_list = _fit_linear_regression_models(working_df) expected_fp = os.path.join(self.data_dir, 'modelling_output.tsv') expected_df = pd.read_csv(expected_fp, sep="\t", comment="#") @@ -645,3 +634,5 @@ def test__fit_linear_regression_models(self): self.assertAlmostEqual(expected_slope, v.slope) self.assertAlmostEqual(expected_intercept, v.intercept) # next model + + self.assertEqual([], out_msgs_list) diff --git a/pysyndna/tests/test_quant_orfs.py b/pysyndna/tests/test_quant_orfs.py new file mode 100644 index 0000000..b4724a1 --- /dev/null +++ b/pysyndna/tests/test_quant_orfs.py @@ -0,0 +1,316 @@ +import biom.table +import numpy as np +import os +import pandas +from pandas.testing import assert_frame_equal +from unittest import TestCase +from pysyndna import calc_copies_of_ogu_orf_ssrna_per_g_sample, \ + calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita +from pysyndna.src.quant_orfs import _read_ogu_orf_coords_to_df, \ + _calc_ogu_orf_copies_per_g_from_coords, \ + _calc_copies_of_ogu_orf_ssrna_per_g_sample, \ + OGU_ORF_ID_KEY, OGU_ORF_START_KEY, OGU_ORF_END_KEY, OGU_ORF_LEN_KEY, \ + COPIES_PER_G_OGU_ORF_SSRNA_KEY, SAMPLE_ID_KEY, \ + SAMPLE_IN_ALIQUOT_MASS_G_KEY, SSRNA_CONCENTRATION_NG_UL_KEY, \ + ELUTE_VOL_UL_KEY, TOTAL_BIOLOGICAL_READS_KEY + + +class TestQuantOrfs(TestCase): + COORDS_DICT = { + OGU_ORF_ID_KEY: ["G000005825_1", "G000005825_2", "G000005825_3", + "G000005825_4", "G000005825_5", "G900163845_3247", + "G900163845_3248", "G900163845_3249", + "G900163845_3250", "G900163845_3251"], + OGU_ORF_START_KEY: [816, 2348, 3744, 3971, 5098, 3392209, 3393051, + 3393938, 3394702, 3395077], + OGU_ORF_END_KEY: [2168, 3490, 3959, 5086, 5373, 3390413, 3392206, + 3393048, 3393935, 3395721] + } + + LEN_AND_COPIES_DICT = { + OGU_ORF_ID_KEY: ["G000005825_1", "G000005825_2", "G000005825_3", + "G000005825_4", "G000005825_5", "G900163845_3247", + "G900163845_3248", "G900163845_3249", + "G900163845_3250", "G900163845_3251"], + OGU_ORF_LEN_KEY: [1353, 1143, 216, 1116, 276, 1797, 846, 891, + 768, 645], + COPIES_PER_G_OGU_ORF_SSRNA_KEY: [1.3091041E+18, 1.5496219E+18, + 8.2000827E+18, 1.5871128E+18, + 6.4174561E+18, 9.8565268E+17, + 2.0936381E+18, 1.9878988E+18, + 2.3062733E+18, 2.7460742E+18] + } + + SAMPLE_IDS = ["IBSRS3526007", "IQSRS3526010"] + COUNT_VALS = np.array([ + [0, 0], + [2, 0], + [0, 1], + [35, 0], + [0, 694], + [10292, 382], + [0, 0], + [190, 10], + [0, 630], + [34, 1003]]) + + PARAMS_DICT = { + SAMPLE_ID_KEY: SAMPLE_IDS, + SAMPLE_IN_ALIQUOT_MASS_G_KEY: [0.003, 0.00082], + SSRNA_CONCENTRATION_NG_UL_KEY: [0.132714286, 0.0042], + ELUTE_VOL_UL_KEY: [70, 70], + TOTAL_BIOLOGICAL_READS_KEY: [213988, 3028580] + } + + COPIES_PER_G_SAMPLE_VALS = np.array([ + [0, 0], + [4.4849829E+07, 0], + [0, 9.7076176E+05], + [8.0386085E+08, 0], + [0, 5.2725026E+08], + [1.4680090E+11, 4.4574009E+07], + [0, 0], + [5.4657898E+09, 2.3533619E+06], + [0, 1.7200685E+08], + [1.3511272E+09, 3.2606759E+08]]) + + def setUp(self): + self.maxDiff = None + self.data_dir = os.path.join(os.path.dirname(__file__), 'data') + + def test__read_ogu_orf_coords_to_df(self): + expected_df = pandas.DataFrame(self.COORDS_DICT) + + ogu_orf_coords_fp = os.path.join(self.data_dir, "coords.txt") + output_df = _read_ogu_orf_coords_to_df(ogu_orf_coords_fp) + assert_frame_equal(output_df, expected_df) + + def test__calc_ogu_orf_copies_per_g_from_coords(self): + expected_dict = self.COORDS_DICT.copy() + expected_dict.update(self.LEN_AND_COPIES_DICT) + expected_df = pandas.DataFrame( + expected_dict, index=expected_dict[OGU_ORF_ID_KEY]) + expected_df.index.name = OGU_ORF_ID_KEY + + input_df = pandas.DataFrame(self.COORDS_DICT) + output_df = _calc_ogu_orf_copies_per_g_from_coords(input_df) + + assert_frame_equal(expected_df, output_df) + + def test__calc_copies_of_ogu_orf_ssrna_per_g_sample(self): + input_quant_params_per_sample_df = pandas.DataFrame(self.PARAMS_DICT) + input_ogu_orf_copies_per_g_ssrna_df = pandas.DataFrame( + self.LEN_AND_COPIES_DICT, + index=self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY]) + + input_reads_per_ogu_orf_per_sample_biom = biom.table.Table( + self.COUNT_VALS, + self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY], + self.SAMPLE_IDS) + + expected_biom = biom.table.Table( + self.COPIES_PER_G_SAMPLE_VALS, + self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY], + self.SAMPLE_IDS) + + output_biom = _calc_copies_of_ogu_orf_ssrna_per_g_sample( + input_quant_params_per_sample_df, + input_reads_per_ogu_orf_per_sample_biom, + input_ogu_orf_copies_per_g_ssrna_df) + + # NB: Comparing the bioms as dataframes because the biom equality + # compare does not allow "almost equal" checking for float values, + # whereas rtol and atol are built in to assert_frame_equal + output_df = output_biom.to_dataframe() + expected_df = expected_biom.to_dataframe() + pandas.testing.assert_frame_equal(output_df, expected_df) + + def test__calc_copies_of_ogu_orf_ssrna_per_g_sample_ids_err(self): + # drop the first sample from the params dataframe; now the reads + # will contain a sample that the params dataframe does not + input_quant_params_per_sample_df = pandas.DataFrame(self.PARAMS_DICT) + input_quant_params_per_sample_df.drop(index=0, axis=0, inplace=True) + + input_ogu_orf_copies_per_g_ssrna_df = pandas.DataFrame( + self.LEN_AND_COPIES_DICT, + index=self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY]) + + input_reads_per_ogu_orf_per_sample_biom = biom.table.Table( + self.COUNT_VALS, + self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY], + self.SAMPLE_IDS) + + expected_msg = r"Found sample ids in reads data that were not in" \ + r" sample info: \{'IBSRS3526007'\}" + with self.assertRaisesRegex(ValueError, expected_msg): + _ = _calc_copies_of_ogu_orf_ssrna_per_g_sample( + input_quant_params_per_sample_df, + input_reads_per_ogu_orf_per_sample_biom, + input_ogu_orf_copies_per_g_ssrna_df) + + def test__calc_copies_of_ogu_orf_ssrna_per_g_sample_col_err(self): + params_dict = self.PARAMS_DICT.copy() + + # drop a necessary column from the params dict + del params_dict[TOTAL_BIOLOGICAL_READS_KEY] + input_quant_params_per_sample_df = pandas.DataFrame(params_dict) + input_quant_params_per_sample_df.drop(index=0, axis=0, inplace=True) + + input_ogu_orf_copies_per_g_ssrna_df = pandas.DataFrame( + self.LEN_AND_COPIES_DICT, + index=self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY]) + + input_reads_per_ogu_orf_per_sample_biom = biom.table.Table( + self.COUNT_VALS, + self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY], + self.SAMPLE_IDS) + + expected_msg = r"parameters dataframe is missing required " \ + r"column\(s\): \['total_biological_reads_r1r2'\]" + with self.assertRaisesRegex(ValueError, expected_msg): + _ = _calc_copies_of_ogu_orf_ssrna_per_g_sample( + input_quant_params_per_sample_df, + input_reads_per_ogu_orf_per_sample_biom, + input_ogu_orf_copies_per_g_ssrna_df) + + def test_calc_copies_of_ogu_orf_ssrna_per_g_sample(self): + input_quant_params_per_sample_df = pandas.DataFrame(self.PARAMS_DICT) + ogu_orf_coords_fp = os.path.join(self.data_dir, "coords.txt") + + input_reads_per_ogu_orf_per_sample_biom = biom.table.Table( + self.COUNT_VALS, + self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY], + self.SAMPLE_IDS) + + expected_biom = biom.table.Table( + self.COPIES_PER_G_SAMPLE_VALS, + self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY], + self.SAMPLE_IDS) + + output_biom = calc_copies_of_ogu_orf_ssrna_per_g_sample( + input_quant_params_per_sample_df, + input_reads_per_ogu_orf_per_sample_biom, + ogu_orf_coords_fp) + + # NB: Comparing the bioms as dataframes because the biom equality + # compare does not allow "almost equal" checking for float values, + # whereas rtol and atol are built in to assert_frame_equal + output_df = output_biom.to_dataframe() + expected_df = expected_biom.to_dataframe() + pandas.testing.assert_frame_equal(output_df, expected_df) + + def test_calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita(self): + sample_info_dict = {k: self.PARAMS_DICT[k] for k in + [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY]} + + prep_info_dict = {k: self.PARAMS_DICT[k] for k in + [SAMPLE_ID_KEY, ELUTE_VOL_UL_KEY, + SSRNA_CONCENTRATION_NG_UL_KEY, + TOTAL_BIOLOGICAL_READS_KEY]} + + sample_info_df = pandas.DataFrame(sample_info_dict) + prep_info_df = pandas.DataFrame(prep_info_dict) + ogu_orf_coords_fp = os.path.join(self.data_dir, "coords.txt") + + input_reads_per_ogu_orf_per_sample_biom = biom.table.Table( + self.COUNT_VALS, + self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY], + self.SAMPLE_IDS) + + expected_biom = biom.table.Table( + self.COPIES_PER_G_SAMPLE_VALS, + self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY], + self.SAMPLE_IDS) + + output_biom = calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita( + sample_info_df, prep_info_df, + input_reads_per_ogu_orf_per_sample_biom, + ogu_orf_coords_fp) + + # NB: Comparing the bioms as dataframes because the biom equality + # compare does not allow "almost equal" checking for float values, + # whereas rtol and atol are built in to assert_frame_equal + output_df = output_biom.to_dataframe() + expected_df = expected_biom.to_dataframe() + pandas.testing.assert_frame_equal(output_df, expected_df) + + def test_calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita_col_err(self): + sample_info_dict = {k: self.PARAMS_DICT[k] for k in + [SAMPLE_ID_KEY]} + + prep_info_dict = {k: self.PARAMS_DICT[k] for k in + [SAMPLE_ID_KEY, ELUTE_VOL_UL_KEY, + SSRNA_CONCENTRATION_NG_UL_KEY, + TOTAL_BIOLOGICAL_READS_KEY]} + + sample_info_df = pandas.DataFrame(sample_info_dict) + prep_info_df = pandas.DataFrame(prep_info_dict) + ogu_orf_coords_fp = os.path.join(self.data_dir, "coords.txt") + + input_reads_per_ogu_orf_per_sample_biom = biom.table.Table( + self.COUNT_VALS, + self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY], + self.SAMPLE_IDS) + + expected_msg = r"sample info is missing required " \ + r"column\(s\): \['calc_mass_sample_aliquot_input_g'\]" + with self.assertRaisesRegex(ValueError, expected_msg): + _ = calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita( + sample_info_df, prep_info_df, + input_reads_per_ogu_orf_per_sample_biom, + ogu_orf_coords_fp) + + def test_calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita_col_err2(self): + sample_info_dict = {k: self.PARAMS_DICT[k] for k in + [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY]} + + prep_info_dict = {k: self.PARAMS_DICT[k] for k in + [SAMPLE_ID_KEY, ELUTE_VOL_UL_KEY, + SSRNA_CONCENTRATION_NG_UL_KEY]} + + sample_info_df = pandas.DataFrame(sample_info_dict) + prep_info_df = pandas.DataFrame(prep_info_dict) + ogu_orf_coords_fp = os.path.join(self.data_dir, "coords.txt") + + input_reads_per_ogu_orf_per_sample_biom = biom.table.Table( + self.COUNT_VALS, + self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY], + self.SAMPLE_IDS) + + expected_msg = r"prep info is missing required " \ + r"column\(s\): \['total_biological_reads_r1r2'\]" + with self.assertRaisesRegex(ValueError, expected_msg): + _ = calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita( + sample_info_df, prep_info_df, + input_reads_per_ogu_orf_per_sample_biom, + ogu_orf_coords_fp) + + def test_calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita_id_err(self): + sample_info_dict = {k: self.PARAMS_DICT[k] for k in + [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY]} + + prep_info_dict = {k: self.PARAMS_DICT[k] for k in + [SAMPLE_ID_KEY, ELUTE_VOL_UL_KEY, + SSRNA_CONCENTRATION_NG_UL_KEY, + TOTAL_BIOLOGICAL_READS_KEY]} + + sample_info_df = pandas.DataFrame(sample_info_dict) + + # drop the first sample from the prep dataframe; now the sample info + # will contain a sample that the prep dataframe does not. + prep_info_df = pandas.DataFrame(prep_info_dict) + prep_info_df.drop(index=0, axis=0, inplace=True) + + ogu_orf_coords_fp = os.path.join(self.data_dir, "coords.txt") + input_reads_per_ogu_orf_per_sample_biom = biom.table.Table( + self.COUNT_VALS, + self.LEN_AND_COPIES_DICT[OGU_ORF_ID_KEY], + self.SAMPLE_IDS) + + expected_msg = (r"Found sample ids in reads data that were not in " + r"sample info: \{'IBSRS3526007'\}") + with self.assertRaisesRegex(ValueError, expected_msg): + _ = calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita( + sample_info_df, prep_info_df, + input_reads_per_ogu_orf_per_sample_biom, + ogu_orf_coords_fp) diff --git a/pysyndna/tests/test_util.py b/pysyndna/tests/test_util.py new file mode 100644 index 0000000..7289999 --- /dev/null +++ b/pysyndna/tests/test_util.py @@ -0,0 +1,235 @@ +import biom +import numpy as np +import pandas +from pandas.testing import assert_series_equal, assert_frame_equal +from unittest import TestCase +from pysyndna.src.util import calc_copies_genomic_element_per_g_series, \ + calc_gs_genomic_element_in_aliquot, \ + validate_metadata_vs_prep_id_consistency, \ + validate_metadata_vs_reads_id_consistency, \ + validate_required_columns_exist, SAMPLE_ID_KEY, ELUTE_VOL_UL_KEY + + +class TestCalcCellCounts(TestCase): + def test_validate_required_columns_exist_true(self): + input_dict = { + 'sample_id': ['sample1'], + 'prep_id': ['prep1'], + } + input_df = pandas.DataFrame(input_dict) + required_columns = ['sample_id', 'prep_id'] + + validate_required_columns_exist( + input_df, required_columns, "missing") + + # Pass test if we made it this far + self.assertTrue(True) + + def test_validate_required_columns_exist_err(self): + input_dict = { + 'sample_id': ['sample1'], + } + input_df = pandas.DataFrame(input_dict) + required_columns = ['sample_id', 'prep_id'] + + expected_err = r"missing: \['prep_id'\]" + with self.assertRaisesRegex(ValueError, expected_err): + validate_required_columns_exist( + input_df, required_columns, "missing") + + def test_validate_metadata_vs_prep_id_consistency_true(self): + input_dict = { + SAMPLE_ID_KEY: ['sample1'], + 'color': ['blue'], + } + input_df = pandas.DataFrame(input_dict) + + prep_dict = { + SAMPLE_ID_KEY: ['sample1'], + 'prep_id': ['prep1'], + } + prep_df = pandas.DataFrame(prep_dict) + + _ = validate_metadata_vs_prep_id_consistency(input_df, prep_df) + + # Pass test if we made it this far + self.assertTrue(True) + + def test_validate_metadata_vs_prep_id_consistency_true_w_msg(self): + input_dict = { + SAMPLE_ID_KEY: ['sample1', 'sample2'], + 'color': ['blue', 'aqua'], + } + input_df = pandas.DataFrame(input_dict) + + prep_dict = { + SAMPLE_ID_KEY: ['sample1'], + 'prep_id': ['prep1'], + } + prep_df = pandas.DataFrame(prep_dict) + + not_in_prep_ids = validate_metadata_vs_prep_id_consistency( + input_df, prep_df) + + expected_not_in_prep_ids = ['sample2'] + self.assertEqual(not_in_prep_ids, expected_not_in_prep_ids) + + def test_validate_metadata_vs_prep_id_consistency_err(self): + input_dict = { + SAMPLE_ID_KEY: ['sample1'], + 'color': ['blue'], + } + input_df = pandas.DataFrame(input_dict) + + prep_dict = { + SAMPLE_ID_KEY: ['sample1', 'sample2'], + 'prep_id': ['prep1', 'prep2'], + } + prep_df = pandas.DataFrame(prep_dict) + + expected_err = (r"Found sample ids in prep info that were not in " + r"sample info: \{'sample2'\}") + with self.assertRaisesRegex(ValueError, expected_err): + _ = validate_metadata_vs_prep_id_consistency( + input_df, prep_df) + + def test_validate_metadata_vs_reads_id_consistency_df_true(self): + input_dict = { + SAMPLE_ID_KEY: ['sample1', 'sample2'], + 'color': ['blue', 'aqua'], + } + input_df = pandas.DataFrame(input_dict) + + reads_dict = { + 'sample1': [1, 2], + 'sample2': [3, 4], + } + reads_df = pandas.DataFrame(reads_dict) + + _ = validate_metadata_vs_reads_id_consistency(input_df, reads_df) + + # Pass test if we made it this far + self.assertTrue(True) + + def test_validate_metadata_vs_reads_id_consistency_df_true_w_msg(self): + input_dict = { + SAMPLE_ID_KEY: ['sample1', 'sample2', 'sample3'], + 'color': ['blue', 'aqua', 'cerulean'], + } + input_df = pandas.DataFrame(input_dict) + + reads_dict = { + 'sample1': [1, 2], + 'sample2': [3, 4], + } + reads_df = pandas.DataFrame(reads_dict) + + not_in_prep_ids = validate_metadata_vs_reads_id_consistency( + input_df, reads_df) + + expected_not_in_prep_ids = ['sample3'] + self.assertEqual(not_in_prep_ids, expected_not_in_prep_ids) + + def test_validate_metadata_vs_reads_id_consistency_df_err(self): + input_dict = { + SAMPLE_ID_KEY: ['sample1'], + 'color': ['blue'], + } + input_df = pandas.DataFrame(input_dict) + + reads_dict = { + 'sample1': [1, 2], + 'sample2': [3, 4], + } + reads_df = pandas.DataFrame(reads_dict) + + expected_err = (r"Found sample ids in reads data that were not in " + r"sample info: \{'sample2'\}") + with self.assertRaisesRegex(ValueError, expected_err): + _ = validate_metadata_vs_reads_id_consistency( + input_df, reads_df) + + def test_validate_metadata_vs_reads_id_consistency_biom_true(self): + input_dict = { + SAMPLE_ID_KEY: ['sample1', 'sample2'], + 'color': ['blue', 'aqua'], + } + input_df = pandas.DataFrame(input_dict) + + reads_biom = biom.table.Table( + np.array([[1, 2], [3, 4]]), + ['obs1', 'obs2'], + ['sample1', 'sample2']) + + _ = validate_metadata_vs_reads_id_consistency(input_df, reads_biom) + + # Pass test if we made it this far + self.assertTrue(True) + + def test_validate_metadata_vs_reads_id_consistency_biom_true_w_msg(self): + input_dict = { + SAMPLE_ID_KEY: ['sample1', 'sample2', 'sample3'], + 'color': ['blue', 'aqua', 'cerulean'], + } + input_df = pandas.DataFrame(input_dict) + + reads_biom = biom.table.Table( + np.array([[1, 2], [3, 4]]), + ['obs1', 'obs2'], + ['sample1', 'sample2']) + + not_in_prep_ids = validate_metadata_vs_reads_id_consistency( + input_df, reads_biom) + + expected_not_in_prep_ids = ['sample3'] + self.assertEqual(not_in_prep_ids, expected_not_in_prep_ids) + + def test_validate_metadata_vs_reads_id_consistency_biom_err(self): + input_dict = { + SAMPLE_ID_KEY: ['sample1'], + 'color': ['blue'], + } + input_df = pandas.DataFrame(input_dict) + + reads_biom = biom.table.Table( + np.array([[1, 2], [3, 4]]), + ['obs1', 'obs2'], + ['sample1', 'sample2']) + + expected_err = (r"Found sample ids in reads data that were not in " + r"sample info: \{'sample2'\}") + with self.assertRaisesRegex(ValueError, expected_err): + _ = validate_metadata_vs_reads_id_consistency( + input_df, reads_biom) + + def test_calc_copies_genomic_element_per_g_series(self): + # example from "rna_copy_quant_example.xlsx" "full_calc" tab, + # ogu_orf_calculations table + elements_lens = [1353, 1143, 216, 1116, 276, 1797, 846, 891, 768, 645] + copies_per_g = [1.309104e+18, 1.549622e+18, 8.200083e+18, 1.587113e+18, + 6.417456e+18, 9.856527e+17, 2.093638e+18, 1.987899e+18, + 2.306273e+18, 2.746074e+18] + expected_series = pandas.Series(copies_per_g) + obs_series = calc_copies_genomic_element_per_g_series( + pandas.Series(elements_lens), 340) + assert_series_equal(expected_series, obs_series) + + def test_calc_gs_genomic_element_in_aliquot(self): + # example from "rna_copy_quant_example.xlsx" "full_calc" tab, + # quant_params_per_sample table + input_dict = { + SAMPLE_ID_KEY: ["IBSRS3526007", "IQSRS3526010"], + 'conc_ng_ul': [0.132714, 0.004200], + ELUTE_VOL_UL_KEY: [70, 70] + } + + added_dict = {'mass_key': [9.290000e-09, 2.940000e-10]} + expected_dict = input_dict.copy() + expected_dict.update(added_dict) + + input_df = pandas.DataFrame(input_dict) + expected_df = pandas.DataFrame(expected_dict) + + obs_df = calc_gs_genomic_element_in_aliquot( + input_df, 'conc_ng_ul', 'mass_key') + assert_frame_equal(expected_df, obs_df)