diff --git a/docs/absolute_quant_example.xlsx b/docs/absolute_quant_example.xlsx index 07e61dd..d8821a0 100644 Binary files a/docs/absolute_quant_example.xlsx and b/docs/absolute_quant_example.xlsx differ diff --git a/pysyndna/__init__.py b/pysyndna/__init__.py index afe04f9..16b949b 100644 --- a/pysyndna/__init__.py +++ b/pysyndna/__init__.py @@ -3,7 +3,10 @@ FIT_SYNDNA_MODELS_LOG_KEY from pysyndna.src.calc_cell_counts import calc_ogu_cell_counts_biom, \ calc_ogu_cell_counts_per_g_of_sample_for_qiita, \ + calc_ogu_cell_counts_per_cm2_of_sample_for_qiita, \ + calc_ogu_cell_counts_per_ul_of_sample_for_qiita, \ OGU_CELLS_PER_G_OF_GDNA_KEY, OGU_CELLS_PER_G_OF_SAMPLE_KEY, \ + OGU_CELLS_PER_UL_OF_SAMPLE_KEY, OGU_CELLS_PER_CM2_OF_SAMPLE_KEY, \ OGU_ID_KEY, OGU_LEN_IN_BP_KEY, OGU_PERCENT_COVERAGE_KEY, \ CELL_COUNT_RESULT_KEY, CELL_COUNT_LOG_KEY from pysyndna.src.quant_orfs import \ @@ -17,6 +20,8 @@ 'fit_linear_regression_models_for_qiita', 'calc_ogu_cell_counts_biom', 'calc_ogu_cell_counts_per_g_of_sample_for_qiita', + 'calc_ogu_cell_counts_per_cm2_of_sample_for_qiita', + 'calc_ogu_cell_counts_per_ul_of_sample_for_qiita', 'read_ogu_orf_coords_to_df', 'validate_and_cast_ogu_orf_coords_df', 'calc_copies_of_ogu_orf_ssrna_per_g_sample_from_dfs', @@ -24,6 +29,8 @@ 'calc_copies_of_ogu_orf_ssrna_per_g_sample_for_qiita', 'OGU_CELLS_PER_G_OF_GDNA_KEY', 'OGU_CELLS_PER_G_OF_SAMPLE_KEY', + 'OGU_CELLS_PER_UL_OF_SAMPLE_KEY', + 'OGU_CELLS_PER_CM2_OF_SAMPLE_KEY', 'SAMPLE_ID_KEY', 'OGU_ID_KEY', 'OGU_LEN_IN_BP_KEY', 'OGU_ORF_ID_KEY', 'OGU_PERCENT_COVERAGE_KEY', 'LIN_REGRESS_RESULT_KEY', 'FIT_SYNDNA_MODELS_LOG_KEY', diff --git a/pysyndna/src/calc_cell_counts.py b/pysyndna/src/calc_cell_counts.py index 8843d46..0253df0 100644 --- a/pysyndna/src/calc_cell_counts.py +++ b/pysyndna/src/calc_cell_counts.py @@ -9,8 +9,7 @@ validate_metadata_vs_reads_id_consistency, filter_data_by_sample_info, \ validate_metadata_vs_prep_id_consistency, cast_cols, \ DNA_BASEPAIR_G_PER_MOLE, NANOGRAMS_PER_GRAM, \ - SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY, ELUTE_VOL_UL_KEY, \ - REQUIRED_SAMPLE_INFO_KEYS + SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY, ELUTE_VOL_UL_KEY from pysyndna.src.fit_syndna_models import SYNDNA_POOL_MASS_NG_KEY, \ SLOPE_KEY, INTERCEPT_KEY @@ -25,8 +24,10 @@ GDNA_CONCENTRATION_NG_UL_KEY = 'extracted_gdna_concentration_ng_ul' GDNA_FROM_ALIQUOT_MASS_G_KEY = 'extracted_gdna_concentration_g' -# NB: below is NOT the full mass of gDNA extracted from the sample, but -# ONLY the mass of gDNA that was put into sequencing. This mass should +# NB: below is NOT the full mass of gDNA extracted from the sample (which can +# be calculated from GDNA_CONCENTRATION_NG_UL_KEY and ELUTE_VOL_UL_KEY +# and then stored in GDNA_FROM_ALIQUOT_MASS_G_KEY) but +# ONLY the mass of gDNA that was put into sequencing . This mass should # NOT include the additional mass of the syndna pool added to sequencing. SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY = 'sequenced_sample_gdna_mass_ng' OGU_ID_KEY = 'ogu_id' @@ -43,76 +44,138 @@ # (NOT limited to the amount of gDNA that was put into sequencing, unlike # SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY) GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY = 'gdna_mass_to_sample_mass_ratio' +SAMPLE_VOLUME_UL_KEY = "sample_volume_ul" +OGU_CELLS_PER_UL_OF_SAMPLE_KEY = "ogu_cells_per_ul_of_sample" +GDNA_MASS_TO_SAMPLE_VOL_RATIO_KEY = "gdna_mass_to_sample_vol_ratio" +SAMPLE_SURFACE_AREA_CM2_KEY = "sample_surface_area_cm2" +OGU_CELLS_PER_CM2_OF_SAMPLE_KEY = "ogu_cells_per_cm2_of_sample" +GDNA_MASS_TO_SAMPLE_SURFACE_AREA_RATIO_KEY = "gdna_mass_to_sample_surface_area_ratio" REQUIRED_DNA_PREP_INFO_KEYS = [SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY, ELUTE_VOL_UL_KEY] +RATIO_NAME_KEY = "ratio_key" +DENOMINATOR_KEY = "denom_key" +SAMPLE_LEVEL_METRICS_DICT = { + OGU_CELLS_PER_G_OF_SAMPLE_KEY: { + DENOMINATOR_KEY: SAMPLE_IN_ALIQUOT_MASS_G_KEY, + RATIO_NAME_KEY: GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY}, + OGU_CELLS_PER_UL_OF_SAMPLE_KEY: { + DENOMINATOR_KEY: SAMPLE_VOLUME_UL_KEY, + RATIO_NAME_KEY: GDNA_MASS_TO_SAMPLE_VOL_RATIO_KEY}, + OGU_CELLS_PER_CM2_OF_SAMPLE_KEY: { + DENOMINATOR_KEY: SAMPLE_SURFACE_AREA_CM2_KEY, + RATIO_NAME_KEY: GDNA_MASS_TO_SAMPLE_SURFACE_AREA_RATIO_KEY} +} + + +def _calc_ogu_cell_counts_per_x_of_sample_for_qiita( + sample_info_df: pd.DataFrame, + prep_info_df: pd.DataFrame, + linregress_by_sample_id_fp: str, + ogu_counts_per_sample_biom: biom.Table, + ogu_percent_coverage_df: pd.DataFrame, + ogu_lengths_fp: str, + output_cell_counts_metric: str, + min_coverage: float = DEFAULT_MIN_PERCENT_COVERAGE, + min_rsquared: float = DEFAULT_MIN_RSQUARED, + syndna_mass_fraction_of_sample: float = + DEFAULT_SYNDNA_MASS_FRACTION_OF_SAMPLE) \ + -> Dict[str, Union[str, biom.Table]]: - -def _calc_gdna_mass_to_sample_mass_by_sample_df( - absolute_quant_params_per_sample_df: pd.DataFrame) -> pd.Series: - - """Calculates ratio of extracted gDNA mass to sample mass for each sample. - - Note that the sample mass is the mass of the sample material (only, not - buffer, tube, etc) that went into the extraction, which may be different - from the total mass of sample that was collected. + """Gets # of cells of each OGU/g of sample for samples from Qiita. Parameters ---------- - absolute_quant_params_per_sample_df: pd.DataFrame - A Dataframe of at least SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY, - SAMPLE_IN_ALIQUOT_MASS_G_KEY, and ELUTE_VOL_UL_KEY for - each sample. + sample_info_df: pd.DataFrame + A Dataframe containing sample info for all samples in the prep, + including SAMPLE_ID_KEY and SAMPLE_IN_ALIQUOT_MASS_G_KEY + prep_info_df: pd.DataFrame + A Dataframe containing prep info for all samples in the prep, + including SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY, and + ELUTE_VOL_UL_KEY, SYNDNA_POOL_MASS_NG_KEY. + linregress_by_sample_id_fp: str + String containing the filepath to the yaml file holding the + dictionary keyed by sample id, containing for each sample a dictionary + representation of the sample's LinregressResult. + ogu_counts_per_sample_biom: biom.Table + Biom table holding the read counts aligned to each OGU in each sample. + ogu_percent_coverage_df : pd.DataFrame + A Dataframe of OGU_ID_KEY and OGU_PERCENT_COVERAGE_KEY for each OGU. + ogu_lengths_fp : str + String containing the filepath to a tab-separated, two-column, + no-header file in which the first column is the OGU id and the + second is the OGU length in basepairs + min_coverage : float + Minimum allowable % coverage of an OGU in a sample needed to include + that OGU/sample in the output. + min_rsquared: float + Minimum allowable R^2 value for the linear regression model for a + sample; any sample with an R^2 value less than this will be excluded + from the output. + syndna_mass_fraction_of_sample: float + Fraction of the mass of the sample that is added as syndna (usually + 0.05, which is to say 5%). Returns ------- - gdna_mass_to_sample_mass_by_sample_series : pd.Series - A Series with index of sample id and values of the ratio of gDNA mass - units extracted from each mass unit of input sample (only) mass. + output_by_out_type : dict of str or biom.Table + Dictionary of outputs keyed by their type Currently, the following keys + are defined: + CELL_COUNT_RESULT_KEY: biom.Table holding the calculated number of + cells per gram of sample material for each OGU in each sample. + CELL_COUNT_LOG_KEY: log of messages from the cell count calc process. """ - # get the total grams of gDNA that are in the elute after extraction; - # this is sample-specific - working_df = calc_gs_genomic_element_in_aliquot( - absolute_quant_params_per_sample_df, GDNA_CONCENTRATION_NG_UL_KEY, - GDNA_FROM_ALIQUOT_MASS_G_KEY) + required_prep_cols = list( + {SYNDNA_POOL_MASS_NG_KEY} | set(REQUIRED_DNA_PREP_INFO_KEYS)) + validate_required_columns_exist( + prep_info_df, required_prep_cols, + "prep info is missing required column(s)") - # determine how many mass units of gDNA are produced from the extraction of - # each mass unit of sample material; this is sample-specific: - # grams of gDNA after extraction divided grams of sample material. - gdna_mass_to_sample_mass_ratio = \ - working_df[GDNA_FROM_ALIQUOT_MASS_G_KEY] / \ - working_df[SAMPLE_IN_ALIQUOT_MASS_G_KEY] + # Check if any samples in the prep are missing from the sample info; + # Not bothering to report samples that are in sample info but not the prep + # --maybe those just weren't included in this prep. + _ = validate_metadata_vs_prep_id_consistency( + sample_info_df, prep_info_df) - gdna_mass_to_sample_mass_ratio.name = GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY - gdna_mass_to_sample_mass_ratio.index = working_df[SAMPLE_ID_KEY] - gdna_mass_to_sample_mass_ratio.index.name = SAMPLE_ID_KEY + # cast in case the input comes in as string or something + syndna_mass_fraction_of_sample = float(syndna_mass_fraction_of_sample) - return gdna_mass_to_sample_mass_ratio + # make sure the SYNDNA_POOL_MASS_NG_KEY column of prep_info_df is a float, + # then calculate the mass of gDNA sequenced for each sample. We have the + # mass of syndna pool that was added to each sample, and we know that the + # syndna pool mass is calculated to be a certain percentage of the mass of + # the sample (added into the library prep in addition to the sample mass). + # Therefore, if the syndna fraction is 0.05 or 5%, the mass of the sample + # gDNA put into sequencing is 1/0.05 = 20x the mass of syndna pool added. + prep_info_df = cast_cols(prep_info_df, [SYNDNA_POOL_MASS_NG_KEY], True) + prep_info_df[SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY] = \ + prep_info_df[SYNDNA_POOL_MASS_NG_KEY] * \ + (1 / syndna_mass_fraction_of_sample) + # merge the sample info and prep info dataframes + absolute_quant_params_per_sample_df = \ + sample_info_df.merge(prep_info_df, on=SAMPLE_ID_KEY, how='left') -def _series_to_df(a_series, index_col_name, val_col_name): - """Converts a pd.Series to two-column pd.DataFrame (from index and value) + # read in the linregress_by_sample_id yaml file + with open(linregress_by_sample_id_fp) as f: + linregress_by_sample_id = yaml.load(f, Loader=yaml.FullLoader) - Parameters - ---------- - a_series : pd.Series - A Series to be converted to a dataframe. - index_col_name : str - Name of the index-derived in the resulting dataframe. - val_col_name : str - Name of the values-derived column in the resulting dataframe. + # read in the ogu_lengths file + ogu_lengths_df = pd.read_csv(ogu_lengths_fp, sep='\t', header=None, + names=[OGU_ID_KEY, OGU_LEN_IN_BP_KEY]) - Returns - ------- - a_df : pd.DataFrame - A Dataframe with two columns, one from the index and one containing the - values from the input series. - """ + # calculate # cells per x (g, uL, or cm2) of sample material of each OGU + # in each sample + output_biom, log_msgs_list = calc_ogu_cell_counts_biom( + absolute_quant_params_per_sample_df, linregress_by_sample_id, + ogu_counts_per_sample_biom, ogu_percent_coverage_df, ogu_lengths_df, + min_coverage, min_rsquared, output_cell_counts_metric) - a_df = a_series.to_frame().reset_index() - a_df.columns = [index_col_name, val_col_name] + out_txt_by_out_type = { + CELL_COUNT_RESULT_KEY: output_biom, + CELL_COUNT_LOG_KEY: '\n'.join(log_msgs_list)} - return a_df + return out_txt_by_out_type def _calc_long_format_ogu_cell_counts_df( @@ -141,8 +204,10 @@ def _calc_long_format_ogu_cell_counts_df( ogu_lengths_df : pd.DataFrame A Dataframe of OGU_ID_KEY and OGU_LEN_IN_BP_KEY for each OGU. per_sample_calc_info_df : pd.DataFrame - A Dataframe of SAMPLE_ID_KEY, GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY, and - SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY for each sample. + A Dataframe of SAMPLE_ID_KEY, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY, + GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY, GDNA_MASS_TO_SAMPLE_VOL_RATIO_KEY, + and GDNA_MASS_TO_SAMPLE_SURFACE_AREA_RATIO_KEY for each sample. Any or + all of the ratio columns may be NaN for a given sample. min_coverage : float Minimum allowable coverage of an OGU needed to include that OGU in the output. @@ -295,8 +360,10 @@ def _calc_ogu_cell_counts_df_for_sample( (if no model could be trained for that SAMPLE_ID_KEY) or a dictionary representation of the sample's LinregressResult. per_sample_info_df : pd.DataFrame - A Dataframe of SAMPLE_ID_KEY, GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY, and - SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY for each sample. + A Dataframe of SAMPLE_ID_KEY, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY, + GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY, GDNA_MASS_TO_SAMPLE_VOL_RATIO_KEY, + and GDNA_MASS_TO_SAMPLE_SURFACE_AREA_RATIO_KEY for each sample. Any or + all of the ratio columns may be NaN for a given sample. working_df : pd.DataFrame Long-format dataframe with columns for OGU_ID_KEY, SAMPLE_ID_KEY, OGU_READ_COUNT_KEY, and OGU_LEN_IN_BP_KEY @@ -316,9 +383,10 @@ def _calc_ogu_cell_counts_df_for_sample( None if the specified sample id has no linear model or has a model with R^2 < min_rsquared. Otherwise, a long-format dataframe with columns for at least OGU_ID_KEY, SAMPLE_ID_KEY, OGU_READ_COUNT_KEY, - OGU_LEN_IN_BP_KEY, OGU_CELLS_PER_G_OF_GDNA_KEY, + OGU_LEN_IN_BP_KEY, OGU_GENOMES_PER_G_OF_GDNA_KEY, OGU_CELLS_PER_G_OF_GDNA_KEY, - and OGU_CELLS_PER_G_OF_SAMPLE_KEY + OGU_CELLS_PER_G_OF_SAMPLE_KEY, OGU_CELLS_PER_UL_OF_SAMPLE_KEY, and + OGU_CELLS_PER_CM2_OF_SAMPLE_KEY log_messages_list : list[str] List of strings containing log messages generated by this function. """ @@ -360,7 +428,8 @@ def _calc_ogu_cell_counts_df_for_sample( per_sample_info_df[SAMPLE_ID_KEY] == sample_id, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY].values[0] - # calc the # of genomes of each OGU per gram of gDNA in this sample + # calc the # of genomes of each OGU per gram of gDNA in the sample + # (normalized by the grams of gDNA in the *sequenced* sample) ogu_genomes_per_gdnas = _calc_ogu_genomes_per_g_of_gdna_series_for_sample( sample_df, sequenced_sample_gdna_mass_ng, is_test=is_test) sample_df[OGU_GENOMES_PER_G_OF_GDNA_KEY] = \ @@ -375,14 +444,24 @@ def _calc_ogu_cell_counts_df_for_sample( sample_df[OGU_CELLS_PER_G_OF_GDNA_KEY] = \ sample_df[OGU_GENOMES_PER_G_OF_GDNA_KEY] - # calc the # of cells of each OGU per gram of actual sample material - # (e.g., per gram of stool if these are fecal samples) for this sample - mass_ratio_for_sample = per_sample_info_df.loc[ - per_sample_info_df[SAMPLE_ID_KEY] == sample_id, - GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY].values[0] - sample_df[OGU_CELLS_PER_G_OF_SAMPLE_KEY] = \ - sample_df[OGU_CELLS_PER_G_OF_GDNA_KEY] * \ - mass_ratio_for_sample + # for each potential output metric: + # 1. multiply the ratio by the genomes of each OGU per g of gDNA in the + # sample to get the genomes of each OGU per metric for the sample + # 2. set the # of cells of the microbe represented by each OGU per gram + # of gDNA in this sample to be the same as the number of genomes + # Don't worry about whether the ratio columns are in the df at this + # point; earlier on, we put them in as NaN if not there so all these + # calculations can be done in one place and just be NaN if not relevant. + for cell_metric_key, metric_info in SAMPLE_LEVEL_METRICS_DICT.items(): + ratio_key = metric_info[RATIO_NAME_KEY] + ratio_for_sample = per_sample_info_df.loc[ + per_sample_info_df[SAMPLE_ID_KEY] == sample_id, + ratio_key].values[0] + + # calculate # of cells (i.e., genomes) of each OGU per metric + # for this sample + sample_df[cell_metric_key] = ( + sample_df[OGU_GENOMES_PER_G_OF_GDNA_KEY] * ratio_for_sample) return sample_df, log_messages_list @@ -560,8 +639,9 @@ def calc_ogu_cell_counts_biom( ---------- absolute_quant_params_per_sample_df: pd.DataFrame A Dataframe of at least SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY, - SAMPLE_IN_ALIQUOT_MASS_G_KEY, ELUTE_VOL_UL_KEY, and - SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY for each sample. + ELUTE_VOL_UL_KEY, and SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY for each + sample. It should also have at least one of SAMPLE_VOLUME_UL_KEY, + SAMPLE_SURFACE_AREA_CM2_KEY, and/or SAMPLE_IN_ALIQUOT_MASS_G_KEY. linregress_by_sample_id : dict[str, dict[str: float]] Dictionary keyed by sample id, containing for each sample either None (if no model could be trained for that SAMPLE_ID_KEY) or a dictionary @@ -580,8 +660,9 @@ def calc_ogu_cell_counts_biom( sample; any sample with an R^2 value less than this will be excluded from the output. output_cell_counts_metric : str - Name of the desired output cell count metric; options are - OGU_CELLS_PER_G_OF_GDNA_KEY and OGU_CELLS_PER_G_OF_SAMPLE_KEY. + Name of the desired output cell count metric; options are: + OGU_CELLS_PER_G_OF_GDNA_KEY, OGU_CELLS_PER_G_OF_SAMPLE_KEY, + OGU_CELLS_PER_UL_OF_SAMPLE_KEY, or OGU_CELLS_PER_CM2_OF_SAMPLE_KEY. Returns ------- @@ -594,10 +675,13 @@ def calc_ogu_cell_counts_biom( """ # check if the inputs all have the required columns + extra_required = set() + if output_cell_counts_metric in SAMPLE_LEVEL_METRICS_DICT: + extra_required = {SAMPLE_LEVEL_METRICS_DICT[ + output_cell_counts_metric][DENOMINATOR_KEY]} required_cols_list = list( - {SAMPLE_ID_KEY, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY, - SAMPLE_IN_ALIQUOT_MASS_G_KEY} | - set(REQUIRED_DNA_PREP_INFO_KEYS)) + {SAMPLE_ID_KEY, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY} | + set(REQUIRED_DNA_PREP_INFO_KEYS) | extra_required) validate_required_columns_exist( absolute_quant_params_per_sample_df, required_cols_list, "sample info is missing required column(s)") @@ -641,20 +725,21 @@ def calc_ogu_cell_counts_biom( filter_data_by_sample_info( filter_params_df, ogu_counts_per_sample_biom, cols_to_filter_on) - # calculate the ratio of extracted gDNA mass to sample mass put into - # extraction for each sample - gdna_mass_to_sample_mass_by_sample_series = \ - _calc_gdna_mass_to_sample_mass_by_sample_df(working_params_df) - per_sample_calc_info_df = _series_to_df( - gdna_mass_to_sample_mass_by_sample_series, SAMPLE_ID_KEY, - GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY) - - # merge the SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY - # column of working_params_df into gdna_mass_to_sample_mass_df - # by SAMPLE_ID_KEY - per_sample_calc_info_df = per_sample_calc_info_df.merge( - working_params_df[[SAMPLE_ID_KEY, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY]], - on=SAMPLE_ID_KEY, how='left') + # calc GDNA_FROM_ALIQUOT_MASS_G_KEY, the total grams of gDNA that are in + # the elute after extraction; this is sample-specific + per_sample_calc_info_df = calc_gs_genomic_element_in_aliquot( + working_params_df, GDNA_CONCENTRATION_NG_UL_KEY, + GDNA_FROM_ALIQUOT_MASS_G_KEY) + + for curr_metric_key, curr_metric_dict in SAMPLE_LEVEL_METRICS_DICT.items(): + metric_ratio_col_name = curr_metric_dict[RATIO_NAME_KEY] + metric_ratio_denom_key = curr_metric_dict[DENOMINATOR_KEY] + if metric_ratio_denom_key not in per_sample_calc_info_df.columns: + per_sample_calc_info_df[metric_ratio_denom_key] = np.nan + + per_sample_calc_info_df[metric_ratio_col_name] = \ + per_sample_calc_info_df[GDNA_FROM_ALIQUOT_MASS_G_KEY] / \ + per_sample_calc_info_df[metric_ratio_denom_key] # convert input biom table to a dataframe with sparse columns, which # should act basically the same as a dense dataframe but use less memory @@ -701,102 +786,63 @@ def calc_ogu_cell_counts_per_g_of_sample_for_qiita( DEFAULT_SYNDNA_MASS_FRACTION_OF_SAMPLE) \ -> Dict[str, Union[str, biom.Table]]: - """Gets # of cells of each OGU/g of sample for samples from Qiita. - - Parameters - ---------- - sample_info_df: pd.DataFrame - A Dataframe containing sample info for all samples in the prep, - including SAMPLE_ID_KEY and SAMPLE_IN_ALIQUOT_MASS_G_KEY - prep_info_df: pd.DataFrame - A Dataframe containing prep info for all samples in the prep, - including SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY, and - ELUTE_VOL_UL_KEY, SYNDNA_POOL_MASS_NG_KEY. - linregress_by_sample_id_fp: str - String containing the filepath to the yaml file holding the - dictionary keyed by sample id, containing for each sample a dictionary - representation of the sample's LinregressResult. - ogu_counts_per_sample_biom: biom.Table - Biom table holding the read counts aligned to each OGU in each sample. - ogu_percent_coverage_df : pd.DataFrame - A Dataframe of OGU_ID_KEY and OGU_PERCENT_COVERAGE_KEY for each OGU. - ogu_lengths_fp : str - String containing the filepath to a tab-separated, two-column, - no-header file in which the first column is the OGU id and the - second is the OGU length in basepairs - min_coverage : float - Minimum allowable % coverage of an OGU in a sample needed to include - that OGU/sample in the output. - min_rsquared: float - Minimum allowable R^2 value for the linear regression model for a - sample; any sample with an R^2 value less than this will be excluded - from the output. - syndna_mass_fraction_of_sample: float - Fraction of the mass of the sample that is added as syndna (usually - 0.05, which is to say 5%). - - Returns - ------- - output_by_out_type : dict of str or biom.Table - Dictionary of outputs keyed by their type Currently, the following keys - are defined: - CELL_COUNT_RESULT_KEY: biom.Table holding the calculated number of - cells per gram of sample material for each OGU in each sample. - CELL_COUNT_LOG_KEY: log of messages from the cell count calc process. - """ - # check if the inputs all have the required columns validate_required_columns_exist( - sample_info_df, REQUIRED_SAMPLE_INFO_KEYS, + sample_info_df, [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY], "sample info is missing required column(s)") - required_prep_cols = list( - {SYNDNA_POOL_MASS_NG_KEY} | set(REQUIRED_DNA_PREP_INFO_KEYS)) - validate_required_columns_exist( - prep_info_df, required_prep_cols, - "prep info is missing required column(s)") - - # Check if any samples in the prep are missing from the sample info; - # Not bothering to report samples that are in sample info but not the prep - # --maybe those just weren't included in this prep. - _ = validate_metadata_vs_prep_id_consistency( - sample_info_df, prep_info_df) + return _calc_ogu_cell_counts_per_x_of_sample_for_qiita( + sample_info_df, prep_info_df, linregress_by_sample_id_fp, + ogu_counts_per_sample_biom, ogu_percent_coverage_df, ogu_lengths_fp, + OGU_CELLS_PER_G_OF_SAMPLE_KEY, min_coverage, min_rsquared, + syndna_mass_fraction_of_sample) - # cast in case the input comes in as string or something - syndna_mass_fraction_of_sample = float(syndna_mass_fraction_of_sample) - # make sure the SYNDNA_POOL_MASS_NG_KEY column of prep_info_df is a float, - # then calculate the mass of gDNA sequenced for each sample. We have the - # mass of syndna pool that was added to each sample, and we know that the - # syndna pool mass is calculated to be a certain percentage of the mass of - # the sample (added into the library prep in addition to the sample mass). - # Therefore, if the syndna fraction is 0.05 or 5%, the mass of the sample - # gDNA put into sequencing is 1/0.05 = 20x the mass of syndna pool added. - prep_info_df = cast_cols(prep_info_df, [SYNDNA_POOL_MASS_NG_KEY], True) - prep_info_df[SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY] = \ - prep_info_df[SYNDNA_POOL_MASS_NG_KEY] * \ - (1 / syndna_mass_fraction_of_sample) +def calc_ogu_cell_counts_per_cm2_of_sample_for_qiita( + sample_info_df: pd.DataFrame, + prep_info_df: pd.DataFrame, + linregress_by_sample_id_fp: str, + ogu_counts_per_sample_biom: biom.Table, + ogu_percent_coverage_df: pd.DataFrame, + ogu_lengths_fp: str, + min_coverage: float = DEFAULT_MIN_PERCENT_COVERAGE, + min_rsquared: float = DEFAULT_MIN_RSQUARED, + syndna_mass_fraction_of_sample: float = + DEFAULT_SYNDNA_MASS_FRACTION_OF_SAMPLE) \ + -> Dict[str, Union[str, biom.Table]]: - # merge the sample info and prep info dataframes - absolute_quant_params_per_sample_df = \ - sample_info_df.merge(prep_info_df, on=SAMPLE_ID_KEY, how='left') + # check if the inputs all have the required columns + validate_required_columns_exist( + sample_info_df, [SAMPLE_ID_KEY, SAMPLE_SURFACE_AREA_CM2_KEY], + "sample info is missing required column(s)") - # read in the linregress_by_sample_id yaml file - with open(linregress_by_sample_id_fp) as f: - linregress_by_sample_id = yaml.load(f, Loader=yaml.FullLoader) + return _calc_ogu_cell_counts_per_x_of_sample_for_qiita( + sample_info_df, prep_info_df, linregress_by_sample_id_fp, + ogu_counts_per_sample_biom, ogu_percent_coverage_df, ogu_lengths_fp, + OGU_CELLS_PER_CM2_OF_SAMPLE_KEY, min_coverage, min_rsquared, + syndna_mass_fraction_of_sample) - # read in the ogu_lengths file - ogu_lengths_df = pd.read_csv(ogu_lengths_fp, sep='\t', header=None, - names=[OGU_ID_KEY, OGU_LEN_IN_BP_KEY]) - # calculate # cells per gram of sample material of each OGU in each sample - output_biom, log_msgs_list = calc_ogu_cell_counts_biom( - absolute_quant_params_per_sample_df, linregress_by_sample_id, - ogu_counts_per_sample_biom, ogu_percent_coverage_df, ogu_lengths_df, - min_coverage, min_rsquared, OGU_CELLS_PER_G_OF_SAMPLE_KEY) +def calc_ogu_cell_counts_per_ul_of_sample_for_qiita( + sample_info_df: pd.DataFrame, + prep_info_df: pd.DataFrame, + linregress_by_sample_id_fp: str, + ogu_counts_per_sample_biom: biom.Table, + ogu_percent_coverage_df: pd.DataFrame, + ogu_lengths_fp: str, + min_coverage: float = DEFAULT_MIN_PERCENT_COVERAGE, + min_rsquared: float = DEFAULT_MIN_RSQUARED, + syndna_mass_fraction_of_sample: float = + DEFAULT_SYNDNA_MASS_FRACTION_OF_SAMPLE) \ + -> Dict[str, Union[str, biom.Table]]: - out_txt_by_out_type = { - CELL_COUNT_RESULT_KEY: output_biom, - CELL_COUNT_LOG_KEY: '\n'.join(log_msgs_list)} + # check if the inputs all have the required columns + validate_required_columns_exist( + sample_info_df, [SAMPLE_ID_KEY, SAMPLE_VOLUME_UL_KEY], + "sample info is missing required column(s)") - return out_txt_by_out_type + return _calc_ogu_cell_counts_per_x_of_sample_for_qiita( + sample_info_df, prep_info_df, linregress_by_sample_id_fp, + ogu_counts_per_sample_biom, ogu_percent_coverage_df, ogu_lengths_fp, + OGU_CELLS_PER_UL_OF_SAMPLE_KEY, min_coverage, min_rsquared, + syndna_mass_fraction_of_sample) diff --git a/pysyndna/src/quant_orfs.py b/pysyndna/src/quant_orfs.py index 69969ed..3360457 100644 --- a/pysyndna/src/quant_orfs.py +++ b/pysyndna/src/quant_orfs.py @@ -6,9 +6,9 @@ validate_required_columns_exist, \ validate_metadata_vs_reads_id_consistency, cast_cols, \ validate_metadata_vs_prep_id_consistency, SAMPLE_ID_KEY, \ - SAMPLE_IN_ALIQUOT_MASS_G_KEY, ELUTE_VOL_UL_KEY, RNA_BASE_G_PER_MOLE, \ - REQUIRED_SAMPLE_INFO_KEYS + SAMPLE_IN_ALIQUOT_MASS_G_KEY, ELUTE_VOL_UL_KEY, RNA_BASE_G_PER_MOLE +REQUIRED_SAMPLE_INFO_KEYS = [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY] OGU_ORF_ID_KEY = "ogu_orf_id" OGU_ORF_START_KEY = "ogu_orf_start" OGU_ORF_END_KEY = "ogu_orf_end" diff --git a/pysyndna/src/util.py b/pysyndna/src/util.py index e11ebdb..c8d5db0 100644 --- a/pysyndna/src/util.py +++ b/pysyndna/src/util.py @@ -13,7 +13,6 @@ SAMPLE_ID_KEY = 'sample_name' SAMPLE_IN_ALIQUOT_MASS_G_KEY = 'calc_mass_sample_aliquot_input_g' ELUTE_VOL_UL_KEY = 'vol_extracted_elution_ul' -REQUIRED_SAMPLE_INFO_KEYS = [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY] def _validate_sample_id_consistency( diff --git a/pysyndna/tests/test_calc_cell_counts.py b/pysyndna/tests/test_calc_cell_counts.py index 87900e3..d2800d9 100644 --- a/pysyndna/tests/test_calc_cell_counts.py +++ b/pysyndna/tests/test_calc_cell_counts.py @@ -7,7 +7,9 @@ import os from unittest import TestCase from pysyndna import calc_ogu_cell_counts_biom, \ - calc_ogu_cell_counts_per_g_of_sample_for_qiita + calc_ogu_cell_counts_per_g_of_sample_for_qiita, \ + calc_ogu_cell_counts_per_cm2_of_sample_for_qiita, \ + calc_ogu_cell_counts_per_ul_of_sample_for_qiita from pysyndna.src.fit_syndna_models import SAMPLE_TOTAL_READS_KEY from pysyndna.src.calc_cell_counts import SAMPLE_ID_KEY, ELUTE_VOL_UL_KEY, \ OGU_ID_KEY, OGU_READ_COUNT_KEY, \ @@ -15,14 +17,16 @@ SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY, OGU_GENOMES_PER_G_OF_GDNA_KEY, \ OGU_CELLS_PER_G_OF_GDNA_KEY, SYNDNA_POOL_MASS_NG_KEY, \ GDNA_CONCENTRATION_NG_UL_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY, \ - GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY, \ - OGU_CELLS_PER_G_OF_SAMPLE_KEY, \ + GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY, OGU_CELLS_PER_G_OF_SAMPLE_KEY, \ + SAMPLE_VOLUME_UL_KEY, \ + GDNA_MASS_TO_SAMPLE_VOL_RATIO_KEY, OGU_CELLS_PER_UL_OF_SAMPLE_KEY, \ + SAMPLE_SURFACE_AREA_CM2_KEY, GDNA_MASS_TO_SAMPLE_SURFACE_AREA_RATIO_KEY, \ + OGU_CELLS_PER_CM2_OF_SAMPLE_KEY, \ OGU_PERCENT_COVERAGE_KEY, \ CELL_COUNT_RESULT_KEY, CELL_COUNT_LOG_KEY, \ _calc_long_format_ogu_cell_counts_df, \ _prepare_cell_counts_calc_df, \ _calc_ogu_cell_counts_df_for_sample, \ - _calc_gdna_mass_to_sample_mass_by_sample_df, \ _calc_ogu_gdna_mass_ng_series_for_sample, \ _calc_ogu_genomes_per_g_of_gdna_series_for_sample, \ _calc_ogu_genomes_series_for_sample @@ -74,6 +78,8 @@ class TestCalcCellCountsData: SAMPLE_ID_KEY: ["example1", "example2"], SAMPLE_IN_ALIQUOT_MASS_G_KEY: [0.027829017, 0.029491697], SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY: [5, 4.76], + SAMPLE_VOLUME_UL_KEY: [200, 150], + SAMPLE_SURFACE_AREA_CM2_KEY: [4, 6], GDNA_CONCENTRATION_NG_UL_KEY: [2, 1.4], ELUTE_VOL_UL_KEY: [100, 100], SYNDNA_POOL_MASS_NG_KEY: [0.25, 0.238], @@ -89,7 +95,11 @@ class TestCalcCellCountsData: SAMPLE_TOTAL_READS_KEY: [3216923, 611913], SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY: [5, 4.76], GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY: [7.1867431342E-06, - 4.7470988923E-06] + 4.7470988923E-06], + GDNA_MASS_TO_SAMPLE_VOL_RATIO_KEY: [1.0000000000E-09, + 9.3333333333E-10], + GDNA_MASS_TO_SAMPLE_SURFACE_AREA_RATIO_KEY: [5.0000000000E-08, + 2.3333333333E-08] } # These values are taken from cell directly under the @@ -288,7 +298,33 @@ class TestCalcCellCountsData: 5266336.58, 4309425.29, 4241775.81, - 4239032.64] + 4239032.64], + OGU_CELLS_PER_UL_OF_SAMPLE_KEY: [5438.576852, + 2859.984606, + 2260.023104, + 1327.927321, + 1308.077383, + 1195.417056, + 1158.313591, + 1122.845832, + 958.792227, + 732.784863, + 599.635358, + 590.222265, + 589.840566], + OGU_CELLS_PER_CM2_OF_SAMPLE_KEY: [271928.842592, + 142999.230316, + 113001.155186, + 66396.366056, + 65403.869162, + 59770.852802, + 57915.679546, + 56142.291599, + 47939.611356, + 36639.243160, + 29981.767892, + 29511.113225, + 29492.028296] }) # NB: the reason there is no "example2_ogu_full_" is that @@ -320,7 +356,15 @@ class TestCalcCellCountsData: OGU_CELLS_PER_G_OF_SAMPLE_KEY: _remove_filtered_entries( example1_ogu_full_outputs_full_avogadro_dict[ - OGU_CELLS_PER_G_OF_SAMPLE_KEY]) + OGU_CELLS_PER_G_OF_SAMPLE_KEY]), + OGU_CELLS_PER_UL_OF_SAMPLE_KEY: + _remove_filtered_entries( + example1_ogu_full_outputs_full_avogadro_dict[ + OGU_CELLS_PER_UL_OF_SAMPLE_KEY]), + OGU_CELLS_PER_CM2_OF_SAMPLE_KEY: + _remove_filtered_entries( + example1_ogu_full_outputs_full_avogadro_dict[ + OGU_CELLS_PER_CM2_OF_SAMPLE_KEY]) } # This dict contains the results of calculations done on *filtered* @@ -384,7 +428,29 @@ class TestCalcCellCountsData: 3.922839e+06, 5.290705e+07, 2.451376e+06, - 2.411375e+06] + 2.411375e+06], + OGU_CELLS_PER_UL_OF_SAMPLE_KEY: [4385.512032, + 2306.831068, + 1822.647020, + 1069.647383, + 1052.911702, + 961.822421, + 927.531325, + 771.274450, + 10402.124237, + 481.968209, + 474.103500], + OGU_CELLS_PER_CM2_OF_SAMPLE_KEY: [109637.800794, + 57670.776705, + 45566.175504, + 26741.184585, + 26322.792552, + 24045.560534, + 23188.283114, + 19281.861261, + 260053.105920, + 12049.205223, + 11852.587503] } # This dict contains the results of calculations done on *filtered* @@ -431,36 +497,69 @@ class TestCalcCellCountsData: ] } - # NB: The test values for example1 here are *slightly* different than - # those in self.example1_ogu_filtered_outputs_full_avogadro_dict because - # the gdna-to-sample mass ratio calculated internally during this - # soup-to-nuts function has more digits past the decimal than does the - # example1 entry in the manually-populated self.mass_and_totals_dict. - # Since we are multiplying/dividing by large numbers like e.g., 10^9 - # (to change ng to g), this ends up making a slight difference in the - # end product: for example, for L.gasseri, - # 3908565*5.46* cells instead of 3908565*4.85* cells, - # 8324502.*38* instead of 8324502.*25* for L. valderiana, - # and 2055397*5.06* instead of 2055397*4.73* for R. albus. - # Remember, with reordering, the 4th sub-array is for L. gasseri, - # the 5th is for L. valderiana, and the 9th is for R. albus. - # The commented out values are for N. subflava and H. influenzae, - # which are removed due to low coverage. - example1_example4_cells_per_g_sample = [ + example1_example4_results_dict = { + OGU_ID_KEY: reordered_results_dict[OGU_ID_KEY], + # NB: The test values for example1 here are *slightly* different than + # those in self.example1_ogu_filtered_outputs_full_avogadro_dict bc + # the gdna-to-sample mass ratio calculated internally during this + # soup-to-nuts function has more digits past the decimal than does the + # example1 entry in the manually-populated self.mass_and_totals_dict. + # Since we are multiplying/dividing by large numbers like e.g., 10^9 + # (to change ng to g), this ends up making a slight difference in the + # end product: for example, for L.gasseri, + # 3908565*5.46* cells instead of 3908565*4.85* cells, + # 8324502.*38* instead of 8324502.*25* for L. valderiana, + # and 2055397*5.06* instead of 2055397*4.73* for R. albus. + # Remember, with reordering, the 4th sub-array is for L. gasseri, + # the 5th is for L. valderiana, and the 9th is for R. albus. + # The commented out values are for N. subflava and H. influenzae, + # which are removed due to low coverage. + OGU_CELLS_PER_G_OF_SAMPLE_KEY: [ [16242205.78, 6489214.14], [5266336.67, 37034933.76], - #[4241775.87, 0], + # [4241775.87, 0], [39085655.46, 15613844.24], [8324502.38, 3302312.14], [6890593.56, 2745987.02], - #[8069604.7, 0], + # [8069604.7, 0], [9400816.3, 3748706.92], [20553975.06, 8213066.28], [8591155.45, 3424399.56], [4309425.36, 1715963.04], [9543472.71, 3808291.37], [4239032.7, 1687962.12] - ] + ], + OGU_CELLS_PER_CM2_OF_SAMPLE_KEY: [ + [113001.155186, 31896.322853], + [36639.24316, 182037.174147], + # + [271928.842592, 76746.460557], + [57915.679546, 16231.79818], + [47939.611356, 13497.302883], + # + [65403.869162, 18425.954787], + [142999.230316, 40369.543694], + [59770.852802, 16831.892374], + [29981.767892, 8434.443656], + [66396.366056, 18718.82921], + [29492.028296, 8296.811252], + ], + OGU_CELLS_PER_UL_OF_SAMPLE_KEY: [ + [2260.023104, 1275.852914], + [732.784863, 7281.486966], + # + [5438.576852, 3069.858422], + [1158.313591, 649.271927], + [958.792227, 539.892115], + # + [1308.077383, 737.038191], + [2859.984606, 1614.781748], + [1195.417056, 673.275695], + [599.635358, 337.377746], + [1327.927321, 748.753168], + [589.840566, 331.87245] + ]} + @classmethod def combine_inputs(cls): @@ -539,7 +638,7 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita(self): [GDNA_CONCENTRATION_NG_UL_KEY, ELUTE_VOL_UL_KEY, SYNDNA_POOL_MASS_NG_KEY]} - # NOTE: this column is not needed anymore. It is left in this tests + # NOTE: this column is not needed anymore. It is left in this test # just to show that the code can deal with extra columns (it just # ignores them). prep_info_dict[SAMPLE_TOTAL_READS_KEY] = \ @@ -550,9 +649,6 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita(self): # example4 has the same counts as example2 counts_vals = TestCalcCellCountsData.make_combined_counts_np_array() - ogu_cell_counts_per_g_sample = np.array( - TestCalcCellCountsData.example1_example4_cells_per_g_sample) - sample_info_df = pd.DataFrame(sample_info_dict) prep_info_df = pd.DataFrame(prep_info_dict) counts_biom = biom.table.Table( @@ -566,8 +662,9 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita(self): # Note that, in the output, the ogu_ids are apparently sorted # alphabetically--different than the input order expected_out_biom = biom.table.Table( - ogu_cell_counts_per_g_sample, - TestCalcCellCountsData.reordered_results_dict[OGU_ID_KEY], + np.array(TestCalcCellCountsData.example1_example4_results_dict[ + OGU_CELLS_PER_G_OF_SAMPLE_KEY]), + TestCalcCellCountsData.example1_example4_results_dict[OGU_ID_KEY], sample_ids) min_coverage = 10 @@ -615,9 +712,6 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_casts(self): # example4 has the same counts as example2 counts_vals = TestCalcCellCountsData.make_combined_counts_np_array() - ogu_cell_counts_per_g_sample = np.array( - TestCalcCellCountsData.example1_example4_cells_per_g_sample) - sample_info_df = pd.DataFrame(sample_info_dict) prep_info_df = pd.DataFrame(prep_info_dict) counts_biom = biom.table.Table( @@ -631,8 +725,9 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_casts(self): # Note that, in the output, the ogu_ids are apparently sorted # alphabetically--different than the input order expected_out_biom = biom.table.Table( - ogu_cell_counts_per_g_sample, - TestCalcCellCountsData.reordered_results_dict[OGU_ID_KEY], + np.array(TestCalcCellCountsData.example1_example4_results_dict[ + OGU_CELLS_PER_G_OF_SAMPLE_KEY]), + TestCalcCellCountsData.example1_example4_results_dict[OGU_ID_KEY], sample_ids) # pass in strings for the numeric values to ensure they get cast @@ -687,7 +782,8 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_negs(self): # negative aliquot mass ogu_cell_counts_per_g_sample = np.array( [[x[0]] for x in - TestCalcCellCountsData.example1_example4_cells_per_g_sample] + TestCalcCellCountsData.example1_example4_results_dict[ + OGU_CELLS_PER_G_OF_SAMPLE_KEY]] ) sample_info_df = pd.DataFrame(sample_info_dict) @@ -827,9 +923,138 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_ids_err(self): sample_info_df, prep_info_df, models_fp, counts_biom, coverages_df, lengths_fp, min_coverage, min_rsquared) + def test_calc_ogu_cell_counts_per_cm2_of_sample_for_qiita(self): + # example4 is the same as example2 except that the elute volume is 70; + # see "absolute_quant_example.xlsx" for details. + example4_elute_vol = 70 + sample_ids = ["example1", "example4"] + sample_info_dict = { + k: TestCalcCellCountsData.sample_and_prep_input_dict[k].copy() for + k in [SAMPLE_SURFACE_AREA_CM2_KEY]} + sample_info_dict[SAMPLE_ID_KEY] = sample_ids + + prep_info_dict = { + k: TestCalcCellCountsData.sample_and_prep_input_dict[k].copy() for k in + [GDNA_CONCENTRATION_NG_UL_KEY, + ELUTE_VOL_UL_KEY, SYNDNA_POOL_MASS_NG_KEY]} + + # NOTE: this column is not needed anymore. It is left in this test + # just to show that the code can deal with extra columns (it just + # ignores them). + prep_info_dict[SAMPLE_TOTAL_READS_KEY] = \ + TestCalcCellCountsData.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY] + prep_info_dict[SAMPLE_ID_KEY] = sample_ids + prep_info_dict[ELUTE_VOL_UL_KEY][1] = example4_elute_vol + + # example4 has the same counts as example2 + counts_vals = TestCalcCellCountsData.make_combined_counts_np_array() + + sample_info_df = pd.DataFrame(sample_info_dict) + prep_info_df = pd.DataFrame(prep_info_dict) + counts_biom = biom.table.Table( + counts_vals, + TestCalcCellCountsData.ogu_lengths_dict[OGU_ID_KEY], + sample_ids) + coverages_df = pd.DataFrame( + TestCalcCellCountsData.ogu_percent_coverage_dict) + models_fp = os.path.join(self.test_data_dir, "models.yml") + lengths_fp = os.path.join(self.test_data_dir, "ogu_lengths.tsv") + # Note that, in the output, the ogu_ids are apparently sorted + # alphabetically--different than the input order + expected_out_biom = biom.table.Table( + np.array(TestCalcCellCountsData.example1_example4_results_dict[ + OGU_CELLS_PER_CM2_OF_SAMPLE_KEY]), + TestCalcCellCountsData.reordered_results_dict[OGU_ID_KEY], + sample_ids) + + min_coverage = 10 + min_rsquared = 0.8 + + output_dict = calc_ogu_cell_counts_per_cm2_of_sample_for_qiita( + sample_info_df, prep_info_df, models_fp, counts_biom, + coverages_df, lengths_fp, min_coverage, min_rsquared) + + self.assertSetEqual( + set(output_dict.keys()), + {CELL_COUNT_RESULT_KEY, CELL_COUNT_LOG_KEY}) + + a_tester = Testers() + a_tester.assert_biom_tables_equal( + expected_out_biom, output_dict[CELL_COUNT_RESULT_KEY], + decimal_precision=1) + self.assertEqual( + "The following items have % coverage lower than the minimum of " + "10.0: ['Neisseria subflava', 'Haemophilus influenzae']", + output_dict[CELL_COUNT_LOG_KEY]) + + def test_calc_ogu_cell_counts_per_ul_of_sample_for_qiita(self): + # example4 is the same as example2 except that the elute volume is 70; + # see "absolute_quant_example.xlsx" for details. + example4_elute_vol = 70 + sample_ids = ["example1", "example4"] + sample_info_dict = { + k: TestCalcCellCountsData.sample_and_prep_input_dict[k].copy() for + k in [SAMPLE_VOLUME_UL_KEY]} + sample_info_dict[SAMPLE_ID_KEY] = sample_ids + + prep_info_dict = { + k: TestCalcCellCountsData.sample_and_prep_input_dict[k].copy() for k in + [GDNA_CONCENTRATION_NG_UL_KEY, + ELUTE_VOL_UL_KEY, SYNDNA_POOL_MASS_NG_KEY]} + + # NOTE: this column is not needed anymore. It is left in this test + # just to show that the code can deal with extra columns (it just + # ignores them). + prep_info_dict[SAMPLE_TOTAL_READS_KEY] = \ + TestCalcCellCountsData.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY] + prep_info_dict[SAMPLE_ID_KEY] = sample_ids + prep_info_dict[ELUTE_VOL_UL_KEY][1] = example4_elute_vol + + # example4 has the same counts as example2 + counts_vals = TestCalcCellCountsData.make_combined_counts_np_array() + + sample_info_df = pd.DataFrame(sample_info_dict) + prep_info_df = pd.DataFrame(prep_info_dict) + counts_biom = biom.table.Table( + counts_vals, + TestCalcCellCountsData.ogu_lengths_dict[OGU_ID_KEY], + sample_ids) + coverages_df = pd.DataFrame( + TestCalcCellCountsData.ogu_percent_coverage_dict) + models_fp = os.path.join(self.test_data_dir, "models.yml") + lengths_fp = os.path.join(self.test_data_dir, "ogu_lengths.tsv") + # Note that, in the output, the ogu_ids are apparently sorted + # alphabetically--different than the input order + expected_out_biom = biom.table.Table( + np.array(TestCalcCellCountsData.example1_example4_results_dict[ + OGU_CELLS_PER_UL_OF_SAMPLE_KEY]), + TestCalcCellCountsData.reordered_results_dict[OGU_ID_KEY], + sample_ids) + + min_coverage = 10 + min_rsquared = 0.8 + + output_dict = calc_ogu_cell_counts_per_ul_of_sample_for_qiita( + sample_info_df, prep_info_df, models_fp, counts_biom, + coverages_df, lengths_fp, min_coverage, min_rsquared) + + self.assertSetEqual( + set(output_dict.keys()), + {CELL_COUNT_RESULT_KEY, CELL_COUNT_LOG_KEY}) + + a_tester = Testers() + a_tester.assert_biom_tables_equal( + expected_out_biom, output_dict[CELL_COUNT_RESULT_KEY], + decimal_precision=1) + self.assertEqual( + "The following items have % coverage lower than the minimum of " + "10.0: ['Neisseria subflava', 'Haemophilus influenzae']", + output_dict[CELL_COUNT_LOG_KEY]) + def test_calc_ogu_cell_counts_biom(self): params_dict = {k: TestCalcCellCountsData.sample_and_prep_input_dict[k] for k in [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY, + SAMPLE_VOLUME_UL_KEY, SAMPLE_SURFACE_AREA_CM2_KEY, GDNA_CONCENTRATION_NG_UL_KEY, ELUTE_VOL_UL_KEY, SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY]} @@ -1026,8 +1251,9 @@ def test__calc_long_format_ogu_cell_counts_df(self): # NB: this test is NOT using the truncated version of Avogadro's # that # was used in the notebook, so the results are slightly different # (but more realistic) + ogu_ids = TestCalcCellCountsData.combine_filtered_out(OGU_ID_KEY) expected_dict = { - OGU_ID_KEY: TestCalcCellCountsData.combine_filtered_out(OGU_ID_KEY), + OGU_ID_KEY: ogu_ids, SAMPLE_ID_KEY: TestCalcCellCountsData.generate_sample_names_list(), OGU_READ_COUNT_KEY: SparseArray( TestCalcCellCountsData.combine_filtered_out(OGU_READ_COUNT_KEY)), @@ -1042,11 +1268,16 @@ def test__calc_long_format_ogu_cell_counts_df(self): OGU_CELLS_PER_G_OF_GDNA_KEY: SparseArray( TestCalcCellCountsData.combine_filtered_out(OGU_CELLS_PER_G_OF_GDNA_KEY)), OGU_CELLS_PER_G_OF_SAMPLE_KEY: SparseArray( - TestCalcCellCountsData.combine_filtered_out(OGU_CELLS_PER_G_OF_SAMPLE_KEY)) + TestCalcCellCountsData.combine_filtered_out(OGU_CELLS_PER_G_OF_SAMPLE_KEY)), + OGU_CELLS_PER_UL_OF_SAMPLE_KEY: SparseArray( + TestCalcCellCountsData.combine_filtered_out(OGU_CELLS_PER_UL_OF_SAMPLE_KEY)), + OGU_CELLS_PER_CM2_OF_SAMPLE_KEY: SparseArray( + TestCalcCellCountsData.combine_filtered_out(OGU_CELLS_PER_CM2_OF_SAMPLE_KEY)) } counts_df = pd.DataFrame(counts_dict) counts_df.set_index(OGU_ID_KEY, inplace=True) + per_sample_calc_info_df = pd.DataFrame(TestCalcCellCountsData.mass_and_totals_dict) coverages_df = pd.DataFrame( TestCalcCellCountsData.ogu_percent_coverage_dict) @@ -1200,12 +1431,46 @@ def test__calc_ogu_cell_counts_df_for_sample(self): input_df = pd.DataFrame(input_dict) per_sample_info_df = pd.DataFrame(TestCalcCellCountsData.mass_and_totals_dict) + expected_additions_dict = { + k: TestCalcCellCountsData.example1_ogu_full_outputs_full_avogadro_dict[k] for k in + (OGU_ID_KEY, OGU_GDNA_MASS_NG_KEY, + OGU_GENOMES_PER_G_OF_GDNA_KEY, + OGU_CELLS_PER_G_OF_GDNA_KEY, + OGU_CELLS_PER_G_OF_SAMPLE_KEY, + OGU_CELLS_PER_UL_OF_SAMPLE_KEY, + OGU_CELLS_PER_CM2_OF_SAMPLE_KEY)} + + sample_a_df = input_df[input_df[SAMPLE_ID_KEY] == "example1"] + expected_add_df = pd.DataFrame(expected_additions_dict) + expected_out_df = sample_a_df.merge(expected_add_df, on=OGU_ID_KEY) + + sample_id = "example1" + min_rsquared = 0.8 + + output_df, output_msgs = _calc_ogu_cell_counts_df_for_sample( + sample_id, TestCalcCellCountsData.linregresses_dict, + per_sample_info_df, input_df, min_rsquared, is_test=False) + + pd.testing.assert_frame_equal(expected_out_df, output_df) + self.assertListEqual([], output_msgs) + + def test__calc_ogu_cell_counts_df_for_sample_w_nan_ratios(self): + input_dict = TestCalcCellCountsData.combine_inputs() + input_df = pd.DataFrame(input_dict) + per_sample_info_df = pd.DataFrame(TestCalcCellCountsData.mass_and_totals_dict) + # set two of the ratios to nans--meaning we didn't get the inputs + # necessary to calculate them. Outputs for these will exist but be nan + per_sample_info_df[GDNA_MASS_TO_SAMPLE_VOL_RATIO_KEY] = np.nan + per_sample_info_df[GDNA_MASS_TO_SAMPLE_SURFACE_AREA_RATIO_KEY] = np.nan + expected_additions_dict = { k: TestCalcCellCountsData.example1_ogu_full_outputs_full_avogadro_dict[k] for k in (OGU_ID_KEY, OGU_GDNA_MASS_NG_KEY, OGU_GENOMES_PER_G_OF_GDNA_KEY, OGU_CELLS_PER_G_OF_GDNA_KEY, OGU_CELLS_PER_G_OF_SAMPLE_KEY)} + expected_additions_dict[OGU_CELLS_PER_UL_OF_SAMPLE_KEY] = np.nan + expected_additions_dict[OGU_CELLS_PER_CM2_OF_SAMPLE_KEY] = np.nan sample_a_df = input_df[input_df[SAMPLE_ID_KEY] == "example1"] expected_add_df = pd.DataFrame(expected_additions_dict) @@ -1224,7 +1489,7 @@ def test__calc_ogu_cell_counts_df_for_sample(self): def test__calc_ogu_cell_counts_df_for_sample_w_log_msgs_no_model(self): input_dict = TestCalcCellCountsData.combine_inputs() input_df = pd.DataFrame(input_dict) - mass_ratio_df = pd.DataFrame(TestCalcCellCountsData.sample_and_prep_input_dict) + mass_ratio_df = pd.DataFrame(TestCalcCellCountsData.mass_and_totals_dict) # No entry for example 1, which should trigger a log message. linregresses_dict = copy.deepcopy(TestCalcCellCountsData.linregresses_dict) @@ -1244,7 +1509,7 @@ def test__calc_ogu_cell_counts_df_for_sample_w_log_msgs_no_model(self): def test__calc_ogu_cell_counts_df_for_sample_w_log_msgs_low_rsquared(self): input_dict = TestCalcCellCountsData.combine_inputs() input_df = pd.DataFrame(input_dict) - mass_ratio_df = pd.DataFrame(TestCalcCellCountsData.sample_and_prep_input_dict) + mass_ratio_df = pd.DataFrame(TestCalcCellCountsData.mass_and_totals_dict) sample_id = "example1" high_min_rsquared = 0.99 @@ -1262,24 +1527,6 @@ def test__calc_ogu_cell_counts_df_for_sample_w_log_msgs_low_rsquared(self): 'minimum allowed value of 0.99.'], output_msgs) - def test__calc_gdna_mass_to_sample_mass_by_sample_df(self): - inputs_dict = {k: TestCalcCellCountsData.sample_and_prep_input_dict[k] for k in - (SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY, - SAMPLE_IN_ALIQUOT_MASS_G_KEY, ELUTE_VOL_UL_KEY)} - - expected_dict = {k: TestCalcCellCountsData.mass_and_totals_dict[k] for k in - (SAMPLE_ID_KEY, GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY)} - - inputs_df = pd.DataFrame(inputs_dict) - - expected_series = pd.Series( - expected_dict[GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY], - index=expected_dict[SAMPLE_ID_KEY], - name=GDNA_MASS_TO_SAMPLE_MASS_RATIO_KEY) - expected_series.index.name = SAMPLE_ID_KEY - - output_series = _calc_gdna_mass_to_sample_mass_by_sample_df(inputs_df) - pd.testing.assert_series_equal(expected_series, output_series) def test__calc_ogu_genomes_per_g_of_gdna_series_for_sample(self): # this is the default value for our experimental system