From 1a95140d90d62e215ebff8cbada2a803f1d93687 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 13 Dec 2024 19:50:50 -0600 Subject: [PATCH 01/12] refactor: extracted output writer factory --- mokapot/confidence.py | 129 +++++++++++++++++++++++++++--------------- 1 file changed, 82 insertions(+), 47 deletions(-) diff --git a/mokapot/confidence.py b/mokapot/confidence.py index a59119f..979ea93 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -332,8 +332,13 @@ def assign_confidence( # just take the first one for info (and make sure the other are the same) curr_dataset = datasets[0] file_ext = curr_dataset.get_default_extension() - for dataset in datasets[1:]: - assert dataset.columns == curr_dataset.columns + for di, dataset in enumerate(datasets[1:]): + if dataset.columns != curr_dataset.columns: + raise ValueError( + "Datasets must have the same columns. " + f"Dataset 1 has columns {curr_dataset.columns} " + f"and dataset {di + 2} has columns {dataset.columns}" + ) # Level data for psm level level = "psms" @@ -361,49 +366,10 @@ def assign_confidence( level_data_path[level] = dest_dir / f"{file_root}{level}{file_ext}" level_hash_columns[level] = curr_dataset.protein_column - output_column_names = [ - "PSMId", - "peptide", - *extra_output_columns, - "score", - "q-value", - "posterior_error_prob", - "proteinIds", - ] - - output_column_names_proteins = [ - "mokapot protein group", - "best peptide", - "stripped sequence", - "score", - "q-value", - "posterior_error_prob", - ] - - @typechecked - def create_output_writer(path: Path, level: str, initialize: bool): - # Note: This method does not create a writer, it writes the data. - if level == "proteins": - output_columns = output_column_names_proteins - else: - output_columns = output_column_names - - # Create the writers - if is_sqlite: - writer = ConfidenceSqliteWriter( - sqlite_path, - columns=output_columns, - column_types=[], - level=level, - qvalue_column="q-value", - pep_column="posterior_error_prob", - ) - else: - writer = TabularDataWriter.from_suffix(path, output_columns, []) - - if initialize: - writer.initialize() - return writer + output_writers_factory = OutputWriterFactory( + extra_output_columns, + is_sqlite=is_sqlite, + ) if prefixes is None: prefixes = [None] * len(datasets) @@ -454,7 +420,7 @@ def create_output_writer(path: Path, level: str, initialize: bool): ) output_writers[level].append( - create_output_writer( + output_writers_factory.create_writer( outfile_targets, level, not append_to_output_file ) ) @@ -464,7 +430,7 @@ def create_output_writer(path: Path, level: str, initialize: bool): dest_dir / f"{file_prefix}decoys.{level}{file_ext}" ) output_writers[level].append( - create_output_writer( + output_writers_factory.create_output_writer( outfile_decoys, level, not append_to_output_file ) ) @@ -577,6 +543,75 @@ def hash_data_row(data_row): return out +class OutputWriterFactory: + """Factory class for creating output writers based on configuration.""" + + def __init__(self, extra_output_columns: list[str], is_sqlite: bool): + self.is_sqlite = is_sqlite + self.extra_output_columns = extra_output_columns + self.output_column_names = [ + "PSMId", + "peptide", + *extra_output_columns, + "score", + "q-value", + "posterior_error_prob", + "proteinIds", + ] + + self.output_column_names_proteins = [ + "mokapot protein group", + "best peptide", + "stripped sequence", + "score", + "q-value", + "posterior_error_prob", + ] + + def __str__(self) -> str: + out = "OutputWriterFactory(" + out += f"extra_output_columns={self.extra_output_columns}, " + out += f"is_sqlite={self.is_sqlite})" + return out + + def __repr__(self) -> str: + out = "OutputWriterFactory:" + out += f"\textra_output_columns={self.extra_output_columns}, " + out += f"\tis_sqlite={self.is_sqlite}" + out += f"\toutput_column_names={self.output_column_names}, " + out += "\toutput_column_names_proteins=" + out += f"{self.output_column_names_proteins}" + return out + + def create_writer( + self, + path: Path, + level: str, + initialize: bool, + ) -> TabularDataWriter | ConfidenceSqliteWriter: + """Create appropriate writer based on output type and level.""" + output_columns = ( + self.output_column_names_proteins + if level == "proteins" + else self.output_column_names + ) + + if self.is_sqlite: + return ConfidenceSqliteWriter( + path, + columns=output_columns, + column_types=[], + level=level, + qvalue_column="q-value", + pep_column="posterior_error_prob", + ) + + writer = TabularDataWriter.from_suffix(path, output_columns, []) + if initialize: + writer.initialize() + return writer + + @contextmanager @typechecked def create_sorted_file_reader( From 0466b71cd647e6f5a71cc812701c142353626975 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 13 Dec 2024 21:27:08 -0600 Subject: [PATCH 02/12] refactor: extracted level manager in confidence --- mokapot/confidence.py | 267 ++++++++++++++++++++++++++++++------------ 1 file changed, 191 insertions(+), 76 deletions(-) diff --git a/mokapot/confidence.py b/mokapot/confidence.py index 979ea93..d24bc02 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -18,6 +18,7 @@ import logging from contextlib import contextmanager from pathlib import Path +from pprint import pformat from typing import Sequence, Iterator import numpy as np @@ -340,34 +341,16 @@ def assign_confidence( f"and dataset {di + 2} has columns {dataset.columns}" ) - # Level data for psm level - level = "psms" - levels = [level] - level_data_path = {level: dest_dir / f"{file_root}{level}{file_ext}"} - level_hash_columns = {level: curr_dataset.spectrum_columns} - - # Level data for higher rollup levels - extra_output_columns = [] - if do_rollup: - level_columns = curr_dataset.level_columns - - for level_column in level_columns: - level = level_column.lower() + "s" # e.g. Peptide to peptides - levels.append(level) - level_data_path[level] = dest_dir / f"{file_root}{level}{file_ext}" - level_hash_columns[level] = [level_column] - if level not in ["psms", "peptides", "proteins"]: - extra_output_columns.append(level_column) - - levels_or_proteins = levels - if proteins: - level = "proteins" - levels_or_proteins = [*levels, level] - level_data_path[level] = dest_dir / f"{file_root}{level}{file_ext}" - level_hash_columns[level] = curr_dataset.protein_column + level_manager = LevelManager.from_dataset( + dataset=curr_dataset, + do_rollup=do_rollup, + use_proteins=True if proteins else False, + dest_dir=dest_dir, + file_root=file_root, + ) output_writers_factory = OutputWriterFactory( - extra_output_columns, + level_manager.extra_output_columns, is_sqlite=is_sqlite, ) @@ -381,38 +364,18 @@ def assign_confidence( # column defs module, and further, have standardized columns # directly from the pin reader (applying the renaming itself) - level_column_names = [ - "PSMId", - dataset.target_column, - "peptide", - *extra_output_columns, - "proteinIds", - "score", - ] - level_input_column_names = [ - dataset.specId_column, - dataset.target_column, - dataset.peptide_column, - *extra_output_columns, - dataset.protein_column, - "score", - ] - - level_input_output_column_mapping = { - in_col: out_col - for in_col, out_col in strictzip( - level_input_column_names, - level_column_names, - ) - if in_col is not None - } + # Q: why is this done here? it seems constant, since all + # datasets have the same columns. + level_input_output_column_mapping = ( + level_manager.build_output_col_mapping(dataset) + ) file_prefix = file_root if prefix: file_prefix = f"{file_prefix}{prefix}." output_writers = {} - for level in levels_or_proteins: + for level in level_manager.levels_or_proteins: output_writers[level] = [] outfile_targets = ( @@ -441,7 +404,9 @@ def assign_confidence( score_reader, dest_dir, file_prefix, - level_hash_columns["psms"] if deduplication else None, + level_manager.level_hash_columns["psms"] + if deduplication + else None, max_workers, level_input_output_column_mapping, ) as sorted_file_reader: @@ -464,29 +429,32 @@ def assign_confidence( level_writers = { level: TabularDataWriter.from_suffix( - level_data_path[level], + level_manager.level_data_paths[level], columns=list(level_input_output_column_mapping.values()), column_types=level_column_types, buffer_size=CONFIDENCE_CHUNK_SIZE, buffer_type=BufferType.Dicts, ) - for level in levels + for level in level_manager.levels } - for writer in level_writers.values(): + for level, writer in level_writers.items(): + LOGGER.info(f"Initializing writer for level {level}: {writer}") writer.initialize() def hash_data_row(data_row): return str([ data_row[level_input_output_column_mapping.get(col, col)] - for col in level_hash_columns[level] + for col in level_manager.level_hash_columns[level] ]) - seen_level_entities = {level: set() for level in levels} + seen_level_entities = { + level: set() for level in level_manager.levels + } score_stats = OnlineStatistics() psm_count = 0 for data_row in sorted_file_iterator: psm_count += 1 - for level in levels: + for level in level_manager.levels: if level != "psms" or deduplication: psm_hash = hash_data_row(data_row) if psm_hash in seen_level_entities[level]: @@ -503,9 +471,13 @@ def hash_data_row(data_row): level_writers[level].append_data(out_row) score_stats.update_single(data_row["score"]) - for level in levels: + for level in level_manager.levels: count = len(seen_level_entities[level]) - level_writers[level].finalize() + curr_writer = level_writers[level] + LOGGER.info( + f"Finalizing writer for level {level}: {curr_writer}" + ) + curr_writer.finalize() if level == "psms": if deduplication: LOGGER.info( @@ -522,8 +494,8 @@ def hash_data_row(data_row): con = Confidence( dataset=dataset, - levels=levels_or_proteins, - level_paths=level_data_path, + levels=level_manager.levels_or_proteins, + level_paths=level_manager.level_data_paths, out_writers=output_writers, eval_fdr=eval_fdr, write_decoys=write_decoys, @@ -543,6 +515,160 @@ def hash_data_row(data_row): return out +# class MultiLevelWriter: + + +class LevelManager: + """Manages level-specific data and operations.""" + + def __init__( + self, + *, + level_columns: list[str], + default_extension: str, + spectrum_columns: list[str], + do_rollup: bool, + use_proteins: bool, + dest_dir: Path, + file_root: str, + ): + self.level_columns = level_columns + self.default_extension = default_extension + self.spectrum_columns = spectrum_columns + self.use_proteins = use_proteins + self.dest_dir = dest_dir + self.file_root = file_root + self.do_rollup = do_rollup + + self._initialize_levels() + self._setup_level_paths() + self._setup_hash_columns() + self._setup_protein_levels() + self._setup_extra_output_columns() + + # self.level_data_paths = {} + # self.level_hash_columns = {} + + @staticmethod + def from_dataset( + *, + dataset: PsmDataset, + do_rollup: bool, + use_proteins: bool, + dest_dir: Path, + file_root: str, + ): + level_columns = dataset.level_columns + default_extension = dataset.get_default_extension() + spectrum_columns = dataset.spectrum_columns + return LevelManager( + level_columns=level_columns, + default_extension=default_extension, + spectrum_columns=spectrum_columns, + do_rollup=do_rollup, + use_proteins=use_proteins, + dest_dir=dest_dir, + file_root=file_root, + ) + + def __repr__(self) -> str: + formatted_dict = pformat(self.__dict__) + return f"{self.__class__!s}({formatted_dict})" + + def _initialize_levels(self) -> list[str]: + """Initialize processing levels based on configuration.""" + levels = ["psms"] + if self.do_rollup: + level_columns = self.level_columns + levels.extend(col.lower() + "s" for col in level_columns) + + self.levels = levels + + def _setup_level_paths( + self, + ) -> None: + """Setup paths for each processing level.""" + self.level_data_paths = {} + file_ext = self.default_extension + for level in self.levels: + self.level_data_paths[level] = ( + self.dest_dir / f"{self.file_root}{level}{file_ext}" + ) + + def _setup_hash_columns(self) -> None: + """Setup hash columns for each level.""" + self.level_hash_columns = {"psms": self.spectrum_columns} + for level in self.levels[1:]: + if level != "proteins": + self.level_hash_columns[level] = [ + level.rstrip("s").capitalize() + ] + + def _setup_protein_levels(self) -> None: + levels_or_proteins = self.levels + if self.use_proteins: + levels_or_proteins = [*levels_or_proteins, "proteins"] + self.level_data_paths["proteins"] = ( + self.dest_dir / f"{self.file_root}proteins{self.file_ext}" + ) + self.level_hash_columns["proteins"] = self.protein_column + + self.levels_or_proteins = levels_or_proteins + + def _setup_extra_output_columns(self) -> None: + extra_output_columns = [] + if self.do_rollup: + level_columns = self.level_columns + + for level_column in level_columns: + level = level_column.lower() + "s" # e.g. Peptide to peptides + if level not in self.levels: + self.levels.append(level) + + self.level_data_paths[level] = ( + self.dest_dir + / f"{self.file_root}{level}{self.default_extension}" + ) + + # I am not sure why but over-writing some of the levels here is + # important, I think it has to do with with how the rollup + # levels are handled (columns are renamed). + self.level_hash_columns[level] = [level_column] + if level not in ["psms", "peptides", "proteins"]: + extra_output_columns.append(level_column) + + self.extra_output_columns = extra_output_columns + + def build_output_col_mapping(self, dataset: PsmDataset) -> dict: + level_column_names = [ + "PSMId", + dataset.target_column, + "peptide", + *self.extra_output_columns, + "proteinIds", + "score", + ] + level_input_column_names = [ + dataset.specId_column, + dataset.target_column, + dataset.peptide_column, + *self.extra_output_columns, + dataset.protein_column, + "score", + ] + + level_input_output_column_mapping = { + in_col: out_col + for in_col, out_col in strictzip( + level_input_column_names, + level_column_names, + ) + if in_col is not None + } + + return level_input_output_column_mapping + + class OutputWriterFactory: """Factory class for creating output writers based on configuration.""" @@ -568,20 +694,9 @@ def __init__(self, extra_output_columns: list[str], is_sqlite: bool): "posterior_error_prob", ] - def __str__(self) -> str: - out = "OutputWriterFactory(" - out += f"extra_output_columns={self.extra_output_columns}, " - out += f"is_sqlite={self.is_sqlite})" - return out - def __repr__(self) -> str: - out = "OutputWriterFactory:" - out += f"\textra_output_columns={self.extra_output_columns}, " - out += f"\tis_sqlite={self.is_sqlite}" - out += f"\toutput_column_names={self.output_column_names}, " - out += "\toutput_column_names_proteins=" - out += f"{self.output_column_names_proteins}" - return out + formatted_dict = pformat(self.__dict__) + return f"{self.__class__!s}({formatted_dict})" def create_writer( self, From 692a9f2b844ce7d562069c9b5e61864444c101c1 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 13 Dec 2024 22:11:38 -0600 Subject: [PATCH 03/12] refactor: extracted level writer group --- mokapot/confidence.py | 194 ++++++++++++++++++++++++++---------------- 1 file changed, 122 insertions(+), 72 deletions(-) diff --git a/mokapot/confidence.py b/mokapot/confidence.py index d24bc02..a93b2c3 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -15,6 +15,8 @@ confidence estimates, rather than initializing the classes below directly. """ +from __future__ import annotations + import logging from contextlib import contextmanager from pathlib import Path @@ -266,7 +268,7 @@ def assign_confidence( datasets: list[PsmDataset], scores_list: list[np.ndarray[float]], max_workers: int = 1, - eval_fdr=0.01, + eval_fdr: float = 0.01, dest_dir: Path | None = None, file_root: str = "", prefixes: list[str | None] | None = None, @@ -422,75 +424,15 @@ def assign_confidence( row_type=BufferType.Dicts ) type_map = sorted_file_reader.get_schema(as_dict=True) - level_column_types = [ - type_map[name] - for name in level_input_output_column_mapping.values() - ] - - level_writers = { - level: TabularDataWriter.from_suffix( - level_manager.level_data_paths[level], - columns=list(level_input_output_column_mapping.values()), - column_types=level_column_types, - buffer_size=CONFIDENCE_CHUNK_SIZE, - buffer_type=BufferType.Dicts, - ) - for level in level_manager.levels - } - for level, writer in level_writers.items(): - LOGGER.info(f"Initializing writer for level {level}: {writer}") - writer.initialize() - - def hash_data_row(data_row): - return str([ - data_row[level_input_output_column_mapping.get(col, col)] - for col in level_manager.level_hash_columns[level] - ]) - - seen_level_entities = { - level: set() for level in level_manager.levels - } - score_stats = OnlineStatistics() - psm_count = 0 - for data_row in sorted_file_iterator: - psm_count += 1 - for level in level_manager.levels: - if level != "psms" or deduplication: - psm_hash = hash_data_row(data_row) - if psm_hash in seen_level_entities[level]: - if level == "psms": - # If we are on the psms level, we can skip - # checking the other levels - break - continue - seen_level_entities[level].add(psm_hash) - out_row = { - col: data_row[col] - for col in level_input_output_column_mapping.values() - } - level_writers[level].append_data(out_row) - score_stats.update_single(data_row["score"]) - - for level in level_manager.levels: - count = len(seen_level_entities[level]) - curr_writer = level_writers[level] - LOGGER.info( - f"Finalizing writer for level {level}: {curr_writer}" - ) - curr_writer.finalize() - if level == "psms": - if deduplication: - LOGGER.info( - f"\t- Found {count} PSMs from unique spectra." - ) - else: - LOGGER.info(f"\t- Found {psm_count} PSMs.") - LOGGER.info( - f"\t- The average score was {score_stats.mean:.3f} " - f"with standard deviation {score_stats.sd:.3f}." - ) - else: - LOGGER.info(f"\t- Found {count} unique {level}.") + level_writers = LevelWriterCollection.from_manager( + level_manager=level_manager, + type_map=type_map, + level_input_output_column_mapping=level_input_output_column_mapping, + deduplication=deduplication, + ) + + level_writers.sink_iterator(sorted_file_iterator) + level_writers.finalize() con = Confidence( dataset=dataset, @@ -506,7 +448,7 @@ def hash_data_row(data_row): peps_algorithm=peps_algorithm, qvalue_algorithm=qvalue_algorithm, stream_confidence=stream_confidence, - score_stats=score_stats, + score_stats=level_writers.score_stats, ) out.append(con) if not prefix: @@ -515,7 +457,112 @@ def hash_data_row(data_row): return out -# class MultiLevelWriter: +class LevelWriterCollection: + def __init__( + self, + levels: list[str], + level_data_paths: dict[str, Path], + schema_dict: dict[str, np.dtype], + level_input_output_column_mapping: dict[str, str], + level_hash_columns: dict[str, list[str]], + deduplication: bool, + ): + # Do I need to pass the levels? cant I use the keys of the data paths? + self.levels = levels + self.deduplication = deduplication + self.level_input_output_column_mapping = ( + level_input_output_column_mapping + ) + self.level_hash_columns = level_hash_columns + level_column_types = [ + schema_dict[name] + for name in level_input_output_column_mapping.values() + ] + + self.level_writers = { + level: TabularDataWriter.from_suffix( + level_data_paths[level], + columns=list(level_input_output_column_mapping.values()), + column_types=level_column_types, + buffer_size=CONFIDENCE_CHUNK_SIZE, + buffer_type=BufferType.Dicts, + ) + for level in levels + } + self.seen_level_entities = {level: set() for level in levels} + for level, writer in self.level_writers.items(): + LOGGER.info(f"Initializing writer for level {level}: {writer}") + writer.initialize() + + self.score_stats = OnlineStatistics() + self.psm_count = 0 + + def __repr__(self): + pretty_dict = pformat(self.__dict__) + return f"{self.__class__!s}({pretty_dict})" + + @staticmethod + def from_manager( + level_manager: LevelManager, + type_map: dict[str, np.dtype], + level_input_output_column_mapping: dict[str, str], + deduplication: bool, + ) -> LevelWriterCollection: + level_data_paths = level_manager.level_data_paths + levels = level_manager.levels + hash_columns = level_manager.level_hash_columns + return LevelWriterCollection( + levels=levels, + level_data_paths=level_data_paths, + schema_dict=type_map, + level_input_output_column_mapping=level_input_output_column_mapping, + level_hash_columns=hash_columns, + deduplication=deduplication, + ) + + def hash_data_row(self, data_row, level): + return str([ + data_row[self.level_input_output_column_mapping.get(col, col)] + for col in self.level_hash_columns[level] + ]) + + def sink_iterator(self, sorted_file_iterator): + for data_row in sorted_file_iterator: + self.psm_count += 1 + for level in self.levels: + if level != "psms" or self.deduplication: + psm_hash = self.hash_data_row(data_row, level=level) + if psm_hash in self.seen_level_entities[level]: + if level == "psms": + # If we are on the psms level, we can skip + # checking the other levels + break + continue + self.seen_level_entities[level].add(psm_hash) + out_row = { + col: data_row[col] + for col in self.level_input_output_column_mapping.values() + } + self.level_writers[level].append_data(out_row) + self.score_stats.update_single(data_row["score"]) + + def finalize(self): + for level in self.levels: + count = len(self.seen_level_entities[level]) + curr_writer = self.level_writers[level] + LOGGER.info(f"Finalizing writer for level {level}: {curr_writer}") + curr_writer.finalize() + if level == "psms": + if self.deduplication: + LOGGER.info(f"\t- Found {count} PSMs from unique spectra.") + else: + LOGGER.info(f"\t- Found {self.psm_count} PSMs.") + LOGGER.info( + f"\t- The average score was {self.score_stats.mean:.3f} " + f"with standard deviation {self.score_stats.sd:.3f}." + ) + else: + LOGGER.info(f"\t- Found {count} unique {level}.") class LevelManager: @@ -640,6 +687,9 @@ def _setup_extra_output_columns(self) -> None: self.extra_output_columns = extra_output_columns def build_output_col_mapping(self, dataset: PsmDataset) -> dict: + # Q: what would be the requirement here? + # Could we use the spectrum columns? since multiple + # columns can be used to identify a spectrum. level_column_names = [ "PSMId", dataset.target_column, From d4d3e8c32a895350994b2387a8a9271d6dd01046 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Sun, 15 Dec 2024 12:08:28 -0600 Subject: [PATCH 04/12] refactor: extracted more writer builder work to class --- mokapot/brew.py | 5 +- mokapot/confidence.py | 133 +++++++++++++++++------ mokapot/dataset.py | 16 ++- mokapot/model.py | 9 +- mokapot/parsers/pin.py | 3 +- tests/conftest.py | 5 - tests/unit_tests/test_confidence.py | 4 - tests/unit_tests/test_writer_flashlfq.py | 12 +- 8 files changed, 129 insertions(+), 58 deletions(-) diff --git a/mokapot/brew.py b/mokapot/brew.py index 245072c..2585acf 100644 --- a/mokapot/brew.py +++ b/mokapot/brew.py @@ -103,12 +103,15 @@ def brew( model = PercolatorModel() try: + # Q: what is this doing? Why does the randon number + # generater get set only if the model has an estimator? + # Shouldn't it assign it to all the models if they are passed? model.estimator model.rng = rng except AttributeError: pass - # Check that all of the datasets have the same features: + # Check that all of the datasets have the same features: feat_set = set(datasets[0].feature_columns) if not all([ set(dataset.feature_columns) == feat_set for dataset in datasets diff --git a/mokapot/confidence.py b/mokapot/confidence.py index a93b2c3..b6bb44e 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -352,13 +352,19 @@ def assign_confidence( ) output_writers_factory = OutputWriterFactory( - level_manager.extra_output_columns, + extra_output_columns=level_manager.extra_output_columns, is_sqlite=is_sqlite, + append_to_output_file=append_to_output_file, + write_decoys=write_decoys, ) if prefixes is None: prefixes = [None] * len(datasets) + level_input_output_column_mapping = level_manager.build_output_col_mapping( + curr_dataset + ) + out = [] for dataset, score, prefix in strictzip(datasets, scores_list, prefixes): @@ -366,40 +372,10 @@ def assign_confidence( # column defs module, and further, have standardized columns # directly from the pin reader (applying the renaming itself) - # Q: why is this done here? it seems constant, since all - # datasets have the same columns. - level_input_output_column_mapping = ( - level_manager.build_output_col_mapping(dataset) + output_writers, file_prefix = output_writers_factory.build_writers( + level_manager ) - file_prefix = file_root - if prefix: - file_prefix = f"{file_prefix}{prefix}." - - output_writers = {} - for level in level_manager.levels_or_proteins: - output_writers[level] = [] - - outfile_targets = ( - dest_dir / f"{file_prefix}targets.{level}{file_ext}" - ) - - output_writers[level].append( - output_writers_factory.create_writer( - outfile_targets, level, not append_to_output_file - ) - ) - - if write_decoys and not is_sqlite: - outfile_decoys = ( - dest_dir / f"{file_prefix}decoys.{level}{file_ext}" - ) - output_writers[level].append( - output_writers_factory.create_output_writer( - outfile_decoys, level, not append_to_output_file - ) - ) - score_reader = TabularDataReader.from_array(score, "score") with create_sorted_file_reader( dataset, @@ -566,7 +542,36 @@ def finalize(self): class LevelManager: - """Manages level-specific data and operations.""" + """Manages level-specific data and operations. + + This class is meant to be used internally by the `Confidence` class. + + Parameters + ---------- + level_columns : list of str + The columns that can be used to aggregate PSMs. + For example, peptides, modified peptides, precursors. + would generate "rollups" of the PSMs at the PSM (default) + and in addition to that, the peptide and modified peptide + columns would generate "peptide groups" of PSMs (each). + default_extension : str + The default extension to use for the output files. + The extension will be used to determine the output format + when initializing the `LevelWriterCollection` which internally + uses the `TabularDataWriter.from_suffix` method. + spectrum_columns : list of str + The columns that uniquely identify a mass spectrum. + do_rollup : bool + Do we apply rollup on peptides, modified peptides etc.? + use_proteins : bool + Whether to roll up protein-level confidence estimates. + dest_dir : Path + The directory in which to save the files. + file_root : str + The prefix added to all output file names. + The final file names will be: + `dest_dir / file_root+level+default_extension` + """ def __init__( self, @@ -644,6 +649,8 @@ def _setup_level_paths( def _setup_hash_columns(self) -> None: """Setup hash columns for each level.""" + + # Q: wouldnt the right thing here be to use spectrum_cols + peptide? self.level_hash_columns = {"psms": self.spectrum_columns} for level in self.levels[1:]: if level != "proteins": @@ -722,13 +729,24 @@ def build_output_col_mapping(self, dataset: PsmDataset) -> dict: class OutputWriterFactory: """Factory class for creating output writers based on configuration.""" - def __init__(self, extra_output_columns: list[str], is_sqlite: bool): + def __init__( + self, + *, + extra_output_columns: list[str], + is_sqlite: bool, + append_to_output_file: bool, + write_decoys: bool, + ): + # Q: are we deleting the sqlite ops? self.is_sqlite = is_sqlite + self.write_decoys = write_decoys self.extra_output_columns = extra_output_columns + self.append_to_output_file = append_to_output_file self.output_column_names = [ "PSMId", "peptide", *extra_output_columns, + # Q: should we prefix these with "mokapot"? "score", "q-value", "posterior_error_prob", @@ -750,6 +768,7 @@ def __repr__(self) -> str: def create_writer( self, + *, path: Path, level: str, initialize: bool, @@ -776,6 +795,50 @@ def create_writer( writer.initialize() return writer + def build_writers( + self, level_manager: LevelManager, prefix: str | None = None + ): + output_writers = {} + + file_prefix = level_manager.file_root + if prefix: + file_prefix = f"{file_prefix}{prefix}." + + for level in level_manager.levels_or_proteins: + output_writers[level] = [] + + name = [ + str(file_prefix), + "targets.", + str(level), + str(level_manager.default_extension), + ] + + outfile_targets = level_manager.dest_dir / "".join(name) + + output_writers[level].append( + self.create_writer( + path=outfile_targets, + level=level, + initialize=not self.append_to_output_file, + ) + ) + + if self.write_decoys and not self.is_sqlite: + outfile_decoys = ( + self.dest_dir + / f"{self.file_prefix}decoys.{level}{self.file_ext}" + ) + output_writers[level].append( + self.create_writer( + path=outfile_decoys, + level=level, + initialize=not self.append_to_output_file, + ) + ) + + return output_writers, file_prefix + @contextmanager @typechecked diff --git a/mokapot/dataset.py b/mokapot/dataset.py index b82c11f..0e70993 100644 --- a/mokapot/dataset.py +++ b/mokapot/dataset.py @@ -748,7 +748,7 @@ class OnDiskPsmDataset(PsmDataset): def __init__( self, filename_or_reader: Path | TabularDataReader, - columns, + *, target_column, spectrum_columns, peptide_column, @@ -756,7 +756,7 @@ def __init__( feature_columns, metadata_columns, metadata_column_types, # the columns+types could be a dict. - level_columns, + level_columns, # What is this supposed to be? filename_column, scan_column, specId_column, # Why does this have different capitalization? @@ -773,7 +773,10 @@ def __init__( else: self._reader = TabularDataReader.from_path(filename_or_reader) + columns = self.reader.get_column_names() self.columns = columns + # Q: Why ae columns asked for in the constructor? + # . Since we can read them from the reader ... self._target_column = target_column self._peptide_column = peptide_column self._protein_column = protein_column @@ -791,7 +794,6 @@ def __init__( self._specId_column = specId_column self._spectra_dataframe = spectra_dataframe - columns = self.reader.get_column_names() opt_cols = OptionalColumns( filename=filename_column, scan=scan_column, @@ -832,7 +834,7 @@ def check_columns(columns): check_column(self.expmass_column) check_column(self.rt_column) check_column(self.charge_column) - check_column(self.specId_column) + # check_column(self.specId_column) def get_default_extension(self) -> str: return self.reader.get_default_extension() @@ -852,6 +854,10 @@ def peptides(self) -> pd.Series: @property def specId_column(self) -> str: + # breakpoint() + # I am thinking on removing this ... since the "key" + # of a spectrum is all the columns that identify it uniquely. + # ... not this column that might or might not be present. return self._specId_column @property @@ -919,7 +925,7 @@ def __repr__(self) -> str: rep += f"Expmass column: {self.expmass_column}\n" rep += f"Rt column: {self.rt_column}\n" rep += f"Charge column: {self.charge_column}\n" - rep += f"SpecId column: {self.specId_column}\n" + # rep += f"SpecId column: {self.specId_column}\n" rep += f"Spectra DF: \n{spec_sec}\n" return rep diff --git a/mokapot/model.py b/mokapot/model.py index f76eb52..ac9fd20 100644 --- a/mokapot/model.py +++ b/mokapot/model.py @@ -224,6 +224,8 @@ def decision_function(self, dataset: LinearPsmDataset): numpy.ndarray A :py:class:`numpy.ndarray` containing the score for each PSM. """ + # Q: we should rename this methid to "score_dataset" ... + # ... or just remove it ... since it is redundant with `predict` if not self.is_trained: raise NotFittedError("This model is untrained. Run fit() first.") @@ -558,6 +560,10 @@ def _get_starting_labels(dataset: LinearPsmDataset, model): feat_pass : int The number of passing PSMs with the best feature. """ + + # Note: This function does sooo much more than getting the starting + # labels, we should at least rename it to something more descriptive. + # JSPP 2024-12-14 LOGGER.debug("Finding initial direction...") if model.direction is None and not model.is_trained: feat_res = dataset._find_best_feature(model.train_fdr) @@ -656,7 +662,7 @@ def _find_hyperparameters(model, features, labels): return new_est -def _get_weights(model, features): +def _get_weights(model, features) -> list[str] | None: """ If the model is a linear model, parse the weights to a list of strings. @@ -680,6 +686,7 @@ def _get_weights(model, features): assert len(intercept) == 1 weights = list(weights.flatten()) except (AttributeError, AssertionError): + LOGGER.debug("No coefficients in the current model.") return None col_width = max([len(f) for f in features]) + 2 diff --git a/mokapot/parsers/pin.py b/mokapot/parsers/pin.py index ae45213..06d16c1 100644 --- a/mokapot/parsers/pin.py +++ b/mokapot/parsers/pin.py @@ -221,6 +221,8 @@ def read_percolator( chunk_size=CHUNK_SIZE_COLUMNS_FOR_DROP_COLUMNS, ) df_spectra_list = [] + # Q: this really feels like a bad idea ... concurrent mutation of a list + # . where the elements are concrruently mutated datafames in-place. features_to_drop = Parallel(n_jobs=max_workers, require="sharedmem")( delayed(drop_missing_values_and_fill_spectra_dataframe)( reader=reader, @@ -252,7 +254,6 @@ def read_percolator( return OnDiskPsmDataset( perc_file, - columns=columns, target_column=labels, spectrum_columns=spectra, peptide_column=peptides, diff --git a/tests/conftest.py b/tests/conftest.py index 9e665f3..e516cf5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -208,8 +208,6 @@ def psms_ondisk() -> OnDiskPsmDataset: usecols=["ScanNr", "ExpMass", "Label"], ) # Q: why is the exp mass in the spectra dataframe? - with open(filename) as perc: - columns = perc.readline().rstrip().split("\t") psms = OnDiskPsmDataset( filename, target_column="Label", @@ -251,7 +249,6 @@ def psms_ondisk() -> OnDiskPsmDataset: filename_column=None, specId_column="SpecId", spectra_dataframe=df_spectra, - columns=columns, ) return psms @@ -264,7 +261,6 @@ def psms_ondisk_from_parquet() -> OnDiskPsmDataset: filename, columns=["ScanNr", "ExpMass", "Label"] ).to_pandas() df_spectra = convert_targets_column(df_spectra, "Label") - columns = pq.ParquetFile(filename).schema.names psms = OnDiskPsmDataset( filename, target_column="Label", @@ -309,7 +305,6 @@ def psms_ondisk_from_parquet() -> OnDiskPsmDataset: filename_column=None, specId_column="SpecId", spectra_dataframe=df_spectra, - columns=columns, ) return psms diff --git a/tests/unit_tests/test_confidence.py b/tests/unit_tests/test_confidence.py index b5548bb..98beb72 100644 --- a/tests/unit_tests/test_confidence.py +++ b/tests/unit_tests/test_confidence.py @@ -34,7 +34,6 @@ def test_chunked_assign_confidence(psm_df_1000, tmp_path): # incorrectly (namely the last and before last) pin_file, df, _, score_cols = psm_df_1000 - columns = list(pd.read_csv(pin_file, sep="\t").columns) df_spectra = pd.read_csv( pin_file, sep="\t", usecols=["scannr", "expmass", "target"] ) @@ -51,7 +50,6 @@ def test_chunked_assign_confidence(psm_df_1000, tmp_path): expmass_column="expmass", rt_column="ret_time", charge_column="charge", - columns=columns, protein_column="proteins", metadata_columns=[ "specid", @@ -143,7 +141,6 @@ def test_assign_confidence_parquet(psm_df_1000_parquet, tmp_path): """Test that assign_confidence() works with parquet files.""" parquet_file, df, _ = psm_df_1000_parquet - columns = pq.ParquetFile(parquet_file).schema.names df_spectra = pq.read_table( parquet_file, columns=["scannr", "expmass", "target"] ).to_pandas() @@ -160,7 +157,6 @@ def test_assign_confidence_parquet(psm_df_1000_parquet, tmp_path): expmass_column="expmass", rt_column="ret_time", charge_column="charge", - columns=columns, protein_column="proteins", metadata_columns=[ "specid", diff --git a/tests/unit_tests/test_writer_flashlfq.py b/tests/unit_tests/test_writer_flashlfq.py index a417ba8..b68b702 100644 --- a/tests/unit_tests/test_writer_flashlfq.py +++ b/tests/unit_tests/test_writer_flashlfq.py @@ -52,14 +52,14 @@ def is_flashlfq_df(df): } for col, coltype in EXPECTED_COLS.items(): assert col in df.columns, f"Column {col} not found in input" - assert isinstance( - df[col].iloc[0], coltype - ), f"Column {col} is not {coltype}" + assert isinstance(df[col].iloc[0], coltype), ( + f"Column {col} is not {coltype}" + ) # Check that the base sequence matches the pattern [A-Z]+ - assert ( - df["Base Sequence"].str.match("[A-Z]+").all() - ), "Base sequence must only contain amino acids" + assert df["Base Sequence"].str.match("[A-Z]+").all(), ( + "Base sequence must only contain amino acids" + ) return True From 789f0b55f2dedc71b5ca6023f950bb6f787066ee Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Sun, 15 Dec 2024 19:11:17 -0600 Subject: [PATCH 05/12] feat: score propagation and unscored confidence --- mokapot/brew.py | 11 ++- mokapot/confidence.py | 87 +++++++++++++++++++++++- mokapot/dataset.py | 77 +++++++++++++++++++-- mokapot/model.py | 9 ++- mokapot/qvalues.py | 12 ++-- mokapot/statistics.py | 13 ++-- pyproject.toml | 2 +- tests/unit_tests/test_confidence.py | 50 +++++++++++++- tests/unit_tests/test_writer_flashlfq.py | 8 ++- uv.lock | 12 ++-- 10 files changed, 246 insertions(+), 35 deletions(-) diff --git a/mokapot/brew.py b/mokapot/brew.py index 2585acf..c2758a1 100644 --- a/mokapot/brew.py +++ b/mokapot/brew.py @@ -25,6 +25,7 @@ ) from mokapot.model import PercolatorModel, Model from mokapot.parsers.pin import parse_in_chunks +from mokapot.utils import strictzip LOGGER = logging.getLogger(__name__) @@ -295,6 +296,8 @@ def brew( # Reverse all scores for which desc is False (this way, we don't have to # return `descs` from this function + # Q: why dont we just return a class that denotes if its descending? + # JSPP 2024-12-15 for idx, desc in enumerate(descs): if not desc: scores[idx] = -scores[idx] @@ -302,6 +305,10 @@ def brew( # Coherces the tuple to a list models = list(models) + + for score, dataset in strictzip(scores, datasets): + dataset.scores = score + return list(models), scores @@ -501,7 +508,9 @@ def _predict( @typechecked def _predict_with_ensemble( - dataset: PsmDataset, models: Iterable[Model], max_workers + dataset: PsmDataset, + models: Iterable[Model], + max_workers: int, ): """ Return the new scores for the dataset using ensemble of all trained models diff --git a/mokapot/confidence.py b/mokapot/confidence.py index b6bb44e..49faa0b 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -28,6 +28,7 @@ from joblib import Parallel, delayed from typeguard import typechecked + from mokapot.column_defs import get_standard_column_name from mokapot.constants import CONFIDENCE_CHUNK_SIZE from mokapot.dataset import PsmDataset, OptionalColumns @@ -58,6 +59,14 @@ LOGGER = logging.getLogger(__name__) +try: + import matplotlib.pyplot as plt +except ImportError: + LOGGER.warning( + "Matplotlib is not installed. Confidence plots will not be available." + ) + plt = None + # Classes --------------------------------------------------------------------- @typechecked @@ -266,7 +275,7 @@ def to_flashlfq(self, out_file="mokapot.flashlfq.txt"): @typechecked def assign_confidence( datasets: list[PsmDataset], - scores_list: list[np.ndarray[float]], + scores_list: list[np.ndarray[float]] | None = None, max_workers: int = 1, eval_fdr: float = 0.01, dest_dir: Path | None = None, @@ -334,7 +343,6 @@ def assign_confidence( # just take the first one for info (and make sure the other are the same) curr_dataset = datasets[0] - file_ext = curr_dataset.get_default_extension() for di, dataset in enumerate(datasets[1:]): if dataset.columns != curr_dataset.columns: raise ValueError( @@ -365,9 +373,23 @@ def assign_confidence( curr_dataset ) + scores_use = scores_list + if scores_use is None: + if any(dataset.scores is None for dataset in datasets): + feature = datasets[0].find_best_feature(eval_fdr).feature + scores_use = [ + dataset.read_data(columns=[feature.name])[ + feature.name + ].to_numpy() + for dataset in datasets + ] + # TODO: warn that no scores are present and will fall back + else: + scores_use = [dataset.scores for dataset in datasets] + out = [] - for dataset, score, prefix in strictzip(datasets, scores_list, prefixes): + for dataset, score, prefix in strictzip(datasets, scores_use, prefixes): # todo: nice to have: move this column renaming stuff into the # column defs module, and further, have standardized columns # directly from the pin reader (applying the renaming itself) @@ -1045,3 +1067,62 @@ def create_score_target_iterator(chunked_iterator: Iterator): scores = df_chunk["score"].values targets = ~df_chunk["is_decoy"].values yield scores, targets + + +def plot_qvalues(qvalues, threshold=0.1, ax=None, **kwargs): + """ + Plot the cumulative number of discoveries over range of q-values. + + Parameters + ---------- + qvalues : numpy.ndarray + The q-values to plot. + threshold : float, optional + Indicates the maximum q-value to plot. + ax : matplotlib.pyplot.Axes, optional + The matplotlib Axes on which to plot. If `None` the current + Axes instance is used. + **kwargs : dict, optional + Arguments passed to :py:func:`matplotlib.axes.Axes.plot`. + + Returns + ------- + matplotlib.pyplot.Axes + An :py:class:`matplotlib.axes.Axes` with the cumulative + number of accepted target PSMs or peptides. + """ + if ax is None: + if plt is None: + raise RuntimeError( + "Matplotlib is not installed. Confidence plots will not be " + "available." + ) + ax = plt.gca() + + # Calculate cumulative targets at each q-value + qvals = pd.Series(qvalues, name="qvalue") + qvals = qvals.sort_values(ascending=True).to_frame() + qvals["target"] = 1 + qvals["num"] = qvals["target"].cumsum() + qvals = qvals.groupby(["qvalue"]).max().reset_index() + qvals = qvals[["qvalue", "num"]] + + zero = pd.DataFrame({"qvalue": qvals["qvalue"][0], "num": 0}, index=[-1]) + qvals = pd.concat([zero, qvals], sort=True).reset_index(drop=True) + + xmargin = threshold * 0.05 + ymax = qvals.num[qvals["qvalue"] <= (threshold + xmargin)].max() + ymargin = ymax * 0.05 + + # Set margins + curr_ylims = ax.get_ylim() + if curr_ylims[1] < ymax + ymargin: + ax.set_ylim(0 - ymargin, ymax + ymargin) + + ax.set_xlim(0 - xmargin, threshold + xmargin) + ax.set_xlabel("q-value") + ax.set_ylabel("Discoveries") + + ax.step(qvals["qvalue"].values, qvals.num.values, where="post", **kwargs) + + return ax diff --git a/mokapot/dataset.py b/mokapot/dataset.py index 0e70993..e0b7bef 100644 --- a/mokapot/dataset.py +++ b/mokapot/dataset.py @@ -67,6 +67,20 @@ def as_dict(self): } +@dataclass +class BestFeatureProperties: + name: str + positives: int + fdr: float + descending: bool + + +@dataclass +class LabeledBestFeature: + feature: BestFeatureProperties + new_labels: np.ndarray + + class PsmDataset(ABC): """Store a collection of PSMs and their features. @@ -261,6 +275,21 @@ def read_data( ) -> pd.DataFrame | Generator[pd.DataFrame, None, None]: raise NotImplementedError + @abstractmethod + def find_best_feature(self, eval_fdr: float) -> LabeledBestFeature: + raise NotImplementedError + + @property + @abstractmethod + def scores(self) -> np.ndarray | None: + # q: should i rename this to mokapot_scores? + raise NotImplementedError + + @scores.setter + @abstractmethod + def scores(self, scores: np.ndarray | None): + raise NotImplementedError + class LinearPsmDataset(PsmDataset): """Store and analyze a collection of PSMs. @@ -659,7 +688,7 @@ def _targets_count_by_feature(self, desc, eval_fdr): index=self._feature_columns, ) - def _find_best_feature(self, eval_fdr): + def find_best_feature(self, eval_fdr: float) -> LabeledBestFeature: """ Find the best feature to separate targets from decoys at the specified false-discovery rate threshold. @@ -704,7 +733,16 @@ def _find_best_feature(self, eval_fdr): f"No PSMs found below the 'eval_fdr' {eval_fdr}." ) - return best_feat, best_positives, new_labels, best_desc + out = LabeledBestFeature( + feature=BestFeatureProperties( + name=best_feat, + positives=best_positives, + descending=best_desc, + fdr=eval_fdr, + ), + new_labels=new_labels, + ) + return out def _calibrate_scores(self, scores, eval_fdr, desc=True): calibrate_scores( @@ -741,6 +779,16 @@ def read_data( def get_default_extension(self) -> str: return ".csv" + @property + def scores(self) -> np.ndarray | None: + if not hasattr(self, "_scores"): + return None + return self._scores + + @scores.setter + def scores(self, scores: np.ndarray | None): + self._scores = scores + @typechecked class OnDiskPsmDataset(PsmDataset): @@ -981,7 +1029,7 @@ def _targets_count_by_feature(self, column, eval_fdr, desc): == 1 ).sum() - def find_best_feature(self, eval_fdr): + def find_best_feature(self, eval_fdr: float) -> LabeledBestFeature: best_feat = None best_positives = 0 new_labels = None @@ -1021,7 +1069,16 @@ def find_best_feature(self, eval_fdr): f"No PSMs found below the 'eval_fdr' {eval_fdr}." ) - return best_feat, best_positives, new_labels, best_desc + out = LabeledBestFeature( + feature=BestFeatureProperties( + name=best_feat, + positives=best_positives, + descending=best_desc, + fdr=eval_fdr, + ), + new_labels=new_labels, + ) + return out def update_labels(self, scores, target_column, eval_fdr=0.01, desc=True): df = self.read_data(columns=target_column) @@ -1123,6 +1180,16 @@ def read_data( else: return self.reader.read(columns=columns) + @property + def scores(self) -> np.ndarray | None: + if not hasattr(self, "_scores"): + return None + return self._scores + + @scores.setter + def scores(self, scores: np.ndarray | None): + self._scores = scores + @typechecked def _update_labels( @@ -1130,7 +1197,7 @@ def _update_labels( targets: np.ndarray[bool] | pd.Series, eval_fdr: float = 0.01, desc: bool = True, -) -> np.ndarray[bool] | pd.Series: +) -> np.ndarray[bool]: """Return the label for each PSM, given it's score. This method is used during model training to define positive examples, diff --git a/mokapot/model.py b/mokapot/model.py index ac9fd20..59772b1 100644 --- a/mokapot/model.py +++ b/mokapot/model.py @@ -566,8 +566,13 @@ def _get_starting_labels(dataset: LinearPsmDataset, model): # JSPP 2024-12-14 LOGGER.debug("Finding initial direction...") if model.direction is None and not model.is_trained: - feat_res = dataset._find_best_feature(model.train_fdr) - best_feat, feat_pass, start_labels, desc = feat_res + feat_res = dataset.find_best_feature(model.train_fdr) + best_feat, feat_pass, start_labels, desc = ( + feat_res.feature.name, + feat_res.feature.positives, + feat_res.new_labels, + feat_res.feature.descending, + ) LOGGER.info( "\t- Selected feature %s with %i PSMs at q<=%g.", best_feat, diff --git a/mokapot/qvalues.py b/mokapot/qvalues.py index e7e3589..b5a1bff 100644 --- a/mokapot/qvalues.py +++ b/mokapot/qvalues.py @@ -136,7 +136,7 @@ def tdc( unique_metric = np.flip(unique_metric) indices = np.flip(indices) - qvals = _fdr2qvalue(fdr, num_total, unique_metric, indices) + qvals = _fdr2qvalue(fdr, num_total, indices) qvals = np.flip(qvals) qvals = qvals[np.argsort(srt_idx)] @@ -144,7 +144,7 @@ def tdc( @nb.njit -def _fdr2qvalue(fdr, num_total, met, indices): +def _fdr2qvalue(fdr, num_total, indices): """Quickly turn a list of FDRs to q-values. All of the inputs are assumed to be sorted. @@ -155,8 +155,6 @@ def _fdr2qvalue(fdr, num_total, met, indices): A vector of all unique FDR values. num_total : numpy.ndarray A vector of the cumulative number of PSMs at each score. - met : numpy.ndarray - A vector of the scores for each PSM. indices : tuple of numpy.ndarray Tuple where the vector at index i indicates the PSMs that shared the unique FDR value in `fdr`. @@ -167,10 +165,9 @@ def _fdr2qvalue(fdr, num_total, met, indices): A vector of q-values. """ min_q = 1 - qvals = np.ones(len(fdr)) - group_fdr = np.ones(len(fdr)) + qvals = np.ones_like(fdr) prev_idx = 0 - for idx in range(met.shape[0]): + for idx in range(indices.shape[0]): next_idx = prev_idx + indices[idx] group = slice(prev_idx, next_idx) prev_idx = next_idx @@ -181,7 +178,6 @@ def _fdr2qvalue(fdr, num_total, met, indices): if curr_fdr < min_q: min_q = curr_fdr - group_fdr[group] = curr_fdr qvals[group] = min_q return qvals diff --git a/mokapot/statistics.py b/mokapot/statistics.py index 42c17b3..93764c9 100644 --- a/mokapot/statistics.py +++ b/mokapot/statistics.py @@ -10,7 +10,6 @@ ) -@typechecked @dataclass(slots=True) class OnlineStatistics: """A class for performing basic statistical calculations. @@ -26,14 +25,18 @@ class OnlineStatistics: sum : float The sum of all values encountered so far. Initialized to 0.0. mean : float - The mean value calculated based on the encountered values. Initialized to 0.0. + The mean value calculated based on the encountered values. + Initialized to 0.0. var : float - The variance value calculated based on the encountered values. Initialized to 0.0. + The variance value calculated based on the encountered values. + Initialized to 0.0. sd : float - The standard deviation value calculated based on the encountered values. + The standard deviation value calculated based on the + encountered values. Initialized to 0.0. M2n : float - The intermediate value used in calculating variance. Initialized to 0.0. + The intermediate value used in calculating variance. + Initialized to 0.0. """ diff --git a/pyproject.toml b/pyproject.toml index 31a6fde..47e81a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,6 @@ dependencies = [ "pandas>=1.0.3", "scikit-learn>=0.22.1", "numba>=0.48.0", - "matplotlib>=3.1.3", "lxml>=4.6.2", "triqler>=0.6.2", "joblib>=1.1.0", @@ -45,6 +44,7 @@ description = "Fast and flexible semi-supervised learning for peptide detection" dynamic = ["version"] license = {text = "Apache 2.0"} name = "mokapot" +optional-dependencies = {plot = ["matplotlib>=3.1.3"]} requires-python = ">=3.9,<3.13" # Upper threshold is required bc of numba [project.readme] diff --git a/tests/unit_tests/test_confidence.py b/tests/unit_tests/test_confidence.py index 98beb72..153a790 100644 --- a/tests/unit_tests/test_confidence.py +++ b/tests/unit_tests/test_confidence.py @@ -9,7 +9,8 @@ from pandas.testing import assert_frame_equal import mokapot -from mokapot import OnDiskPsmDataset, assign_confidence +from mokapot import OnDiskPsmDataset, assign_confidence, LinearPsmDataset +import pytest @contextlib.contextmanager @@ -23,7 +24,44 @@ def run_with_chunk_size(chunk_size): mokapot.confidence.CONFIDENCE_CHUNK_SIZE = old_chunk_size -def test_chunked_assign_confidence(psm_df_1000, tmp_path): +@pytest.fixture +def inmem_psms_ds(psm_df_builder): + """A small-ish PSM dataset""" + data = psm_df_builder(1000, 1000, score_diffs=[5.0]) + psms = LinearPsmDataset( + psms=data.df, + target_column="target", + spectrum_columns="specid", + peptide_column="peptide", + feature_columns=list(data.score_cols), + filename_column="filename", + scan_column="specid", + calcmass_column="calcmass", + expmass_column="expmass", + rt_column="ret_time", + charge_column="charge", + copy_data=True, + ) + return psms + + +@pytest.mark.parametrize("deduplication", [True, False]) +def test_assign_unscored_confidence(inmem_psms_ds, tmp_path, deduplication): + if deduplication: + pytest.skip("Deduplication is not working") + _foo = assign_confidence( + [inmem_psms_ds], + scores_list=None, + eval_fdr=0.01, + dest_dir=tmp_path, + max_workers=4, + deduplication=False, + ) + # TODO actually add assertions here ... + + +@pytest.mark.parametrize("deduplication", [True, False]) +def test_chunked_assign_confidence(psm_df_1000, tmp_path, deduplication): """Test that assign_confidence() works correctly with small chunks""" # After correcting the targets column stuff and @@ -79,6 +117,7 @@ def test_chunked_assign_confidence(psm_df_1000, tmp_path): dest_dir=tmp_path, max_workers=4, eval_fdr=0.02, + deduplication=deduplication, ) df_results_group = pd.read_csv(tmp_path / "targets.peptides.csv", sep="\t") @@ -137,7 +176,10 @@ def test_chunked_assign_confidence(psm_df_1000, tmp_path): ) -def test_assign_confidence_parquet(psm_df_1000_parquet, tmp_path): +@pytest.mark.parametrize("deduplication", [True, False]) +def test_assign_confidence_parquet( + psm_df_1000_parquet, tmp_path, deduplication +): """Test that assign_confidence() works with parquet files.""" parquet_file, df, _ = psm_df_1000_parquet @@ -187,6 +229,7 @@ def test_assign_confidence_parquet(psm_df_1000_parquet, tmp_path): dest_dir=tmp_path, max_workers=4, eval_fdr=0.02, + deduplication=deduplication, ) df_results_group1 = pd.read_parquet( tmp_path / "targets.peptides.parquet" @@ -201,6 +244,7 @@ def test_assign_confidence_parquet(psm_df_1000_parquet, tmp_path): dest_dir=tmp_path, max_workers=4, eval_fdr=0.02, + deduplication=deduplication, ) df_results_group2 = pd.read_parquet( tmp_path / "targets.peptides.parquet" diff --git a/tests/unit_tests/test_writer_flashlfq.py b/tests/unit_tests/test_writer_flashlfq.py index b68b702..4a961d4 100644 --- a/tests/unit_tests/test_writer_flashlfq.py +++ b/tests/unit_tests/test_writer_flashlfq.py @@ -108,13 +108,17 @@ def flashlfq_psms_ds_ondisk(psm_df_builder, tmp_path): return psms -def test_internal_flashlfq_ondisk(flashlfq_psms_ds_ondisk): +@pytest.parametrize("deduplication", [True, False]) +def test_internal_flashlfq_ondisk(flashlfq_psms_ds_ondisk, deduplication): + if deduplication: + pytest.skip("Deduplication is not working") + mods, scores = mokapot.brew([flashlfq_psms_ds_ondisk], test_fdr=0.1) conf = mokapot.assign_confidence( [flashlfq_psms_ds_ondisk], scores_list=scores, eval_fdr=0.1, - deduplication=False, # RN fails with deduplication = True + deduplication=deduplication, # RN fails with deduplication = True ) _tmp = _format_flashlfq(conf[0]) for col in EXPECTED_COLS: diff --git a/uv.lock b/uv.lock index fa6e884..24996b0 100644 --- a/uv.lock +++ b/uv.lock @@ -388,7 +388,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/98/65/13d9e76ca19b0ba5603d71ac8424b5694415b348e719db277b5edc985ff5/cryptography-44.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:761817a3377ef15ac23cd7834715081791d4ec77f9297ee694ca1ee9c2c7e5eb", size = 3915420 }, { url = "https://files.pythonhosted.org/packages/b1/07/40fe09ce96b91fc9276a9ad272832ead0fddedcba87f1190372af8e3039c/cryptography-44.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3c672a53c0fb4725a29c303be906d3c1fa99c32f58abe008a82705f9ee96f40b", size = 4154498 }, { url = "https://files.pythonhosted.org/packages/75/ea/af65619c800ec0a7e4034207aec543acdf248d9bffba0533342d1bd435e1/cryptography-44.0.0-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:4ac4c9f37eba52cb6fbeaf5b59c152ea976726b865bd4cf87883a7e7006cc543", size = 3932569 }, - { url = "https://files.pythonhosted.org/packages/4e/d5/9cc182bf24c86f542129565976c21301d4ac397e74bf5a16e48241aab8a6/cryptography-44.0.0-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:60eb32934076fa07e4316b7b2742fa52cbb190b42c2df2863dbc4230a0a9b385", size = 4164756 }, { url = "https://files.pythonhosted.org/packages/c7/af/d1deb0c04d59612e3d5e54203159e284d3e7a6921e565bb0eeb6269bdd8a/cryptography-44.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ed3534eb1090483c96178fcb0f8893719d96d5274dfde98aa6add34614e97c8e", size = 4016721 }, { url = "https://files.pythonhosted.org/packages/bd/69/7ca326c55698d0688db867795134bdfac87136b80ef373aaa42b225d6dd5/cryptography-44.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f3f6fdfa89ee2d9d496e2c087cebef9d4fcbb0ad63c40e821b39f74bf48d9c5e", size = 4240915 }, { url = "https://files.pythonhosted.org/packages/1a/07/5f165b6c65696ef75601b781a280fc3b33f1e0cd6aa5a92d9fb96c410e97/cryptography-44.0.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1923cb251c04be85eec9fda837661c67c1049063305d6be5721643c22dd4e2b7", size = 3922613 }, @@ -396,7 +395,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d0/c7/c656eb08fd22255d21bc3129625ed9cd5ee305f33752ef2278711b3fa98b/cryptography-44.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c5eb858beed7835e5ad1faba59e865109f3e52b3783b9ac21e7e47dc5554e289", size = 3915417 }, { url = "https://files.pythonhosted.org/packages/ef/82/72403624f197af0db6bac4e58153bc9ac0e6020e57234115db9596eee85d/cryptography-44.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f53c2c87e0fb4b0c00fa9571082a057e37690a8f12233306161c8f4b819960b7", size = 4155160 }, { url = "https://files.pythonhosted.org/packages/a2/cd/2f3c440913d4329ade49b146d74f2e9766422e1732613f57097fea61f344/cryptography-44.0.0-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:9e6fc8a08e116fb7c7dd1f040074c9d7b51d74a8ea40d4df2fc7aa08b76b9e6c", size = 3932331 }, - { url = "https://files.pythonhosted.org/packages/31/d9/90409720277f88eb3ab72f9a32bfa54acdd97e94225df699e7713e850bd4/cryptography-44.0.0-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:9abcc2e083cbe8dde89124a47e5e53ec38751f0d7dfd36801008f316a127d7ba", size = 4165207 }, { url = "https://files.pythonhosted.org/packages/7f/df/8be88797f0a1cca6e255189a57bb49237402b1880d6e8721690c5603ac23/cryptography-44.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:d2436114e46b36d00f8b72ff57e598978b37399d2786fd39793c36c6d5cb1c64", size = 4017372 }, { url = "https://files.pythonhosted.org/packages/af/36/5ccc376f025a834e72b8e52e18746b927f34e4520487098e283a719c205e/cryptography-44.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a01956ddfa0a6790d594f5b34fc1bfa6098aca434696a03cfdbe469b8ed79285", size = 4239657 }, { url = "https://files.pythonhosted.org/packages/1a/aa/ba8a7467c206cb7b62f09b4168da541b5109838627f582843bbbe0235e8e/cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:f677e1268c4e23420c3acade68fac427fffcb8d19d7df95ed7ad17cdef8404f4", size = 3850615 }, @@ -1161,13 +1159,12 @@ wheels = [ [[package]] name = "mokapot" -version = "0.7.2.dev66+gd6ef287.d20241205" +version = "0.10.1.dev37+gd4d3e8c.d20241215" source = { editable = "." } dependencies = [ { name = "importlib-metadata" }, { name = "joblib" }, { name = "lxml" }, - { name = "matplotlib" }, { name = "numba" }, { name = "numpy" }, { name = "pandas" }, @@ -1178,6 +1175,11 @@ dependencies = [ { name = "typeguard" }, ] +[package.optional-dependencies] +plot = [ + { name = "matplotlib" }, +] + [package.dev-dependencies] dev = [ { name = "pre-commit" }, @@ -1200,7 +1202,7 @@ requires-dist = [ { name = "importlib-metadata", specifier = ">=5.1.0" }, { name = "joblib", specifier = ">=1.1.0" }, { name = "lxml", specifier = ">=4.6.2" }, - { name = "matplotlib", specifier = ">=3.1.3" }, + { name = "matplotlib", marker = "extra == 'plot'", specifier = ">=3.1.3" }, { name = "numba", specifier = ">=0.48.0" }, { name = "numpy", specifier = ">=1.4.0,<2.0.0" }, { name = "pandas", specifier = ">=1.0.3" }, From 59e649db95868ee7ebd1c8ccf99196ad3b20c05b Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Mon, 16 Dec 2024 15:35:27 -0600 Subject: [PATCH 06/12] feat(confidence): add data reading api --- mokapot/brew.py | 1 + mokapot/column_defs.py | 3 +- mokapot/confidence.py | 153 ++++++++++++++++++----- mokapot/tabular_data/base.py | 5 + mokapot/tabular_data/csv.py | 8 ++ mokapot/tabular_data/sqlite.py | 4 + mokapot/tabular_data/streaming.py | 8 ++ tests/system_tests/test_brew_rollup.py | 2 +- tests/system_tests/test_cli.py | 2 +- tests/unit_tests/test_writer_flashlfq.py | 2 +- 10 files changed, 151 insertions(+), 37 deletions(-) diff --git a/mokapot/brew.py b/mokapot/brew.py index c2758a1..7641fa9 100644 --- a/mokapot/brew.py +++ b/mokapot/brew.py @@ -306,6 +306,7 @@ def brew( # Coherces the tuple to a list models = list(models) + LOGGER.info("Assigning scores to PSMs...") for score, dataset in strictzip(scores, datasets): dataset.scores = score diff --git a/mokapot/column_defs.py b/mokapot/column_defs.py index de057f6..e801ec6 100644 --- a/mokapot/column_defs.py +++ b/mokapot/column_defs.py @@ -1,3 +1,4 @@ +Q_VALUE_COL_NAME = "mokapot_qvalue" STANDARD_COLUMN_NAME_MAP = { "SpecId": "psm_id", "PSMId": "psm_id", @@ -10,7 +11,7 @@ "ModifiedPeptide": "modified_peptide", "modifiedpeptide": "modified_peptide", # "q-value": "q_value", - "q-value": "q-value", + "q-value": Q_VALUE_COL_NAME, } diff --git a/mokapot/confidence.py b/mokapot/confidence.py index 49faa0b..13da83e 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -29,7 +29,7 @@ from typeguard import typechecked -from mokapot.column_defs import get_standard_column_name +from mokapot.column_defs import get_standard_column_name, Q_VALUE_COL_NAME from mokapot.constants import CONFIDENCE_CHUNK_SIZE from mokapot.dataset import PsmDataset, OptionalColumns from mokapot.peps import ( @@ -82,7 +82,7 @@ def __init__( eval_fdr: float = 0.01, write_decoys: bool = False, do_rollup: bool = True, - proteins=None, + proteins: pd.DataFrame | None = None, peps_error: bool = False, rng=0, peps_algorithm: str = "qvality", @@ -131,7 +131,7 @@ def __init__( self.write_decoys = write_decoys self.levels = levels self.do_rollup = do_rollup - self.proteins = proteins + self._proteins = proteins self.peps_error = peps_error self.rng = rng self.score_stats = score_stats @@ -165,7 +165,6 @@ def __repr__(self) -> str: rep += f"Eval FDR: {self.eval_fdr}\n" rep += f"Write decoys: {self.write_decoys}\n" rep += f"Do rollup: {self.do_rollup}\n" - rep += f"Proteins: {self.proteins}\n" rep += f"Peps error: {self.peps_error}\n" rep += f"Rng: {self.rng}\n" rep += f"Score stats: {self.score_stats}\n" @@ -181,7 +180,7 @@ def _assign_confidence( peps_algorithm: str = "qvality", qvalue_algorithm: str = "tdc", stream_confidence: bool = False, - score_stats=None, + score_stats: OnlineStatistics | None = None, eval_fdr: float = 0.01, ): """ @@ -262,14 +261,59 @@ def _write_protein_level_data(self, level_paths, proteins, rng): def get_optional_columns(self) -> OptionalColumns: return self.dataset.get_optional_columns() + def read(self, level: str) -> pd.DataFrame: + """Read the results for a given level.""" + if level not in self.levels: + raise ValueError( + f"Level {level} not found. Available levels are: {self.levels}" + ) + tmp = [x.read() for x in self.out_writers[level]] + return pd.concat(tmp) + + @property + def peptides(self) -> pd.DataFrame: + return self.read("peptides") + + @property + def psms(self) -> pd.DataFrame: + return self.read("psms") + @property - def peptides(self) -> pd.Series: - return self.dataset.peptides + def proteins(self) -> pd.DataFrame: + return self.read("proteins") def to_flashlfq(self, out_file="mokapot.flashlfq.txt"): """Save confidenct peptides for quantification with FlashLFQ.""" return to_flashlfq(self, out_file) + def plot_qvalues( + self, level: str, threshold: float = 0.1, ax=None, **kwargs + ): + """Plot the q-values for a given level. + + Parameters + ---------- + level : str + The level to plot. + threshold : float, optional + Indicates the maximum q-value to plot. + ax : matplotlib.pyplot.Axes, optional + The matplotlib Axes on which to plot. If `None` the current + Axes instance is used. + **kwargs : dict, optional + Arguments passed to :py:func:`matplotlib.axes.Axes.plot`. + + Returns + ------- + matplotlib.pyplot.Axes + A `matplotlib.axes.Axes` with the cumulative + number of accepted target PSMs or peptides. + """ + + all_read = [x.read() for x in self.out_writers[level]] + qvals = pd.concat(all_read)[Q_VALUE_COL_NAME] + return plot_qvalues(qvals, threshold=threshold, ax=ax, **kwargs) + # Functions ------------------------------------------------------------------- @typechecked @@ -315,8 +359,9 @@ def assign_confidence( dest_dir : Path or None, optional The directory in which to save the files. :code:`None` will use the current working directory. - prefixes : [str] + prefixes : [str] or None The prefixes added to all output file names. + If None, a single concatenated file will be created. write_decoys : bool, optional Save decoys confidence estimates as well? deduplication: bool @@ -375,8 +420,11 @@ def assign_confidence( scores_use = scores_list if scores_use is None: + LOGGER.info("No scores passed, attempting to find them.") if any(dataset.scores is None for dataset in datasets): + LOGGER.info("No scores found, attempting to find best feature.") feature = datasets[0].find_best_feature(eval_fdr).feature + LOGGER.info("Best feature found: %s", feature) scores_use = [ dataset.read_data(columns=[feature.name])[ feature.name @@ -385,6 +433,7 @@ def assign_confidence( ] # TODO: warn that no scores are present and will fall back else: + LOGGER.info("Scores found in psms, using them.") scores_use = [dataset.scores for dataset in datasets] out = [] @@ -395,20 +444,23 @@ def assign_confidence( # directly from the pin reader (applying the renaming itself) output_writers, file_prefix = output_writers_factory.build_writers( - level_manager + level_manager, + prefix=prefix, ) score_reader = TabularDataReader.from_array(score, "score") with create_sorted_file_reader( - dataset, - score_reader, - dest_dir, - file_prefix, - level_manager.level_hash_columns["psms"] - if deduplication - else None, - max_workers, - level_input_output_column_mapping, + dataset=dataset, + score_reader=score_reader, + dest_dir=dest_dir, + file_prefix=file_prefix, + deduplication_columns=( + level_manager.level_hash_columns["psms"] + if deduplication + else None + ), + max_workers=max_workers, + input_output_column_mapping=level_input_output_column_mapping, ) as sorted_file_reader: LOGGER.info("Assigning confidence...") LOGGER.info("Performing target-decoy competition...") @@ -450,12 +502,18 @@ def assign_confidence( ) out.append(con) if not prefix: - append_to_output_file = True + # Having None as a prefix means that all outputs will be + # written to a single file, thus after the first iteration + # we stop initializing the writers (bc that generates over-writing + # the files instead of appending to them). + output_writers_factory.append_to_output_file = True return out class LevelWriterCollection: + """ """ + def __init__( self, levels: list[str], @@ -519,6 +577,9 @@ def from_manager( ) def hash_data_row(self, data_row, level): + # TODO: benchmark if actually hashing here would be better. + # . It feels inefficient to keep large numbers of large strings + # in memory. return str([ data_row[self.level_input_output_column_mapping.get(col, col)] for col in self.level_hash_columns[level] @@ -620,9 +681,6 @@ def __init__( self._setup_protein_levels() self._setup_extra_output_columns() - # self.level_data_paths = {} - # self.level_hash_columns = {} - @staticmethod def from_dataset( *, @@ -770,7 +828,7 @@ def __init__( *extra_output_columns, # Q: should we prefix these with "mokapot"? "score", - "q-value", + Q_VALUE_COL_NAME, "posterior_error_prob", "proteinIds", ] @@ -780,7 +838,7 @@ def __init__( "best peptide", "stripped sequence", "score", - "q-value", + Q_VALUE_COL_NAME, "posterior_error_prob", ] @@ -808,7 +866,7 @@ def create_writer( columns=output_columns, column_types=[], level=level, - qvalue_column="q-value", + qvalue_column=Q_VALUE_COL_NAME, pep_column="posterior_error_prob", ) @@ -819,7 +877,32 @@ def create_writer( def build_writers( self, level_manager: LevelManager, prefix: str | None = None - ): + ) -> tuple[ + dict[str, list[TabularDataWriter] | list[ConfidenceSqliteWriter]], str + ]: + """Build output writers for each level. + + Parameters + ---------- + level_manager : LevelManager + The level manager. + prefix : str, optional + The prefix to use for the output files, by default None. + It will be used to create the file names whose pattern is + "{self.dest_dir}/{file_root}{prefix}.{level}{file_ext}". + + Returns + ------- + tuple[ + dict[ + str, + list[TabularDataWriter] | list[ConfidenceSqliteWriter] + ], + str + ] + A tuple containing the output writers and the file prefix. + + """ output_writers = {} file_prefix = level_manager.file_root @@ -847,10 +930,13 @@ def build_writers( ) if self.write_decoys and not self.is_sqlite: - outfile_decoys = ( - self.dest_dir - / f"{self.file_prefix}decoys.{level}{self.file_ext}" - ) + decoy_name = [ + str(file_prefix), + "decoys.", + str(level), + str(level_manager.default_extension), + ] + outfile_decoys = level_manager.dest_dir / "".join(decoy_name) output_writers[level].append( self.create_writer( path=outfile_decoys, @@ -886,7 +972,8 @@ def create_sorted_file_reader( input_output_column_mapping.get(name, name) for name in input_columns ] file_iterator = reader.get_chunked_data_iterator( - CONFIDENCE_CHUNK_SIZE, output_columns + chunk_size=CONFIDENCE_CHUNK_SIZE, + columns=output_columns, ) # Write those chunks in parallel, where the columns are given @@ -911,7 +998,7 @@ def create_sorted_file_reader( ] sorted_file_reader = MergedTabularDataReader( - readers, + readers=readers, priority_column="score", reader_chunk_size=CONFIDENCE_CHUNK_SIZE, ) @@ -950,7 +1037,7 @@ def _save_sorted_metadata_chunks( try: chunk_metadata.drop_duplicates(deduplication_columns, inplace=True) except KeyError as e: - msg = "Duplication error in the following columns: " + msg = "Duplication error trying to use the following columns: " msg += str(deduplication_columns) msg += f". Found: {chunk_metadata.columns} " msg += ". Please check the input data." diff --git a/mokapot/tabular_data/base.py b/mokapot/tabular_data/base.py index 8ade0e4..5fac3fc 100644 --- a/mokapot/tabular_data/base.py +++ b/mokapot/tabular_data/base.py @@ -310,6 +310,11 @@ def initialize(self): def finalize(self): pass + @abstractmethod + def read(self) -> pd.DataFrame: + # TODO: Evaluate if this method should allow lazier reading. + raise NotImplementedError + def __enter__(self): self.initialize() return self diff --git a/mokapot/tabular_data/csv.py b/mokapot/tabular_data/csv.py index 0e81771..e4113ba 100644 --- a/mokapot/tabular_data/csv.py +++ b/mokapot/tabular_data/csv.py @@ -1,5 +1,6 @@ from pathlib import Path from typing import Generator +import warnings import numpy as np import pandas as pd @@ -97,6 +98,10 @@ def __repr__(self): def initialize(self): # Just write header information + if Path(self.file_name).exists(): + warnings.warn( + f"CSV file {self.file_name} exists, but will be overwritten." + ) df = pd.DataFrame(columns=self.columns) df.to_csv(self.file_name, **self.stdargs) @@ -110,3 +115,6 @@ def append_data(self, data: pd.DataFrame): def get_associated_reader(self): return CSVFileReader(self.file_name, sep=self.stdargs["sep"]) + + def read(self): + return pd.read_csv(self.file_name, sep=self.stdargs["sep"]) diff --git a/mokapot/tabular_data/sqlite.py b/mokapot/tabular_data/sqlite.py index fc715f5..a9b814a 100644 --- a/mokapot/tabular_data/sqlite.py +++ b/mokapot/tabular_data/sqlite.py @@ -101,3 +101,7 @@ def append_data(self, data): row["q_value"] = row[self.qvalue_column] row["posterior_error_prob"] = row[self.pep_column] self.connection.executemany(query, data) + + def read(self, level: str = "psms"): + table_name, table_id_col, mokapot_id_col = self.level_cols[level] + return pd.read_sql_table(table_name, self.connection) diff --git a/mokapot/tabular_data/streaming.py b/mokapot/tabular_data/streaming.py index 3b5a2dd..5e67bc1 100644 --- a/mokapot/tabular_data/streaming.py +++ b/mokapot/tabular_data/streaming.py @@ -5,6 +5,7 @@ from __future__ import annotations import warnings +from pprint import pformat from typing import Generator, Callable, Iterator import numpy as np @@ -398,6 +399,13 @@ def __init__( self.finalized = False self.initialized = False + def __repr__(self): + IGNORE_KEYS = {"buffer"} + dict_repr = pformat({ + k: v for k, v in self.__dict__.items() if k not in IGNORE_KEYS + }) + return f"{self.__class__!s}({dict_repr})" + def __del__(self): if self.initialized and not self.finalized: warnings.warn( diff --git a/tests/system_tests/test_brew_rollup.py b/tests/system_tests/test_brew_rollup.py index 85bbf7b..aa34b69 100644 --- a/tests/system_tests/test_brew_rollup.py +++ b/tests/system_tests/test_brew_rollup.py @@ -149,7 +149,7 @@ def test_rollup_10000(rollup_src_dirs, suffix, tmp_path): df0 = TabularDataReader.from_path(file0).read() df1 = TabularDataReader.from_path(file1).read() - qval_column = "q-value" + qval_column = "mokapot_qvalue" assert_series_equal(df0[qval_column], df1[qval_column], atol=0.02) assert ( estimate_abs_int(df0.score, df1[qval_column] - df0[qval_column]) diff --git a/tests/system_tests/test_cli.py b/tests/system_tests/test_cli.py index 7b23234..f3dbb56 100644 --- a/tests/system_tests/test_cli.py +++ b/tests/system_tests/test_cli.py @@ -43,7 +43,7 @@ def test_basic_cli(tmp_path, scope_files): "PSMId", "peptide", "score", - "q-value", + "mokapot_qvalue", "posterior_error_prob", "proteinIds", ] diff --git a/tests/unit_tests/test_writer_flashlfq.py b/tests/unit_tests/test_writer_flashlfq.py index 4a961d4..d0d9a83 100644 --- a/tests/unit_tests/test_writer_flashlfq.py +++ b/tests/unit_tests/test_writer_flashlfq.py @@ -108,7 +108,7 @@ def flashlfq_psms_ds_ondisk(psm_df_builder, tmp_path): return psms -@pytest.parametrize("deduplication", [True, False]) +@pytest.mark.parametrize("deduplication", [True, False]) def test_internal_flashlfq_ondisk(flashlfq_psms_ds_ondisk, deduplication): if deduplication: pytest.skip("Deduplication is not working") From 2e43ce2e442a84e951555830d262fc0c60013189 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Mon, 16 Dec 2024 17:41:06 -0600 Subject: [PATCH 07/12] feat,experiment: Experimental qvalue-fdr estimation --- mokapot/confidence.py | 7 +++- mokapot/qvalues.py | 50 ++++++++++++++++++++++++++ mokapot/writers/flashlfq.py | 2 +- tests/system_tests/test_brew_rollup.py | 5 +++ tests/system_tests/test_parquet.py | 2 +- tests/system_tests/test_rollup.py | 2 +- tests/unit_tests/test_confidence.py | 8 +++-- 7 files changed, 69 insertions(+), 7 deletions(-) diff --git a/mokapot/confidence.py b/mokapot/confidence.py index 13da83e..f2f2371 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -666,6 +666,7 @@ def __init__( use_proteins: bool, dest_dir: Path, file_root: str, + protein_column: str | None, ): self.level_columns = level_columns self.default_extension = default_extension @@ -674,6 +675,7 @@ def __init__( self.dest_dir = dest_dir self.file_root = file_root self.do_rollup = do_rollup + self.protein_column = protein_column self._initialize_levels() self._setup_level_paths() @@ -693,6 +695,7 @@ def from_dataset( level_columns = dataset.level_columns default_extension = dataset.get_default_extension() spectrum_columns = dataset.spectrum_columns + protein_column = dataset.protein_column return LevelManager( level_columns=level_columns, default_extension=default_extension, @@ -701,6 +704,7 @@ def from_dataset( use_proteins=use_proteins, dest_dir=dest_dir, file_root=file_root, + protein_column=protein_column, ) def __repr__(self) -> str: @@ -740,10 +744,11 @@ def _setup_hash_columns(self) -> None: def _setup_protein_levels(self) -> None: levels_or_proteins = self.levels + file_ext = self.default_extension if self.use_proteins: levels_or_proteins = [*levels_or_proteins, "proteins"] self.level_data_paths["proteins"] = ( - self.dest_dir / f"{self.file_root}proteins{self.file_ext}" + self.dest_dir / f"{self.file_root}proteins{file_ext}" ) self.level_hash_columns["proteins"] = self.protein_column diff --git a/mokapot/qvalues.py b/mokapot/qvalues.py index b5a1bff..0eac457 100644 --- a/mokapot/qvalues.py +++ b/mokapot/qvalues.py @@ -6,6 +6,7 @@ import numba as nb from typeguard import typechecked from typing import Callable +import os from mokapot.peps import ( peps_from_scores_hist_nnls, @@ -126,6 +127,10 @@ def tdc( ) # Calculate q-values + # Note: I really feel like unique values from floats is not the best idea + # ... + # a sane alternative would be to use a specific precision and make it + # an integer. unique_metric, indices = np.unique(scores, return_counts=True) # Some arrays need to be flipped so that we can loop through from @@ -136,7 +141,35 @@ def tdc( unique_metric = np.flip(unique_metric) indices = np.flip(indices) + # import time + + # t0 = time.time() qvals = _fdr2qvalue(fdr, num_total, indices) + # et = (time.time() - t0) * 1000 + # print(f"Base Time: {et}") + # t0 = time.time() + # qvals_np = np.minimum.accumulate(fdr) + # et = (time.time() - t0) * 1000 + # print(f"Numpy Time: {et}") + + # CARE = False + # if CARE and not np.allclose(qvals, qvals_np): + # rmse = np.sqrt(np.mean((qvals - qvals_np) ** 2)) + # print(f"RMSE: {rmse}") + # from matplotlib import pyplot as plt + + # diff_window = (qvals > 0.6) & (qvals < 0.75) + # print(f"Diff Qvals: {qvals[diff_window]}") + # print(f"Diff Qvals_np: {qvals[diff_window]}") + # print(f"Diff Qvals_fdr: {qvals[diff_window]}") + + # plt.scatter(x=qvals, y=qvals_np, alpha=0.3) + # plt.xlabel("Qvals") + # plt.ylabel("Qvals Numpy") + # plt.show() + # # if rmse > 1e-3: + # # raise RuntimeError("Numpy implementation is not close.") + qvals = np.flip(qvals) qvals = qvals[np.argsort(srt_idx)] @@ -173,6 +206,9 @@ def _fdr2qvalue(fdr, num_total, indices): prev_idx = next_idx fdr_group = fdr[group] + # Q: Why isnt this a constant? + # Shouldnt all the elements in the group be the same? + # JSPP 2024-12-16 n_group = num_total[group] curr_fdr = fdr_group[np.argmax(n_group)] if curr_fdr < min_q: @@ -183,6 +219,20 @@ def _fdr2qvalue(fdr, num_total, indices): return qvals +# Experimental for now ... will remove from the PR if needed. +if os.environ.get("MOKAPOT_QVALUES_USE_NUMPY", False): + import warnings + + warnings.warn( + "Using numpy implementation of q-value computation. " + "This is not recommended for production use." + "Set the environment variable MOKAPOT_QVALUES_USE_NUMPY=0 to disable." + ) + + def _fdr2qvalue(fdr, num_total, indices): + return np.minimum.accumulate(fdr) + + @typechecked def qvalues_from_scores( scores: np.ndarray[float], diff --git a/mokapot/writers/flashlfq.py b/mokapot/writers/flashlfq.py index 70157c9..d99a236 100644 --- a/mokapot/writers/flashlfq.py +++ b/mokapot/writers/flashlfq.py @@ -101,7 +101,7 @@ def _format_flashlfq(conf): # OLD: passing = peptides["mokapot q-value"] <= eval_fdr eval_fdr = conf.eval_fdr passing = pd.read_csv(conf.out_writers["peptides"][0].file_name, sep="\t") - passing = passing[passing["q-value"] < eval_fdr] + passing = passing[passing["mokapot_qvalue"] <= eval_fdr] cols_pull = opt_cols cols_pull["PSMId"] = conf.dataset.specId_column diff --git a/tests/system_tests/test_brew_rollup.py b/tests/system_tests/test_brew_rollup.py index aa34b69..48ae75a 100644 --- a/tests/system_tests/test_brew_rollup.py +++ b/tests/system_tests/test_brew_rollup.py @@ -151,6 +151,11 @@ def test_rollup_10000(rollup_src_dirs, suffix, tmp_path): qval_column = "mokapot_qvalue" assert_series_equal(df0[qval_column], df1[qval_column], atol=0.02) + + # Q: What is this meant to test? + # Why is the score expected to be "correlated" with the difference + # in q-values between the streaming and non-streaming implementations? + # JSPP 2024-12-16 assert ( estimate_abs_int(df0.score, df1[qval_column] - df0[qval_column]) < 0.002 diff --git a/tests/system_tests/test_parquet.py b/tests/system_tests/test_parquet.py index dca4407..2bfbb0e 100644 --- a/tests/system_tests/test_parquet.py +++ b/tests/system_tests/test_parquet.py @@ -25,7 +25,7 @@ def test_parquet_output(tmp_path): "PSMId", "peptide", "score", - "q-value", + "mokapot_qvalue", "posterior_error_prob", "proteinIds", ] diff --git a/tests/system_tests/test_rollup.py b/tests/system_tests/test_rollup.py index 267bc1b..25bab5a 100644 --- a/tests/system_tests/test_rollup.py +++ b/tests/system_tests/test_rollup.py @@ -235,7 +235,7 @@ def read_tsv(filename): df_base = read_tsv("base.targets.psms.csv") - qvc = "q-value" + qvc = "mokapot_qvalue" pvc = "posterior_error_prob" pd.testing.assert_frame_equal( df_streamed.drop(columns=[qvc, pvc]), df_base.drop(columns=[qvc, pvc]) diff --git a/tests/unit_tests/test_confidence.py b/tests/unit_tests/test_confidence.py index 153a790..bf88bf5 100644 --- a/tests/unit_tests/test_confidence.py +++ b/tests/unit_tests/test_confidence.py @@ -130,7 +130,7 @@ def test_chunked_assign_confidence(psm_df_1000, tmp_path, deduplication): "PSMId", "peptide", "score", - "q-value", + "mokapot_qvalue", "posterior_error_prob", "proteinIds", ] @@ -158,10 +158,12 @@ def test_chunked_assign_confidence(psm_df_1000, tmp_path, deduplication): # 0.0103092780336737, # 0.0103092780336737, # ]) - assert np.all(df_head["q-value"] < 0.015), ( + assert np.all(df_head["mokapot_qvalue"] < 0.015), ( "Good q-values should be lt 0.015" ) - assert np.all(df_tail["q-value"] > 0.9), "Bad q-values should be gt 0.9" + assert np.all(df_tail["mokapot_qvalue"] > 0.9), ( + "Bad q-values should be gt 0.9" + ) # assert df["posterior_error_prob"].tolist() == approx([ # 3.315389846699129e-05, From be91528e7e82db91e7fb9b2ef841c176cc4a2329 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Tue, 17 Dec 2024 13:56:00 -0600 Subject: [PATCH 08/12] chore,docs: updated basic docs to curr api and updated typing --- mokapot/confidence.py | 132 ++++++++++++++++++++++---------------- mokapot/dataset.py | 30 --------- mokapot/parsers/fasta.py | 18 +++--- mokapot/picked_protein.py | 14 +++- pyproject.toml | 2 +- uv.lock | 2 +- 6 files changed, 98 insertions(+), 100 deletions(-) diff --git a/mokapot/confidence.py b/mokapot/confidence.py index f2f2371..950aae6 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -13,6 +13,8 @@ We recommend using the :py:func:`~mokapot.brew()` function to obtain these confidence estimates, rather than initializing the classes below directly. + +TODO: update this docstring. """ from __future__ import annotations @@ -38,6 +40,7 @@ peps_func_from_hist_nnls, PepsConvergenceError, ) +from mokapot.proteins import Proteins from mokapot.picked_protein import picked_protein from mokapot.qvalues import qvalues_from_scores, qvalues_func_from_hist from mokapot.statistics import OnlineStatistics, HistData @@ -70,8 +73,45 @@ # Classes --------------------------------------------------------------------- @typechecked -class Confidence(object): - """Estimate the statistical confidence for a collection of PSMs.""" +class Confidence: + """Calculate, Store and provide access to confidence estimates. + + This class stores confidence estimates (q-values and PEPs) computed for + different levels (PSMs, peptides, proteins) and provides methods + to access, save and visualize these estimates. + + Parameters + ---------- + dataset : PsmDataset + The dataset containing PSMs and metadata + levels : list[str] + Levels at which confidence estimation was performed + (e.g. ['psms','peptides']) + level_paths : dict[str, Path] + Paths to intermediate files for each confidence level + out_writers : dict[str, Sequence[TabularDataWriter]] + Writers for outputting results at each level + eval_fdr : float, optional + FDR threshold for evaluation metrics, by default 0.01 + write_decoys : bool, optional + Whether to write decoy results, by default False + do_rollup : bool, optional + Whether to perform protein inference, by default True + proteins : pd.DataFrame, optional + Protein annotations if protein inference is used + peps_error : bool, optional + Whether to raise error on PEP calculation failure, by default False + rng : int or np.random.Generator, optional + Random number generator for reproducibility + peps_algorithm : str, optional + Algorithm for PEP calculation, by default "qvality" + qvalue_algorithm : str, optional + Algorithm for q-value calculation, by default "tdc" + stream_confidence : bool, optional + Whether to stream confidence calculations, by default False + score_stats : OnlineStatistics, optional + Pre-computed score statistics if streaming + """ def __init__( self, @@ -82,7 +122,7 @@ def __init__( eval_fdr: float = 0.01, write_decoys: bool = False, do_rollup: bool = True, - proteins: pd.DataFrame | None = None, + proteins: Proteins | None = None, peps_error: bool = False, rng=0, peps_algorithm: str = "qvality", @@ -90,34 +130,6 @@ def __init__( stream_confidence: bool = False, score_stats=None, ): - """Initialize a Confidence object. - - Assign confidence estimates to a set of PSMs - - Estimate q-values and posterior error probabilities (PEPs) for PSMs and - peptides when ranked by the provided scores. - - Parameters - ---------- - dataset : OnDiskPsmDataset - An OnDiskPsmDataset. - rng : int or np.random.Generator, optional - A seed or generator used for cross-validation split creation and to - break ties, or ``None`` to use the default random number generator - state. - levels : list[str] - Levels at which confidence estimation was performed - level_paths : list[Path] - Files with unique psms and unique peptides. - out_paths : list[list[Path]] - The output files where the results will be written - eval_fdr : float - The FDR threshold at which to report performance. This parameter - has no affect on the analysis itself, only logging messages. - write_decoys : bool - Save decoys confidence estimates as well? - """ - self.dataset = dataset self._score_column = "score" self._target_column = dataset.target_column @@ -172,7 +184,7 @@ def __repr__(self) -> str: def _assign_confidence( self, - levels: list[str], + levels: list[str], # why is this passed if its a property of self? level_path_map: dict[str, Path], out_writers_map: dict[str, Sequence[TabularDataWriter]], write_decoys: bool = False, @@ -237,26 +249,28 @@ def _assign_confidence( level_path.unlink(missing_ok=True) - def _write_protein_level_data(self, level_paths, proteins, rng): + def _write_protein_level_data( + self, level_paths: dict[str, Path], proteins: Proteins, rng + ): psms = TabularDataReader.from_path(level_paths["psms"]).read() - proteins = picked_protein( - psms, - self._target_column, - self._peptide_column, - self._score_column, - proteins, - rng, + proteins_df = picked_protein( + peptides=psms, + target_column=self._target_column, + peptide_column=self._peptide_column, + score_column=self._score_column, + proteins=proteins, + rng=rng, ) - proteins = proteins.sort_values( + proteins_df = proteins_df.sort_values( by=self._score_column, ascending=False ).reset_index(drop=True) protein_writer = TabularDataWriter.from_suffix( file_name=level_paths["proteins"], - columns=proteins.columns.tolist(), - column_types=proteins.dtypes.tolist(), + columns=proteins_df.columns.tolist(), + column_types=proteins_df.dtypes.tolist(), ) - protein_writer.write(proteins) - LOGGER.info("\t- Found %i unique protein groups.", len(proteins)) + protein_writer.write(proteins_df) + LOGGER.info("\t- Found %i unique protein groups.", len(proteins_df)) def get_optional_columns(self) -> OptionalColumns: return self.dataset.get_optional_columns() @@ -326,16 +340,16 @@ def assign_confidence( file_root: str = "", prefixes: list[str | None] | None = None, write_decoys: bool = False, - deduplication=True, - do_rollup=True, - proteins=None, - append_to_output_file=False, - rng=0, - peps_error=False, - peps_algorithm="qvality", - qvalue_algorithm="tdc", - sqlite_path=None, - stream_confidence=False, + deduplication: bool = True, + do_rollup: bool = True, + proteins: Proteins | None = None, + append_to_output_file: bool = False, + rng: int | np.random.Generator = 0, + peps_error: bool = False, + peps_algorithm="qvality", # TODO make this an enum (2024-12-17) + qvalue_algorithm="tdc", # TODO make this an enum (2024-12-17) + sqlite_path: Path | None = None, + stream_confidence: bool = False, ) -> list[Confidence]: """Assign confidence to PSMs peptides, and optionally, proteins. @@ -512,8 +526,6 @@ def assign_confidence( class LevelWriterCollection: - """ """ - def __init__( self, levels: list[str], @@ -628,6 +640,8 @@ class LevelManager: """Manages level-specific data and operations. This class is meant to be used internally by the `Confidence` class. + But it basically bundles the output path creation logic and the column + +level mapping logic. Parameters ---------- @@ -654,6 +668,10 @@ class LevelManager: The prefix added to all output file names. The final file names will be: `dest_dir / file_root+level+default_extension` + protein_column : str, optional + The column that specifies which protein(s) the detected peptide might + have originated from. This column is not used to compute protein-level + confidence estimates, it is just propagated to the output. """ def __init__( diff --git a/mokapot/dataset.py b/mokapot/dataset.py index e0b7bef..286ed79 100644 --- a/mokapot/dataset.py +++ b/mokapot/dataset.py @@ -30,8 +30,6 @@ from mokapot import qvalues from mokapot import utils -from mokapot.parsers.fasta import read_fasta -from mokapot.proteins import Proteins from .tabular_data import TabularDataReader, DataFrameReader LOGGER = logging.getLogger(__name__) @@ -107,29 +105,6 @@ def rng(self, rng): """Set the random number generator""" self._rng = np.random.default_rng(rng) - def add_proteins(self, proteins, **kwargs): - """Add protein information to the dataset. - - Protein sequence information is required to compute protein-level - confidence estimates using the picked-protein approach. - - Parameters - ---------- - proteins : a Proteins object or str - The :py:class:`~mokapot.proteins.Proteins` object defines the - mapping of peptides to proteins and the mapping of decoy proteins - to their corresponding target proteins. Alternatively, a string - specifying a FASTA file can be specified which will be parsed to - define these mappings. - **kwargs : dict - Keyword arguments to be passed to the - :py:class:`mokapot.read_fasta()` function. - """ - if not isinstance(proteins, Proteins): - proteins = read_fasta(proteins, **kwargs) - - self._proteins = proteins - @abstractmethod def get_optional_columns(self) -> OptionalColumns: """Return a dictionary of optional columns and their names. @@ -657,11 +632,6 @@ def columns(self): """The columns of the dataset.""" return self.data.columns.tolist() - @property - def has_proteins(self): - """Has a FASTA file been added?""" - return self._proteins is not None - def _targets_count_by_feature(self, desc, eval_fdr): """ iterate over features and count the number of positive examples diff --git a/mokapot/parsers/fasta.py b/mokapot/parsers/fasta.py index ceb6140..98f7b10 100644 --- a/mokapot/parsers/fasta.py +++ b/mokapot/parsers/fasta.py @@ -14,14 +14,14 @@ def read_fasta( - fasta_files, - enzyme="[KR]", - missed_cleavages=2, - clip_nterm_methionine=False, - min_length=6, - max_length=50, - semi=False, - decoy_prefix="decoy_", + fasta_files: str | list[str] | tuple[str], + enzyme: str | re.Pattern = "[KR]", + missed_cleavages: int = 2, + clip_nterm_methionine: bool = False, + min_length: int = 6, + max_length: int = 50, + semi: bool = False, + decoy_prefix: str = "decoy_", ): """Parse a FASTA file, storing a mapping of peptides and proteins. @@ -310,7 +310,7 @@ def digest( # Private Functions ----------------------------------------------------------- -def _parse_fasta_files(fasta_files): +def _parse_fasta_files(fasta_files: str | list[str] | tuple[str]) -> list[str]: """Read a fasta file and divide into proteins Parameters diff --git a/mokapot/picked_protein.py b/mokapot/picked_protein.py index 10e1413..15a7a3a 100644 --- a/mokapot/picked_protein.py +++ b/mokapot/picked_protein.py @@ -5,15 +5,22 @@ import logging import pandas as pd +import numpy as np from mokapot import utils from mokapot.peptides import match_decoy +from mokapot.proteins import Proteins LOGGER = logging.getLogger(__name__) def picked_protein( - peptides, target_column, peptide_column, score_column, proteins, rng + peptides: pd.DataFrame, + target_column: str, + peptide_column: str, + score_column: str, + proteins: Proteins, + rng: int | np.random.Generator, ): """Perform the picked-protein approach @@ -25,6 +32,8 @@ def picked_protein( The column in `peptides` indicating if the peptide is a target. peptide_column : str The column in `peptides` containing the peptide sequence. + score_column : str + The column in `peptides` containing the score. proteins : Proteins object A Proteins object. rng : int or numpy.random.Generator @@ -142,6 +151,7 @@ def strip_peptides(sequences): sequences.str.replace(r"[\[\(].*?[\]\)]", "", regex=True) .str.replace(r"^.*?\.", "", regex=True) .str.replace(r"\..*?$", "", regex=True) + .str.strip("-") ) # Sometimes folks use lowercase letters for the termini or mods: @@ -153,7 +163,7 @@ def strip_peptides(sequences): return sequences -def group_with_decoys(peptides, proteins): +def group_with_decoys(peptides: pd.DataFrame, proteins: Proteins): """Retrieve the protein group in the case where the FASTA has decoys. Parameters diff --git a/pyproject.toml b/pyproject.toml index 47e81a1..3810c93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,7 +79,7 @@ select = ["E", "F", "T20"] # T20 is for print() statements. [tool.ruff.lint.per-file-ignores] "__init__.py" = ["F401"] -"docs/*.ipynb" = ["T20"] +"docs/**/*.ipynb" = ["T20"] "test_parser_pepxml.py" = ["E501"] [tool.setuptools] diff --git a/uv.lock b/uv.lock index 24996b0..79c1ea8 100644 --- a/uv.lock +++ b/uv.lock @@ -1159,7 +1159,7 @@ wheels = [ [[package]] name = "mokapot" -version = "0.10.1.dev37+gd4d3e8c.d20241215" +version = "0.10.1.dev40+g2e43ce2.d20241217" source = { editable = "." } dependencies = [ { name = "importlib-metadata" }, From 680fc5b540e8729f3549abc98e2287aca464bbc4 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Tue, 17 Dec 2024 16:02:38 -0600 Subject: [PATCH 09/12] chore: updated basic n joint model docs code (md in progress) --- docs/source/vignettes/.gitignore | 3 + docs/source/vignettes/basic_python_api.ipynb | 551 +++++++++---------- docs/source/vignettes/joint_models.ipynb | 410 +++++++++----- 3 files changed, 542 insertions(+), 422 deletions(-) create mode 100644 docs/source/vignettes/.gitignore diff --git a/docs/source/vignettes/.gitignore b/docs/source/vignettes/.gitignore new file mode 100644 index 0000000..2983688 --- /dev/null +++ b/docs/source/vignettes/.gitignore @@ -0,0 +1,3 @@ + +joint_models/ +basic_python_api_output/ diff --git a/docs/source/vignettes/basic_python_api.ipynb b/docs/source/vignettes/basic_python_api.ipynb index 73548dc..ad0dc6e 100644 --- a/docs/source/vignettes/basic_python_api.ipynb +++ b/docs/source/vignettes/basic_python_api.ipynb @@ -23,7 +23,8 @@ "metadata": {}, "outputs": [], "source": [ - "pin_file = \"../../../data/phospho_rep1.pin\"" + "from pathlib import Path\n", + "pin_file = Path(\"../../../data/phospho_rep1.pin\")" ] }, { @@ -39,7 +40,7 @@ "metadata": {}, "outputs": [], "source": [ - "fasta_file = \"../../../data/human_sp_td.fasta\"" + "fasta_file = Path(\"../../../data/human_sp_td.fasta\")" ] }, { @@ -57,7 +58,6 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", "import mokapot\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", @@ -66,8 +66,12 @@ "np.random.seed(42)\n", "\n", "# Create an output directory\n", - "out_dir = \"basic_python_api_output\"\n", - "os.makedirs(out_dir, exist_ok=True)" + "out_dir = Path(\"basic_python_api_output\")\n", + "out_dir.mkdir(exist_ok=True)\n", + "mokapot_outs = (out_dir / \"mokapot\")\n", + "mokapot_outs.mkdir(exist_ok=True)\n", + "tide_outs = (out_dir / \"tide\")\n", + "tide_outs.mkdir(exist_ok=True)" ] }, { @@ -100,16 +104,57 @@ "source": [ "## Step 2: Read the PSMs\n", "\n", - "We'll now use mokapot to read the PSMs from the provided input file. The [read_pin()](https://mokapot.readthedocs.io/en/latest/api/functions.html#mokapot.read_pin) function returns [LinearPsmDataset](https://mokapot.readthedocs.io/en/latest/api/dataset.html#mokapot.dataset.LinearPsmDataset) object, which stores the PSMs and their associated features for analysis." + "We'll now use mokapot to read the PSMs from the provided input file. The [read_pin()](https://mokapot.readthedocs.io/en/latest/api/functions.html#mokapot.read_pin) function returns a list of [LinearPsmDataset](https://mokapot.readthedocs.io/en/latest/api/dataset.html#mokapot.dataset.LinearPsmDataset) object, which stores the PSMs and their associated features for analysis." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[OnDiskPsmDataset object\n", + " Reader: CSVFileReader(self.file_name=PosixPath('../../../data/phospho_rep1.pin'))\n", + " Spectrum columns: ['ScanNr', 'ExpMass']\n", + " Peptide column: Peptide\n", + " Protein column: Proteins\n", + " Feature columns: ['lnrSp', 'deltLCn', 'deltCn', 'Sp', 'IonFrac', 'RefactoredXCorr', 'NegLog10PValue', 'NegLog10ResEvPValue', 'NegLog10CombinePValue', 'PepLen', 'Charge1', 'Charge2', 'Charge3', 'Charge4', 'Charge5', 'enzN', 'enzC', 'enzInt', 'lnNumDSP', 'dM', 'absdM']\n", + " Metadata columns: ['SpecId', 'ScanNr', 'Peptide', 'Proteins', 'Label', 'CalcMass', 'ExpMass']\n", + " Metadata columns types: [dtype('O'), dtype('int64'), dtype('O'), dtype('O'), dtype('int64'), dtype('float64'), dtype('float64')]\n", + " Level columns: ['Peptide']\n", + " Filename column: None\n", + " Scan column: ScanNr\n", + " Calcmass column: CalcMass\n", + " Expmass column: ExpMass\n", + " Rt column: None\n", + " Charge column: None\n", + " Spectra DF: \n", + " ScanNr ExpMass Label\n", + " 0 16619 750.4149 True\n", + " 1 2025 751.4212 True\n", + " 2 1598 751.4223 True\n", + " 3 9595 750.4153 True\n", + " 4 8281 749.4421 True\n", + " ... ... ... ...\n", + " 55393 59499 6392.0034 False\n", + " 55394 62362 6455.8178 False\n", + " 55395 61067 6603.0532 False\n", + " 55396 62418 6684.0410 False\n", + " 55397 45960 6734.9956 False\n", + " \n", + " [55398 rows x 3 columns]]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "psms = mokapot.read_pin(pin_file)" + "psms = mokapot.read_pin(pin_file, max_workers=1)\n", + "psms" ] }, { @@ -126,7 +171,9 @@ "metadata": {}, "outputs": [], "source": [ - "psms.add_proteins(fasta_file)" + "from mokapot.parsers.fasta import read_fasta\n", + "\n", + "proteins = read_fasta(fasta_file)" ] }, { @@ -142,146 +189,85 @@ "cell_type": "code", "execution_count": 7, "metadata": {}, + "outputs": [], + "source": [ + "# moka_conf, models = mokapot.brew(psms)\n", + "models, scores = mokapot.brew(psms)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sebastianpaez/git/mokapot_worktree/auto_pin_handling/mokapot/tabular_data/csv.py:102: UserWarning: CSV file basic_python_api_output/mokapot/targets.psms.csv exists, but will be overwritten.\n", + " warnings.warn(\n", + "/Users/sebastianpaez/git/mokapot_worktree/auto_pin_handling/mokapot/tabular_data/csv.py:102: UserWarning: CSV file basic_python_api_output/mokapot/targets.peptides.csv exists, but will be overwritten.\n", + " warnings.warn(\n", + "/Users/sebastianpaez/git/mokapot_worktree/auto_pin_handling/mokapot/tabular_data/csv.py:102: UserWarning: CSV file basic_python_api_output/mokapot/targets.proteins.csv exists, but will be overwritten.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "moka_conf = mokapot.assign_confidence(psms, dest_dir=mokapot_outs, proteins=proteins)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SpecIdLabelScanNrExpMassCalcMassPeptidemokapot scoremokapot q-valuemokapot PEPProteins
0target_0_48845_5_-1True488455269.57565269.5728R.RGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHT...11.2154980.0000536.305117e-16sp|P68104|EF1A1_HUMAN
1target_0_45243_4_-1True452433945.87593945.8706R.CSDAAGYPHATHDLEGPPLDAYSIQGQHTISPLDLAK.L10.6010630.0000536.305117e-16sp|Q15365|PCBP1_HUMAN
2target_0_51371_4_-1True513714051.12234051.1086K.KLHEEEIQELQAQIQEQHVQIDVDVSKPDLTAALR.D10.5508550.0000536.305117e-16sp|P08670|VIME_HUMAN
3target_0_41715_3_-1True417154473.83594473.8286K.ALGKYGPADVEDTTGSGATDSKDDDDIDLFGS[79.97]DDEEE...9.9646990.0000536.305117e-16sp|P24534|EF1B_HUMAN
4target_0_48913_5_-1True489135269.57375269.5728R.RGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHT...9.8743740.0000536.305117e-16sp|P68104|EF1A1_HUMAN
\n", - "
" - ], "text/plain": [ - " SpecId Label ScanNr ExpMass CalcMass \\\n", - "0 target_0_48845_5_-1 True 48845 5269.5756 5269.5728 \n", - "1 target_0_45243_4_-1 True 45243 3945.8759 3945.8706 \n", - "2 target_0_51371_4_-1 True 51371 4051.1223 4051.1086 \n", - "3 target_0_41715_3_-1 True 41715 4473.8359 4473.8286 \n", - "4 target_0_48913_5_-1 True 48913 5269.5737 5269.5728 \n", - "\n", - " Peptide mokapot score \\\n", - "0 R.RGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHT... 11.215498 \n", - "1 R.CSDAAGYPHATHDLEGPPLDAYSIQGQHTISPLDLAK.L 10.601063 \n", - "2 K.KLHEEEIQELQAQIQEQHVQIDVDVSKPDLTAALR.D 10.550855 \n", - "3 K.ALGKYGPADVEDTTGSGATDSKDDDDIDLFGS[79.97]DDEEE... 9.964699 \n", - "4 R.RGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHT... 9.874374 \n", - "\n", - " mokapot q-value mokapot PEP Proteins \n", - "0 0.000053 6.305117e-16 sp|P68104|EF1A1_HUMAN \n", - "1 0.000053 6.305117e-16 sp|Q15365|PCBP1_HUMAN \n", - "2 0.000053 6.305117e-16 sp|P08670|VIME_HUMAN \n", - "3 0.000053 6.305117e-16 sp|P24534|EF1B_HUMAN \n", - "4 0.000053 6.305117e-16 sp|P68104|EF1A1_HUMAN " + "[Confidence object\n", + " Dataset: \n", + " \tOnDiskPsmDataset object\n", + " \tReader: CSVFileReader(self.file_name=PosixPath('../../../data/phospho_rep1.pin'))\n", + " \tSpectrum columns: ['ScanNr', 'ExpMass']\n", + " \tPeptide column: Peptide\n", + " \tProtein column: Proteins\n", + " \tFeature columns: ['lnrSp', 'deltLCn', 'deltCn', 'Sp', 'IonFrac', 'RefactoredXCorr', 'NegLog10PValue', 'NegLog10ResEvPValue', 'NegLog10CombinePValue', 'PepLen', 'Charge1', 'Charge2', 'Charge3', 'Charge4', 'Charge5', 'enzN', 'enzC', 'enzInt', 'lnNumDSP', 'dM', 'absdM']\n", + " \tMetadata columns: ['SpecId', 'ScanNr', 'Peptide', 'Proteins', 'Label', 'CalcMass', 'ExpMass']\n", + " \tMetadata columns types: [dtype('O'), dtype('int64'), dtype('O'), dtype('O'), dtype('int64'), dtype('float64'), dtype('float64')]\n", + " \tLevel columns: ['Peptide']\n", + " \tFilename column: None\n", + " \tScan column: ScanNr\n", + " \tCalcmass column: CalcMass\n", + " \tExpmass column: ExpMass\n", + " \tRt column: None\n", + " \tCharge column: None\n", + " \tSpectra DF: \n", + " \t\tUnset\n", + " \t\n", + " Levels: ['psms', 'peptides', 'proteins']\n", + " Level paths: {'psms': PosixPath('basic_python_api_output/mokapot/psms.csv'), 'peptides': PosixPath('basic_python_api_output/mokapot/peptides.csv'), 'proteins': PosixPath('basic_python_api_output/mokapot/proteins.csv')}\n", + " Out writers: {'psms': [CSVFileWriter(self.file_name=PosixPath('basic_python_api_output/mokapot/targets.psms.csv'),self.columns=['PSMId', 'peptide', 'score', 'mokapot_qvalue', 'posterior_error_prob', 'proteinIds'],self.stdargs={'sep': '\\t', 'index': False})], 'peptides': [CSVFileWriter(self.file_name=PosixPath('basic_python_api_output/mokapot/targets.peptides.csv'),self.columns=['PSMId', 'peptide', 'score', 'mokapot_qvalue', 'posterior_error_prob', 'proteinIds'],self.stdargs={'sep': '\\t', 'index': False})], 'proteins': [CSVFileWriter(self.file_name=PosixPath('basic_python_api_output/mokapot/targets.proteins.csv'),self.columns=['mokapot protein group', 'best peptide', 'stripped sequence', 'score', 'mokapot_qvalue', 'posterior_error_prob'],self.stdargs={'sep': '\\t', 'index': False})]}\n", + " Eval FDR: 0.01\n", + " Write decoys: False\n", + " Do rollup: True\n", + " Peps error: False\n", + " Rng: 0\n", + " Score stats: OnlineStatistics(min=-4.48121575197243, max=11.092598293745384, n=101599, sum=45383.74404487357, mean=0.4466947907447275, M2n=390155.2088566294, ddof=1, unbiased=True)]" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "moka_conf, models = mokapot.brew(psms)\n", - "moka_conf.psms.head()" + "moka_conf" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -305,123 +291,92 @@ " \n", " \n", " \n", - " SpecId\n", - " Label\n", - " ScanNr\n", - " ExpMass\n", - " CalcMass\n", - " Peptide\n", - " mokapot score\n", - " mokapot q-value\n", - " mokapot PEP\n", - " Proteins\n", + " PSMId\n", + " peptide\n", + " score\n", + " mokapot_qvalue\n", + " posterior_error_prob\n", + " proteinIds\n", " \n", " \n", " \n", " \n", " 0\n", - " target_0_48845_5_-1\n", - " True\n", - " 48845\n", - " 5269.5756\n", - " 5269.5728\n", - " R.RGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHT...\n", - " 11.215498\n", - " 0.000073\n", + " target_0_51371_4_-1\n", + " K.KLHEEEIQELQAQIQEQHVQIDVDVSKPDLTAALR.D\n", + " 11.092598\n", + " 0.000079\n", " 6.305117e-16\n", - " sp|P68104|EF1A1_HUMAN\n", + " sp|P08670|VIME_HUMAN\n", " \n", " \n", " 1\n", - " target_0_45243_4_-1\n", - " True\n", - " 45243\n", - " 3945.8759\n", - " 3945.8706\n", - " R.CSDAAGYPHATHDLEGPPLDAYSIQGQHTISPLDLAK.L\n", - " 10.601063\n", - " 0.000073\n", + " target_0_48845_5_-1\n", + " R.RGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHT...\n", + " 11.061756\n", + " 0.000079\n", " 6.305117e-16\n", - " sp|Q15365|PCBP1_HUMAN\n", + " sp|P68104|EF1A1_HUMAN\n", " \n", " \n", " 2\n", - " target_0_51371_4_-1\n", - " True\n", - " 51371\n", - " 4051.1223\n", - " 4051.1086\n", - " K.KLHEEEIQELQAQIQEQHVQIDVDVSKPDLTAALR.D\n", - " 10.550855\n", - " 0.000073\n", + " target_0_41715_3_-1\n", + " K.ALGKYGPADVEDTTGSGATDSKDDDDIDLFGS[79.97]DDEEE...\n", + " 10.039874\n", + " 0.000079\n", " 6.305117e-16\n", - " sp|P08670|VIME_HUMAN\n", + " sp|P24534|EF1B_HUMAN\n", " \n", " \n", " 3\n", - " target_0_41715_3_-1\n", - " True\n", - " 41715\n", - " 4473.8359\n", - " 4473.8286\n", - " K.ALGKYGPADVEDTTGSGATDSKDDDDIDLFGS[79.97]DDEEE...\n", - " 9.964699\n", - " 0.000073\n", + " target_0_54061_3_-1\n", + " K.YQFVREPEDEEEEEEEEEEDEDEDLEELEVLER.K\n", + " 9.843314\n", + " 0.000079\n", " 6.305117e-16\n", - " sp|P24534|EF1B_HUMAN\n", + " sp|Q9NQC3|RTN4_HUMAN\n", " \n", " \n", " 4\n", - " target_0_31886_3_-1\n", - " True\n", - " 31886\n", - " 3450.7612\n", - " 3450.7544\n", - " R.AAAAVAAAASSCRPLGSGAGPGPTGAAPVSAPAPGPGPAGK.G\n", - " 9.778672\n", - " 0.000073\n", + " target_0_40746_5_-1\n", + " K.KLVHNALANLDGHPEDKPTHIIFGS[79.97]DS[79.97]ECE...\n", + " 9.830909\n", + " 0.000079\n", " 6.305117e-16\n", - " sp|Q9NRL3|STRN4_HUMAN\n", + " sp|Q76FK4|NOL8_HUMAN\n", " \n", " \n", "\n", "" ], "text/plain": [ - " SpecId Label ScanNr ExpMass CalcMass \\\n", - "0 target_0_48845_5_-1 True 48845 5269.5756 5269.5728 \n", - "1 target_0_45243_4_-1 True 45243 3945.8759 3945.8706 \n", - "2 target_0_51371_4_-1 True 51371 4051.1223 4051.1086 \n", - "3 target_0_41715_3_-1 True 41715 4473.8359 4473.8286 \n", - "4 target_0_31886_3_-1 True 31886 3450.7612 3450.7544 \n", - "\n", - " Peptide mokapot score \\\n", - "0 R.RGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHT... 11.215498 \n", - "1 R.CSDAAGYPHATHDLEGPPLDAYSIQGQHTISPLDLAK.L 10.601063 \n", - "2 K.KLHEEEIQELQAQIQEQHVQIDVDVSKPDLTAALR.D 10.550855 \n", - "3 K.ALGKYGPADVEDTTGSGATDSKDDDDIDLFGS[79.97]DDEEE... 9.964699 \n", - "4 R.AAAAVAAAASSCRPLGSGAGPGPTGAAPVSAPAPGPGPAGK.G 9.778672 \n", + " PSMId peptide \\\n", + "0 target_0_51371_4_-1 K.KLHEEEIQELQAQIQEQHVQIDVDVSKPDLTAALR.D \n", + "1 target_0_48845_5_-1 R.RGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHT... \n", + "2 target_0_41715_3_-1 K.ALGKYGPADVEDTTGSGATDSKDDDDIDLFGS[79.97]DDEEE... \n", + "3 target_0_54061_3_-1 K.YQFVREPEDEEEEEEEEEEDEDEDLEELEVLER.K \n", + "4 target_0_40746_5_-1 K.KLVHNALANLDGHPEDKPTHIIFGS[79.97]DS[79.97]ECE... \n", "\n", - " mokapot q-value mokapot PEP Proteins \n", - "0 0.000073 6.305117e-16 sp|P68104|EF1A1_HUMAN \n", - "1 0.000073 6.305117e-16 sp|Q15365|PCBP1_HUMAN \n", - "2 0.000073 6.305117e-16 sp|P08670|VIME_HUMAN \n", - "3 0.000073 6.305117e-16 sp|P24534|EF1B_HUMAN \n", - "4 0.000073 6.305117e-16 sp|Q9NRL3|STRN4_HUMAN " + " score mokapot_qvalue posterior_error_prob proteinIds \n", + "0 11.092598 0.000079 6.305117e-16 sp|P08670|VIME_HUMAN \n", + "1 11.061756 0.000079 6.305117e-16 sp|P68104|EF1A1_HUMAN \n", + "2 10.039874 0.000079 6.305117e-16 sp|P24534|EF1B_HUMAN \n", + "3 9.843314 0.000079 6.305117e-16 sp|Q9NQC3|RTN4_HUMAN \n", + "4 9.830909 0.000079 6.305117e-16 sp|Q76FK4|NOL8_HUMAN " ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "moka_conf.peptides.head()" + "moka_conf[0].peptides.head()" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -448,55 +403,55 @@ " mokapot protein group\n", " best peptide\n", " stripped sequence\n", - " mokapot score\n", - " mokapot q-value\n", - " mokapot PEP\n", + " score\n", + " mokapot_qvalue\n", + " posterior_error_prob\n", " \n", " \n", " \n", " \n", " 0\n", - " sp|P68104|EF1A1_HUMAN\n", - " R.RGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHT...\n", - " RGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHTAH...\n", - " 11.215498\n", - " 0.000291\n", + " sp|P08670|VIME_HUMAN\n", + " K.KLHEEEIQELQAQIQEQHVQIDVDVSKPDLTAALR.D\n", + " KLHEEEIQELQAQIQEQHVQIDVDVSKPDLTAALR\n", + " 11.092598\n", + " 0.000292\n", " 6.305117e-16\n", " \n", " \n", " 1\n", - " sp|Q15365|PCBP1_HUMAN\n", - " R.CSDAAGYPHATHDLEGPPLDAYSIQGQHTISPLDLAK.L\n", - " CSDAAGYPHATHDLEGPPLDAYSIQGQHTISPLDLAK\n", - " 10.601063\n", - " 0.000291\n", + " sp|P68104|EF1A1_HUMAN\n", + " R.RGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHT...\n", + " RGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHTAH...\n", + " 11.061756\n", + " 0.000292\n", " 6.305117e-16\n", " \n", " \n", " 2\n", - " sp|P08670|VIME_HUMAN\n", - " K.KLHEEEIQELQAQIQEQHVQIDVDVSKPDLTAALR.D\n", - " KLHEEEIQELQAQIQEQHVQIDVDVSKPDLTAALR\n", - " 10.550855\n", - " 0.000291\n", + " sp|P24534|EF1B_HUMAN\n", + " K.ALGKYGPADVEDTTGSGATDSKDDDDIDLFGS[79.97]DDEEE...\n", + " ALGKYGPADVEDTTGSGATDSKDDDDIDLFGSDDEEESEEAK\n", + " 10.039874\n", + " 0.000292\n", " 6.305117e-16\n", " \n", " \n", " 3\n", - " sp|P24534|EF1B_HUMAN\n", - " K.ALGKYGPADVEDTTGSGATDSKDDDDIDLFGS[79.97]DDEEE...\n", - " ALGKYGPADVEDTTGSGATDSKDDDDIDLFGSDDEEESEEAK\n", - " 9.964699\n", - " 0.000291\n", + " sp|Q9NQC3|RTN4_HUMAN\n", + " K.YQFVREPEDEEEEEEEEEEDEDEDLEELEVLER.K\n", + " YQFVREPEDEEEEEEEEEEDEDEDLEELEVLER\n", + " 9.843314\n", + " 0.000292\n", " 6.305117e-16\n", " \n", " \n", " 4\n", - " sp|Q9NRL3|STRN4_HUMAN\n", - " R.AAAAVAAAASSCRPLGSGAGPGPTGAAPVSAPAPGPGPAGK.G\n", - " AAAAVAAAASSCRPLGSGAGPGPTGAAPVSAPAPGPGPAGK\n", - " 9.778672\n", - " 0.000291\n", + " sp|Q76FK4|NOL8_HUMAN\n", + " K.KLVHNALANLDGHPEDKPTHIIFGS[79.97]DS[79.97]ECE...\n", + " KLVHNALANLDGHPEDKPTHIIFGSDSECETEETSTQEQSHPGEEWVK\n", + " 9.830909\n", + " 0.000292\n", " 6.305117e-16\n", " \n", " \n", @@ -505,34 +460,34 @@ ], "text/plain": [ " mokapot protein group best peptide \\\n", - "0 sp|P68104|EF1A1_HUMAN R.RGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHT... \n", - "1 sp|Q15365|PCBP1_HUMAN R.CSDAAGYPHATHDLEGPPLDAYSIQGQHTISPLDLAK.L \n", - "2 sp|P08670|VIME_HUMAN K.KLHEEEIQELQAQIQEQHVQIDVDVSKPDLTAALR.D \n", - "3 sp|P24534|EF1B_HUMAN K.ALGKYGPADVEDTTGSGATDSKDDDDIDLFGS[79.97]DDEEE... \n", - "4 sp|Q9NRL3|STRN4_HUMAN R.AAAAVAAAASSCRPLGSGAGPGPTGAAPVSAPAPGPGPAGK.G \n", + "0 sp|P08670|VIME_HUMAN K.KLHEEEIQELQAQIQEQHVQIDVDVSKPDLTAALR.D \n", + "1 sp|P68104|EF1A1_HUMAN R.RGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHT... \n", + "2 sp|P24534|EF1B_HUMAN K.ALGKYGPADVEDTTGSGATDSKDDDDIDLFGS[79.97]DDEEE... \n", + "3 sp|Q9NQC3|RTN4_HUMAN K.YQFVREPEDEEEEEEEEEEDEDEDLEELEVLER.K \n", + "4 sp|Q76FK4|NOL8_HUMAN K.KLVHNALANLDGHPEDKPTHIIFGS[79.97]DS[79.97]ECE... \n", "\n", - " stripped sequence mokapot score \\\n", - "0 RGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHTAH... 11.215498 \n", - "1 CSDAAGYPHATHDLEGPPLDAYSIQGQHTISPLDLAK 10.601063 \n", - "2 KLHEEEIQELQAQIQEQHVQIDVDVSKPDLTAALR 10.550855 \n", - "3 ALGKYGPADVEDTTGSGATDSKDDDDIDLFGSDDEEESEEAK 9.964699 \n", - "4 AAAAVAAAASSCRPLGSGAGPGPTGAAPVSAPAPGPGPAGK 9.778672 \n", + " stripped sequence score \\\n", + "0 KLHEEEIQELQAQIQEQHVQIDVDVSKPDLTAALR 11.092598 \n", + "1 RGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHTAH... 11.061756 \n", + "2 ALGKYGPADVEDTTGSGATDSKDDDDIDLFGSDDEEESEEAK 10.039874 \n", + "3 YQFVREPEDEEEEEEEEEEDEDEDLEELEVLER 9.843314 \n", + "4 KLVHNALANLDGHPEDKPTHIIFGSDSECETEETSTQEQSHPGEEWVK 9.830909 \n", "\n", - " mokapot q-value mokapot PEP \n", - "0 0.000291 6.305117e-16 \n", - "1 0.000291 6.305117e-16 \n", - "2 0.000291 6.305117e-16 \n", - "3 0.000291 6.305117e-16 \n", - "4 0.000291 6.305117e-16 " + " mokapot_qvalue posterior_error_prob \n", + "0 0.000292 6.305117e-16 \n", + "1 0.000292 6.305117e-16 \n", + "2 0.000292 6.305117e-16 \n", + "3 0.000292 6.305117e-16 \n", + "4 0.000292 6.305117e-16 " ] }, - "execution_count": 9, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "moka_conf.proteins.head()" + "moka_conf[0].proteins.head()" ] }, { @@ -544,11 +499,27 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sebastianpaez/git/mokapot_worktree/auto_pin_handling/mokapot/tabular_data/csv.py:102: UserWarning: CSV file basic_python_api_output/tide/targets.psms.csv exists, but will be overwritten.\n", + " warnings.warn(\n", + "/Users/sebastianpaez/git/mokapot_worktree/auto_pin_handling/mokapot/tabular_data/csv.py:102: UserWarning: CSV file basic_python_api_output/tide/targets.peptides.csv exists, but will be overwritten.\n", + " warnings.warn(\n", + "/Users/sebastianpaez/git/mokapot_worktree/auto_pin_handling/mokapot/tabular_data/csv.py:102: UserWarning: CSV file basic_python_api_output/tide/targets.proteins.csv exists, but will be overwritten.\n", + " warnings.warn(\n" + ] + } + ], "source": [ - "tide_conf = psms.assign_confidence()" + "# Brewing the data modifies the PSMs, so we need to make a a new\n", + "# clean copy of our data.\n", + "psms = mokapot.read_pin(pin_file, max_workers=1)\n", + "tide_conf = mokapot.assign_confidence(psms, dest_dir=tide_outs, proteins=proteins)" ] }, { @@ -562,23 +533,27 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "
" + "
" ] }, - "metadata": { - "needs_background": "light" - }, + "metadata": {}, "output_type": "display_data" } ], "source": [ + "# Since we have lists but a single input file, we can just take the first\n", + "# element\n", + "tide_conf = tide_conf[0]\n", + "moka_conf = moka_conf[0]\n", + "\n", + "\n", "fig, axs = plt.subplots(1, 3, figsize=(12, 4))\n", "colors = (\"#343131\", \"#24B8A0\")\n", "\n", @@ -603,33 +578,33 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "PSMs gained by mokapot: 1149\n", - "Peptides gained by mokapot: 872\n", - "Proteins gained by mokapot: 89\n" + "PSMs gained by mokapot: 1128\n", + "Peptides gained by mokapot: 888\n", + "Proteins gained by mokapot: 86\n" ] } ], "source": [ "# PSMs\n", - "moka_psms = (moka_conf.psms[\"mokapot q-value\"] <= 0.01).sum()\n", - "tide_psms = (tide_conf.psms[\"mokapot q-value\"] <= 0.01).sum()\n", + "moka_psms = (moka_conf.psms[\"mokapot_qvalue\"] <= 0.01).sum()\n", + "tide_psms = (tide_conf.psms[\"mokapot_qvalue\"] <= 0.01).sum()\n", "print(f\"PSMs gained by mokapot: {moka_psms - tide_psms}\")\n", "\n", "# Peptides\n", - "moka_peps = (moka_conf.peptides[\"mokapot q-value\"] <= 0.01).sum()\n", - "tide_peps = (tide_conf.peptides[\"mokapot q-value\"] <= 0.01).sum()\n", + "moka_peps = (moka_conf.peptides[\"mokapot_qvalue\"] <= 0.01).sum()\n", + "tide_peps = (tide_conf.peptides[\"mokapot_qvalue\"] <= 0.01).sum()\n", "print(f\"Peptides gained by mokapot: {moka_peps - tide_peps}\")\n", "\n", "# Proteins\n", - "moka_prots = (moka_conf.proteins[\"mokapot q-value\"] <= 0.01).sum()\n", - "tide_prots = (tide_conf.proteins[\"mokapot q-value\"] <= 0.01).sum()\n", + "moka_prots = (moka_conf.proteins[\"mokapot_qvalue\"] <= 0.01).sum()\n", + "tide_prots = (tide_conf.proteins[\"mokapot_qvalue\"] <= 0.01).sum()\n", "print(f\"Proteins gained by mokapot: {moka_prots - tide_prots}\")" ] }, @@ -637,30 +612,32 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, we will save the results as tab-delimited text files:" + "In prior versions of mokapor we had to save the results, now those get saved automatically\n", + "during the confidence assignment.\n", + "\n", + "We can see where they are using the `level_paths` attribute of the confidence object." ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['basic_python_api_output/mokapot.psms.txt',\n", - " 'basic_python_api_output/mokapot.peptides.txt',\n", - " 'basic_python_api_output/mokapot.proteins.txt']" + "{'psms': PosixPath('basic_python_api_output/mokapot/psms.csv'),\n", + " 'peptides': PosixPath('basic_python_api_output/mokapot/peptides.csv'),\n", + " 'proteins': PosixPath('basic_python_api_output/mokapot/proteins.csv')}" ] }, - "execution_count": 13, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "result_files = moka_conf.to_txt(dest_dir=out_dir)\n", - "result_files" + "moka_conf.level_paths" ] }, { @@ -675,7 +652,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -689,7 +666,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.11.11" } }, "nbformat": 4, diff --git a/docs/source/vignettes/joint_models.ipynb b/docs/source/vignettes/joint_models.ipynb index 958e931..d1ce912 100644 --- a/docs/source/vignettes/joint_models.ipynb +++ b/docs/source/vignettes/joint_models.ipynb @@ -19,16 +19,18 @@ "\n", "To run this notebook, you'll need to have [mokapot](../index.rst#installation) installed and have the input files saved on your computer in the same directory. You can find these files here: [scope2_FP97AA.pin](https://github.com/wfondrie/mokapot/raw/master/data/scope2_FP97AA.pin), [scope2_FP97AB.pin](https://github.com/wfondrie/mokapot/raw/master/data/scope2_FP97AB.pin), [scope2_FP97AC.pin](https://github.com/wfondrie/mokapot/raw/master/data/scope2_FP97AC.pin)\n", "\n", - "We can set the path to the input files here:" + "We can set the path to the input files here and the location we will use to save the results:" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "pin_dir = \"../../../data\"" + "pin_dir = \"../../../data\"\n", + "out_dir_separate = \"./joint_models/separate\"\n", + "out_dir_joint = \"./joint_models/joint\"" ] }, { @@ -42,24 +44,24 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['../../../data/scope2_FP97AA.pin',\n", - " '../../../data/scope2_FP97AB.pin',\n", - " '../../../data/scope2_FP97AC.pin']" + "[PosixPath('../../../data/scope2_FP97AA.pin'),\n", + " PosixPath('../../../data/scope2_FP97AB.pin'),\n", + " PosixPath('../../../data/scope2_FP97AC.pin')]" ] }, - "execution_count": 2, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import os\n", + "from pathlib import Path\n", "import mokapot\n", "import numpy as np\n", "import pandas as pd\n", @@ -72,11 +74,7 @@ "colors = (\"#343131\", \"#24B8A0\")\n", "\n", "# Find the input files:\n", - "pin_files = [\n", - " os.path.join(pin_dir, f)\n", - " for f in os.listdir(pin_dir)\n", - " if f.startswith(\"scope2_FP97A\") and f.endswith(\".pin\")\n", - "]\n", + "pin_files = list(Path(pin_dir).rglob(\"scope2_FP97A*.pin\"))\n", "\n", "pin_files" ] @@ -90,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -116,27 +114,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Analyzing ../../../data/scope2_FP97AA.pin\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:root:Learned model did not improve over the best feature. Now scoring by the best feature for each collection of PSMs.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "Analyzing ../../../data/scope2_FP97AA.pin\n", "Analyzing ../../../data/scope2_FP97AB.pin\n", "Analyzing ../../../data/scope2_FP97AC.pin\n" ] @@ -145,17 +130,24 @@ "source": [ "# A dictionary to store the results:\n", "sep_results = {}\n", + "base_location = Path(out_dir_separate)\n", "\n", "# Loop through the input files, analyzing each with mokapot:\n", "for pin in pin_files:\n", " # Read PSMs and run mokapot\n", " print(f\"Analyzing {pin}\")\n", - " psms = mokapot.read_pin(pin)\n", - " results, models = mokapot.brew(psms)\n", + " out_loc = base_location / pin.stem\n", + " out_loc.mkdir(exist_ok=True, parents=True) # We make sure the output directory exists\n", + " psms = mokapot.read_pin(pin, max_workers=1)\n", + " \n", + " models, scores = mokapot.brew(psms, max_workers=1)\n", + " conf = mokapot.assign_confidence(psms, dest_dir=base_location / pin.stem, max_workers=1)\n", "\n", " # Add results to our result dictionary:\n", - " rep = os.path.split(pin)[-1].replace(\".pin\", \"\")\n", - " sep_results[rep] = results" + " # We are selecting the first element \n", + " sep_results[pin.stem] = conf[0]\n", + "\n", + "# TODO: Check if the models third model is meant to throw a warning" ] }, { @@ -169,7 +161,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -193,112 +185,81 @@ " \n", " \n", " \n", - " SpecId\n", - " Label\n", - " ScanNr\n", - " ExpMass\n", - " CalcMass\n", - " Peptide\n", - " mokapot score\n", - " mokapot q-value\n", - " mokapot PEP\n", - " Proteins\n", + " PSMId\n", + " peptide\n", + " score\n", + " mokapot_qvalue\n", + " posterior_error_prob\n", + " proteinIds\n", " \n", " \n", " \n", " \n", " 0\n", - " target_0_11040_3_-1\n", - " True\n", - " 11040\n", - " 2789.4179\n", - " 2789.4084\n", - " K.LVQDVANNTNEEAGDGTTTATVLAR.S\n", - " 23.598709\n", - " 0.000912\n", - " 2.917290e-19\n", - " sp|P10809|CH60_HUMAN\n", - " \n", - " \n", - " 1\n", " target_0_8060_3_-1\n", - " True\n", - " 8060\n", - " 2154.1931\n", - " 2154.1860\n", " K.GAEAANVTGPGGVPVQGSK.Y\n", - " 21.533703\n", - " 0.000912\n", - " 2.563543e-17\n", + " 3.850000\n", + " 0.000462\n", + " 4.050871e-13\n", " sp|P67809|YBOX1_HUMAN\n", " \n", " \n", + " 1\n", + " target_0_12043_3_-1\n", + " K.GVVPLAGTDGETTTQGLDGLSER.C\n", + " 3.468646\n", + " 0.000462\n", + " 4.927086e-12\n", + " sp|P09972|ALDOC_HUMAN\n", + " \n", + " \n", " 2\n", - " target_0_11114_3_-1\n", - " True\n", - " 11114\n", - " 2618.3554\n", - " 2617.3450\n", - " K.QTTVSNSQQAYQEAFEISK.K\n", - " 20.623930\n", - " 0.000912\n", - " 1.841793e-16\n", - " sp|P31946|1433B_HUMAN\n", + " target_0_11040_3_-1\n", + " K.LVQDVANNTNEEAGDGTTTATVLAR.S\n", + " 3.139778\n", + " 0.000462\n", + " 4.249085e-11\n", + " sp|P10809|CH60_HUMAN\n", " \n", " \n", " 3\n", - " target_0_12043_3_-1\n", - " True\n", - " 12043\n", - " 2502.2946\n", - " 2502.2815\n", - " K.GVVPLAGTN[0.98]GETTTQGLDGLSER.C\n", - " 19.626017\n", - " 0.000912\n", - " 1.601810e-15\n", - " sp|P04075|ALDOA_HUMAN\n", + " target_0_10722_3_-1\n", + " R.GSTAPVGGGAFPTIVER.E\n", + " 3.112971\n", + " 0.000462\n", + " 5.064860e-11\n", + " sp|Q15084|PDIA6_HUMAN\n", " \n", " \n", " 4\n", - " target_0_10221_3_-1\n", - " True\n", - " 10221\n", - " 2424.1862\n", - " 2424.1812\n", - " K.EQQEAIEHIDEVQNEIDR.L\n", - " 18.178486\n", - " 0.000912\n", - " 3.691655e-14\n", - " sp|P0DME0|SETLP_HUMAN\\tsp|Q01105|SET_HUMAN\n", + " target_0_11114_3_-1\n", + " K.QTTVSNSQQAYQEAFEISK.K\n", + " 3.099323\n", + " 0.000462\n", + " 5.538586e-11\n", + " sp|P31946|1433B_HUMAN\n", " \n", " \n", "\n", "" ], "text/plain": [ - " SpecId Label ScanNr ExpMass CalcMass \\\n", - "0 target_0_11040_3_-1 True 11040 2789.4179 2789.4084 \n", - "1 target_0_8060_3_-1 True 8060 2154.1931 2154.1860 \n", - "2 target_0_11114_3_-1 True 11114 2618.3554 2617.3450 \n", - "3 target_0_12043_3_-1 True 12043 2502.2946 2502.2815 \n", - "4 target_0_10221_3_-1 True 10221 2424.1862 2424.1812 \n", - "\n", - " Peptide mokapot score mokapot q-value \\\n", - "0 K.LVQDVANNTNEEAGDGTTTATVLAR.S 23.598709 0.000912 \n", - "1 K.GAEAANVTGPGGVPVQGSK.Y 21.533703 0.000912 \n", - "2 K.QTTVSNSQQAYQEAFEISK.K 20.623930 0.000912 \n", - "3 K.GVVPLAGTN[0.98]GETTTQGLDGLSER.C 19.626017 0.000912 \n", - "4 K.EQQEAIEHIDEVQNEIDR.L 18.178486 0.000912 \n", + " PSMId peptide score \\\n", + "0 target_0_8060_3_-1 K.GAEAANVTGPGGVPVQGSK.Y 3.850000 \n", + "1 target_0_12043_3_-1 K.GVVPLAGTDGETTTQGLDGLSER.C 3.468646 \n", + "2 target_0_11040_3_-1 K.LVQDVANNTNEEAGDGTTTATVLAR.S 3.139778 \n", + "3 target_0_10722_3_-1 R.GSTAPVGGGAFPTIVER.E 3.112971 \n", + "4 target_0_11114_3_-1 K.QTTVSNSQQAYQEAFEISK.K 3.099323 \n", "\n", - " mokapot PEP Proteins \n", - "0 2.917290e-19 sp|P10809|CH60_HUMAN \n", - "1 2.563543e-17 sp|P67809|YBOX1_HUMAN \n", - "2 1.841793e-16 sp|P31946|1433B_HUMAN \n", - "3 1.601810e-15 sp|P04075|ALDOA_HUMAN \n", - "4 3.691655e-14 sp|P0DME0|SETLP_HUMAN\\tsp|Q01105|SET_HUMAN " + " mokapot_qvalue posterior_error_prob proteinIds \n", + "0 0.000462 4.050871e-13 sp|P67809|YBOX1_HUMAN \n", + "1 0.000462 4.927086e-12 sp|P09972|ALDOC_HUMAN \n", + "2 0.000462 4.249085e-11 sp|P10809|CH60_HUMAN \n", + "3 0.000462 5.064860e-11 sp|Q15084|PDIA6_HUMAN \n", + "4 0.000462 5.538586e-11 sp|P31946|1433B_HUMAN " ] }, - "execution_count": 5, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -318,23 +279,28 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "# A dictionary to store the results:\n", "joint_results = {}\n", + "out_dir_combined = Path(out_dir_joint)\n", + "output_prefixes = [ x.stem for x in pin_files ]\n", + "\n", + "# Again we make sure that the output directory exists:\n", + "out_dir_combined.mkdir(exist_ok=True)\n", "\n", "# Read each input file:\n", - "psms_list = [mokapot.read_pin(f) for f in pin_files]\n", + "psms_list = mokapot.read_pin(pin_files, max_workers=1)\n", "\n", "# Run mokapot on all of the files:\n", - "results, brew = mokapot.brew(psms_list)\n", + "models, scores = mokapot.brew(psms_list)\n", + "confs = mokapot.assign_confidence(psms_list, dest_dir=out_dir_combined, prefixes=output_prefixes)\n", "\n", "# Add results to our result dictionary:\n", - "for pin, result in zip(pin_files, results):\n", - " rep = os.path.split(pin)[-1].replace(\".pin\", \"\")\n", - " joint_results[rep] = result" + "for pin, conf in zip(pin_files, confs):\n", + " joint_results[pin.stem] = conf" ] }, { @@ -348,19 +314,195 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 34, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PSMIdpeptidescoremokapot_qvalueposterior_error_probproteinIds
3546target_0_4697_2_-1-.MAEAGK.V-0.5487900.1020580.645097sp|Q7Z4M0|RE114_HUMAN
3979target_0_7282_2_-1-.MAGGMK.V-0.6778720.1917090.875404sp|O14524|NEMP1_HUMAN
5175target_0_12287_3_-1-.MALGLLIAVPLLLQAAPRGAAHYEMMGTCR.M-1.3963360.3740611.000000sp|Q7Z5L3|C1QL2_HUMAN
5055target_0_12268_3_-1-.MATEN[0.98]GAVELGIQN[0.98]PSTDKAPK.G-1.1037170.3577221.000000sp|Q9H1R3|MYLK2_HUMAN
3323target_0_3673_2_-1-.MDNQAER.E-0.4573170.0595490.470527sp|Q9BWW8|APOL6_HUMAN
.....................
452target_0_3700_2_-1R.YYGGGSEGGR.A1.0556410.0005220.000062sp|P14866|HNRPL_HUMAN
3368target_0_6172_3_-1R.YYGINDPVADK.L-0.4769680.0664890.507022sp|Q9NW64|RBM22_HUMAN
638target_0_7942_2_-1R.YYPTEDVPR.K0.8263210.0005220.000256sp|Q02878|RL6_HUMAN
4985target_0_3659_2_-1R.YYQVAR.D-1.0121550.3476341.000000sp|Q969U6|FBXW5_HUMAN
1734target_0_9733_2_-1R.YYTVFDR.D0.2041120.0005220.011981sp|P07339|CATD_HUMAN
\n", + "

5221 rows × 6 columns

\n", + "
" + ], "text/plain": [ - "
" + " PSMId peptide score \\\n", + "3546 target_0_4697_2_-1 -.MAEAGK.V -0.548790 \n", + "3979 target_0_7282_2_-1 -.MAGGMK.V -0.677872 \n", + "5175 target_0_12287_3_-1 -.MALGLLIAVPLLLQAAPRGAAHYEMMGTCR.M -1.396336 \n", + "5055 target_0_12268_3_-1 -.MATEN[0.98]GAVELGIQN[0.98]PSTDKAPK.G -1.103717 \n", + "3323 target_0_3673_2_-1 -.MDNQAER.E -0.457317 \n", + "... ... ... ... \n", + "452 target_0_3700_2_-1 R.YYGGGSEGGR.A 1.055641 \n", + "3368 target_0_6172_3_-1 R.YYGINDPVADK.L -0.476968 \n", + "638 target_0_7942_2_-1 R.YYPTEDVPR.K 0.826321 \n", + "4985 target_0_3659_2_-1 R.YYQVAR.D -1.012155 \n", + "1734 target_0_9733_2_-1 R.YYTVFDR.D 0.204112 \n", + "\n", + " mokapot_qvalue posterior_error_prob proteinIds \n", + "3546 0.102058 0.645097 sp|Q7Z4M0|RE114_HUMAN \n", + "3979 0.191709 0.875404 sp|O14524|NEMP1_HUMAN \n", + "5175 0.374061 1.000000 sp|Q7Z5L3|C1QL2_HUMAN \n", + "5055 0.357722 1.000000 sp|Q9H1R3|MYLK2_HUMAN \n", + "3323 0.059549 0.470527 sp|Q9BWW8|APOL6_HUMAN \n", + "... ... ... ... \n", + "452 0.000522 0.000062 sp|P14866|HNRPL_HUMAN \n", + "3368 0.066489 0.507022 sp|Q9NW64|RBM22_HUMAN \n", + "638 0.000522 0.000256 sp|Q02878|RL6_HUMAN \n", + "4985 0.347634 1.000000 sp|Q969U6|FBXW5_HUMAN \n", + "1734 0.000522 0.011981 sp|P07339|CATD_HUMAN \n", + "\n", + "[5221 rows x 6 columns]" ] }, - "metadata": { - "needs_background": "light" + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "joint_results[\"scope2_FP97AA\"].peptides.sort_values(by=\"peptide\")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] }, + "metadata": {}, "output_type": "display_data" } ], @@ -400,19 +542,17 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 36, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "
" + "
" ] }, - "metadata": { - "needs_background": "light" - }, + "metadata": {}, "output_type": "display_data" } ], @@ -435,7 +575,7 @@ " detected in a number of replicates.\n", " \"\"\"\n", " peps = {\n", - " k: set(p.peptides[\"Peptide\"][p.peptides[\"mokapot q-value\"] <= 0.01])\n", + " k: set(p.peptides[\"peptide\"][p.peptides[\"mokapot_qvalue\"] <= 0.01])\n", " for k, p in res_dict.items()\n", " }\n", "\n", @@ -499,7 +639,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -513,7 +653,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.11.11" } }, "nbformat": 4, From 4ece54863b49a7899eec1eb6324a034d581aff0b Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Tue, 17 Dec 2024 17:22:29 -0600 Subject: [PATCH 10/12] chore: updated notebook --- docs/source/vignettes/joint_models.ipynb | 311 ++++++++++++++--------- 1 file changed, 189 insertions(+), 122 deletions(-) diff --git a/docs/source/vignettes/joint_models.ipynb b/docs/source/vignettes/joint_models.ipynb index d1ce912..0dc8a1e 100644 --- a/docs/source/vignettes/joint_models.ipynb +++ b/docs/source/vignettes/joint_models.ipynb @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -55,7 +55,7 @@ " PosixPath('../../../data/scope2_FP97AC.pin')]" ] }, - "execution_count": 13, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -88,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -114,17 +114,59 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Analyzing ../../../data/scope2_FP97AA.pin\n", - "Analyzing ../../../data/scope2_FP97AB.pin\n", + "Analyzing ../../../data/scope2_FP97AA.pin\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sebastianpaez/git/mokapot_worktree/auto_pin_handling/mokapot/tabular_data/csv.py:102: UserWarning: CSV file joint_models/separate/scope2_FP97AA/targets.psms.csv exists, but will be overwritten.\n", + " warnings.warn(\n", + "/Users/sebastianpaez/git/mokapot_worktree/auto_pin_handling/mokapot/tabular_data/csv.py:102: UserWarning: CSV file joint_models/separate/scope2_FP97AA/targets.peptides.csv exists, but will be overwritten.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Analyzing ../../../data/scope2_FP97AB.pin\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sebastianpaez/git/mokapot_worktree/auto_pin_handling/mokapot/tabular_data/csv.py:102: UserWarning: CSV file joint_models/separate/scope2_FP97AB/targets.psms.csv exists, but will be overwritten.\n", + " warnings.warn(\n", + "/Users/sebastianpaez/git/mokapot_worktree/auto_pin_handling/mokapot/tabular_data/csv.py:102: UserWarning: CSV file joint_models/separate/scope2_FP97AB/targets.peptides.csv exists, but will be overwritten.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "Analyzing ../../../data/scope2_FP97AC.pin\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sebastianpaez/git/mokapot_worktree/auto_pin_handling/mokapot/tabular_data/csv.py:102: UserWarning: CSV file joint_models/separate/scope2_FP97AC/targets.psms.csv exists, but will be overwritten.\n", + " warnings.warn(\n", + "/Users/sebastianpaez/git/mokapot_worktree/auto_pin_handling/mokapot/tabular_data/csv.py:102: UserWarning: CSV file joint_models/separate/scope2_FP97AC/targets.peptides.csv exists, but will be overwritten.\n", + " warnings.warn(\n" + ] } ], "source": [ @@ -136,15 +178,21 @@ "for pin in pin_files:\n", " # Read PSMs and run mokapot\n", " print(f\"Analyzing {pin}\")\n", + "\n", + " # For each of them, we create an output location.\n", " out_loc = base_location / pin.stem\n", - " out_loc.mkdir(exist_ok=True, parents=True) # We make sure the output directory exists\n", + " # We make sure the output directory exists\n", + " out_loc.mkdir(exist_ok=True, parents=True)\n", + "\n", + " # Read PSMs and run mokapot.\n", " psms = mokapot.read_pin(pin, max_workers=1)\n", - " \n", " models, scores = mokapot.brew(psms, max_workers=1)\n", " conf = mokapot.assign_confidence(psms, dest_dir=base_location / pin.stem, max_workers=1)\n", "\n", " # Add results to our result dictionary:\n", - " # We are selecting the first element \n", + " # We are selecting the first element because both brew and assign_confidence\n", + " # return a list, where 1 element is given per input file (the combined example\n", + " # will show how to handle multiple files).\n", " sep_results[pin.stem] = conf[0]\n", "\n", "# TODO: Check if the models third model is meant to throw a warning" @@ -161,7 +209,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -196,48 +244,48 @@ " \n", " \n", " 0\n", - " target_0_8060_3_-1\n", - " K.GAEAANVTGPGGVPVQGSK.Y\n", - " 3.850000\n", - " 0.000462\n", - " 4.050871e-13\n", - " sp|P67809|YBOX1_HUMAN\n", - " \n", - " \n", - " 1\n", " target_0_12043_3_-1\n", " K.GVVPLAGTDGETTTQGLDGLSER.C\n", - " 3.468646\n", - " 0.000462\n", - " 4.927086e-12\n", + " 3.142521\n", + " 0.000453\n", + " 1.191094e-11\n", " sp|P09972|ALDOC_HUMAN\n", " \n", " \n", - " 2\n", + " 1\n", " target_0_11040_3_-1\n", " K.LVQDVANNTNEEAGDGTTTATVLAR.S\n", - " 3.139778\n", - " 0.000462\n", - " 4.249085e-11\n", + " 3.125541\n", + " 0.000453\n", + " 1.340957e-11\n", " sp|P10809|CH60_HUMAN\n", " \n", " \n", + " 2\n", + " target_0_11114_3_-1\n", + " K.QTTVSNSQQAYQEAFEISK.K\n", + " 3.116248\n", + " 0.000453\n", + " 1.430812e-11\n", + " sp|P31946|1433B_HUMAN\n", + " \n", + " \n", " 3\n", - " target_0_10722_3_-1\n", - " R.GSTAPVGGGAFPTIVER.E\n", - " 3.112971\n", - " 0.000462\n", - " 5.064860e-11\n", - " sp|Q15084|PDIA6_HUMAN\n", + " target_0_12180_3_-1\n", + " K.TVTNAVVTVPAYFNDSQR.Q\n", + " 2.525352\n", + " 0.000453\n", + " 8.845228e-10\n", + " sp|P11142|HSP7C_HUMAN\n", " \n", " \n", " 4\n", - " target_0_11114_3_-1\n", - " K.QTTVSNSQQAYQEAFEISK.K\n", - " 3.099323\n", - " 0.000462\n", - " 5.538586e-11\n", - " sp|P31946|1433B_HUMAN\n", + " target_0_11284_3_-1\n", + " K.SQIFSTASDN[0.98]QPTVTIK.V\n", + " 2.523156\n", + " 0.000453\n", + " 8.981859e-10\n", + " sp|P11021|BIP_HUMAN\n", " \n", " \n", "\n", @@ -245,21 +293,21 @@ ], "text/plain": [ " PSMId peptide score \\\n", - "0 target_0_8060_3_-1 K.GAEAANVTGPGGVPVQGSK.Y 3.850000 \n", - "1 target_0_12043_3_-1 K.GVVPLAGTDGETTTQGLDGLSER.C 3.468646 \n", - "2 target_0_11040_3_-1 K.LVQDVANNTNEEAGDGTTTATVLAR.S 3.139778 \n", - "3 target_0_10722_3_-1 R.GSTAPVGGGAFPTIVER.E 3.112971 \n", - "4 target_0_11114_3_-1 K.QTTVSNSQQAYQEAFEISK.K 3.099323 \n", + "0 target_0_12043_3_-1 K.GVVPLAGTDGETTTQGLDGLSER.C 3.142521 \n", + "1 target_0_11040_3_-1 K.LVQDVANNTNEEAGDGTTTATVLAR.S 3.125541 \n", + "2 target_0_11114_3_-1 K.QTTVSNSQQAYQEAFEISK.K 3.116248 \n", + "3 target_0_12180_3_-1 K.TVTNAVVTVPAYFNDSQR.Q 2.525352 \n", + "4 target_0_11284_3_-1 K.SQIFSTASDN[0.98]QPTVTIK.V 2.523156 \n", "\n", " mokapot_qvalue posterior_error_prob proteinIds \n", - "0 0.000462 4.050871e-13 sp|P67809|YBOX1_HUMAN \n", - "1 0.000462 4.927086e-12 sp|P09972|ALDOC_HUMAN \n", - "2 0.000462 4.249085e-11 sp|P10809|CH60_HUMAN \n", - "3 0.000462 5.064860e-11 sp|Q15084|PDIA6_HUMAN \n", - "4 0.000462 5.538586e-11 sp|P31946|1433B_HUMAN " + "0 0.000453 1.191094e-11 sp|P09972|ALDOC_HUMAN \n", + "1 0.000453 1.340957e-11 sp|P10809|CH60_HUMAN \n", + "2 0.000453 1.430812e-11 sp|P31946|1433B_HUMAN \n", + "3 0.000453 8.845228e-10 sp|P11142|HSP7C_HUMAN \n", + "4 0.000453 8.981859e-10 sp|P11021|BIP_HUMAN " ] }, - "execution_count": 18, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -279,9 +327,28 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sebastianpaez/git/mokapot_worktree/auto_pin_handling/mokapot/tabular_data/csv.py:102: UserWarning: CSV file joint_models/joint/scope2_FP97AA.targets.psms.csv exists, but will be overwritten.\n", + " warnings.warn(\n", + "/Users/sebastianpaez/git/mokapot_worktree/auto_pin_handling/mokapot/tabular_data/csv.py:102: UserWarning: CSV file joint_models/joint/scope2_FP97AA.targets.peptides.csv exists, but will be overwritten.\n", + " warnings.warn(\n", + "/Users/sebastianpaez/git/mokapot_worktree/auto_pin_handling/mokapot/tabular_data/csv.py:102: UserWarning: CSV file joint_models/joint/scope2_FP97AB.targets.psms.csv exists, but will be overwritten.\n", + " warnings.warn(\n", + "/Users/sebastianpaez/git/mokapot_worktree/auto_pin_handling/mokapot/tabular_data/csv.py:102: UserWarning: CSV file joint_models/joint/scope2_FP97AB.targets.peptides.csv exists, but will be overwritten.\n", + " warnings.warn(\n", + "/Users/sebastianpaez/git/mokapot_worktree/auto_pin_handling/mokapot/tabular_data/csv.py:102: UserWarning: CSV file joint_models/joint/scope2_FP97AC.targets.psms.csv exists, but will be overwritten.\n", + " warnings.warn(\n", + "/Users/sebastianpaez/git/mokapot_worktree/auto_pin_handling/mokapot/tabular_data/csv.py:102: UserWarning: CSV file joint_models/joint/scope2_FP97AC.targets.peptides.csv exists, but will be overwritten.\n", + " warnings.warn(\n" + ] + } + ], "source": [ "# A dictionary to store the results:\n", "joint_results = {}\n", @@ -314,7 +381,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -348,48 +415,48 @@ " \n", " \n", " \n", - " 3546\n", - " target_0_4697_2_-1\n", - " -.MAEAGK.V\n", - " -0.548790\n", - " 0.102058\n", - " 0.645097\n", - " sp|Q7Z4M0|RE114_HUMAN\n", - " \n", - " \n", - " 3979\n", + " 3995\n", " target_0_7282_2_-1\n", " -.MAGGMK.V\n", - " -0.677872\n", - " 0.191709\n", - " 0.875404\n", + " -0.659660\n", + " 0.193944\n", + " 0.860169\n", " sp|O14524|NEMP1_HUMAN\n", " \n", " \n", - " 5175\n", + " 5172\n", " target_0_12287_3_-1\n", " -.MALGLLIAVPLLLQAAPRGAAHYEMMGTCR.M\n", - " -1.396336\n", - " 0.374061\n", + " -1.397640\n", + " 0.372711\n", " 1.000000\n", " sp|Q7Z5L3|C1QL2_HUMAN\n", " \n", " \n", - " 5055\n", + " 5056\n", " target_0_12268_3_-1\n", " -.MATEN[0.98]GAVELGIQN[0.98]PSTDKAPK.G\n", - " -1.103717\n", - " 0.357722\n", + " -1.100226\n", + " 0.355547\n", " 1.000000\n", " sp|Q9H1R3|MYLK2_HUMAN\n", " \n", " \n", - " 3323\n", + " 3677\n", + " target_0_11417_2_-1\n", + " -.MCLLPR.G\n", + " -0.565965\n", + " 0.122316\n", + " 0.694290\n", + " sp|Q9BZD3|GCOM2_HUMAN\n", + " \n", + " \n", + " 3372\n", " target_0_3673_2_-1\n", " -.MDNQAER.E\n", - " -0.457317\n", - " 0.059549\n", - " 0.470527\n", + " -0.458577\n", + " 0.069037\n", + " 0.491961\n", " sp|Q9BWW8|APOL6_HUMAN\n", " \n", " \n", @@ -402,86 +469,86 @@ " ...\n", " \n", " \n", - " 452\n", + " 409\n", " target_0_3700_2_-1\n", " R.YYGGGSEGGR.A\n", - " 1.055641\n", - " 0.000522\n", - " 0.000062\n", + " 1.054732\n", + " 0.000514\n", + " 0.000087\n", " sp|P14866|HNRPL_HUMAN\n", " \n", " \n", - " 3368\n", + " 3423\n", " target_0_6172_3_-1\n", " R.YYGINDPVADK.L\n", - " -0.476968\n", - " 0.066489\n", - " 0.507022\n", + " -0.477437\n", + " 0.077372\n", + " 0.526640\n", " sp|Q9NW64|RBM22_HUMAN\n", " \n", " \n", - " 638\n", + " 589\n", " target_0_7942_2_-1\n", " R.YYPTEDVPR.K\n", - " 0.826321\n", - " 0.000522\n", - " 0.000256\n", + " 0.823093\n", + " 0.000514\n", + " 0.000349\n", " sp|Q02878|RL6_HUMAN\n", " \n", " \n", - " 4985\n", + " 4986\n", " target_0_3659_2_-1\n", " R.YYQVAR.D\n", - " -1.012155\n", - " 0.347634\n", + " -1.011639\n", + " 0.346231\n", " 1.000000\n", " sp|Q969U6|FBXW5_HUMAN\n", " \n", " \n", - " 1734\n", + " 1695\n", " target_0_9733_2_-1\n", " R.YYTVFDR.D\n", - " 0.204112\n", - " 0.000522\n", - " 0.011981\n", + " 0.204137\n", + " 0.000514\n", + " 0.014292\n", " sp|P07339|CATD_HUMAN\n", " \n", " \n", "\n", - "

5221 rows × 6 columns

\n", + "

5217 rows × 6 columns

\n", "" ], "text/plain": [ " PSMId peptide score \\\n", - "3546 target_0_4697_2_-1 -.MAEAGK.V -0.548790 \n", - "3979 target_0_7282_2_-1 -.MAGGMK.V -0.677872 \n", - "5175 target_0_12287_3_-1 -.MALGLLIAVPLLLQAAPRGAAHYEMMGTCR.M -1.396336 \n", - "5055 target_0_12268_3_-1 -.MATEN[0.98]GAVELGIQN[0.98]PSTDKAPK.G -1.103717 \n", - "3323 target_0_3673_2_-1 -.MDNQAER.E -0.457317 \n", + "3995 target_0_7282_2_-1 -.MAGGMK.V -0.659660 \n", + "5172 target_0_12287_3_-1 -.MALGLLIAVPLLLQAAPRGAAHYEMMGTCR.M -1.397640 \n", + "5056 target_0_12268_3_-1 -.MATEN[0.98]GAVELGIQN[0.98]PSTDKAPK.G -1.100226 \n", + "3677 target_0_11417_2_-1 -.MCLLPR.G -0.565965 \n", + "3372 target_0_3673_2_-1 -.MDNQAER.E -0.458577 \n", "... ... ... ... \n", - "452 target_0_3700_2_-1 R.YYGGGSEGGR.A 1.055641 \n", - "3368 target_0_6172_3_-1 R.YYGINDPVADK.L -0.476968 \n", - "638 target_0_7942_2_-1 R.YYPTEDVPR.K 0.826321 \n", - "4985 target_0_3659_2_-1 R.YYQVAR.D -1.012155 \n", - "1734 target_0_9733_2_-1 R.YYTVFDR.D 0.204112 \n", + "409 target_0_3700_2_-1 R.YYGGGSEGGR.A 1.054732 \n", + "3423 target_0_6172_3_-1 R.YYGINDPVADK.L -0.477437 \n", + "589 target_0_7942_2_-1 R.YYPTEDVPR.K 0.823093 \n", + "4986 target_0_3659_2_-1 R.YYQVAR.D -1.011639 \n", + "1695 target_0_9733_2_-1 R.YYTVFDR.D 0.204137 \n", "\n", " mokapot_qvalue posterior_error_prob proteinIds \n", - "3546 0.102058 0.645097 sp|Q7Z4M0|RE114_HUMAN \n", - "3979 0.191709 0.875404 sp|O14524|NEMP1_HUMAN \n", - "5175 0.374061 1.000000 sp|Q7Z5L3|C1QL2_HUMAN \n", - "5055 0.357722 1.000000 sp|Q9H1R3|MYLK2_HUMAN \n", - "3323 0.059549 0.470527 sp|Q9BWW8|APOL6_HUMAN \n", + "3995 0.193944 0.860169 sp|O14524|NEMP1_HUMAN \n", + "5172 0.372711 1.000000 sp|Q7Z5L3|C1QL2_HUMAN \n", + "5056 0.355547 1.000000 sp|Q9H1R3|MYLK2_HUMAN \n", + "3677 0.122316 0.694290 sp|Q9BZD3|GCOM2_HUMAN \n", + "3372 0.069037 0.491961 sp|Q9BWW8|APOL6_HUMAN \n", "... ... ... ... \n", - "452 0.000522 0.000062 sp|P14866|HNRPL_HUMAN \n", - "3368 0.066489 0.507022 sp|Q9NW64|RBM22_HUMAN \n", - "638 0.000522 0.000256 sp|Q02878|RL6_HUMAN \n", - "4985 0.347634 1.000000 sp|Q969U6|FBXW5_HUMAN \n", - "1734 0.000522 0.011981 sp|P07339|CATD_HUMAN \n", + "409 0.000514 0.000087 sp|P14866|HNRPL_HUMAN \n", + "3423 0.077372 0.526640 sp|Q9NW64|RBM22_HUMAN \n", + "589 0.000514 0.000349 sp|Q02878|RL6_HUMAN \n", + "4986 0.346231 1.000000 sp|Q969U6|FBXW5_HUMAN \n", + "1695 0.000514 0.014292 sp|P07339|CATD_HUMAN \n", "\n", - "[5221 rows x 6 columns]" + "[5217 rows x 6 columns]" ] }, - "execution_count": 34, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -492,12 +559,12 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -542,12 +609,12 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] From 409d98d83e137409df08ff1a966ad98b9b4a9cc0 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Thu, 19 Dec 2024 14:56:51 -0600 Subject: [PATCH 11/12] chore,confidence: update docstrings --- mokapot/confidence.py | 134 ++++++++++++++++++++++++++---------------- 1 file changed, 84 insertions(+), 50 deletions(-) diff --git a/mokapot/confidence.py b/mokapot/confidence.py index 950aae6..371bbab 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -8,13 +8,18 @@ The following classes store the confidence estimates for a dataset based on the provided score. They provide utilities to access, save, and plot these estimates for the various relevant levels (i.e. PSMs, peptides, and proteins). -The :py:func:`Confidence` class is appropriate for most data-dependent -acquisition proteomics datasets. - -We recommend using the :py:func:`~mokapot.brew()` function to obtain these -confidence estimates, rather than initializing the classes below directly. - -TODO: update this docstring. +The :py:class:`Confidence` class is the primary interface for handling confidence +estimates in data-dependent acquisition proteomics datasets. + +The module provides: +- Confidence estimation at PSM, peptide, and protein levels +- Streaming processing for large datasets +- Visualization utilities for confidence estimates +- Export capabilities in various formats + +We recommend using the :py:func:`~mokapot.brew()` followed by the `assign_confidence` +function to obtain these confidence estimates, rather than initializing the +classes below directly. """ from __future__ import annotations @@ -184,7 +189,7 @@ def __repr__(self) -> str: def _assign_confidence( self, - levels: list[str], # why is this passed if its a property of self? + levels: list[str], level_path_map: dict[str, Path], out_writers_map: dict[str, Sequence[TabularDataWriter]], write_decoys: bool = False, @@ -195,19 +200,36 @@ def _assign_confidence( score_stats: OnlineStatistics | None = None, eval_fdr: float = 0.01, ): - """ - Assign confidence to PSMs and peptides. + """Assign confidence estimates to PSMs and peptides. + + This method processes the dataset to assign confidence estimates (q-values and PEPs) + at different levels (PSMs, peptides) using the specified algorithms. It can optionally + stream the confidence calculations for large datasets. Parameters ---------- - level_path_map : List(Path) - Files with unique psms and unique peptides. - levels : List(str) - the levels at which confidence estimation was performed - out_paths : List(Path) - The output files where the results will be written + levels : list[str] + The levels at which to compute confidence estimates (e.g., ['psms', 'peptides']). + level_path_map : dict[str, Path] + Mapping of confidence levels to their corresponding file paths for intermediate data. + out_writers_map : dict[str, Sequence[TabularDataWriter]] + Mapping of confidence levels to their output writers for results. write_decoys : bool, optional - Save decoys confidence estimates as well? + Whether to include decoy results in the output, by default False. + peps_error : bool, optional + Whether to raise an error if PEP calculation fails, by default False. + peps_algorithm : str, optional + Algorithm to use for posterior error probability calculation. + Currently supports "qvality" (default). + qvalue_algorithm : str, optional + Algorithm to use for q-value calculation. + Currently supports "tdc" (default). + stream_confidence : bool, optional + Whether to stream confidence calculations for large datasets, by default False. + score_stats : OnlineStatistics | None, optional + Pre-computed score statistics if streaming is enabled, by default None. + eval_fdr : float, optional + FDR threshold for evaluation metrics, by default 0.01. """ if stream_confidence: if score_stats is None: @@ -250,7 +272,10 @@ def _assign_confidence( level_path.unlink(missing_ok=True) def _write_protein_level_data( - self, level_paths: dict[str, Path], proteins: Proteins, rng + self, + level_paths: dict[str, Path], + proteins: Proteins, + rng: int | np.random.Generator, ): psms = TabularDataReader.from_path(level_paths["psms"]).read() proteins_df = picked_protein( @@ -346,51 +371,60 @@ def assign_confidence( append_to_output_file: bool = False, rng: int | np.random.Generator = 0, peps_error: bool = False, - peps_algorithm="qvality", # TODO make this an enum (2024-12-17) - qvalue_algorithm="tdc", # TODO make this an enum (2024-12-17) + peps_algorithm="qvality", + qvalue_algorithm="tdc", sqlite_path: Path | None = None, stream_confidence: bool = False, -) -> list[Confidence]: - """Assign confidence to PSMs peptides, and optionally, proteins. +): + """Assign confidence to PSMs, peptides, and optionally proteins. Parameters ---------- - max_workers - datasets : list[OnDiskPsmDataset] + datasets : list[PsmDataset] A collection of PSMs. - scores_list : list[numpy.ndarray] + scores_list : list[numpy.ndarray[float]] | None, optional The scores by which to rank the PSMs. Usually derived from - `mokapot.brew` - rng : int or np.random.Generator, optional - A seed or generator used for cross-validation split creation and to - break ties, or ``None`` to use the default random number generator - state. - eval_fdr : float - The FDR threshold at which to report and evaluate performance. If - `scores` is not :code:`None`, this parameter has no affect on the - analysis itself, but does affect logging messages and the FDR - threshold applied for some output formats, such as FlashLFQ. - dest_dir : Path or None, optional - The directory in which to save the files. :code:`None` will use the - current working directory. - prefixes : [str] or None + `mokapot.brew`, by default None. + max_workers : int, optional + Number of parallel workers to use for processing, by default 1. + eval_fdr : float, optional + The FDR threshold at which to report and evaluate performance, by default 0.01. + This affects logging messages and the FDR threshold applied for some output formats. + dest_dir : Path | None, optional + The directory in which to save the files. None will use the + current working directory, by default None. + file_root : str, optional + Base name prefix for output files, by default "". + prefixes : list[str | None] | None, optional The prefixes added to all output file names. - If None, a single concatenated file will be created. + If None, a single concatenated file will be created, by default None. write_decoys : bool, optional - Save decoys confidence estimates as well? - deduplication: bool - Are we performing deduplication on the psm level? - do_rollup: bool - do we apply rollup on peptides, modified peptides etc.? - proteins: Proteins, optional - collection of proteins - append_to_output_file: bool - do we append results to file ? - sqlite_path: Path to the sqlite database to write mokapot results + Save decoy confidence estimates as well?, by default False. + deduplication : bool, optional + Whether to perform deduplication on the PSM level, by default True. + do_rollup : bool, optional + Whether to apply rollup on peptides, modified peptides etc., by default True. + proteins : Proteins | None, optional + Collection of proteins for protein inference, by default None. + append_to_output_file : bool, optional + Whether to append results to existing file, by default False. + rng : int | np.random.Generator, optional + Random number generator or seed for reproducibility, by default 0. + peps_error : bool, optional + Whether to raise error on PEP calculation failure, by default False. + peps_algorithm : {'qvality', 'qvality_bin', 'kde_nnls', 'hist_nnls'}, optional + Algorithm for posterior error probability calculation, by default "qvality". + qvalue_algorithm : {'tdc', 'hist'}, optional + Algorithm for q-value calculation, by default "tdc". + sqlite_path : Path | None, optional + Path to the SQLite database to write mokapot results, by default None. + stream_confidence : bool, optional + Whether to stream confidence calculations for large datasets, by default False. Returns ------- list[Confidence] + A list of Confidence objects containing the confidence estimates for each dataset. """ # Note: I am really not a big fan of how large this function is ... From 100ec5852c1d6a898d31e122edfb14006148ea5f Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Thu, 19 Dec 2024 14:59:26 -0600 Subject: [PATCH 12/12] chore,qvalue: removed commented out code --- mokapot/qvalues.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/mokapot/qvalues.py b/mokapot/qvalues.py index 0eac457..2d22c58 100644 --- a/mokapot/qvalues.py +++ b/mokapot/qvalues.py @@ -141,35 +141,7 @@ def tdc( unique_metric = np.flip(unique_metric) indices = np.flip(indices) - # import time - - # t0 = time.time() qvals = _fdr2qvalue(fdr, num_total, indices) - # et = (time.time() - t0) * 1000 - # print(f"Base Time: {et}") - # t0 = time.time() - # qvals_np = np.minimum.accumulate(fdr) - # et = (time.time() - t0) * 1000 - # print(f"Numpy Time: {et}") - - # CARE = False - # if CARE and not np.allclose(qvals, qvals_np): - # rmse = np.sqrt(np.mean((qvals - qvals_np) ** 2)) - # print(f"RMSE: {rmse}") - # from matplotlib import pyplot as plt - - # diff_window = (qvals > 0.6) & (qvals < 0.75) - # print(f"Diff Qvals: {qvals[diff_window]}") - # print(f"Diff Qvals_np: {qvals[diff_window]}") - # print(f"Diff Qvals_fdr: {qvals[diff_window]}") - - # plt.scatter(x=qvals, y=qvals_np, alpha=0.3) - # plt.xlabel("Qvals") - # plt.ylabel("Qvals Numpy") - # plt.show() - # # if rmse > 1e-3: - # # raise RuntimeError("Numpy implementation is not close.") - qvals = np.flip(qvals) qvals = qvals[np.argsort(srt_idx)]