Merge pull request #77 from bigbio/spectrumAI

spectrumAI into py-pgatk
bigbio · Apr 19, 2024 · 6faf53c · 6faf53c
2 parents bbbbb34 + fd2af1c
commit 6faf53c
Show file tree

Hide file tree

Showing 29 changed files with 1,385 additions and 120 deletions.
diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml
@@ -5,17 +5,17 @@ name: Python application
 
 on:
   push:
-    branches: [ master ]
+    branches: [ spectrumAI ]
   pull_request:
-    branches: [ master ]
+    branches: [ spectrumAI ]
 
 jobs:
   build:
 
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8]
+        python-version: [3.7, 3.8]
 
     steps:
     - uses: actions/checkout@v2
@@ -70,6 +70,6 @@ jobs:
         pip install pytest-cov
         python setup.py install
         cd pypgatk
-        pytest --cov=./ --cov-report=xml tests/*
+        pytest -s --cov=./ --cov-report=xml tests/*
 
 
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -5,17 +5,17 @@ name: Python package
 
 on:
   push:
-    branches: [ master ]
+    branches: [ spectrumAI ]
   pull_request:
-    branches: [ master ]
+    branches: [ spectrumAI ]
 
 jobs:
   build:
 
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8]
+        python-version: [3.7, 3.8]
 
     steps:
     - uses: actions/checkout@v2
@@ -47,4 +47,4 @@ jobs:
       run: |
         python setup.py install
         cd pypgatk
-        python tests/pypgatk_tests.py
+        python -s tests/pypgatk_tests.py
diff --git a/.gitignore b/.gitignore
@@ -113,3 +113,5 @@ pypgatk/database_cbioportal/
 pypgatk/database_cosmic/
 pypgatk/config/private/
 .pypirc
+/pypgatk/test_all.bed
+/pypgatk/test_annotated.vcf
diff --git a/conda-enviroment.yaml b/conda-enviroment.yaml
@@ -7,13 +7,18 @@ channels:
   - bioconda
 dependencies:
   - biopython
-  - Click=7.0
-  - gffutils=0.10.1
-  - numpy=1.16.3
-  - PyYAML=5.1.2
-  - requests=2.21.0
-  - simplejson=3.16.0
-  - ratelimit=2.2.1
+  - Click
+  - gffutils
+  - numpy
+  - PyYAML
+  - requests
+  - simplejson
+  - ratelimit
+  - pathos
   - bioconda::pyteomics
-  - pybedtools=0.8.2
+  - pybedtools
+  - matplotlib
   - bioconda::pyopenms
+  - pytest
+  - tqdm
+  - pyahocorasick
diff --git a/pypgatk/cgenomes/cbioportal_downloader.py b/pypgatk/cgenomes/cbioportal_downloader.py
@@ -34,8 +34,8 @@ def __init__(self, config_data, pipeline_arguments):
         self._list_studies = []
         self._multithreading = True
 
-        self._cbioportal_base_url = 'https://www.cbioportal.org/webservice.do'
-        self._cancer_studies_command = 'cmd=getCancerStudies'
+        self._cbioportal_base_url = 'https://www.cbioportal.org/api'
+        self._cancer_studies_command = 'studies'
 
         self._cbioportal_download_url = 'https://cbioportal-datahub.s3.amazonaws.com'
 
@@ -111,13 +111,13 @@ def get_cancer_studies(self):
         """
         server = self._cbioportal_base_url
         endpoint = self._cancer_studies_command
-        self._cbioportal_studies = call_api_raw(server + "?" + endpoint).text
+        self._cbioportal_studies = call_api_raw(server + "/" + endpoint).text
         return self._cbioportal_studies
 
     def download_study(self, download_study, url_file_name=None):
         """
         This function will download a study from cBioPortal using the study ID
-        :param download_study: Study to be download, if the study is empty or None, all the studies will be
+        :param download_study: Study to be downloaded, if the study is empty or None, all the studies will be
         downloaded.
         :param url_file_name: file tsv containing the urls to be downloaded.
         :return: None

diff --git a/pypgatk/commands/blast_get_position.py b/pypgatk/commands/blast_get_position.py
@@ -0,0 +1,33 @@
+import logging
+
+import click
+
+from pypgatk.toolbox.general import read_yaml_from_file
+from pypgatk.commands.utils import print_help
+from pypgatk.proteogenomics.blast_get_position import BlastGetPositionService
+
+log = logging.getLogger(__name__)
+
+@click.command('blast_get_position', short_help='Blast peptide and refence protein database to find variation sites.')
+@click.option('-c', '--config_file', help='Configuration file for the fdr peptides pipeline.')
+@click.option('-i', '--input_psm_to_blast', help='The file name of the input PSM table to blast.')
+@click.option('-o', '--output_psm', help='The file name of the output PSM table.')
+@click.option('-r', '--input_reference_database', help='The file name of the refence protein database to blast. The reference database includes Uniprot Proteomes with isoforms, ENSEMBL, RefSeq, etc.')
+@click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')
+
+@click.pass_context
+def blast_get_position(ctx, config_file, input_psm_to_blast, output_psm, input_reference_database, number_of_processes):
+    config_data = None
+    if config_file is not None:
+        config_data = read_yaml_from_file(config_file)
+
+    if input_psm_to_blast is None or input_reference_database is None or output_psm is None:
+        print_help()
+    pipeline_arguments = {}
+    if input_reference_database is not None:
+        pipeline_arguments[BlastGetPositionService.CONFIG_INPUT_REFERENCE_DATABASE] = input_reference_database
+    if number_of_processes is not None:
+        pipeline_arguments[BlastGetPositionService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
+
+    blast_get_position_service = BlastGetPositionService(config_data, pipeline_arguments)
+    blast_get_position_service.blast(input_psm_to_blast, output_psm)
diff --git a/pypgatk/commands/deeplc.py b/pypgatk/commands/deeplc.py
diff --git a/pypgatk/commands/msrescore.py b/pypgatk/commands/msrescore.py
diff --git a/pypgatk/commands/mztab_class_fdr.py b/pypgatk/commands/mztab_class_fdr.py
@@ -0,0 +1,39 @@
+import logging
+
+import click
+
+from pypgatk.toolbox.general import read_yaml_from_file
+from pypgatk.commands.utils import print_help
+from pypgatk.proteogenomics.mztab_class_fdr import MzTabClassFdr
+
+log = logging.getLogger(__name__)
+
+@click.command('mztab_class_fdr', short_help='Extract psms from mzTab for global-fdr and class-fdr filtering')
+@click.option('-c', '--config_file', help='Configuration file for the fdr peptides pipeline')
+@click.option('-i', '--input_mztab', help='The file name of the input mzTab')
+@click.option('-o', '--outfile_name', help='The file name of the psm table filtered by global-fdr and class-fdr')
+@click.option('-d', '--decoy_prefix', help='Default is "decoy"')
+@click.option('-gf', '--global_fdr_cutoff', help='PSM peptide global-fdr cutoff or threshold. Default is 0.01')
+@click.option('-cf', '--class_fdr_cutoff', help='PSM peptide class-fdr cutoff or threshold. Default is 0.01')
+@click.option('-g', '--peptide_groups_prefix', help="Peptide class "
+                                              "groups e.g. \"{non_canonical:[altorf,pseudo,ncRNA];mutations:[COSMIC,cbiomut];variants:[var_mut,var_rs]}\"")
+@click.pass_context
+def mztab_class_fdr(ctx, config_file, input_mztab, outfile_name, decoy_prefix, global_fdr_cutoff, class_fdr_cutoff, peptide_groups_prefix):
+    config_data = None
+    if config_file is not None:
+        config_data = read_yaml_from_file(config_file)
+
+    if input_mztab is None or outfile_name is None:
+        print_help()
+    pipeline_arguments = {}
+    if decoy_prefix is not None:
+        pipeline_arguments[MzTabClassFdr.CONFIG_DECOY_PREFIX] = decoy_prefix
+    if global_fdr_cutoff is not None:
+        pipeline_arguments[MzTabClassFdr.CONFIG_GLOBAL_FDR_CUTOFF] = global_fdr_cutoff
+    if class_fdr_cutoff is not None:
+        pipeline_arguments[MzTabClassFdr.CONFIG_CLASS_FDR_CUTOFF] = class_fdr_cutoff
+    if peptide_groups_prefix is not None:
+        pipeline_arguments[MzTabClassFdr.CONFIG_PEPTIDE_GROUPS_PREFIX] = peptide_groups_prefix
+
+    mzTab_class_fdr = MzTabClassFdr(config_data, pipeline_arguments)
+    mzTab_class_fdr.form_mztab_class_fdr(input_mztab, outfile_name)
diff --git a/pypgatk/commands/peptide_class_fdr.py b/pypgatk/commands/peptide_class_fdr.py
@@ -14,7 +14,7 @@
 @click.option('-c', '--config_file', help='Configuration to perform Peptide Class FDR')
 @click.option('-in', '--input-file', help='input file with the peptides and proteins')
 @click.option('-out', '--output-file', help='idxml from openms with filtered peptides and proteins')
-@click.option("--file-type")
+@click.option("--file-type", help="File types supported by the tool (TSV (.tsv), IDXML (.idxml), MZTAB (.mztab))")
 @click.option('--min-peptide-length', help='minimum peptide length')
 @click.option('--psm-pep-fdr-cutoff', help="PSM peptide FDR cutoff or threshold")
 @click.option('--psm-pep-class-fdr-cutoff', help="PSM class peptide FDR cutoff or threshold")
@@ -48,7 +48,7 @@ def peptide_class_fdr(ctx, config_file, input_file, output_file, file_type, min_
   :param psm_pep_class_fdr_cutoff: Peptide class FDR cutoff
   :param peptide_groups_prefix: Peptide groups prefix for the Peptide classes FDR
   :param peptide_classes_prefix: Peptide classes
-  :param file_type: File type to compute the FDR and class FDR.
+  :param file_type: File type to compute the FDR and class FDR ()
   :param disable_class_fdr: Do not compute class FDR and not filtering the PSMs
   :return:
   """

diff --git a/pypgatk/commands/validate_peptides.py b/pypgatk/commands/validate_peptides.py
@@ -0,0 +1,55 @@
+import logging
+
+import click
+
+from pypgatk.toolbox.general import read_yaml_from_file
+from pypgatk.proteogenomics.validate_peptides import ValidatePeptidesService
+from pypgatk.commands.utils import print_help
+
+log = logging.getLogger(__name__)
+
+
+@click.command('validate_peptides',
+               short_help='Command to inspect MS2 spectra of single-subsititution peptide identifications')
+@click.option('-c', '--config_file', help='Configuration file for the validate peptides pipeline')
+@click.option('-p', '--mzml_path', help='The mzml file path.You only need to use either mzml_path or mzml_files')
+@click.option('-f', '--mzml_files',
+              help='The mzml files.Different files are separated by ",".You only need to use either mzml_path or mzml_files')
+@click.option('-i', '--infile_name', help='Variant peptide PSMs table')
+@click.option('-o', '--outfile_name', help='Output file for the results')
+@click.option('-ion', '--ions_tolerance', help='MS2 fragment ions mass accuracy')
+@click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')
+@click.option('-r', '--relative', help='When using ppm as ions_tolerance (not Da), it needs to be turned on',
+              is_flag=True)
+@click.option('-msgf', '--msgf',
+              help='If it is the standard format of MSGF output, please turn on this switch, otherwise it defaults to mzTab format',
+              is_flag=True)
+@click.pass_context
+def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance,
+                      number_of_processes, relative, msgf):
+    config_data = None
+    if config_file is not None:
+        config_data = read_yaml_from_file(config_file)
+
+    validate_flag = bool(infile_name and (mzml_path or mzml_files) and outfile_name)
+    if not validate_flag:
+        print_help()
+
+    pipeline_arguments = {}
+
+    if mzml_path is not None:
+        pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_PATH] = mzml_path
+    if mzml_files is not None:
+        pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_FILES] = mzml_files
+    if ions_tolerance is not None:
+        pipeline_arguments[ValidatePeptidesService.CONFIG_IONS_TOLERANCE] = ions_tolerance
+    if number_of_processes is not None:
+        pipeline_arguments[ValidatePeptidesService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
+    if relative is not None:
+        pipeline_arguments[ValidatePeptidesService.CONFIG_RELATIVE] = relative
+    if msgf is not None:
+        pipeline_arguments[ValidatePeptidesService.CONFIG_MSGF] = msgf
+
+    validate_peptides_service = ValidatePeptidesService(config_data, pipeline_arguments)
+    if validate_flag:
+        validate_peptides_service.validate(infile_name, outfile_name)
diff --git a/pypgatk/config/cbioportal_config.yaml b/pypgatk/config/cbioportal_config.yaml
@@ -2,8 +2,8 @@ cbioportal_data_downloader:
   output_directory: database_cbioportal
   list_studies: []
   cbioportal_api:
-    base_url: https://www.cbioportal.org/webservice.do
-    cancer_studies: cmd=getCancerStudies
+    base_url: https://www.cbioportal.org/api
+    cancer_studies: studies
   cbioportal_download_url: https://cbioportal-datahub.s3.amazonaws.com
   logger:
     formatters:

diff --git a/pypgatk/config/ensembl_downloader_config.yaml b/pypgatk/config/ensembl_downloader_config.yaml
@@ -26,7 +26,7 @@ ensembl_data_downloader:
       - chr_patch_hapl_scaff.
       file_extension: gtf
   ensembl_api:
-    server: http://rest.ensembl.org
+    server: https://rest.ensembl.org
     species: /info/species
   logger:
     formatters:

diff --git a/pypgatk/ensembl/data_downloader.py b/pypgatk/ensembl/data_downloader.py
@@ -56,7 +56,7 @@ def __init__(self, config_file, pipeline_arguments):
         super(EnsemblDataDownloadService, self).__init__(self.CONFIG_KEY_DATA_DOWNLOADER, config_file,
                                                          pipeline_arguments)
 
-        self._rest_api = 'http://rest.ensembl.org'
+        self._rest_api = 'https://rest.ensembl.org'
         self._rest_endpoint = '/info/species'
 
         self._skip_protein_database = self.get_data_download_parameters(variable=self.CONFIG_KEY_SKIP_PROTEIN,

diff --git a/pypgatk/ensembl/ensembl.py b/pypgatk/ensembl/ensembl.py
@@ -252,7 +252,6 @@ def get_features(db, feature_id, feature_types=None):
     also genomic positions for all its elements (exons/cds&start_codon)
     :param db:
     :param feature_id:
-    :param biotype_str:
     :param feature_types:
     :return:
     """

diff --git a/pypgatk/proteogenomics/__init__.py b/pypgatk/proteogenomics/__init__.py