Skip to content

Commit

Permalink
Merge pull request #77 from bigbio/spectrumAI
Browse files Browse the repository at this point in the history
spectrumAI into py-pgatk
  • Loading branch information
ypriverol authored Apr 19, 2024
2 parents bbbbb34 + fd2af1c commit 6faf53c
Show file tree
Hide file tree
Showing 29 changed files with 1,385 additions and 120 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/pythonapp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@ name: Python application

on:
push:
branches: [ master ]
branches: [ spectrumAI ]
pull_request:
branches: [ master ]
branches: [ spectrumAI ]

jobs:
build:

runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
python-version: [3.7, 3.8]

steps:
- uses: actions/checkout@v2
Expand Down Expand Up @@ -70,6 +70,6 @@ jobs:
pip install pytest-cov
python setup.py install
cd pypgatk
pytest --cov=./ --cov-report=xml tests/*
pytest -s --cov=./ --cov-report=xml tests/*
8 changes: 4 additions & 4 deletions .github/workflows/pythonpackage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@ name: Python package

on:
push:
branches: [ master ]
branches: [ spectrumAI ]
pull_request:
branches: [ master ]
branches: [ spectrumAI ]

jobs:
build:

runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
python-version: [3.7, 3.8]

steps:
- uses: actions/checkout@v2
Expand Down Expand Up @@ -47,4 +47,4 @@ jobs:
run: |
python setup.py install
cd pypgatk
python tests/pypgatk_tests.py
python -s tests/pypgatk_tests.py
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,5 @@ pypgatk/database_cbioportal/
pypgatk/database_cosmic/
pypgatk/config/private/
.pypirc
/pypgatk/test_all.bed
/pypgatk/test_annotated.vcf
21 changes: 13 additions & 8 deletions conda-enviroment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,18 @@ channels:
- bioconda
dependencies:
- biopython
- Click=7.0
- gffutils=0.10.1
- numpy=1.16.3
- PyYAML=5.1.2
- requests=2.21.0
- simplejson=3.16.0
- ratelimit=2.2.1
- Click
- gffutils
- numpy
- PyYAML
- requests
- simplejson
- ratelimit
- pathos
- bioconda::pyteomics
- pybedtools=0.8.2
- pybedtools
- matplotlib
- bioconda::pyopenms
- pytest
- tqdm
- pyahocorasick
8 changes: 4 additions & 4 deletions pypgatk/cgenomes/cbioportal_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ def __init__(self, config_data, pipeline_arguments):
self._list_studies = []
self._multithreading = True

self._cbioportal_base_url = 'https://www.cbioportal.org/webservice.do'
self._cancer_studies_command = 'cmd=getCancerStudies'
self._cbioportal_base_url = 'https://www.cbioportal.org/api'
self._cancer_studies_command = 'studies'

self._cbioportal_download_url = 'https://cbioportal-datahub.s3.amazonaws.com'

Expand Down Expand Up @@ -111,13 +111,13 @@ def get_cancer_studies(self):
"""
server = self._cbioportal_base_url
endpoint = self._cancer_studies_command
self._cbioportal_studies = call_api_raw(server + "?" + endpoint).text
self._cbioportal_studies = call_api_raw(server + "/" + endpoint).text
return self._cbioportal_studies

def download_study(self, download_study, url_file_name=None):
"""
This function will download a study from cBioPortal using the study ID
:param download_study: Study to be download, if the study is empty or None, all the studies will be
:param download_study: Study to be downloaded, if the study is empty or None, all the studies will be
downloaded.
:param url_file_name: file tsv containing the urls to be downloaded.
:return: None
Expand Down
33 changes: 33 additions & 0 deletions pypgatk/commands/blast_get_position.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import logging

import click

from pypgatk.toolbox.general import read_yaml_from_file
from pypgatk.commands.utils import print_help
from pypgatk.proteogenomics.blast_get_position import BlastGetPositionService

log = logging.getLogger(__name__)

@click.command('blast_get_position', short_help='Blast peptide and refence protein database to find variation sites.')
@click.option('-c', '--config_file', help='Configuration file for the fdr peptides pipeline.')
@click.option('-i', '--input_psm_to_blast', help='The file name of the input PSM table to blast.')
@click.option('-o', '--output_psm', help='The file name of the output PSM table.')
@click.option('-r', '--input_reference_database', help='The file name of the refence protein database to blast. The reference database includes Uniprot Proteomes with isoforms, ENSEMBL, RefSeq, etc.')
@click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')

@click.pass_context
def blast_get_position(ctx, config_file, input_psm_to_blast, output_psm, input_reference_database, number_of_processes):
config_data = None
if config_file is not None:
config_data = read_yaml_from_file(config_file)

if input_psm_to_blast is None or input_reference_database is None or output_psm is None:
print_help()
pipeline_arguments = {}
if input_reference_database is not None:
pipeline_arguments[BlastGetPositionService.CONFIG_INPUT_REFERENCE_DATABASE] = input_reference_database
if number_of_processes is not None:
pipeline_arguments[BlastGetPositionService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes

blast_get_position_service = BlastGetPositionService(config_data, pipeline_arguments)
blast_get_position_service.blast(input_psm_to_blast, output_psm)
37 changes: 0 additions & 37 deletions pypgatk/commands/deeplc.py

This file was deleted.

33 changes: 0 additions & 33 deletions pypgatk/commands/msrescore.py

This file was deleted.

39 changes: 39 additions & 0 deletions pypgatk/commands/mztab_class_fdr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import logging

import click

from pypgatk.toolbox.general import read_yaml_from_file
from pypgatk.commands.utils import print_help
from pypgatk.proteogenomics.mztab_class_fdr import MzTabClassFdr

log = logging.getLogger(__name__)

@click.command('mztab_class_fdr', short_help='Extract psms from mzTab for global-fdr and class-fdr filtering')
@click.option('-c', '--config_file', help='Configuration file for the fdr peptides pipeline')
@click.option('-i', '--input_mztab', help='The file name of the input mzTab')
@click.option('-o', '--outfile_name', help='The file name of the psm table filtered by global-fdr and class-fdr')
@click.option('-d', '--decoy_prefix', help='Default is "decoy"')
@click.option('-gf', '--global_fdr_cutoff', help='PSM peptide global-fdr cutoff or threshold. Default is 0.01')
@click.option('-cf', '--class_fdr_cutoff', help='PSM peptide class-fdr cutoff or threshold. Default is 0.01')
@click.option('-g', '--peptide_groups_prefix', help="Peptide class "
"groups e.g. \"{non_canonical:[altorf,pseudo,ncRNA];mutations:[COSMIC,cbiomut];variants:[var_mut,var_rs]}\"")
@click.pass_context
def mztab_class_fdr(ctx, config_file, input_mztab, outfile_name, decoy_prefix, global_fdr_cutoff, class_fdr_cutoff, peptide_groups_prefix):
config_data = None
if config_file is not None:
config_data = read_yaml_from_file(config_file)

if input_mztab is None or outfile_name is None:
print_help()
pipeline_arguments = {}
if decoy_prefix is not None:
pipeline_arguments[MzTabClassFdr.CONFIG_DECOY_PREFIX] = decoy_prefix
if global_fdr_cutoff is not None:
pipeline_arguments[MzTabClassFdr.CONFIG_GLOBAL_FDR_CUTOFF] = global_fdr_cutoff
if class_fdr_cutoff is not None:
pipeline_arguments[MzTabClassFdr.CONFIG_CLASS_FDR_CUTOFF] = class_fdr_cutoff
if peptide_groups_prefix is not None:
pipeline_arguments[MzTabClassFdr.CONFIG_PEPTIDE_GROUPS_PREFIX] = peptide_groups_prefix

mzTab_class_fdr = MzTabClassFdr(config_data, pipeline_arguments)
mzTab_class_fdr.form_mztab_class_fdr(input_mztab, outfile_name)
4 changes: 2 additions & 2 deletions pypgatk/commands/peptide_class_fdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
@click.option('-c', '--config_file', help='Configuration to perform Peptide Class FDR')
@click.option('-in', '--input-file', help='input file with the peptides and proteins')
@click.option('-out', '--output-file', help='idxml from openms with filtered peptides and proteins')
@click.option("--file-type")
@click.option("--file-type", help="File types supported by the tool (TSV (.tsv), IDXML (.idxml), MZTAB (.mztab))")
@click.option('--min-peptide-length', help='minimum peptide length')
@click.option('--psm-pep-fdr-cutoff', help="PSM peptide FDR cutoff or threshold")
@click.option('--psm-pep-class-fdr-cutoff', help="PSM class peptide FDR cutoff or threshold")
Expand Down Expand Up @@ -48,7 +48,7 @@ def peptide_class_fdr(ctx, config_file, input_file, output_file, file_type, min_
:param psm_pep_class_fdr_cutoff: Peptide class FDR cutoff
:param peptide_groups_prefix: Peptide groups prefix for the Peptide classes FDR
:param peptide_classes_prefix: Peptide classes
:param file_type: File type to compute the FDR and class FDR.
:param file_type: File type to compute the FDR and class FDR ()
:param disable_class_fdr: Do not compute class FDR and not filtering the PSMs
:return:
"""
Expand Down
55 changes: 55 additions & 0 deletions pypgatk/commands/validate_peptides.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import logging

import click

from pypgatk.toolbox.general import read_yaml_from_file
from pypgatk.proteogenomics.validate_peptides import ValidatePeptidesService
from pypgatk.commands.utils import print_help

log = logging.getLogger(__name__)


@click.command('validate_peptides',
short_help='Command to inspect MS2 spectra of single-subsititution peptide identifications')
@click.option('-c', '--config_file', help='Configuration file for the validate peptides pipeline')
@click.option('-p', '--mzml_path', help='The mzml file path.You only need to use either mzml_path or mzml_files')
@click.option('-f', '--mzml_files',
help='The mzml files.Different files are separated by ",".You only need to use either mzml_path or mzml_files')
@click.option('-i', '--infile_name', help='Variant peptide PSMs table')
@click.option('-o', '--outfile_name', help='Output file for the results')
@click.option('-ion', '--ions_tolerance', help='MS2 fragment ions mass accuracy')
@click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')
@click.option('-r', '--relative', help='When using ppm as ions_tolerance (not Da), it needs to be turned on',
is_flag=True)
@click.option('-msgf', '--msgf',
help='If it is the standard format of MSGF output, please turn on this switch, otherwise it defaults to mzTab format',
is_flag=True)
@click.pass_context
def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance,
number_of_processes, relative, msgf):
config_data = None
if config_file is not None:
config_data = read_yaml_from_file(config_file)

validate_flag = bool(infile_name and (mzml_path or mzml_files) and outfile_name)
if not validate_flag:
print_help()

pipeline_arguments = {}

if mzml_path is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_PATH] = mzml_path
if mzml_files is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_FILES] = mzml_files
if ions_tolerance is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_IONS_TOLERANCE] = ions_tolerance
if number_of_processes is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
if relative is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_RELATIVE] = relative
if msgf is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MSGF] = msgf

validate_peptides_service = ValidatePeptidesService(config_data, pipeline_arguments)
if validate_flag:
validate_peptides_service.validate(infile_name, outfile_name)
4 changes: 2 additions & 2 deletions pypgatk/config/cbioportal_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ cbioportal_data_downloader:
output_directory: database_cbioportal
list_studies: []
cbioportal_api:
base_url: https://www.cbioportal.org/webservice.do
cancer_studies: cmd=getCancerStudies
base_url: https://www.cbioportal.org/api
cancer_studies: studies
cbioportal_download_url: https://cbioportal-datahub.s3.amazonaws.com
logger:
formatters:
Expand Down
2 changes: 1 addition & 1 deletion pypgatk/config/ensembl_downloader_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ ensembl_data_downloader:
- chr_patch_hapl_scaff.
file_extension: gtf
ensembl_api:
server: http://rest.ensembl.org
server: https://rest.ensembl.org
species: /info/species
logger:
formatters:
Expand Down
2 changes: 1 addition & 1 deletion pypgatk/ensembl/data_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def __init__(self, config_file, pipeline_arguments):
super(EnsemblDataDownloadService, self).__init__(self.CONFIG_KEY_DATA_DOWNLOADER, config_file,
pipeline_arguments)

self._rest_api = 'http://rest.ensembl.org'
self._rest_api = 'https://rest.ensembl.org'
self._rest_endpoint = '/info/species'

self._skip_protein_database = self.get_data_download_parameters(variable=self.CONFIG_KEY_SKIP_PROTEIN,
Expand Down
1 change: 0 additions & 1 deletion pypgatk/ensembl/ensembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,6 @@ def get_features(db, feature_id, feature_types=None):
also genomic positions for all its elements (exons/cds&start_codon)
:param db:
:param feature_id:
:param biotype_str:
:param feature_types:
:return:
"""
Expand Down
Empty file.
Loading

0 comments on commit 6faf53c

Please sign in to comment.