Merge branch 'release-0.4.2' into 'master'

Release 0.4.2 to master See merge request tron/addannot!132
TRON-Bioinformatics · Mar 27, 2021 · b6f8f6a · b6f8f6a
2 parents 6917966 + 723d6c9
commit b6f8f6a
Show file tree

Hide file tree

Showing 90 changed files with 64,307 additions and 337 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,74 @@
+FROM python:3.7-slim
+
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE 1
+ENV PYTHONUNBUFFERED 1
+
+WORKDIR /app
+
+# copy over Neofox package
+COPY neofox neofox
+COPY setup.py setup.py
+COPY MANIFEST.in MANIFEST.in
+COPY setup.cfg setup.cfg
+COPY requirements.txt requirements.txt
+COPY LICENSE LICENSE
+COPY README.md README.md
+# these two files will need to be downloaded from the owner's site after agreeing their license
+COPY netMHCIIpan-3.2.Linux.tar.gz netMHCIIpan-3.2.Linux.tar.gz
+COPY netMHCpan-4.0a.Linux.tar.gz netMHCpan-4.0a.Linux.tar.gz
+
+# build and install neofox package
+RUN python3 setup.py bdist_wheel
+RUN pip3 install dist/*.whl
+
+# install R
+RUN apt-get update && apt-get install -y --no-install-recommends r-base
+ENV NEOFOX_RSCRIPT /usr/bin/Rscript
+
+# install BLASTP
+RUN apt-get install -y --no-install-recommends wget
+RUN wget https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.10.1/ncbi-blast-2.10.1+-x64-linux.tar.gz
+RUN tar -xvf ncbi-blast-2.10.1+-x64-linux.tar.gz
+ENV NEOFOX_BLASTP /app/ncbi-blast-2.10.1+/bin/blastp
+RUN echo $NEOFOX_BLASTP
+ENV NEOFOX_MAKEBLASTDB /app/ncbi-blast-2.10.1+/bin/makeblastdb
+RUN echo $NEOFOX_MAKEBLASTDB
+
+# install netmhcpan
+RUN tar -xvf netMHCpan-4.0a.Linux.tar.gz
+RUN echo $NEOFOX_MAKEBLASTDB
+RUN sed -i 's/\/usr\/cbs\/packages\/netMHCpan\/4.0\/netMHCpan-4.0/\/app\/netMHCpan-4.0/g' /app/netMHCpan-4.0/netMHCpan
+RUN sed -i 's/\/scratch/\/app\/netMHCpan-4.0\/tmp/g' /app/netMHCpan-4.0/netMHCpan
+RUN mkdir /app/netMHCpan-4.0/tmp
+RUN wget http://www.cbs.dtu.dk/services/NetMHCpan-4.0/data.Linux.tar.gz  -O /app/netMHCpan-4.0/data.Linux.tar.gz
+RUN tar -xvf /app/netMHCpan-4.0/data.Linux.tar.gz -C /app/netMHCpan-4.0
+ENV NEOFOX_NETMHCPAN /app/netMHCpan-4.0/netMHCpan
+
+# install netmhc2pan
+RUN tar -xvf netMHCIIpan-3.2.Linux.tar.gz
+RUN sed -i 's/\/usr\/cbs\/bio\/src\/netMHCIIpan-3.2/\/app\/netMHCIIpan-3.2/g' /app/netMHCIIpan-3.2/netMHCIIpan
+RUN sed -i 's/\/scratch/\/app\/netMHCIIpan-3.2\/tmp/g' /app/netMHCIIpan-3.2/netMHCIIpan
+RUN mkdir /app/netMHCIIpan-3.2/tmp
+RUN wget http://www.cbs.dtu.dk/services/NetMHCIIpan-3.2/data.Linux.tar.gz -O /app/netMHCIIpan-3.2/data.Linux.tar.gz
+RUN tar -xvf /app/netMHCIIpan-3.2/data.Linux.tar.gz -C /app/netMHCIIpan-3.2
+ENV NEOFOX_NETMHC2PAN /app/netMHCIIpan-3.2/netMHCIIpan
+RUN apt-get install tcsh
+
+# install mixmhcpred
+RUN wget https://github.com/GfellerLab/MixMHCpred/archive/v2.1.tar.gz
+RUN tar -xvf v2.1.tar.gz
+RUN sed -i 's/"YOUR PATH TO MixMHCpred\/lib FOLDER"/\/app\/MixMHCpred-2.1\/lib/g' /app/MixMHCpred-2.1/MixMHCpred
+RUN apt-get install -y --no-install-recommends g++
+RUN g++ -O3 /app/MixMHCpred-2.1/lib/MixMHCpred.cc -o /app/MixMHCpred-2.1/lib/MixMHCpred.x
+ENV NEOFOX_MIXMHCPRED /app/MixMHCpred-2.1/MixMHCpred
+
+# install mixmhc2pred
+RUN wget https://github.com/GfellerLab/MixMHC2pred/archive/v1.2.tar.gz
+RUN tar -xvf v1.2.tar.gz
+ENV NEOFOX_MIXMHC2PRED /app/MixMHC2pred-1.2/MixMHC2pred_unix
+
+# configure references
+RUN apt-get install -y --no-install-recommends build-essential
+RUN neofox-configure --reference-folder /app/neofox-reference --install-r-dependencies
+ENV NEOFOX_REFERENCE_FOLDER /app/neofox-reference
diff --git a/README.md b/README.md
@@ -40,12 +40,11 @@ NeoFox annotates neoantigen candidate sequences with published neo-epitope descr
 ## Usage from the command line
 
 ````commandline
-neofox --model-file/--candidate-file/--json-file neoantigens_candidates.tab/neoantigens_candidates.json --patient-data patient_data.txt --output-folder /path/to/out --output-prefix out_prefix [--with-short-wide-table] [--with-tall-skinny-table] [--with-json] [--num_cpus]
+neofox --candidate-file/--json-file neoantigens_candidates.tab/neoantigens_candidates.json --patient-data patient_data.txt --output-folder /path/to/out --output-prefix out_prefix [--with-short-wide-table] [--with-tall-skinny-table] [--with-json] [--num_cpus]
 ````
 
 where:
 - `--candidate-file`: tab-separated values table with neoantigen candidates represented by long mutated peptide sequences
-- `--model-file`: tab-separated values table with neoantigens in NeoFox model format
 - `--json-file`: JSON file neoantigens in NeoFox model format
 - `--patient-id`: patient identifier (*optional*, this will be used if the patient id the column `patient` is missing the candidate input file)
 - `--patient-data`: a table of tab separated values containing metadata on the patient
@@ -58,9 +57,8 @@ where:
 
 ### Input data
 
-#### model-file format  
-
-This is an dummy example of a table with neoantigen candidates in `model-file` format:  
+#### Neoantigen candidates in tabular format
+This is an dummy example of a table with neoantigen candidates:  
 
 | gene  | mutation.wildTypeXmer       | mutation.mutatedXmer        | patientIdentifier | rnaExpression | rnaVariantAlleleFrequency | dnaVariantAlleleFrequency | external_annotation_1 | external_annotation_2 |
 |-------|-----------------------------|-----------------------------|-------------------|---------------|---------------------------|---------------------------|-----------------------|-----------------------|
@@ -75,30 +73,13 @@ where:
 - `mutation.mutatedXmer`: the neoantigen candidate sequence, i.e. the mutated amino acid sequence. The mutation should be located in the middle, flanked by 13 amino acid on both sites (IUPAC 1 respecting casing, eg: A)
 - `mutation.wildTypeXmer`: the equivalent non-mutated amino acid sequence (IUPAC 1 respecting casing, eg: A)
 - `patientIdentifier`: the patient identifier
-- `rnaExpression`: the transcript expression. Should be empty if no value available
+- `rnaExpression`: RNA expression. (**optional**) (see *NOTE*) This value can be in any format chosen by the user (e.g. TPM, RPKM) but it is recommended to be consistent for data that should be compared.
 - `rnaVariantAlleleFrequency`: the variant allele frequency calculated from the RNA (**optional**, this will be estimated using the `dnaVariantAlleleFrequency` if not available)
 - `dnaVariantAlleleFrequency`: the variant allele frequency calculated from the DNA (**optional**)  
 
-#### candidate-file format  
-
-Alternatively, neoantigen candidates can be provided in `candidate-file` format. In principle the columns are the same as in the `model-file`. Of note, `candidate-file` allows for an optional patient id in the data table. This is an dummy example:  
-
-|     patient |     gene  | substitution |     transcript_expression |     +-13_AA_(SNV)_/_-15_AA_to_STOP_(INDEL) |     [WT]_+-13_AA_(SNV)_/_-15_AA_to_STOP_(INDEL) |     VAF_in_tumor |     VAF_in_RNA    |
-|-------------|-----------|--------------|---------------------------|--------------------------------------------|-------------------------------------------------|------------------|-------------------|
-|     Ptx     |     BRCA2 | I547T        |     0.51950689            |     AAAAAAAAAAAAAFAAAAAAAAAAAAA            |     AAAAAAAAAAAAALAAAAAAAAAAAAA                 |     0.294        |     0.857         |
-|     Ptx     |     BRCA2 | E135S        |     0.71575659            |     AAAAAAAAAAAAAMAAAAAAAAAAAAA            |     AAAAAAAAAAAAARAAAAAAAAAAAAA                 |     0.173        |     0.556         |
-
-where:
-- `patient` is the patient id (**optional**). If this column is not provided, `--patient-id` must be given as input when starting NeoFox. Of note, providing this column allows to put the neoantigen candidates of several patients into one table.
-- `gene` is the HGNC gene symbol
-- `substitution`  represents a single amino acid substitution with single letter amino acids (eg: I547T). This column allows the detection of INDEL sequences which are removed from the dataset and not processed.  
-- `+-13_AA_(SNV)_/_-15_AA_to_STOP_(INDEL)` the neoantigen candidate sequence, i.e. the mutated amino acid sequence. The mutation should be located in the middle, flanked by 13 amino acid on both sites (IUPAC 1 respecting casing, eg: A)
-- `[WT]_+-13_AA_(SNV)_/_-15_AA_to_STOP_(INDEL)` the equivalent non-mutated amino acid sequence (IUPAC 1 respecting casing, eg: A)
-- `transcript_expression` the transcript expression. Should be empty if no value available
-- `VAF_in_tumor` variant allele frequency in the DNA (**optional**)
-- `VAF_in_RNA` variant allele frequency in the RNA (**optional**, this will be estimated using the `VAF_in_tumor` if not available)
+**NOTE:** If rnaExpression is not provided, expression will be estimated by gene expression in TCGA cohort indicated in the `tumorType` in the patient data (see below). 
 
-### JSON format 
+### Neoantigen candidates in JSON format 
 
 Besides tabular format, neoantigen candidates can be provided as a list of neoantigen models in JSON format as shown below. To simplify, only one full neoantigen model is shown:  
 
@@ -118,20 +99,47 @@ Besides tabular format, neoantigen candidates can be provided as a list of neoan
 
 This is an dummy example of a patient file:  
 
-| identifier | mhcIAlleles                                                                  | mhcIIAlleles                                                                                                                                                   | isRnaAvailable | tumorType |
-|------------|------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------|-----------|
-| Ptx        | HLA-A\*03:01,HLA-A\*29:02,HLA-B\*07:02,HLA-B\*44:03,HLA-C\*07:02,HLA-C*16:01 | HLA-DRB1\*03:01,HLA-DRB1\*08:01,HLA-DQA1\*03:01,HLA-DQA1\*05:01,HLA-DQB1\*01:01,HLA-DQB1\*04:02,HLA-DPA1\*01:03,HLA-DPA1\*03:01,HLA-DPB1\*13:01,HLA-DPB1*04:02 | TRUE           | HNSC      |
-| Pty        | HLA-A\*02:01,HLA-A\*30:01,HLA-B\*07:34,HLA-B\*44:03,HLA-C\*07:02,HLA-C*07:02 | HLA-DRB1\*04:02,HLA-DRB1\*08:01,HLA-DQA1\*03:01,HLA-DQA1\*04:01,HLA-DQB1\*03:02,HLA-DQB1\*14:01,HLA-DPA1\*01:03,HLA-DPA1\*02:01,HLA-DPB1\*02:01,HLA-DPB1*04:01 | FALSE          | HNSC      |
+| identifier | mhcIAlleles                                                                  | mhcIIAlleles                                                                                                                                                   | tumorType |
+|------------|------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------|
+| Ptx        | HLA-A\*03:01,HLA-A\*29:02,HLA-B\*07:02,HLA-B\*44:03,HLA-C\*07:02,HLA-C*16:01 | HLA-DRB1\*03:01,HLA-DRB1\*08:01,HLA-DQA1\*03:01,HLA-DQA1\*05:01,HLA-DQB1\*01:01,HLA-DQB1\*04:02,HLA-DPA1\*01:03,HLA-DPA1\*03:01,HLA-DPB1\*13:01,HLA-DPB1*04:02 | HNSC      |
+| Pty        | HLA-A\*02:01,HLA-A\*30:01,HLA-B\*07:34,HLA-B\*44:03,HLA-C\*07:02,HLA-C*07:02 | HLA-DRB1\*04:02,HLA-DRB1\*08:01,HLA-DQA1\*03:01,HLA-DQA1\*04:01,HLA-DQB1\*03:02,HLA-DQB1\*14:01,HLA-DPA1\*01:03,HLA-DPA1\*02:01,HLA-DPB1\*02:01,HLA-DPB1*04:01 | HNSC      |
 
 where:
 - `identifier`: the patient identifier
 - `mhcIAlleles`: comma separated MHC I alleles of the patient for HLA-A, HLA-B and HLA-C. If homozygous, the allele should be added twice.
 - `mhcIIAlleles`: comma separated  MHC II alleles of the patient for HLA-DRB1, HLA-DQA1, HLA-DQB1, HLA-DPA1 and HLA-DPB1. If homozygous, the allele should be added twice.
-- `isRnaAvailable`: whether RNA was available for the analysis. ***If  false, then expression value will be imputed from TCGA gene expression data.*** If true, then the `VAF_in_RNA` field will be used when available, else `VAF_in_DNA` will be used.
 - `tumorType`: tumour entity in TCGA study abbreviation format (https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations). This field is required for expression imputation and at the moment the following tumor types are supported:
 
+
+| Study Name                                                         | Abbreviation |
+|--------------------------------------------------------------------|-------------------|
+| Adrenocortical carcinoma                                           | ACC               |
+| Bladder Urothelial Carcinoma                                       | BLCA              |
+| Breast invasive carcinoma                                          | BRCA              |
+| Cervical squamous cell carcinoma and endocervical adenocarcinoma   | CESC              |
+| Cholangiocarcinoma                                                 | CHOL              |
+| Colon adenocarcinoma                                               | COAD              |
+| Esophageal carcinoma                                               | ESCA              |
+| Glioblastoma multiforme                                            | GBM               |
+| Head and Neck squamous cell carcinoma                              | HNSC              |
+| Kidney Chromophobe                                                 | KICH              |
+| Kidney renal papillary cell carcinoma                              | KIRP              |
+| Liver hepatocellular carcinoma                                     | LIHC              |
+| Lung adenocarcinoma                                                | LUAD              |
+| Lung squamous cell carcinoma                                       | LUSC              |
+| Ovarian serous cystadenocarcinoma                                  | OV                |
+| Pancreatic adenocarcinoma                                          | PAAD              |
+| Prostate adenocarcinoma                                            | PRAD              |
+| Rectum adenocarcinoma                                              | READ              |
+| Sarcoma                                                            | SARC              |
+| Skin Cutaneous Melanoma                                            | SKCM              |
+| Testicular Germ Cell Tumors                                        | TGCT              |
+| Uterine Corpus Endometrial Carcinoma                               | UCEC              |
+
+
+
 ### Output data
 
-The output data is returned in a short wide tab separated values file (`--with-short-wide-table`). Optionally, it can be provided in a tall skinny tab separated values file (`--with-tall-skinny-table`) or in JSON (`--with-json`).  
+The output data is returned in a short wide tab separated values file (`--with-short-wide-table`). Optionally, it can be provided in a tall skinny tab separated values file (`--with-tall-skinny-table`) or in JSON format (`--with-json`).  
 
 For a more information, please check out our documentation on [https://neofox.readthedocs.io](https://neofox.readthedocs.io/)