From 8e305732a2197cdbee643bb9a489d64120ad2f76 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Fri, 19 Jan 2024 14:38:56 -0500 Subject: [PATCH 1/3] style: run pre-commit on all files --- .gitignore | 2 +- README.md | 131 ++-- config/genomes/biowulf/hg19_19.json | 2 +- config/genomes/biowulf/hg38_30.json | 2 +- config/genomes/biowulf/hg38_34.json | 2 +- config/genomes/biowulf/hg38_36.json | 2 +- config/genomes/biowulf/hg38_38.json | 2 +- config/genomes/biowulf/mm10_M23.json | 2 +- config/templates/tools.json | 2 +- docker/multiqc/Dockerfile | 16 +- docs/RNA-seq/Resources.md | 120 ++-- docs/RNA-seq/TLDR-RNA-seq.md | 45 +- docs/RNA-seq/Theory.md | 138 +++-- docs/RNA-seq/build.md | 237 ++++---- docs/RNA-seq/cache.md | 62 +- docs/RNA-seq/images/RENEE_Pipeline.svg | 2 +- docs/RNA-seq/run.md | 246 ++++---- docs/RNA-seq/unlock.md | 44 +- docs/assets/icons/analytics-black-48dp.svg | 2 +- docs/assets/icons/analytics-white-48dp.svg | 2 +- docs/dev/coming-soon.md | 2 +- docs/dev/lorem_ipsum.md | 76 ++- docs/general-questions.md | 7 +- docs/index.md | 58 +- docs/license.md | 2 +- docs/troubleshooting.md | 9 +- mkdocs.yml | 20 +- renee | 78 +-- resources/RENEE_Pipeline.svg | 2 +- resources/biowulf/fastq_screen.conf | 38 +- resources/biowulf/fastq_screen_2.conf | 38 +- resources/builder | 18 +- resources/cacher | 28 +- resources/clean_gtf.py | 160 ++--- resources/download_dme_files | 84 +-- resources/frce/fastq_screen.conf | 34 +- resources/frce/fastq_screen_2.conf | 32 +- resources/gff3togtf.py | 175 ++++-- resources/jobby | 559 +++++++++--------- resources/multiqc_config.yaml | 259 ++++---- resources/overview.svg | 2 +- resources/upload_to_nidap | 90 +-- workflow/rules/build.smk | 59 +- workflow/rules/nidap.smk | 4 +- workflow/rules/single-end.smk | 20 +- workflow/scripts/PcaReport.Rmd | 539 +++++++++-------- workflow/scripts/bam_count_concord_stats.py | 4 +- .../scripts/builder/create_rRNA_intervals.py | 73 ++- .../builder/gene2transcripts_add_length.py | 54 +- .../scripts/builder/generate_qualimap_ref.py | 70 ++- workflow/scripts/builder/get_gene_annotate.py | 54 +- .../scripts/builder/get_isoform_annotate.py | 65 +- .../scripts/builder/get_karyoplot_beds.py | 82 +-- .../builder/get_karyoplot_gene_coordinates.py | 49 +- .../builder/gtf2protein_coding_genes.py | 95 +-- workflow/scripts/builder/jsonmaker.py | 75 ++- workflow/scripts/builder/make_geneinfo.py | 54 +- workflow/scripts/builder/make_refFlat.py | 75 +-- workflow/scripts/common.py | 71 ++- workflow/scripts/create_tin_matrix.py | 89 ++- workflow/scripts/do_run_rMATS | 97 ++- workflow/scripts/files2spreadsheet.py | 267 +++++---- workflow/scripts/get_flowcell_lanes.py | 76 ++- workflow/scripts/get_read_length.py | 41 +- workflow/scripts/merge_rsem_results.py | 92 +-- workflow/scripts/optimal_read_length.py | 78 ++- workflow/scripts/pcacall.R | 19 +- workflow/scripts/phred_encoding.py | 111 +++- workflow/scripts/pyparser.py | 416 +++++++------ workflow/scripts/rNA.R | 41 +- workflow/scripts/rNA_flowcells.Rmd | 399 +++++++------ workflow/scripts/rNA_groups.Rmd | 407 +++++++------ workflow/scripts/rsemcounts.R | 134 +++-- 73 files changed, 3582 insertions(+), 2960 deletions(-) diff --git a/.gitignore b/.gitignore index 817fbf7..9cce561 100644 --- a/.gitignore +++ b/.gitignore @@ -159,4 +159,4 @@ site/ # other -**/.koparde* \ No newline at end of file +**/.koparde* diff --git a/README.md b/README.md index 8c7bb9c..a84a34e 100644 --- a/README.md +++ b/README.md @@ -3,79 +3,82 @@ An open-source, reproducible, and scalable solution for analyzing RNA-seq data. ### Table of Contents + - [RENEE - **R**na s**E**quencing a**N**alysis pip**E**lin**E**](#renee---rna-sequencing-analysis-pipeline) - - [Table of Contents](#table-of-contents) - - [1. Introduction](#1-introduction) - - [2. Overview](#2-overview) - - [2.1 RENEE Pipeline](#21-renee-pipeline) - - [2.2 Reference Genomes](#22-reference-genomes) - - [2.3 Dependencies](#23-dependencies) - - [3. Run RENEE pipeline](#3-run-renee-pipeline) - - [3.1 Biowulf](#31-biowulf) - - [3.2 FRCE](#32-frce) - - [4. References](#4-references) - - [5. Version Notes](https://github.com/CCBR/RENEE/blob/main/CHANGELOG.md) - -### 1. Introduction -RNA-sequencing (*RNA-seq*) has a wide variety of applications. This popular transcriptome profiling technique can be used to quantify gene and isoform expression, detect alternative splicing events, predict gene-fusions, call variants and much more. - -**RENEE** is a comprehensive, open-source RNA-seq pipeline that relies on technologies like [Docker20](https://www.docker.com/why-docker) and [Singularity21... now called Apptainer](https://apptainer.org/docs/) to maintain the highest-level of reproducibility. The pipeline consists of a series of data processing and quality-control steps orchestrated by [Snakemake19](https://snakemake.readthedocs.io/en/stable/), a flexible and scalable workflow management system, to submit jobs to a cluster or cloud provider. + - [Table of Contents](#table-of-contents) + - [1. Introduction](#1-introduction) + - [2. Overview](#2-overview) + - [2.1 RENEE Pipeline](#21-renee-pipeline) + - [2.2 Reference Genomes](#22-reference-genomes) + - [2.3 Dependencies](#23-dependencies) + - [3. Run RENEE pipeline](#3-run-renee-pipeline) + - [3.1 Biowulf](#31-biowulf) + - [3.2 FRCE](#32-frce) + - [4. References](#4-references) + - [5. Version Notes](https://github.com/CCBR/RENEE/blob/main/CHANGELOG.md) -![RENEE_overview_diagram](./resources/overview.svg) -**Fig 1. Run locally on a compute instance, on-premise using a cluster, or on the cloud using AWS.** A user can define the method or mode of execution. The pipeline can submit jobs to a cluster using a job scheduler like SLURM, or run on AWS using Tibanna (feature coming soon!). A hybrid approach ensures the pipeline is accessible to all users. As an optional step, relevelant output files and metadata can be stored in object storage using HPC DME (NIH users) or Amazon S3 for archival purposes (coming soon!). +### 1. Introduction + +RNA-sequencing (_RNA-seq_) has a wide variety of applications. This popular transcriptome profiling technique can be used to quantify gene and isoform expression, detect alternative splicing events, predict gene-fusions, call variants and much more. +**RENEE** is a comprehensive, open-source RNA-seq pipeline that relies on technologies like [Docker20](https://www.docker.com/why-docker) and [Singularity21... now called Apptainer](https://apptainer.org/docs/) to maintain the highest-level of reproducibility. The pipeline consists of a series of data processing and quality-control steps orchestrated by [Snakemake19](https://snakemake.readthedocs.io/en/stable/), a flexible and scalable workflow management system, to submit jobs to a cluster or cloud provider. + +![RENEE_overview_diagram](./resources/overview.svg) +**Fig 1. Run locally on a compute instance, on-premise using a cluster, or on the cloud using AWS.** A user can define the method or mode of execution. The pipeline can submit jobs to a cluster using a job scheduler like SLURM, or run on AWS using Tibanna (feature coming soon!). A hybrid approach ensures the pipeline is accessible to all users. As an optional step, relevelant output files and metadata can be stored in object storage using HPC DME (NIH users) or Amazon S3 for archival purposes (coming soon!). ### 2. Overview #### 2.1 RENEE Pipeline -A bioinformatics pipeline is more than the sum of its data processing steps. A pipeline without quality-control steps provides a myopic view of the potential sources of variation within your data (i.e., biological verses technical sources of variation). RENEE pipeline is composed of a series of quality-control and data processing steps. + +A bioinformatics pipeline is more than the sum of its data processing steps. A pipeline without quality-control steps provides a myopic view of the potential sources of variation within your data (i.e., biological verses technical sources of variation). RENEE pipeline is composed of a series of quality-control and data processing steps. The accuracy of the downstream interpretations made from transcriptomic data are highly dependent on initial sample library. Unwanted sources of technical variation, which if not accounted for properly, can influence the results. RENEE's comprehensive quality-control helps ensure your results are reliable and _reproducible across experiments_. In the data processing steps, RENEE quantifies gene and isoform expression and predicts gene fusions. Please note that the detection of alternative splicing events and variant calling will be incorporated in a later release. +![RNA-seq quantification pipeline](./resources/RENEE_Pipeline.svg) **Fig 2. An Overview of RENEE Pipeline.** Gene and isoform counts are quantified and a series of QC-checks are performed to assess the quality of the data. This pipeline stops at the generation of a raw counts matrix and gene-fusion calling. To run the pipeline, a user must select their raw data, a reference genome, and output directory (i.e., the location where the pipeline performs the analysis). Quality-control information is summarized across all samples in a MultiQC report. -![RNA-seq quantification pipeline](./resources/RENEE_Pipeline.svg) **Fig 2. An Overview of RENEE Pipeline.** Gene and isoform counts are quantified and a series of QC-checks are performed to assess the quality of the data. This pipeline stops at the generation of a raw counts matrix and gene-fusion calling. To run the pipeline, a user must select their raw data, a reference genome, and output directory (i.e., the location where the pipeline performs the analysis). Quality-control information is summarized across all samples in a MultiQC report. +**Quality Control** +[_FastQC_2](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) is used to assess the sequencing quality. FastQC is run twice, before and after adapter trimming. It generates a set of basic statistics to identify problems that can arise during sequencing or library preparation. FastQC will summarize per base and per read QC metrics such as quality scores and GC content. It will also summarize the distribution of sequence lengths and will report the presence of adapter sequences. -**Quality Control** -[*FastQC*2](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) is used to assess the sequencing quality. FastQC is run twice, before and after adapter trimming. It generates a set of basic statistics to identify problems that can arise during sequencing or library preparation. FastQC will summarize per base and per read QC metrics such as quality scores and GC content. It will also summarize the distribution of sequence lengths and will report the presence of adapter sequences. - -[*Kraken2*14](http://ccb.jhu.edu/software/kraken2/) and [*FastQ Screen*17](https://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/) are used to screen for various sources of contamination. During the process of sample collection to library preparation, there is a risk for introducing wanted sources of DNA. FastQ Screen compares your sequencing data to a set of different reference genomes to determine if there is contamination. It allows a user to see if the composition of your library matches what you expect. Also, if there are high levels of microbial contamination, Kraken can provide an estimation of the taxonomic composition. Kraken can be used in conjunction with [*Krona*15](https://github.com/marbl/Krona/wiki/KronaTools) to produce interactive reports. +[_Kraken2_14](http://ccb.jhu.edu/software/kraken2/) and [_FastQ Screen_17](https://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/) are used to screen for various sources of contamination. During the process of sample collection to library preparation, there is a risk for introducing wanted sources of DNA. FastQ Screen compares your sequencing data to a set of different reference genomes to determine if there is contamination. It allows a user to see if the composition of your library matches what you expect. Also, if there are high levels of microbial contamination, Kraken can provide an estimation of the taxonomic composition. Kraken can be used in conjunction with [_Krona_15](https://github.com/marbl/Krona/wiki/KronaTools) to produce interactive reports. -[*Preseq*1](http://smithlabresearch.org/software/preseq/) is used to estimate the complexity of a library for each samples. If the duplication rate is very high, the overall library complexity will be low. Low library complexity could signal an issue with library preparation where very little input RNA was over-amplified or the sample may be degraded. +[_Preseq_1](http://smithlabresearch.org/software/preseq/) is used to estimate the complexity of a library for each samples. If the duplication rate is very high, the overall library complexity will be low. Low library complexity could signal an issue with library preparation where very little input RNA was over-amplified or the sample may be degraded. -[*Picard*10](https://broadinstitute.github.io/picard/) can be used to estimate the duplication rate, and it has another particularly useful sub-command called CollectRNAseqMetrics which reports the number and percentage of reads that align to various regions: such as coding, intronic, UTR, intergenic and ribosomal regions. This is particularly useful as you would expect a library constructed with ploy(A)-selection to have a high percentage of reads that map to coding regions. Picard CollectRNAseqMetrics will also report the uniformity of coverage across all genes, which is useful for determining whether a sample has a 3' bias (observed in ploy(A)-selection libraries containing degraded RNA). +[_Picard_10](https://broadinstitute.github.io/picard/) can be used to estimate the duplication rate, and it has another particularly useful sub-command called CollectRNAseqMetrics which reports the number and percentage of reads that align to various regions: such as coding, intronic, UTR, intergenic and ribosomal regions. This is particularly useful as you would expect a library constructed with ploy(A)-selection to have a high percentage of reads that map to coding regions. Picard CollectRNAseqMetrics will also report the uniformity of coverage across all genes, which is useful for determining whether a sample has a 3' bias (observed in ploy(A)-selection libraries containing degraded RNA). -[*RSeQC*9](http://rseqc.sourceforge.net/) is another particularity useful package that is tailored for RNA-seq data. It is used to calculate the inner distance between paired-end reads and calculate TIN values for a set of canonical protein-coding transcripts. A median TIN value is calucated for each sample, which analogous to a computationally derived RIN. +[_RSeQC_9](http://rseqc.sourceforge.net/) is another particularity useful package that is tailored for RNA-seq data. It is used to calculate the inner distance between paired-end reads and calculate TIN values for a set of canonical protein-coding transcripts. A median TIN value is calucated for each sample, which analogous to a computationally derived RIN. -[MultiQC11](https://multiqc.info/) is used to aggreate the results of each tool into a single interactive report. +[MultiQC11](https://multiqc.info/) is used to aggreate the results of each tool into a single interactive report. **Quantification** -[*Cutadapt*3](https://cutadapt.readthedocs.io/en/stable/) is used to remove adapter sequences, perform quality trimming, and remove very short sequences that would otherwise multi-map all over the genome prior to alignment. +[_Cutadapt_3](https://cutadapt.readthedocs.io/en/stable/) is used to remove adapter sequences, perform quality trimming, and remove very short sequences that would otherwise multi-map all over the genome prior to alignment. -[*STAR*4](https://github.com/alexdobin/STAR) is used to align reads to the reference genome. The RENEE pipeline runs STAR in a two-passes where splice-junctions are collected and aggregated across all samples and provided to the second-pass of STAR. In the second pass of STAR, the splice-junctions detected in the first pass are inserted into the genome indices prior to alignment. +[_STAR_4](https://github.com/alexdobin/STAR) is used to align reads to the reference genome. The RENEE pipeline runs STAR in a two-passes where splice-junctions are collected and aggregated across all samples and provided to the second-pass of STAR. In the second pass of STAR, the splice-junctions detected in the first pass are inserted into the genome indices prior to alignment. -[*RSEM*5](https://github.com/deweylab/RSEM) is used to quantify gene and isoform expression. The expected counts from RSEM are merged across samples to create a two counts matrices for gene counts and isoform counts. +[_RSEM_5](https://github.com/deweylab/RSEM) is used to quantify gene and isoform expression. The expected counts from RSEM are merged across samples to create a two counts matrices for gene counts and isoform counts. -[*Arriba*22](https://arriba.readthedocs.io/en/latest/) is used to predict gene-fusion events. The pre-built human and mouse reference genomes use Arriba blacklists to reduce the false-positive rate. +[_Arriba_22](https://arriba.readthedocs.io/en/latest/) is used to predict gene-fusion events. The pre-built human and mouse reference genomes use Arriba blacklists to reduce the false-positive rate. #### 2.2 Reference Genomes + Reference files are pulled from an S3 bucket to the compute instance or local filesystem prior to execution. RENEE comes bundled with pre-built reference files for the following genomes: -| Name | Species | Genome | Annotation | -| -------- | ------- | ------------------ | -------- | -| hg38_30 | Homo sapiens (human) | [GRCh38 or hg38](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_28/GRCh38.primary_assembly.genome.fa.gz) | [Gencode6 Release 30](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/gencode.v30.annotation.gtf.gz) | -| hg38_34 | Homo sapiens (human) | [GRCh38 or hg38](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_28/GRCh38.primary_assembly.genome.fa.gz) | [Gencode6 Release 34](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_34/gencode.v34.annotation.gtf.gz) | -| hg38_38 | Homo sapiens (human) | [GRCh38 or hg38](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_28/GRCh38.primary_assembly.genome.fa.gz) | [Gencode6 Release 38](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gtf.gz) | -| hg38_41 | Homo sapiens (human) | [GRCh38 or hg38](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_28/GRCh38.primary_assembly.genome.fa.gz) | [Gencode6 Release 41](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz) | +| Name | Species | Genome | Annotation | +| -------- | ------- | ------------------ | -------- | +| hg38_30 | Homo sapiens (human) | [GRCh38 or hg38](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_28/GRCh38.primary_assembly.genome.fa.gz) | [Gencode6 Release 30](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/gencode.v30.annotation.gtf.gz) | +| hg38_34 | Homo sapiens (human) | [GRCh38 or hg38](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_28/GRCh38.primary_assembly.genome.fa.gz) | [Gencode6 Release 34](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_34/gencode.v34.annotation.gtf.gz) | +| hg38_38 | Homo sapiens (human) | [GRCh38 or hg38](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_28/GRCh38.primary_assembly.genome.fa.gz) | [Gencode6 Release 38](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gtf.gz) | +| hg38_41 | Homo sapiens (human) | [GRCh38 or hg38](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_28/GRCh38.primary_assembly.genome.fa.gz) | [Gencode6 Release 41](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz) | | mm10_M21 | Mus musculus (mouse) | [GRCm38 or mm10](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M18/GRCm38.primary_assembly.genome.fa.gz) | [Gencode6 Release M21](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M21/gencode.vM21.annotation.gtf.gz) | | mm10_M23 | Mus musculus (mouse) | [GRCm38 or mm10](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M18/GRCm38.primary_assembly.genome.fa.gz) | [Gencode6 Release M23](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M23/gencode.vM23.annotation.gtf.gz) | | mm10_M25 | Mus musculus (mouse) | [GRCm38 or mm10](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M18/GRCm38.primary_assembly.genome.fa.gz) | [Gencode6 Release M25](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gtf.gz) | -> **Warning:** This section contains FTP links for downloading each reference file. Open the link in a new tab to start a download. DO NOT download if you are running on Biowulf or FRCE. Pre-built indices are already available for these genome+annotation combinations. This can be done by using the values under the _Name_ column above as the `--genome` renee argument. +> **Warning:** This section contains FTP links for downloading each reference file. Open the link in a new tab to start a download. DO NOT download if you are running on Biowulf or FRCE. Pre-built indices are already available for these genome+annotation combinations. This can be done by using the values under the _Name_ column above as the `--genome` renee argument. > **Note:** These were the only annotation versions available at the time of writing this documentation. Newer annotations versions may be added upon request and may be already available. Please contact [Vishal Koparde](mailto:vishal.koparde@nih.gov) for details. #### 2.3 Dependencies -**Requires:** `singularity>=3.5` `snakemake>=6.0` + +**Requires:** `singularity>=3.5` `snakemake>=6.0` > **NOTE:** > Biowulf users: @@ -92,6 +95,7 @@ RENEE comes bundled with pre-built reference files for the following genomes: ### 3. Run RENEE pipeline #### 3.1 Biowulf + ```bash # RENEE is configured to use different execution backends: local or slurm # view the help page for more information @@ -119,6 +123,7 @@ renee run --input .tests/*.R?.fastq.gz --output /data/$USER/RNA_hg38 --genome hg ``` #### 3.2 FRCE + ```bash # grab an interactive node srun --export all --pty --x11 bash @@ -129,37 +134,37 @@ srun --export all --pty --x11 bash # run renee renee --help ``` +

Back to Top


-### 4. References +### 4. References -**1.** Daley, T. and A.D. Smith, Predicting the molecular complexity of sequencing libraries. Nat Methods, 2013. 10(4): p. 325-7. +**1.** Daley, T. and A.D. Smith, Predicting the molecular complexity of sequencing libraries. Nat Methods, 2013. 10(4): p. 325-7. **2.** Andrews, S. (2010). FastQC: a quality control tool for high throughput sequence data. -**3.** Martin, M. (2011). "Cutadapt removes adapter sequences from high-throughput sequencing reads." EMBnet 17(1): 10-12. -**4.** Dobin, A., et al., STAR: ultrafast universal RNA-seq aligner. Bioinformatics, 2013. 29(1): p. 15-21. -**5.** Li, B. and C.N. Dewey, RSEM: accurate transcript quantification from RNA-Seq data with or without a reference genome. BMC Bioinformatics, 2011. 12: p. 323. -**6.** Harrow, J., et al., GENCODE: the reference human genome annotation for The ENCODE Project. Genome Res, 2012. 22(9): p. 1760-74. -**7.** Law, C.W., et al., voom: Precision weights unlock linear model analysis tools for RNA-seq read counts. Genome Biol, 2014. 15(2): p. R29. -**8.** Smyth, G.K., Linear models and empirical bayes methods for assessing differential expression in microarray experiments. Stat Appl Genet Mol Biol, 2004. 3: p. Article3. -**9.** Wang, L., et al. (2012). "RSeQC: quality control of RNA-seq experiments." Bioinformatics 28(16): 2184-2185. -**10.** The Picard toolkit. https://broadinstitute.github.io/picard/. -**11.** Ewels, P., et al. (2016). "MultiQC: summarize analysis results for multiple tools and samples in a single report." Bioinformatics 32(19): 3047-3048. -**12.** R Core Team (2018). R: A Language and Environment for Statistical Computing. Vienna, Austria, R Foundation for Statistical Computing. -**13.** Li, H., et al. (2009). "The Sequence Alignment/Map format and SAMtools." Bioinformatics 25(16): 2078-2079. -**14.** Wood, D. E. and S. L. Salzberg (2014). "Kraken: ultrafast metagenomic sequence classification using exact alignments." Genome Biol 15(3): R46. -**15.** Ondov, B. D., et al. (2011). "Interactive metagenomic visualization in a Web browser." BMC Bioinformatics 12(1): 385. -**16.** Okonechnikov, K., et al. (2015). "Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data." Bioinformatics 32(2): 292-294. -**17.** Wingett, S. and S. Andrews (2018). "FastQ Screen: A tool for multi-genome mapping and quality control." F1000Research 7(2): 1338. -**18.** Robinson, M. D., et al. (2009). "edgeR: a Bioconductor package for differential expression analysis of digital gene expression data." Bioinformatics 26(1): 139-140. -**19.** Koster, J. and S. Rahmann (2018). "Snakemake-a scalable bioinformatics workflow engine." Bioinformatics 34(20): 3600. -**20.** Merkel, D. (2014). Docker: lightweight linux containers for consistent development and deployment. Linux Journal, 2014(239), 2. -**21.** Kurtzer GM, Sochat V, Bauer MW (2017). Singularity: Scientific containers for mobility of compute. PLoS ONE 12(5): e0177459. -**22.** Haas, B. J., et al. (2019). "Accuracy assessment of fusion transcript detection via read-mapping and de novo fusion transcript assembly-based methods." Genome Biology 20(1): 213. - +**3.** Martin, M. (2011). "Cutadapt removes adapter sequences from high-throughput sequencing reads." EMBnet 17(1): 10-12. +**4.** Dobin, A., et al., STAR: ultrafast universal RNA-seq aligner. Bioinformatics, 2013. 29(1): p. 15-21. +**5.** Li, B. and C.N. Dewey, RSEM: accurate transcript quantification from RNA-Seq data with or without a reference genome. BMC Bioinformatics, 2011. 12: p. 323. +**6.** Harrow, J., et al., GENCODE: the reference human genome annotation for The ENCODE Project. Genome Res, 2012. 22(9): p. 1760-74. +**7.** Law, C.W., et al., voom: Precision weights unlock linear model analysis tools for RNA-seq read counts. Genome Biol, 2014. 15(2): p. R29. +**8.** Smyth, G.K., Linear models and empirical bayes methods for assessing differential expression in microarray experiments. Stat Appl Genet Mol Biol, 2004. 3: p. Article3. +**9.** Wang, L., et al. (2012). "RSeQC: quality control of RNA-seq experiments." Bioinformatics 28(16): 2184-2185. +**10.** The Picard toolkit. https://broadinstitute.github.io/picard/. +**11.** Ewels, P., et al. (2016). "MultiQC: summarize analysis results for multiple tools and samples in a single report." Bioinformatics 32(19): 3047-3048. +**12.** R Core Team (2018). R: A Language and Environment for Statistical Computing. Vienna, Austria, R Foundation for Statistical Computing. +**13.** Li, H., et al. (2009). "The Sequence Alignment/Map format and SAMtools." Bioinformatics 25(16): 2078-2079. +**14.** Wood, D. E. and S. L. Salzberg (2014). "Kraken: ultrafast metagenomic sequence classification using exact alignments." Genome Biol 15(3): R46. +**15.** Ondov, B. D., et al. (2011). "Interactive metagenomic visualization in a Web browser." BMC Bioinformatics 12(1): 385. +**16.** Okonechnikov, K., et al. (2015). "Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data." Bioinformatics 32(2): 292-294. +**17.** Wingett, S. and S. Andrews (2018). "FastQ Screen: A tool for multi-genome mapping and quality control." F1000Research 7(2): 1338. +**18.** Robinson, M. D., et al. (2009). "edgeR: a Bioconductor package for differential expression analysis of digital gene expression data." Bioinformatics 26(1): 139-140. +**19.** Koster, J. and S. Rahmann (2018). "Snakemake-a scalable bioinformatics workflow engine." Bioinformatics 34(20): 3600. +**20.** Merkel, D. (2014). Docker: lightweight linux containers for consistent development and deployment. Linux Journal, 2014(239), 2. +**21.** Kurtzer GM, Sochat V, Bauer MW (2017). Singularity: Scientific containers for mobility of compute. PLoS ONE 12(5): e0177459. +**22.** Haas, B. J., et al. (2019). "Accuracy assessment of fusion transcript detection via read-mapping and de novo fusion transcript assembly-based methods." Genome Biology 20(1): 213.

diff --git a/config/genomes/biowulf/hg19_19.json b/config/genomes/biowulf/hg19_19.json index c3b711e..0e94324 100644 --- a/config/genomes/biowulf/hg19_19.json +++ b/config/genomes/biowulf/hg19_19.json @@ -22,4 +22,4 @@ "FUSIONPROTDOMAIN": "s3://nciccbr/Resources/RNA-seq/arriba/protein_domains_hg19_hs37d5_GRCh37_v2.0.0.gff3" } } -} \ No newline at end of file +} diff --git a/config/genomes/biowulf/hg38_30.json b/config/genomes/biowulf/hg38_30.json index 3395429..6ccb20e 100644 --- a/config/genomes/biowulf/hg38_30.json +++ b/config/genomes/biowulf/hg38_30.json @@ -22,4 +22,4 @@ "FUSIONPROTDOMAIN": "s3://nciccbr/Resources/RNA-seq/arriba/protein_domains_hg38_GRCh38_v2.0.0.gff3" } } -} \ No newline at end of file +} diff --git a/config/genomes/biowulf/hg38_34.json b/config/genomes/biowulf/hg38_34.json index 2d078bd..958a13b 100644 --- a/config/genomes/biowulf/hg38_34.json +++ b/config/genomes/biowulf/hg38_34.json @@ -22,4 +22,4 @@ "FUSIONPROTDOMAIN": "s3://nciccbr/Resources/RNA-seq/arriba/protein_domains_hg38_GRCh38_v2.0.0.gff3" } } -} \ No newline at end of file +} diff --git a/config/genomes/biowulf/hg38_36.json b/config/genomes/biowulf/hg38_36.json index 286d3f2..e921d53 100644 --- a/config/genomes/biowulf/hg38_36.json +++ b/config/genomes/biowulf/hg38_36.json @@ -22,4 +22,4 @@ "FUSIONPROTDOMAIN": "s3://nciccbr/Resources/RNA-seq/arriba/protein_domains_hg38_GRCh38_v2.0.0.gff3" } } -} \ No newline at end of file +} diff --git a/config/genomes/biowulf/hg38_38.json b/config/genomes/biowulf/hg38_38.json index 00d78be..4c7859d 100644 --- a/config/genomes/biowulf/hg38_38.json +++ b/config/genomes/biowulf/hg38_38.json @@ -22,4 +22,4 @@ "FUSIONPROTDOMAIN": "s3://nciccbr/Resources/RNA-seq/arriba/protein_domains_hg38_GRCh38_v2.0.0.gff3" } } -} \ No newline at end of file +} diff --git a/config/genomes/biowulf/mm10_M23.json b/config/genomes/biowulf/mm10_M23.json index 052a4e5..40c0e07 100644 --- a/config/genomes/biowulf/mm10_M23.json +++ b/config/genomes/biowulf/mm10_M23.json @@ -22,4 +22,4 @@ "FUSIONPROTDOMAIN": "s3://nciccbr/Resources/RNA-seq/arriba/protein_domains_mm10_GRCm38_v2.0.0.gff3" } } -} \ No newline at end of file +} diff --git a/config/templates/tools.json b/config/templates/tools.json index d30f2f2..05fe5b7 100644 --- a/config/templates/tools.json +++ b/config/templates/tools.json @@ -61,4 +61,4 @@ "WIGTYPE": "None" } } -} \ No newline at end of file +} diff --git a/docker/multiqc/Dockerfile b/docker/multiqc/Dockerfile index 1e6dbf0..0ad9c26 100644 --- a/docker/multiqc/Dockerfile +++ b/docker/multiqc/Dockerfile @@ -13,19 +13,19 @@ LABEL maintainer=kuhnsa@nih.gov # - matplotlib (pypi) # - XlsxWriter (pypi) -# Create Container filesystem specific +# Create Container filesystem specific # working directory and opt directories -# to avoid collisions with host filesyetem +# to avoid collisions with host filesyetem RUN mkdir -p /opt2 && mkdir -p /data2 -WORKDIR /opt2 +WORKDIR /opt2 -# Set time zone to US east coast +# Set time zone to US east coast ENV TZ=America/New_York RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime \ && echo $TZ > /etc/timezone -# This section installs system packages -# required for your project. If you need +# This section installs system packages +# required for your project. If you need # extra system packages add them here. # Installs python/3.8.10 RUN apt-get update \ @@ -55,6 +55,6 @@ RUN pip3 install --upgrade pip \ # Add Dockerfile and export env variables ADD Dockerfile /opt2/Dockerfile -RUN chmod -R a+rX /opt2 +RUN chmod -R a+rX /opt2 ENV PATH="/opt2:$PATH" -WORKDIR /data2 \ No newline at end of file +WORKDIR /data2 diff --git a/docs/RNA-seq/Resources.md b/docs/RNA-seq/Resources.md index d579542..a00d4b9 100644 --- a/docs/RNA-seq/Resources.md +++ b/docs/RNA-seq/Resources.md @@ -1,24 +1,24 @@ -## 1. Reference genomes +## 1. Reference genomes -On [Biowulf](https://hpc.nih.gov/), RENEE comes bundled with the following pre-built [GENCODE](https://www.gencodegenes.org/)1 reference genomes: +On [Biowulf](https://hpc.nih.gov/), RENEE comes bundled with the following pre-built [GENCODE](https://www.gencodegenes.org/)1 reference genomes: - -| **Genome** | **Species** | **Annotation Version** | **Notes** | -| ------------ |-------------- | --------------------------| --------------------------------------------------------| -| hg38_30 | Homo sapiens (human) | [Gencode Release v30](https://www.gencodegenes.org/human/release_30.html) | [GRCh38](https://www.gencodegenes.org/human/release_30.html), Annotation Release date: 11/2018 | -| hg38_34 | Homo sapiens (human) | [Gencode Release v34](https://www.gencodegenes.org/human/release_34.html) | [GRCh38](https://www.gencodegenes.org/human/release_34.html), Annotation Release date: 04/2020 | -| hg38_38 | Homo sapiens (human) | [Gencode Release v38](https://www.gencodegenes.org/human/release_38.html) | [GRCh38](https://www.gencodegenes.org/human/release_38.html), Annotation Release date: 05/2021 | -| hg38_41 | Homo sapiens (human) | [Gencode Release v41](https://www.gencodegenes.org/human/release_41.html) | [GRCh38](https://www.gencodegenes.org/human/release_41.html), Annotation Release date: 07/2022 | -| mm10_M21 | Mus musculus (mouse) | [Gencode Release M21](https://www.gencodegenes.org/mouse/release_M21.html) | [GRCm38](https://www.gencodegenes.org/mouse/release_M21.html), Annotation Release date: 04/2019 | -| mm10_M23 | Mus musculus (mouse) | [Gencode Release M23](https://www.gencodegenes.org/mouse/release_M23.html) | [GRCm38](https://www.gencodegenes.org/mouse/release_M23.html), Annotation Release date: 09/2019 | -| mm10_M25 | Mus musculus (mouse) | [Gencode Release M25](https://www.gencodegenes.org/mouse/release_M25.html) | [GRCm38](https://www.gencodegenes.org/mouse/release_M25.html), Annotation Release date: 04/2020 | - -However, building new reference genomes is easy! +Bundled Biowulf Reference Genomes +**Human** `hg38_30` +**Mouse** `mm10_M21` +---> + +| **Genome** | **Species** | **Annotation Version** | **Notes** | +| ---------- | -------------------- | -------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | +| hg38_30 | Homo sapiens (human) | [Gencode Release v30](https://www.gencodegenes.org/human/release_30.html) | [GRCh38](https://www.gencodegenes.org/human/release_30.html), Annotation Release date: 11/2018 | +| hg38_34 | Homo sapiens (human) | [Gencode Release v34](https://www.gencodegenes.org/human/release_34.html) | [GRCh38](https://www.gencodegenes.org/human/release_34.html), Annotation Release date: 04/2020 | +| hg38_38 | Homo sapiens (human) | [Gencode Release v38](https://www.gencodegenes.org/human/release_38.html) | [GRCh38](https://www.gencodegenes.org/human/release_38.html), Annotation Release date: 05/2021 | +| hg38_41 | Homo sapiens (human) | [Gencode Release v41](https://www.gencodegenes.org/human/release_41.html) | [GRCh38](https://www.gencodegenes.org/human/release_41.html), Annotation Release date: 07/2022 | +| mm10_M21 | Mus musculus (mouse) | [Gencode Release M21](https://www.gencodegenes.org/mouse/release_M21.html) | [GRCm38](https://www.gencodegenes.org/mouse/release_M21.html), Annotation Release date: 04/2019 | +| mm10_M23 | Mus musculus (mouse) | [Gencode Release M23](https://www.gencodegenes.org/mouse/release_M23.html) | [GRCm38](https://www.gencodegenes.org/mouse/release_M23.html), Annotation Release date: 09/2019 | +| mm10_M25 | Mus musculus (mouse) | [Gencode Release M25](https://www.gencodegenes.org/mouse/release_M25.html) | [GRCm38](https://www.gencodegenes.org/mouse/release_M25.html), Annotation Release date: 04/2020 | + +However, building new reference genomes is easy! If you do not have access to Biowulf or you are looking for a reference genome and/or annotation **_that is currently not available_**, it can be built with RENEE's build sub-command. Given a genomic FASTA file (ref.fa) and a GTF file (genes.gtf), `renee build` will create all of the required reference files to run the RENEE pipeline. Once the build pipeline completes, you can supply the newly generated reference.json to the `--genome` of `renee run`. For more information, please see the help page for the run and build sub commands. @@ -26,31 +26,31 @@ If you do not have access to Biowulf or you are looking for a reference genome a > _Raw data > Adapter Trimming > Alignment > Quantification (genes and isoforms, gene-fusions)_ -| **Tool** | **Version** | **Docker** | **Notes** | -|--------------------------|:-----------:|:------------|-------------------------------------------------------------------------------------------------| -| FastQC2 | 0.11.9 | [nciccbr/ccbr_fastqc_0.11.9](https://hub.docker.com/repository/docker/nciccbr/ccbr_fastqc_0.11.9) | **Quality-control step** to assess sequencing quality, run before and after adapter trimming | -| Cutadapt3 | 1.18 | [nciccbr/ccbr_cutadapt_1.18](https://hub.docker.com/repository/docker/nciccbr/ccbr_cutadapt_1.18) | **Data processing step** to remove adapter sequences and perform quality trimming | -| Kraken4 | 2.1.1 | [nciccbr/ccbr_kraken_v2.1.1](https://hub.docker.com/repository/docker/nciccbr/ccbr_kraken_v2.1.1) | **Quality-control step** to assess microbial taxonomic composition | -| KronaTools5 | 2.7.1 | [nciccbr/ccbr_kraken_v2.1.1](https://hub.docker.com/repository/docker/nciccbr/ccbr_kraken_v2.1.1) | **Quality-control step** to visualize kraken output | -| FastQ Screen6 | 0.13.0 | [nciccbr/ccbr_fastq_screen_0.13.0](https://hub.docker.com/repository/docker/nciccbr/ccbr_fastq_screen_0.13.0) | **Quality-control step** to assess contamination; additional dependencies: `bowtie2/2.3.4`, `perl/5.24.3` | -| STAR7 | 2.7.6a | [nciccbr/ccbr_arriba_2.0.0](https://hub.docker.com/repository/docker/nciccbr/ccbr_arriba_2.0.0) | **Data processing step** to align reads against reference genome (using its two-pass mode) | -| bbtools8 | 38.87 | [nciccbr/ccbr_bbtools_38.87](https://hub.docker.com/repository/docker/nciccbr/ccbr_bbtools_38.87) | **Quality-control step** to calculate insert_size of assembled reads pairs with `bbmerge` | -| QualiMap9 | 2.2.1 | [nciccbr/ccbr_qualimap](https://hub.docker.com/repository/docker/nciccbr/ccbr_qualimap) | **Quality-control step** to assess various alignment metrics | -| Picard10 | 2.18.20 | [nciccbr/ccbr_picard](https://hub.docker.com/repository/docker/nciccbr/ccbr_picard) | **Quality-control step** to run `MarkDuplicates`, `CollectRnaSeqMetrics` and `AddOrReplaceReadGroups` | -| Preseq11 | 2.0.3 | [nciccbr/ccbr_preseq](https://hub.docker.com/repository/docker/nciccbr/ccbr_preseq) | **Quality-control step** to estimate library complexity | -| SAMtools12 | 1.7 | [nciccbr/ccbr_arriba_2.0.0](https://hub.docker.com/repository/docker/nciccbr/ccbr_arriba_2.0.0) | **Quality-control step** to run `flagstat` to calculate alignment statistics | -| bam2strandedbw | [custom](https://github.com/CCBR/Pipeliner/blob/master/Results-template/Scripts/bam2strandedbw.pe.sh) | [nciccbr/ccbr_bam2strandedbw](https://hub.docker.com/repository/docker/nciccbr/ccbr_bam2strandedbw) | **Summarization step** to convert STAR aligned PE bam file into forward and reverse strand bigwigs suitable for a genomic track viewer like IGV | -| RSeQC13 | 4.0.0 | [nciccbr/ccbr_rseqc_4.0.0](https://hub.docker.com/repository/docker/nciccbr/ccbr_rseqc_4.0.0) | **Quality-control step** to infer stranded-ness and read distributions over specific genomic features | -| RSEM14 | 1.3.3 | [nciccbr/ccbr_rsem_1.3.3](https://hub.docker.com/repository/docker/nciccbr/ccbr_rsem_1.3.3) | **Data processing step** to quantify gene and isoform counts | -| Arriba15 | 2.0.0 | [nciccbr/ccbr_arriba_2.0.0](https://hub.docker.com/repository/docker/nciccbr/ccbr_arriba_2.0.0) | **Data processing step** to quantify gene-fusions | -| RNA Report | [custom](https://github.com/CCBR/rNA) | [nciccbr/ccbr_rna](https://hub.docker.com/repository/docker/nciccbr/ccbr_rna) | **Summarization step** to identify outliers and assess techincal sources of variation | -| MultiQC16 | 1.12 | [skchronicles/multiqc](https://hub.docker.com/repository/docker/skchronicles/multiqc/) | **Reporting step** to aggregate sample statistics and quality-control information across all sample | +| **Tool** | **Version** | **Docker** | **Notes** | +| ------------------------ | :---------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------- | +| FastQC2 | 0.11.9 | [nciccbr/ccbr_fastqc_0.11.9](https://hub.docker.com/repository/docker/nciccbr/ccbr_fastqc_0.11.9) | **Quality-control step** to assess sequencing quality, run before and after adapter trimming | +| Cutadapt3 | 1.18 | [nciccbr/ccbr_cutadapt_1.18](https://hub.docker.com/repository/docker/nciccbr/ccbr_cutadapt_1.18) | **Data processing step** to remove adapter sequences and perform quality trimming | +| Kraken4 | 2.1.1 | [nciccbr/ccbr_kraken_v2.1.1](https://hub.docker.com/repository/docker/nciccbr/ccbr_kraken_v2.1.1) | **Quality-control step** to assess microbial taxonomic composition | +| KronaTools5 | 2.7.1 | [nciccbr/ccbr_kraken_v2.1.1](https://hub.docker.com/repository/docker/nciccbr/ccbr_kraken_v2.1.1) | **Quality-control step** to visualize kraken output | +| FastQ Screen6 | 0.13.0 | [nciccbr/ccbr_fastq_screen_0.13.0](https://hub.docker.com/repository/docker/nciccbr/ccbr_fastq_screen_0.13.0) | **Quality-control step** to assess contamination; additional dependencies: `bowtie2/2.3.4`, `perl/5.24.3` | +| STAR7 | 2.7.6a | [nciccbr/ccbr_arriba_2.0.0](https://hub.docker.com/repository/docker/nciccbr/ccbr_arriba_2.0.0) | **Data processing step** to align reads against reference genome (using its two-pass mode) | +| bbtools8 | 38.87 | [nciccbr/ccbr_bbtools_38.87](https://hub.docker.com/repository/docker/nciccbr/ccbr_bbtools_38.87) | **Quality-control step** to calculate insert_size of assembled reads pairs with `bbmerge` | +| QualiMap9 | 2.2.1 | [nciccbr/ccbr_qualimap](https://hub.docker.com/repository/docker/nciccbr/ccbr_qualimap) | **Quality-control step** to assess various alignment metrics | +| Picard10 | 2.18.20 | [nciccbr/ccbr_picard](https://hub.docker.com/repository/docker/nciccbr/ccbr_picard) | **Quality-control step** to run `MarkDuplicates`, `CollectRnaSeqMetrics` and `AddOrReplaceReadGroups` | +| Preseq11 | 2.0.3 | [nciccbr/ccbr_preseq](https://hub.docker.com/repository/docker/nciccbr/ccbr_preseq) | **Quality-control step** to estimate library complexity | +| SAMtools12 | 1.7 | [nciccbr/ccbr_arriba_2.0.0](https://hub.docker.com/repository/docker/nciccbr/ccbr_arriba_2.0.0) | **Quality-control step** to run `flagstat` to calculate alignment statistics | +| bam2strandedbw | [custom](https://github.com/CCBR/Pipeliner/blob/master/Results-template/Scripts/bam2strandedbw.pe.sh) | [nciccbr/ccbr_bam2strandedbw](https://hub.docker.com/repository/docker/nciccbr/ccbr_bam2strandedbw) | **Summarization step** to convert STAR aligned PE bam file into forward and reverse strand bigwigs suitable for a genomic track viewer like IGV | +| RSeQC13 | 4.0.0 | [nciccbr/ccbr_rseqc_4.0.0](https://hub.docker.com/repository/docker/nciccbr/ccbr_rseqc_4.0.0) | **Quality-control step** to infer stranded-ness and read distributions over specific genomic features | +| RSEM14 | 1.3.3 | [nciccbr/ccbr_rsem_1.3.3](https://hub.docker.com/repository/docker/nciccbr/ccbr_rsem_1.3.3) | **Data processing step** to quantify gene and isoform counts | +| Arriba15 | 2.0.0 | [nciccbr/ccbr_arriba_2.0.0](https://hub.docker.com/repository/docker/nciccbr/ccbr_arriba_2.0.0) | **Data processing step** to quantify gene-fusions | +| RNA Report | [custom](https://github.com/CCBR/rNA) | [nciccbr/ccbr_rna](https://hub.docker.com/repository/docker/nciccbr/ccbr_rna) | **Summarization step** to identify outliers and assess techincal sources of variation | +| MultiQC16 | 1.12 | [skchronicles/multiqc](https://hub.docker.com/repository/docker/skchronicles/multiqc/) | **Reporting step** to aggregate sample statistics and quality-control information across all sample | ## 3. Acknowledgements -### 3.1 Biowulf -If you [utilized NIH's Biowulf cluster](https://hpc.nih.gov/Research/) to run RENEE, *please do not forget to provide an acknowlegement*! +### 3.1 Biowulf +If you [utilized NIH's Biowulf cluster](https://hpc.nih.gov/Research/) to run RENEE, _please do not forget to provide an acknowlegement_! > The continued growth and support of NIH's Biowulf cluster is dependent upon its demonstrable value to the NIH Intramural Research Program. If you publish research that involved significant use of Biowulf, please cite the cluster. @@ -61,32 +61,32 @@ This work utilized the computational resources of the NIH HPC Biowulf cluster. ( ``` ## 4. References -**1.** Harrow, J., et al., GENCODE: the reference human genome annotation for The ENCODE Project. Genome Res, 2012. 22(9): p. 1760-74. -**2.** Andrews, S. (2010). FastQC: a quality control tool for high throughput sequence data. -**3.** Martin, M. (2011). "Cutadapt removes adapter sequences from high-throughput sequencing reads." EMBnet 17(1): 10-12. -**4.** Wood, D. E. and S. L. Salzberg (2014). "Kraken: ultrafast metagenomic sequence classification using exact alignments." Genome Biol 15(3): R46. -**5.** Ondov, B. D., et al. (2011). "Interactive metagenomic visualization in a Web browser." BMC Bioinformatics 12(1): 385. -**6.** Wingett, S. and S. Andrews (2018). "FastQ Screen: A tool for multi-genome mapping and quality control." F1000Research 7(2): 1338. -**7.** Dobin, A., et al., STAR: ultrafast universal RNA-seq aligner. Bioinformatics, 2013. 29(1): p. 15-21. -**8.** Bushnell, B., Rood, J., & Singer, E. (2017). BBMerge - Accurate paired shotgun read merging via overlap. PloS one, 12(10), e0185056. -**9.** Okonechnikov, K., et al. (2015). "Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data." Bioinformatics 32(2): 292-294. + +**1.** Harrow, J., et al., GENCODE: the reference human genome annotation for The ENCODE Project. Genome Res, 2012. 22(9): p. 1760-74. +**2.** Andrews, S. (2010). FastQC: a quality control tool for high throughput sequence data. +**3.** Martin, M. (2011). "Cutadapt removes adapter sequences from high-throughput sequencing reads." EMBnet 17(1): 10-12. +**4.** Wood, D. E. and S. L. Salzberg (2014). "Kraken: ultrafast metagenomic sequence classification using exact alignments." Genome Biol 15(3): R46. +**5.** Ondov, B. D., et al. (2011). "Interactive metagenomic visualization in a Web browser." BMC Bioinformatics 12(1): 385. +**6.** Wingett, S. and S. Andrews (2018). "FastQ Screen: A tool for multi-genome mapping and quality control." F1000Research 7(2): 1338. +**7.** Dobin, A., et al., STAR: ultrafast universal RNA-seq aligner. Bioinformatics, 2013. 29(1): p. 15-21. +**8.** Bushnell, B., Rood, J., & Singer, E. (2017). BBMerge - Accurate paired shotgun read merging via overlap. PloS one, 12(10), e0185056. +**9.** Okonechnikov, K., et al. (2015). "Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data." Bioinformatics 32(2): 292-294. **10.** The Picard toolkit. https://broadinstitute.github.io/picard/. **11.** Daley, T. and A.D. Smith, Predicting the molecular complexity of sequencing libraries. Nat Methods, 2013. 10(4): p. 325-7. -**12.** Li, H., et al. (2009). "The Sequence Alignment/Map format and SAMtools." Bioinformatics 25(16): 2078-2079. +**12.** Li, H., et al. (2009). "The Sequence Alignment/Map format and SAMtools." Bioinformatics 25(16): 2078-2079. **13.** Wang, L., et al. (2012). "RSeQC: quality control of RNA-seq experiments." Bioinformatics 28(16): 2184-2185. **14.** Li, B. and C.N. Dewey, RSEM: accurate transcript quantification from RNA-Seq data with or without a reference genome. BMC Bioinformatics, 2011. 12: p. 323. **15.** Uhrig, S., et al. (2021). "Accurate and efficient detection of gene fusions from RNA sequencing data". Genome Res. 31(3): 448-460. -**16.** Ewels, P., et al. (2016). "MultiQC: summarize analysis results for multiple tools and samples in a single report." Bioinformatics 32(19): 3047-3048. - +**16.** Ewels, P., et al. (2016). "MultiQC: summarize analysis results for multiple tools and samples in a single report." Bioinformatics 32(19): 3047-3048. diff --git a/docs/RNA-seq/TLDR-RNA-seq.md b/docs/RNA-seq/TLDR-RNA-seq.md index 53f3943..c2baaad 100644 --- a/docs/RNA-seq/TLDR-RNA-seq.md +++ b/docs/RNA-seq/TLDR-RNA-seq.md @@ -5,36 +5,39 @@ When processing RNA-sequencing data, there are often many steps that we must rep With RENEE, you can run your samples through our highly-reproducible pipeline, build resources for new reference genomes, and more! Here is a list of available renee `sub commands`: - - - [**`run`**](../run): run the rna-seq pipeline - - [**`build`**](../build): build reference files - - [**`cache`**](../cache): cache remote resources locally - - [**`unlock`**](../unlock): unlock a working directory -> This page contains information for building reference files and running the RENEE pipeline. For more information about each of the available sub commands, please see the [usage section](./run.md). +- [**`run`**](../run): run the rna-seq pipeline +- [**`build`**](../build): build reference files +- [**`cache`**](../cache): cache remote resources locally +- [**`unlock`**](../unlock): unlock a working directory + +> This page contains information for building reference files and running the RENEE pipeline. For more information about each of the available sub commands, please see the [usage section](./run.md). ## 2. Setup RENEE + _Estimated Reading Time: 3 Mintutes_ RENEE has two dependencies: `singularity` and `snakemake`. These dependencies can be installed by a sysadmin; however, snakemake is readily available through conda. Before running the pipeline or any of the commands below, please ensure singularity and snakemake are in your `$PATH`. Please see follow the instructions below for getting started with the RENEE pipeline. ### 2.1 Login to cluster + ```bash # Setup Step 0.) ssh into cluster's head node # example below for Biowulf cluster ssh -Y $USER@biowulf.nih.gov ``` - ### 2.2 Grab an interactive node -```bash + +```bash # Setup Step 1.) Please do not run RENEE on the head node! # Grab an interactive node first srun -N 1 -n 1 --time=12:00:00 -p interactive --mem=8gb --cpus-per-task=4 --pty bash ``` ### 2.3 Load dependecies -```bash + +```bash # Setup Step 2.) Add singularity and snakemake executables to $PATH module purge module load ccbrpipeliner @@ -42,7 +45,7 @@ module load ccbrpipeliner ## 3. Building Reference files -In this example, we will start off by building reference files downloaded from [GENCODE](https://www.gencodegenes.org/). We recommend downloading the `PRI` Genome FASTA file and annotation from [GENCODE](https://www.gencodegenes.org/). These `PRI` reference files contain the primary chromosomes and scaffolds. We **do not** recommend downloading the `CHR` reference files! +In this example, we will start off by building reference files downloaded from [GENCODE](https://www.gencodegenes.org/). We recommend downloading the `PRI` Genome FASTA file and annotation from [GENCODE](https://www.gencodegenes.org/). These `PRI` reference files contain the primary chromosomes and scaffolds. We **do not** recommend downloading the `CHR` reference files! Checkout [this](./Resources.md) list for currently avaiable resources on Biowulf. If your required **genome + annotation combination** is NOT available, only then proceed to building your own reference files. Also, if you think that your **genome + annotation combination** may be beneficial for other Biowulf users of RENEE as well, then please request it to be added to RENEE's default resources by [opening an issue on Github](https://github.com/CCBR/RENEE/issues). @@ -63,7 +66,8 @@ wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_36/gencode. gzip -d gencode.v36.primary_assembly.annotation.gtf.gz ``` -### 3.2 Run Build pipeline +### 3.2 Run Build pipeline + ```bash # Build Step 3.) Load dependencies module purge @@ -80,17 +84,16 @@ renee build --ref-fa GRCh38.primary_assembly.genome.fa \ renee build --ref-fa GRCh38.primary_assembly.genome.fa \ --ref-name hg38 \ --ref-gtf gencode.v36.primary_assembly.annotation.gtf \ - --gtf-ver 36 --output /data/$USER/hg38_36 + --gtf-ver 36 --output /data/$USER/hg38_36 ``` An email notification will be sent out when the pipeline starts and ends. Once the build pipeline completes, you can run RENEE with the provided test dataset. Please see the intructions below for more information. -## 4. Running RENEE - -Run RENEE with the reference files we built above using hg38 (GRCh38.p13) Genome FASTA file and GENCODE release 36 annotation (GTF). For more information about how the reference files we generated, please see the intructions above. You can use those instructions as a guide for building any new reference genomes in the future. +## 4. Running RENEE +Run RENEE with the reference files we built above using hg38 (GRCh38.p13) Genome FASTA file and GENCODE release 36 annotation (GTF). For more information about how the reference files we generated, please see the intructions above. You can use those instructions as a guide for building any new reference genomes in the future. -### 4.1 Dry-run pipeline +### 4.1 Dry-run pipeline Dry-run the pipeline prior to submiting the pipeline's master job. Please note that if you wish to run RENEE with a new dataset, you will only need to update the values provided to the `--input` and `--output` arguments (and maybe `--genome`). The `--input` argument supports globbing. If this is the first time running RENEE with for given dataset, the `--output` directory should _**not**_ exist on your local filesystem. It will be created automatically during runtime. @@ -106,7 +109,7 @@ module load ccbrpipeliner # Run Step 2.) Dry-run the pipeline with test dataset # And reference genome generated in the steps above -# Test data consists of sub sampled FastQ files +# Test data consists of sub sampled FastQ files renee run \ --input ${RENEE_HOME}/.tests/*.R?.fastq.gz \ --output /data/${USER}/runner_hg38_36/ \ @@ -116,13 +119,13 @@ renee run \ --dry-run ``` -### 4.2 Run pipeline +### 4.2 Run pipeline -Kick off the pipeline by submiting the master job to the cluster. It is essentially the same command above without the `--dry-run` flag. +Kick off the pipeline by submiting the master job to the cluster. It is essentially the same command above without the `--dry-run` flag. ```bash # Run Step 3.) Submit the master job -# Runs the RENEE pipeline with the +# Runs the RENEE pipeline with the # reference genome generated in the steps above # and with the test dataset renee run \ @@ -134,4 +137,4 @@ renee run \ --dry-run ``` -An email notification will be sent out when the pipeline starts and ends. \ No newline at end of file +An email notification will be sent out when the pipeline starts and ends. diff --git a/docs/RNA-seq/Theory.md b/docs/RNA-seq/Theory.md index cda2d2f..4b72272 100644 --- a/docs/RNA-seq/Theory.md +++ b/docs/RNA-seq/Theory.md @@ -1,85 +1,99 @@ ## 1. Introduction -RNA-sequencing (*RNA-seq*) has a wide variety of applications; this transcriptome profiling method can be used to quantify gene and isoform expression, find changes in alternative splicing, detect gene-fusion events, call variants and much more. -It is also worth noting that RNA-seq can be coupled with other biochemical assays to analyze many other aspects of RNA biology, such as RNA–protein binding (CLIP-seq, RIP-seq), RNA structure (SHAPE-seq), or RNA–RNA interactions (CLASH-seq). These applications are, however, beyond the scope of this documentation as we focus on *typical* RNA-seq project (i.e. quantifying expression and gene fusions). Our focus is to outline current standards and resources for the bioinformatics analysis of RNA-seq data. We do not aim to provide an exhaustive compilation of resources or software tools. Rather, we aim to provide a guideline and conceptual overview for RNA-seq data analysis based on our best-practices RNA-seq pipeline. +RNA-sequencing (_RNA-seq_) has a wide variety of applications; this transcriptome profiling method can be used to quantify gene and isoform expression, find changes in alternative splicing, detect gene-fusion events, call variants and much more. -Here we review all of the *typical* major steps in RNA-seq data analysis, starting from experimental design, quality control, read alignment, quantification of gene and transcript levels, and visualization. +It is also worth noting that RNA-seq can be coupled with other biochemical assays to analyze many other aspects of RNA biology, such as RNA–protein binding (CLIP-seq, RIP-seq), RNA structure (SHAPE-seq), or RNA–RNA interactions (CLASH-seq). These applications are, however, beyond the scope of this documentation as we focus on _typical_ RNA-seq project (i.e. quantifying expression and gene fusions). Our focus is to outline current standards and resources for the bioinformatics analysis of RNA-seq data. We do not aim to provide an exhaustive compilation of resources or software tools. Rather, we aim to provide a guideline and conceptual overview for RNA-seq data analysis based on our best-practices RNA-seq pipeline. + +Here we review all of the _typical_ major steps in RNA-seq data analysis, starting from experimental design, quality control, read alignment, quantification of gene and transcript levels, and visualization. ## 2. Experimental Design + Just like any other scientific experiment, a good RNA-seq experiment is hypothesis-driven. If you cannot describe the problem you are trying to address, throwing NGS at the problem is not a cure-all solution. Fishing for results is a waste of your time and is bad science. As so, designing a well-thought-out experiment around a testable question will maximize the likelihood of generating high-impact results. The data that is generated will determine whether you have the potential to answer your biological question of interest. As a prerequisite, you need to think about how you will construct your libraries; the correct sequencing depth to address your question of interest; the number of replicates, and strategies to reduce/mitigate batch effects. ### 2.1 Library construction -rRNA can comprise up to 80% of the RNA in a cell. An important consideration is the RNA extraction protocol that will be used to remove the highly abundant ribosomal RNA (rRNA). For eukaryotic cells, there are two major considerations: choosing whether to enrich for mRNA or whether to deplete rRNA. + +rRNA can comprise up to 80% of the RNA in a cell. An important consideration is the RNA extraction protocol that will be used to remove the highly abundant ribosomal RNA (rRNA). For eukaryotic cells, there are two major considerations: choosing whether to enrich for mRNA or whether to deplete rRNA. #### 2.1.1 mRNA -Poly-(A) selection is a common method used to enrich for mRNA. This method generates the highest percentage of reads which will ultimately map to protein-coding genes-- making it a common choice for most applications. That being said, poly(A)-selection requires your RNA to be of high quality with minimal degradation. Degraded samples that are followed with ploy(A)-selection may result in a 3’ bias, which in effect, may introduce downstream biases into your results. + +Poly-(A) selection is a common method used to enrich for mRNA. This method generates the highest percentage of reads which will ultimately map to protein-coding genes-- making it a common choice for most applications. That being said, poly(A)-selection requires your RNA to be of high quality with minimal degradation. Degraded samples that are followed with ploy(A)-selection may result in a 3’ bias, which in effect, may introduce downstream biases into your results. #### 2.1.2 total RNA -The second method captures total RNA through the depletion of rRNA. This method allows you to examine both mRNA and other non-coding RNA species such as lncRNAs. Again, depending on the question you are trying to answer this may be the right method for you. Although, it should be noted that both methods, mRNA and total RNA, require RINs (>8). But if you samples do contain slightly degraded RNA, you might be able to use the total RNA method over poly(A)-selection. + +The second method captures total RNA through the depletion of rRNA. This method allows you to examine both mRNA and other non-coding RNA species such as lncRNAs. Again, depending on the question you are trying to answer this may be the right method for you. Although, it should be noted that both methods, mRNA and total RNA, require RINs (>8). But if you samples do contain slightly degraded RNA, you might be able to use the total RNA method over poly(A)-selection. ### 2.2 Sequencing Depth + Sequencing depth or library size is another important design factor. As sequencing depth is increased, more transcripts will be detected (up until a saturation point), and their relative abundance will be quantified more accurately. -At the end of the day, the targeted sequencing depth depends on the aims of the experiment. Are you trying to quantify differences in gene expression, are you trying to quantify differential isoform usage or alternative splicing events? The numbers quoted below are more or less tailored to quantify differences in gene expression. If you are trying to quantify changes in alternative splicing or isoform regulation, you are going to much higher coverage (~ 100M paired-end reads). +At the end of the day, the targeted sequencing depth depends on the aims of the experiment. Are you trying to quantify differences in gene expression, are you trying to quantify differential isoform usage or alternative splicing events? The numbers quoted below are more or less tailored to quantify differences in gene expression. If you are trying to quantify changes in alternative splicing or isoform regulation, you are going to much higher coverage (~ 100M paired-end reads). #### 2.2.1 mRNA -For mRNA libraries or libraries generated from a prep kit using poly-(A) selection, we recommend a minimum sequencing depth of 10-20M paired-end reads (or 20-40M reads). RNA must be of high quality or a 3' bias may be observed. +For mRNA libraries or libraries generated from a prep kit using poly-(A) selection, we recommend a minimum sequencing depth of 10-20M paired-end reads (or 20-40M reads). RNA must be of high quality or a 3' bias may be observed. #### 2.2.2 total RNA + For total RNA libraries, we recommend a sequencing depth of 25-60M paired-end reads (or 50-120M reads). RNA must be of high quality. > Note: In the sections above and below, when I say to paired-end reads I am referring to read pairs generated from paired-end sequencing of a given cDNA fragment. You will sometimes see reads reported as pairs of reads or total reads. ### 2.3 Replicates -Another important design factor is the number of replicates. That being said, biological replicates are always preferred over technical replicates. + +Another important design factor is the number of replicates. That being said, biological replicates are always preferred over technical replicates. #### 2.3.1 Recommendation -We recommend 4 biological replicates per experimental condition or group. Having more replicates is good for several reasons because in the real world problems arise. If you have a bad sample that cannot be used due to severe QC issues, you are still left with 3 biological replicates. This allows you to drop a bad sample without comprising statistical power downstream. + +We recommend 4 biological replicates per experimental condition or group. Having more replicates is good for several reasons because in the real world problems arise. If you have a bad sample that cannot be used due to severe QC issues, you are still left with 3 biological replicates. This allows you to drop a bad sample without comprising statistical power downstream. #### 2.3.2 Bare Minimum -If cost is a factor, at a minimum, 3 biological replicates will ensure good statistical power for downstream analysis. -### 2.4 Reducing Batch Effects +If cost is a factor, at a minimum, 3 biological replicates will ensure good statistical power for downstream analysis. + +### 2.4 Reducing Batch Effects -Batch effects represent unwanted sources of technical variation. Batch effects introduce non-biological variation into your data, which if not accounted for can influence the results. Through the process of library preparation to sequencing, there are a number of steps (such as RNA extraction to adapter ligation to lane loading, etc.) that might introduce biases into the resulting data. +Batch effects represent unwanted sources of technical variation. Batch effects introduce non-biological variation into your data, which if not accounted for can influence the results. Through the process of library preparation to sequencing, there are a number of steps (such as RNA extraction to adapter ligation to lane loading, etc.) that might introduce biases into the resulting data. -As a general rule of thumb, the best way to reduce the introduction of batch effects is through uniform processing-- meaning you need to ensure that differences in sample handling are minimal. This means that samples should be processed by the same lab technician and everything should be done in a uniform manner. That being said, do not extract your RNA at different times, do not use different lots of reagents! If a large number of samples are being processed and everything cannot be done at the same time, process representative samples from each biological group at the same time. This will ensure that batches and your variable of interest do not become confounded. Also, keep note of which samples belong to each batch. This information will be needed for batch correction. +As a general rule of thumb, the best way to reduce the introduction of batch effects is through uniform processing-- meaning you need to ensure that differences in sample handling are minimal. This means that samples should be processed by the same lab technician and everything should be done in a uniform manner. That being said, do not extract your RNA at different times, do not use different lots of reagents! If a large number of samples are being processed and everything cannot be done at the same time, process representative samples from each biological group at the same time. This will ensure that batches and your variable of interest do not become confounded. Also, keep note of which samples belong to each batch. This information will be needed for batch correction. -To reduce the possibility of introducing batch effects from sequencing, all samples should be multiplexed together on the same lane(s). +To reduce the possibility of introducing batch effects from sequencing, all samples should be multiplexed together on the same lane(s). -| Sample | Group | **Batch** | Batch* | -|----------|:-----:|:-----:|:------:| -| Treatment_rep_1 | KO | 1 | 1 | -| Treatment_rep_2 | KO | 2 | 1 | -| Treatment_rep_3 | KO | 1 | 1 | -| Treatment_rep_4 | KO | 2 | 1 | -| Control_rep_1 | WT | 1 | 2 | -| Control_rep_2 | WT | 2 | 2 | -| Control_rep_3 | WT | 1 | 2 | -| Control_rep_4 | WT | 2 | 2 | +| Sample | Group | **Batch** | Batch\* | +| --------------- | :---: | :-------: | :------------: | +| Treatment_rep_1 | KO | 1 | 1 | +| Treatment_rep_2 | KO | 2 | 1 | +| Treatment_rep_3 | KO | 1 | 1 | +| Treatment_rep_4 | KO | 2 | 1 | +| Control_rep_1 | WT | 1 | 2 | +| Control_rep_2 | WT | 2 | 2 | +| Control_rep_3 | WT | 1 | 2 | +| Control_rep_4 | WT | 2 | 2 | > **Batch** = properly balanced batches, easily corrected :relaxed: -> Batch* = groups and batch totally confounded, cannot be corrected :worried: +> Batch\* = groups and batch totally confounded, cannot be corrected :worried: That being said, some problems cannot be bioinformatically corrected. If your variable of interest is totally confounded with your batches, applying batch correction to fix the problem is not going to work, and will lead to undesired results (i.e. `Batch*` column). If batches must be introduced due to other constraining factors, please keep note which samples belong to each batch, and please put some thought into how to properly balance samples across your batches. ## 3. Quality Control -Quality-control (**QC**) is extremely important! As the old adage goes: *Garbage in, Garbage out!* If there is one thing that to take away from this document, let it be that. Performing QC checks will help ensure that your results are reliable and reproducible. -It is worth noting that there is a large variety of open-source tools that can be used to assess the quality of your data so there is no reason to re-invent the wheel. Please keep this in mind but also be aware that there are many wheels *per se*, and you will need to know which to use and when. In this next section, we will cover different quality-control checks that can be applied at different stages of your RNA-seq analysis. These recommendations are based on a few tools our best-practices RNA-seq pipeline employs. +Quality-control (**QC**) is extremely important! As the old adage goes: _Garbage in, Garbage out!_ If there is one thing that to take away from this document, let it be that. Performing QC checks will help ensure that your results are reliable and reproducible. + +It is worth noting that there is a large variety of open-source tools that can be used to assess the quality of your data so there is no reason to re-invent the wheel. Please keep this in mind but also be aware that there are many wheels _per se_, and you will need to know which to use and when. In this next section, we will cover different quality-control checks that can be applied at different stages of your RNA-seq analysis. These recommendations are based on a few tools our best-practices RNA-seq pipeline employs. ### 3.1 Pre-aligment -Before drawing biological conclusions, it is important to perform quality control checks to ensure that there are no signs of sequencing error, biases in your data, or other sources of contamination. Modern high-throughput sequencers generate millions of reads per run, and in the real world, problems can arise. + +Before drawing biological conclusions, it is important to perform quality control checks to ensure that there are no signs of sequencing error, biases in your data, or other sources of contamination. Modern high-throughput sequencers generate millions of reads per run, and in the real world, problems can arise. The general idea is to assess the quality of your reads before and after adapter removal and to check for different sources of contamination before proceeding to alignment. Here are a few of the tools that we use and recommend. #### 3.1.1 Sequencing Quality + To assess the sequencing quality of your data, we recommend running FastQC before and after adapter trimming. FastQC generates a set of basic statistics to identify problems that can arise during sequencing or library preparation. FastQC will summarize per base and per read QC metrics such as quality scores and GC content (ideally, this plot should have a normal distribution with no forms of bimodality). It will also summarize the distribution of sequence lengths and will report the presence of adapter sequences, which is one reason we run it after removing adapters. #### 3.1.2 Contamination Screening + During the process of sample collection to library preparation, there is a risk for introducing wanted sources of DNA. FastQ Screen compares your sequencing data to a set of different reference genomes to determine if there is contamination. It allows a user to see if the composition of your library matches what you expect. If your data has high levels of human, mouse, fungi, or bacterial contamination, FastQ Screen will tell you. FastQ Screen will tell you what percentage of your library aligns against different reference genomes. If there are high levels of microbial contamination, Kraken will provide an estimation of the taxonomic composition. Kraken can be used in conjunction with Krona to produce interactive reports. @@ -87,51 +101,59 @@ If there are high levels of microbial contamination, Kraken will provide an esti > Note: Due to high levels of homology between organisms, there may be a small portion of your reads that align to an unexpected reference genome. Again, this should be a minimal percentage of your reads. ### 3.2 Post-alignment + Again, there are many tools available to assess the quality of your data post-alignment, and as stated before, there is no need to re-invent the wheel. Please see the table below for a generalized set of guidelines for different pre/post QC metrics. -#### 3.2.1 Library Complexity +#### 3.2.1 Library Complexity + Preseq can be used to estimate the complexity of a library for each of your samples. If the duplication rate is very high, the overall library complexity will be low. Low library complexity could signal an issue with library preparation or sample preparation (FFPE samples) where very little input RNA was over-amplified or the sample may be degraded. -#### 3.2.2 Library Composition +#### 3.2.2 Library Composition + Picard has a particularly useful sub-command called CollectRNAseqMetrics which reports the number and percentage of reads that align to various regions: such as coding, intronic, UTR, intergenic and ribosomal regions. This is particularly useful as you would expect a library constructed with ploy(A)-selection to have a high percentage of reads that map to coding regions. Picard CollectRNAseqMetrics will also report the uniformity of coverage across all genes, which is useful for determining whether a sample has a 3' bias (observed in libraries containing degraded RNA). #### 3.2.3 RNA Quality -This is another particularity useful package that is tailored for RNA-seq data. The package is made up of over 20 sub-module that can be used to do things like calculate the average insert size between paired-end reads (which is useful for GEO upload), annotate the percentage of reads spanning known or novel splice junctions, convert a BAM file into a normalized BigWig file, and infer RNA quality. - -### 3.3 Guidelines -Here is a set of generalized guidelines for different QC metrics. Some of these metrics will vary genome-to-genome depending on the quality of the assembly and annotation but that has been taken into consideration for our set of supported reference genomes. - -| QC Metric Guidelines | **mRNA** | **total RNA** | -|-----------------------------------|:--------------------------:|:------------------------:| -| *RNA Type(s)* | Coding | Coding + non-coding | -| *RIN* | >= 8 [low RIN ~ 3' bias] | >= 8 | -| *Single-end vs Paired-end* | Paired-end | Paired-end | -| *Sequencing Depth* | 10-20M PE reads | 25-60M PE reads | -| *FastQC* | Q30 > 70% | Q30 > 70% | -| *Percent Aligned to Reference* | > 70% | > 65% | -| *Million Reads Aligned Reference* | > 7M PE reads | > 16.5M PE reads | -| *Percent Aligned to rRNA* | < 5% | < 15% | -| *Picard RNAseqMetrics* | Coding > 50% | Coding > 35% | -| *Picard RNAseqMetrics* | Intronic + Intergenic < 25% | Intronic + Intergenic < 40% | -| *RSeQC TIN* | medTIN > 65 | medTIN > 60 | - -The [median TIN value reported by RSeQC](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-016-0922-z) works reasonably well for quickly identifying problematic samples. + +This is another particularity useful package that is tailored for RNA-seq data. The package is made up of over 20 sub-module that can be used to do things like calculate the average insert size between paired-end reads (which is useful for GEO upload), annotate the percentage of reads spanning known or novel splice junctions, convert a BAM file into a normalized BigWig file, and infer RNA quality. + +### 3.3 Guidelines + +Here is a set of generalized guidelines for different QC metrics. Some of these metrics will vary genome-to-genome depending on the quality of the assembly and annotation but that has been taken into consideration for our set of supported reference genomes. + +| QC Metric Guidelines | **mRNA** | **total RNA** | +| --------------------------------- | :-------------------------: | :-------------------------: | +| _RNA Type(s)_ | Coding | Coding + non-coding | +| _RIN_ | >= 8 [low RIN ~ 3' bias] | >= 8 | +| _Single-end vs Paired-end_ | Paired-end | Paired-end | +| _Sequencing Depth_ | 10-20M PE reads | 25-60M PE reads | +| _FastQC_ | Q30 > 70% | Q30 > 70% | +| _Percent Aligned to Reference_ | > 70% | > 65% | +| _Million Reads Aligned Reference_ | > 7M PE reads | > 16.5M PE reads | +| _Percent Aligned to rRNA_ | < 5% | < 15% | +| _Picard RNAseqMetrics_ | Coding > 50% | Coding > 35% | +| _Picard RNAseqMetrics_ | Intronic + Intergenic < 25% | Intronic + Intergenic < 40% | +| _RSeQC TIN_ | medTIN > 65 | medTIN > 60 | + +The [median TIN value reported by RSeQC](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-016-0922-z) works reasonably well for quickly identifying problematic samples. ## 4. Data Processing -Starting from raw data (FastQ files), how do we get a raw counts matrix, or how do we get a list of differential expressed genes? Before feeding your data into an R package for differential expression analysis, it needs to be processed to add biological context to it. In this section, we will talk about the data processing pipeline in more detail-- more specifically focusing on primary and secondary analysis. +Starting from raw data (FastQ files), how do we get a raw counts matrix, or how do we get a list of differential expressed genes? Before feeding your data into an R package for differential expression analysis, it needs to be processed to add biological context to it. In this section, we will talk about the data processing pipeline in more detail-- more specifically focusing on primary and secondary analysis. ### 4.1 Primary Analysis ->Raw data > Adapter Trimming > Alignment > Quantification +> Raw data > Adapter Trimming > Alignment > Quantification + +#### 4.1.1 Adapter Trimming -#### 4.1.1 Adapter Trimming One of the first steps in this process is to remove any unwanted adapters sequences from your reads in before alignment. Adapters are composed of synthetic sequences and should be removed prior to alignment. Adapter removal is especially important in certain protocols, such as miRNA-seq. When smaller fragments are sequenced it is almost certain there will be some form of adapter contamination. -#### 4.1.2 Alignment -In the alignment step, we add biological context to the raw data. In this step, we align reads to the reference genome to find where the sequenced fragments originate. +#### 4.1.2 Alignment + +In the alignment step, we add biological context to the raw data. In this step, we align reads to the reference genome to find where the sequenced fragments originate. -Accurate alignment of the cDNA fragments (which are derived from RNA) is difficult. Alternative splicing introduces the problem of aligning to non-contiguous regions, and using traditional genomic alignment algorithms can produce inaccurate or low-quality alignments due to the combination of alternative splicing and genomic variation (substitutions, insertions, and deletions). This has lead to the development of *splice-aware* aligners like STAR, which are designed to overcome these issues. STAR can also be run in a two-pass mode for enhanced detection of reads mapping to novel splice junctions. +Accurate alignment of the cDNA fragments (which are derived from RNA) is difficult. Alternative splicing introduces the problem of aligning to non-contiguous regions, and using traditional genomic alignment algorithms can produce inaccurate or low-quality alignments due to the combination of alternative splicing and genomic variation (substitutions, insertions, and deletions). This has lead to the development of _splice-aware_ aligners like STAR, which are designed to overcome these issues. STAR can also be run in a two-pass mode for enhanced detection of reads mapping to novel splice junctions. #### 4.1.3 Quantification -In the quantification step, the number of reads that mapped to a particular genomic feature (such as a gene or isoform) is counted. It is important to keep in mind that raw counts are biased by a number of factors such as library size, feature-length, and other compositional biases. As so, it is important to normalize your data to remove these biases before summarizing differences between groups of samples. + +In the quantification step, the number of reads that mapped to a particular genomic feature (such as a gene or isoform) is counted. It is important to keep in mind that raw counts are biased by a number of factors such as library size, feature-length, and other compositional biases. As so, it is important to normalize your data to remove these biases before summarizing differences between groups of samples. diff --git a/docs/RNA-seq/build.md b/docs/RNA-seq/build.md index 64e21ac..27eb411 100644 --- a/docs/RNA-seq/build.md +++ b/docs/RNA-seq/build.md @@ -1,13 +1,15 @@ # renee build -## 1. About +## 1. About + The `renee` executable is composed of several inter-related sub commands. Please see `renee -h` for all available options. -This part of the documentation describes options and concepts for renee build sub command in more detail. With minimal configuration, the **`build`** sub command enables you to build new reference files for the renee run pipeline. +This part of the documentation describes options and concepts for renee build sub command in more detail. With minimal configuration, the **`build`** sub command enables you to build new reference files for the renee run pipeline. -Setting up the RENEE build pipeline is fast and easy! In its most basic form, renee build only has *five required inputs*. +Setting up the RENEE build pipeline is fast and easy! In its most basic form, renee build only has _five required inputs_. ## 2. Synopsis + ```text $ renee build [--help] \ [--shared-resources SHARED_RESOURCES] [--small-genome] \ @@ -24,152 +26,169 @@ The synopsis for each command shows its parameters and their usage. Optional par A user **must** provide the genomic sequence of the reference's assembly in FASTA format via `--ref-fa` argument, an alias for the reference genome via `--ref-name` argument, a gene annotation for the reference assembly via `--ref-gtf` argument, an alias or version for the gene annotation via the ` --gtf-ver` argument, and an output directory to store the built reference files via `--output` argument. If you are running the pipeline outside of Biowulf, you will need to additionally provide the the following options: `--shared-resources`, `--tmp-dir`. More information about each of these options can be found below. -For [human](https://www.gencodegenes.org/human/) and [mouse](https://www.gencodegenes.org/mouse/) data, we highly recommend downloading the latest available **PRI** genome assembly and corresponding gene annotation from [GENCODE](https://www.gencodegenes.org/). These reference files contain chromosomes and scaffolds sequences. +For [human](https://www.gencodegenes.org/human/) and [mouse](https://www.gencodegenes.org/mouse/) data, we highly recommend downloading the latest available **PRI** genome assembly and corresponding gene annotation from [GENCODE](https://www.gencodegenes.org/). These reference files contain chromosomes and scaffolds sequences. - The build pipeline will generate a JSON file containing key, value pairs to required reference files for the renee run pipeline. This file will be located in the path provided to `--output`. The name of this JSON file is dependent on the values provided to `--ref-name` and `--gtf-ver` and has the following naming convention: `{OUTPUT}/{REF_NAME}_{GTF_VER}.json`. Once the build pipeline completes, this reference JSON file can be passed to the `--genome` option of renee run. This is how new references are built for the RENEE pipeline. +The build pipeline will generate a JSON file containing key, value pairs to required reference files for the renee run pipeline. This file will be located in the path provided to `--output`. The name of this JSON file is dependent on the values provided to `--ref-name` and `--gtf-ver` and has the following naming convention: `{OUTPUT}/{REF_NAME}_{GTF_VER}.json`. Once the build pipeline completes, this reference JSON file can be passed to the `--genome` option of renee run. This is how new references are built for the RENEE pipeline. -Use you can always use the `-h` option for information on a specific command. +Use you can always use the `-h` option for information on a specific command. ### 2.1 Required Arguments Each of the following arguments are required. Failure to provide a required argument will result in a non-zero exit-code. - `--ref-fa REF_FA` +`--ref-fa REF_FA` + > **Genomic FASTA file of the reference genome.** -> *type: file* -> -> This file represents the genome sequence of the reference assembly in FASTA format. If you are downloading this from GENCODE, you should select the *PRI* genomic FASTA file. This file will contain the primary genomic assembly (contains chromosomes and scaffolds). This input file should not be compressed. Sequence identifers in this file must match with sequence identifers in the GTF file provided to `--ref-gtf`. -> -> ***Example:*** -> `--ref-fa GRCh38.primary_assembly.genome.fa` +> _type: file_ +> +> This file represents the genome sequence of the reference assembly in FASTA format. If you are downloading this from GENCODE, you should select the _PRI_ genomic FASTA file. This file will contain the primary genomic assembly (contains chromosomes and scaffolds). This input file should not be compressed. Sequence identifers in this file must match with sequence identifers in the GTF file provided to `--ref-gtf`. +> +> **_Example:_** > `--ref-fa GRCh38.primary_assembly.genome.fa` + +--- +`--ref-name REF_NAME` ---- - `--ref-name REF_NAME` > **Name of the reference genome.** -> *type: string* -> -> Name or alias for the reference genome. This can be the common name for the reference genome. Here is a list of common examples for different model organisms: mm10, hg38, rn6, danRer11, dm6, canFam3, sacCer3, ce11. If the provided values contains one of the following sub-strings (hg19, hs37d, grch37, hg38, hs38d, grch38, mm10, grcm38), then Arriba will run with its corresponding blacklist. -> -> ***Example:*** `--ref-name hg38` - ---- - `--ref-gtf REF_GTF` +> _type: string_ +> +> Name or alias for the reference genome. This can be the common name for the reference genome. Here is a list of common examples for different model organisms: mm10, hg38, rn6, danRer11, dm6, canFam3, sacCer3, ce11. If the provided values contains one of the following sub-strings (hg19, hs37d, grch37, hg38, hs38d, grch38, mm10, grcm38), then Arriba will run with its corresponding blacklist. +> +> **_Example:_** `--ref-name hg38` + +--- + +`--ref-gtf REF_GTF` + > **Gene annotation or GTF file for the reference genome.** -> *type: file* -> +> _type: file_ +> > This file represents the reference genome's gene annotation in GTF format. If you are downloading this from GENCODE, you should select the 'PRI' GTF file. This file contains gene annotations for the primary assembly (contains chromosomes and scaffolds). This input file should not be compressed. Sequence identifers (column 1) in this file must match with sequence identifers in the FASTA file provided to `--ref-fa`. -> ***Example:*** `--ref-gtf gencode.v36.primary_assembly.annotation.gtf` +> **_Example:_** `--ref-gtf gencode.v36.primary_assembly.annotation.gtf` --- - `--gtf-ver GTF_VER` + +`--gtf-ver GTF_VER` + > **Version of the gene annotation or GTF file provided.** -> *type: string or int* -> -> This is the version of the supplied gene annotation or GTF file. If you are using a GTF file from GENCODE, use the release number or version (i.e. *M25* for mouse or *37* for human). Visit gencodegenes.org for more details. -> ***Example:*** `--gtf-ver 36` - ---- - `--output OUTPUT` +> _type: string or int_ +> +> This is the version of the supplied gene annotation or GTF file. If you are using a GTF file from GENCODE, use the release number or version (i.e. _M25_ for mouse or _37_ for human). Visit gencodegenes.org for more details. +> **_Example:_** `--gtf-ver 36` + +--- + +`--output OUTPUT` + > **Path to an output directory.** -> *type: path* -> +> _type: path_ +> > This location is where the build pipeline will create all of its output files. If the user-provided working directory has not been initialized, it will automatically be created. -> ***Example:*** `--output /data/$USER/refs/hg38_v36/` +> **_Example:_** `--output /data/$USER/refs/hg38_v36/` ### 2.2 Build Options Each of the following arguments are optional and do not need to be provided. If you are running the pipeline outside of Biowulf, the `--shared-resources` option only needs to be provided at least once. This will ensure reference files that are shared across different genomes are downloaded locally. - `--shared-resources SHARED_RESOURCES` +`--shared-resources SHARED_RESOURCES` + > **Local path to shared resources.** -> *type: path* +> _type: path_ +> +> The pipeline uses a set of shared reference files that can be re-used across reference genomes. These currently include reference files for kraken and FQScreen. These reference files can be downloaded with the build sub command's `--shared-resources` option. With that being said, these files only need to be downloaded once. We recommend storing this files in a shared location on the filesystem that other people can access. If you are running the pipeline on Biowulf, you do NOT need to download these reference files! They already exist on the filesystem in a location that anyone can acceess; however, if you are running the pipeline on another cluster or target system, you will need to download the shared resources with the build sub command, and you will need to provide this option every time you run the pipeline. Please provide the same path that was provided to the build sub command's --shared-resources option. Again, if you are running the pipeline on Biowulf, you do NOT need to provide this option. For more information about how to download shared resources, please reference the build sub command's `--shared-resources` option. > -> The pipeline uses a set of shared reference files that can be re-used across reference genomes. These currently include reference files for kraken and FQScreen. These reference files can be downloaded with the build sub command's `--shared-resources` option. With that being said, these files only need to be downloaded once. We recommend storing this files in a shared location on the filesystem that other people can access. If you are running the pipeline on Biowulf, you do NOT need to download these reference files! They already exist on the filesystem in a location that anyone can acceess; however, if you are running the pipeline on another cluster or target system, you will need to download the shared resources with the build sub command, and you will need to provide this option every time you run the pipeline. Please provide the same path that was provided to the build sub command's --shared-resources option. Again, if you are running the pipeline on Biowulf, you do NOT need to provide this option. For more information about how to download shared resources, please reference the build sub command's `--shared-resources` option. -> -> ***Example:*** `--shared-resources /data/shared/renee` +> **_Example:_** `--shared-resources /data/shared/renee` + +--- + +`--small-genome` ---- - `--small-genome` > **Builds a small genome index.** -> *type: boolean* -> +> _type: boolean_ +> > For small genomes, it is recommeded running STAR with a scaled down `--genomeSAindexNbases` value. This option runs the build pipeline in a mode where it dynamically finds the optimal value for this option using the following formula: `min(14, log2(GenomeSize)/2 - 1)`. Generally speaking, this option is not really applicable for most mammalian reference genomes, i.e. human and mouse; however, researcher working with very small reference genomes, like S. cerevisiae ~ 12Mb, should provide this option. > -> When in doubt feel free to provide this option, as the optimal value will be found based on your input. +> When in doubt feel free to provide this option, as the optimal value will be found based on your input. > -> ***Example:*** `--small-genome` +> **_Example:_** `--small-genome` ### 2.3 Orchestration Options - `--dry-run` +`--dry-run` + > **Dry run the build pipeline.** -> *type: boolean* -> +> _type: boolean_ +> > Displays what steps in the build pipeline remain or will be run. Does not execute anything! > -> ***Example:*** `--dry-run` +> **_Example:_** `--dry-run` + +--- + +`--singularity-cache SINGULARITY_CACHE` ---- - `--singularity-cache SINGULARITY_CACHE` > **Overrides the $SINGULARITY_CACHEDIR environment variable.** -> *type: path* -> *default: `--output OUTPUT/.singularity`* +> _type: path_ +> _default: `--output OUTPUT/.singularity`_ +> +> Singularity will cache image layers pulled from remote registries. This ultimately speeds up the process of pull an image from DockerHub if an image layer already exists in the singularity cache directory. By default, the cache is set to the value provided to the `--output` argument. Please note that this cache cannot be shared across users. Singularity strictly enforces you own the cache directory and will return a non-zero exit code if you do not own the cache directory! See the `--sif-cache` option to create a shareable resource. > -> Singularity will cache image layers pulled from remote registries. This ultimately speeds up the process of pull an image from DockerHub if an image layer already exists in the singularity cache directory. By default, the cache is set to the value provided to the `--output` argument. Please note that this cache cannot be shared across users. Singularity strictly enforces you own the cache directory and will return a non-zero exit code if you do not own the cache directory! See the `--sif-cache` option to create a shareable resource. -> -> ***Example:*** `--singularity-cache /data/$USER/.singularity` +> **_Example:_** `--singularity-cache /data/$USER/.singularity` + +--- + +`--sif-cache SIF_CACHE` ---- - `--sif-cache SIF_CACHE` > **Path where a local cache of SIFs are stored.** -> *type: path* +> _type: path_ > > Uses a local cache of SIFs on the filesystem. This SIF cache can be shared across users if permissions are set correctly. If a SIF does not exist in the SIF cache, the image will be pulled from Dockerhub and a warning message will be displayed. The `renee cache` subcommand can be used to create a local SIF cache. Please see `renee cache` for more information. This command is extremely useful for avoiding DockerHub pull rate limits. It also remove any potential errors that could occur due to network issues or DockerHub being temporarily unavailable. We recommend running RENEE with this option when ever possible. -> -> ***Example:*** `--singularity-cache /data/$USER/SIFs` -> +> +> **_Example:_** `--singularity-cache /data/$USER/SIFs` + +--- + +`--tmp-dir TMP_DIR` ---- - `--tmp-dir TMP_DIR` > **Path on the file system for writing temporary files.** -> *type: path* -> *default: `/lscratch/$SLURM_JOBID`* -> -> This is a path on the file system for writing temporary output files. By default, the temporary directory is set to '/lscratch/$SLURM_JOBID' for backwards compatibility with the NIH's Biowulf cluster; however, if you are running the pipeline on another cluster, this option will need to be specified. Ideally, this path should point to a dedicated location on the filesystem for writing tmp files. On many systems, this location is set to somewhere in /scratch. If you need to inject a variable into this string that should NOT be expanded, please quote this options value in single quotes. Again, if you are running the pipeline on Biowulf, you do NOT need to provide this option. -> -> ***Example:*** `--tmp-dir /cluster_scratch/$USER/` +> _type: path_ +> _default: `/lscratch/$SLURM_JOBID`_ +> +> This is a path on the file system for writing temporary output files. By default, the temporary directory is set to '/lscratch/$SLURM_JOBID' for backwards compatibility with the NIH's Biowulf cluster; however, if you are running the pipeline on another cluster, this option will need to be specified. Ideally, this path should point to a dedicated location on the filesystem for writing tmp files. On many systems, this location is set to somewhere in /scratch. If you need to inject a variable into this string that should NOT be expanded, please quote this options value in single quotes. Again, if you are running the pipeline on Biowulf, you do NOT need to provide this option. +> +> **_Example:_** `--tmp-dir /cluster_scratch/$USER/` ### 2.4 Misc Options -Each of the following arguments are optional and do not need to be provided. +Each of the following arguments are optional and do not need to be provided. + +`-h, --help` - `-h, --help` > **Display Help.** -> *type: boolean* -> +> _type: boolean_ +> > Shows command's synopsis, help message, and an example command -> -> ***Example:*** `--help` +> +> **_Example:_** `--help` -## 3. Hybrid Genomes +## 3. Hybrid Genomes -If you have two GTF files, e.g. hybrid genomes (host + virus), then you need to create one genomic FASTA file and one GTF file for the hybrid genome prior to running the renee build command. +If you have two GTF files, e.g. hybrid genomes (host + virus), then you need to create one genomic FASTA file and one GTF file for the hybrid genome prior to running the renee build command. We recommend creating an artifical chromosome for the non-host sequence. The sequence identifer in the FASTA file must match the sequence identifer in the GTF file (column 1). Generally speaking, since the host annotation is usually downloaded from Ensembl or GENCODE, it will be correctly formatted; however, that may not be the case for the non-host sequence! -Please ensure the non-host annotation contains the following features and/or constraints: +Please ensure the non-host annotation contains the following features and/or constraints: - * for a given `gene` feature - * each `gene` entry has at least one `transcript` feature - * and each `transcript` entry has atleast one `exon` feature - * `gene_id`, `gene_name` and `gene_biotype` are required - * for a given `transcipt` feature - * along with `gene_id`, `gene_name` and `gene_biotype` ... `transcript_id` is also required - * for a given `exon` feature - * `gene_id`, `gene_name`, `gene_biotype`, `transcript_id` are required +- for a given `gene` feature + - each `gene` entry has at least one `transcript` feature + - and each `transcript` entry has atleast one `exon` feature + - `gene_id`, `gene_name` and `gene_biotype` are required +- for a given `transcipt` feature + - along with `gene_id`, `gene_name` and `gene_biotype` ... `transcript_id` is also required +- for a given `exon` feature + - `gene_id`, `gene_name`, `gene_biotype`, `transcript_id` are required -If not, the GTF file may need to be manually curated until these conditions are satisfied. +If not, the GTF file may need to be manually curated until these conditions are satisfied. Here is an example feature from a hand-curated Biotyn_probe GTF file: @@ -179,20 +198,19 @@ Biot1 BiotynProbe transcript 1 21 0.000000 + . gene_id "Biot1"; gen Biot1 BiotynProbe exon 1 21 0.000000 + . gene_id "Biot1"; gene_biotype "biotynlated_probe_control"; transcript_id "Biot1"; transcript_type "biotynlated_probe_control"; ``` -In this tab-delimited example above, - - * ***line 1:*** the `gene` feature has 3 required attributes in column 9: `gene_id` and `gene_name` and `gene_biotype` - * ***line 2:*** the `transcript` entry for the above `gene` repeats the same attributes with following required fields: `transcript_id ` and `transcript_name` - * *Please note:* `transcript_type` is *optional* - * ***line 3:*** the `exon` entry for the above `transcript` has 3 required attributes: `gene_id` and `transcript_id` and `gene_biotype` - * *Please note:* `transcript_type` is *optional* +In this tab-delimited example above, -For a given gene, the combination of the `gene_id` AND `gene_name` should form a unique string. There should be no instances where two different genes share the same `gene_id` AND `gene_name`. +- **_line 1:_** the `gene` feature has 3 required attributes in column 9: `gene_id` and `gene_name` and `gene_biotype` +- **_line 2:_** the `transcript` entry for the above `gene` repeats the same attributes with following required fields: `transcript_id ` and `transcript_name` + - _Please note:_ `transcript_type` is _optional_ +- **_line 3:_** the `exon` entry for the above `transcript` has 3 required attributes: `gene_id` and `transcript_id` and `gene_biotype` + - _Please note:_ `transcript_type` is _optional_ +For a given gene, the combination of the `gene_id` AND `gene_name` should form a unique string. There should be no instances where two different genes share the same `gene_id` AND `gene_name`. ## 4. Convert NCBI GFF3 to GTF format -It is worth noting that RENEE comes bundled with a script to convert GFF3 files downloaded from NCBI to GTF file format. This convenience script is useful as the `renee build` sub command takes a GTF file as one of its inputs. +It is worth noting that RENEE comes bundled with a script to convert GFF3 files downloaded from NCBI to GTF file format. This convenience script is useful as the `renee build` sub command takes a GTF file as one of its inputs. Please note that this script has only been tested with GFF3 files downloaded from NCBI, and _it is **not** recommended to use with GFF3 files originating from other sources like Ensembl or GENCODE_. If you are selecting an annotation from Ensembl or GENCODE, please download the GTF file option. @@ -204,17 +222,18 @@ pip install argparse ``` For more information about the script and its usage, please run: + ```bash ./resources/gff3togtf.py -h ``` ## 5. Example -### 5.1 Biowulf +### 5.1 Biowulf -On Biowulf getting started with the pipeline is fast and easy! In this example, we build a mouse reference genome. +On Biowulf getting started with the pipeline is fast and easy! In this example, we build a mouse reference genome. -```bash +```bash # Step 0.) Grab an interactive node (do not run on head node) srun -N 1 -n 1 --time=2:00:00 -p interactive --mem=8gb --cpus-per-task=4 --pty bash module purge @@ -229,20 +248,20 @@ renee build --ref-fa GRCm39.primary_assembly.genome.fa \ --sif-cache /data/CCBR_Pipeliner/SIFs/ \ --dry-run -# Step 2.) Build new RENEE reference files +# Step 2.) Build new RENEE reference files renee build --ref-fa GRCm39.primary_assembly.genome.fa \ --ref-name mm39 \ --ref-gtf gencode.vM26.annotation.gtf \ --gtf-ver M26 \ --output /data/$USER/refs/mm39_M26 \ - --sif-cache /data/CCBR_Pipeliner/SIFs/ + --sif-cache /data/CCBR_Pipeliner/SIFs/ ``` ### 5.2 Generic SLURM Cluster -Running the pipeline outside of Biowulf is easy; however, there are a few extra options you must provide. Please note when running the build sub command for the first time, you will also need to provide the `--shared-resources` option. This option will download our kraken2 database and bowtie2 indices for FastQ Screen. The path provided to this option should be provided to the `--shared-resources` option of the [run](./RNA-seq/cache/) sub command. Next, you will also need to provide a path to write temporary output files via the `--tmp-dir` option. We also recommend providing a path to a SIF cache. You can cache software containers locally with the [cache](./RNA-seq/cache/) sub command. +Running the pipeline outside of Biowulf is easy; however, there are a few extra options you must provide. Please note when running the build sub command for the first time, you will also need to provide the `--shared-resources` option. This option will download our kraken2 database and bowtie2 indices for FastQ Screen. The path provided to this option should be provided to the `--shared-resources` option of the [run](./RNA-seq/cache/) sub command. Next, you will also need to provide a path to write temporary output files via the `--tmp-dir` option. We also recommend providing a path to a SIF cache. You can cache software containers locally with the [cache](./RNA-seq/cache/) sub command. -```bash +```bash # Step 0.) Grab an interactive node (do not run on head node) srun -N 1 -n 1 --time=2:00:00 -p interactive --mem=8gb --cpus-per-task=4 --pty bash # Add snakemake and singularity to $PATH, @@ -270,7 +289,7 @@ renee build --ref-fa GRCm39.primary_assembly.genome.fa \ --sif-cache /data/$USER/cache \ --dry-run -# Step 2.) Build new RENEE reference files +# Step 2.) Build new RENEE reference files renee build --ref-fa GRCm39.primary_assembly.genome.fa \ --ref-name mm39 \ --ref-gtf gencode.vM26.annotation.gtf \ @@ -278,5 +297,5 @@ renee build --ref-fa GRCm39.primary_assembly.genome.fa \ --output /data/$USER/refs/mm39_M26 \ --shared-resources /data/shared/renee \ --tmp-dir /cluster_scratch/$USER/ \ - --sif-cache /data/$USER/cache + --sif-cache /data/$USER/cache ``` diff --git a/docs/RNA-seq/cache.md b/docs/RNA-seq/cache.md index 8540377..7dd75e3 100644 --- a/docs/RNA-seq/cache.md +++ b/docs/RNA-seq/cache.md @@ -1,62 +1,68 @@ # renee cache -## 1. About +## 1. About + The `renee` executable is composed of several inter-related sub commands. Please see `renee -h` for all available options. -This part of the documentation describes options and concepts for renee cache sub command in more detail. With minimal configuration, the **`cache`** sub command enables you to cache remote resources for the RENEE pipeline. Caching remote resources allows the pipeline to run in an offline mode. +This part of the documentation describes options and concepts for renee cache sub command in more detail. With minimal configuration, the **`cache`** sub command enables you to cache remote resources for the RENEE pipeline. Caching remote resources allows the pipeline to run in an offline mode. -`renee cache` when run successfully submits a SLURM job to the job schedule and quits. `squeue` can then be used to track the progress of the caching. +`renee cache` when run successfully submits a SLURM job to the job schedule and quits. `squeue` can then be used to track the progress of the caching. The cache sub command creates local cache on the filesysytem for resources hosted on DockerHub or AWS S3. These resources are normally pulled onto the filesystem when the pipeline runs; however, due to network issues or DockerHub pull rate limits, it may make sense to pull the resources once so a shared cache can be created and re-used. It is worth noting that a singularity cache cannot normally be shared across users. Singularity strictly enforces that its cache is owned by the user. To get around this issue, the cache subcommand can be used to create local SIFs on the filesystem from images on DockerHub. -Caching remote resources for the RENEE pipeline is fast and easy! In its most basic form, renee cache only has *one required input*. +Caching remote resources for the RENEE pipeline is fast and easy! In its most basic form, renee cache only has _one required input_. ## 2. Synopsis + ```text $ renee cache [-h] --sif-cache SIF_CACHE \ - [--dry-run] + [--dry-run] ``` The synopsis for each command shows its parameters and their usage. Optional parameters are shown in square brackets. A user **must** provide a directory to cache remote Docker images via the `--sif-cache` argument. Once the cache has pipeline completed, the local sif cache can be passed to the `--sif-cache` option of the renee build and renee run subcomand. This enables the build and run pipeline to run in an offline mode. -Use you can always use the `-h` option for information on a specific command. +Use you can always use the `-h` option for information on a specific command. ### 2.1 Required Arguments - `--sif-cache SIF_CACHE` - +`--sif-cache SIF_CACHE` + > **Path where a local cache of SIFs will be stored.** -> *type: path* -> -> Any images defined in *config/containers/images.json* will be pulled into the local filesystem. The path provided to this option can be passed to the `--sif-cache` option of the renee build and renee run subcomand. This allows for running the build and run pipelines in an offline mode where no requests are made to external sources. This is useful for avoiding network issues or DockerHub pull rate limits. Please see renee build and run for more information. -> -> ***Example:*** `--sif-cache /data/$USER/cache` +> _type: path_ +> +> Any images defined in _config/containers/images.json_ will be pulled into the local filesystem. The path provided to this option can be passed to the `--sif-cache` option of the renee build and renee run subcomand. This allows for running the build and run pipelines in an offline mode where no requests are made to external sources. This is useful for avoiding network issues or DockerHub pull rate limits. Please see renee build and run for more information. +> +> **_Example:_** `--sif-cache /data/$USER/cache` ### 2.2 Options -Each of the following arguments are optional and do not need to be provided. +Each of the following arguments are optional and do not need to be provided. + +`-h, --help` - `-h, --help` > **Display Help.** -> *type: boolean* -> +> _type: boolean_ +> > Shows command's synopsis, help message, and an example command -> -> ***Example:*** `--help` +> +> **_Example:_** `--help` + +--- + +`--dry-run` ---- - `--dry-run` > **Dry run the pipeline.** -> *type: boolean* -> +> _type: boolean_ +> > Displays what steps in the pipeline remain or will be run. Does not execute anything! > -> ***Example:*** `--dry-run` +> **_Example:_** `--dry-run` ## 3. Example -```bash + +```bash # Step 0.) Grab an interactive node (do not run on head node) srun -N 1 -n 1 --time=12:00:00 -p interactive --mem=8gb --cpus-per-task=4 --pty bash module purge @@ -64,8 +70,8 @@ module load ccbrpipeliner # Step 1.) Dry run cache to see what will be pulled renee cache --sif-cache /data/$USER/cache \ - --dry-run + --dry-run -# Step 2.) Cache remote resources locally -renee cache --sif-cache /data/$USER/cache +# Step 2.) Cache remote resources locally +renee cache --sif-cache /data/$USER/cache ``` diff --git a/docs/RNA-seq/images/RENEE_Pipeline.svg b/docs/RNA-seq/images/RENEE_Pipeline.svg index 4d14b3e..4955e61 100644 --- a/docs/RNA-seq/images/RENEE_Pipeline.svg +++ b/docs/RNA-seq/images/RENEE_Pipeline.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/docs/RNA-seq/run.md b/docs/RNA-seq/run.md index c444319..32d7950 100644 --- a/docs/RNA-seq/run.md +++ b/docs/RNA-seq/run.md @@ -1,13 +1,15 @@ # renee run -## 1. About +## 1. About + The `renee` executable is composed of several inter-related sub commands. Please see `renee -h` for all available options. -This part of the documentation describes options and concepts for renee run sub command in more detail. With minimal configuration, the **`run`** sub command enables you to start running the data processing and quality-control pipeline. +This part of the documentation describes options and concepts for renee run sub command in more detail. With minimal configuration, the **`run`** sub command enables you to start running the data processing and quality-control pipeline. -Setting up the RENEE pipeline is fast and easy! In its most basic form, renee run only has *three required inputs*. +Setting up the RENEE pipeline is fast and easy! In its most basic form, renee run only has _three required inputs_. ## 2. Synopsis + ```text $ renee run [--help] \ [--small-rna] [--star-2-pass-basic] \ @@ -26,163 +28,183 @@ The synopsis for each command shows its parameters and their usage. Optional par A user **must** provide a list of FastQ files (globbing is supported) to analyze via `--input` argument, an output directory to store results via `--output` argument and select reference genome for alignment and annotation via the `--genome` argument. If you are running the pipeline outside of Biowulf, you will need to additionally provide the the following options: `--shared-resources`, `--tmp-dir`. More information about each of these options can be found below. -Use you can always use the `-h` option for information on a specific sub command. +Use you can always use the `-h` option for information on a specific sub command. ### 2.1 Required Arguments Each of the following arguments are required. Failure to provide a required argument will result in a non-zero exit-code. - `--input INPUT [INPUT ...]` +`--input INPUT [INPUT ...]` + > **Input FastQ file(s) to process.** -> *type: file* -> -> One or more FastQ files can be provided. From the command-line, each FastQ file should seperated by a space. Globbing is supported! This makes selecting FastQ files easier. Input FastQ files should be gzipp-ed. The pipeline supports single-end and pair-end RNA-seq data; however, the pipeline will not process a mixture of single-end and paired-end samples together. If you have a mixture of single-end and pair-end samples to process, please process them as two seperate instances of the RENEE pipeline (with two seperate output directories). -> -> ***Example:*** `--input .tests/*.R?.fastq.gz` - ---- - `--output OUTPUT` -> **Path to an output directory.** -> *type: path* -> +> _type: file_ +> +> One or more FastQ files can be provided. From the command-line, each FastQ file should seperated by a space. Globbing is supported! This makes selecting FastQ files easier. Input FastQ files should be gzipp-ed. The pipeline supports single-end and pair-end RNA-seq data; however, the pipeline will not process a mixture of single-end and paired-end samples together. If you have a mixture of single-end and pair-end samples to process, please process them as two seperate instances of the RENEE pipeline (with two seperate output directories). +> +> **_Example:_** `--input .tests/*.R?.fastq.gz` + +--- + +`--output OUTPUT` + +> **Path to an output directory.** +> _type: path_ +> > This location is where the pipeline will create all of its output files, also known as the pipeline's working directory. If the provided output directory does not exist, it will be initialized automatically. -> -> ***Example:*** `--output /data/$USER/RNA_hg38` +> +> **_Example:_** `--output /data/$USER/RNA_hg38` + +--- ---- - `--genome {hg38_30,mm10_M21,custom.json}` -> **Reference genome.** -> *type: string or file* -> -> This option defines the reference genome for your set of samples. On Biowulf, RENEE does comes bundled with pre built reference files for human and mouse samples; however, it is worth noting that the pipeline does accept a custom reference genome built with the build sub command. Building a new reference genome is easy! You can create a custom reference genome with a single command. This is extremely useful when working with non-model organisms. New users can reference the documentation's [getting started](../TLDR-RNA-seq/#3-building-reference-files) section to see how a reference genome is built. +`--genome {hg38_30,mm10_M21,custom.json}` + +> **Reference genome.** +> _type: string or file_ +> +> This option defines the reference genome for your set of samples. On Biowulf, RENEE does comes bundled with pre built reference files for human and mouse samples; however, it is worth noting that the pipeline does accept a custom reference genome built with the build sub command. Building a new reference genome is easy! You can create a custom reference genome with a single command. This is extremely useful when working with non-model organisms. New users can reference the documentation's [getting started](../TLDR-RNA-seq/#3-building-reference-files) section to see how a reference genome is built. > -> ***Pre built Option*** +> **_Pre built Option_** > Pre build genomes are avaiable with RENEE. Please see the [resources page](../Resources/#1.-Reference-genomes) for more information about each pre built option. > -> ***Custom Option*** -> A user can also supply a custom reference genome built with the build sub command. Please supply the custom reference JSON file that was generated by the build sub command. The name of this custom reference JSON file is dependent on the values provided to the following *renee build* args, `--ref-name REF_NAME` and `--gtf-ver GTF_VER`, where the name of the provided custom reference JSON file would be: `{REF_NAME}_{GTF_VER}.json`. -> -> ***Example:*** `--genome hg38_30` *OR* `--genome /data/${USER}/hg38_36/hg38_36.json` +> **_Custom Option_** +> A user can also supply a custom reference genome built with the build sub command. Please supply the custom reference JSON file that was generated by the build sub command. The name of this custom reference JSON file is dependent on the values provided to the following _renee build_ args, `--ref-name REF_NAME` and `--gtf-ver GTF_VER`, where the name of the provided custom reference JSON file would be: `{REF_NAME}_{GTF_VER}.json`. +> +> **_Example:_** `--genome hg38_30` _OR_ `--genome /data/${USER}/hg38_36/hg38_36.json` ### 2.2 Analysis Options - `--small-rna` +`--small-rna` + > **Run STAR using ENCODE's recomendations for small RNA.** -> *type: boolean* -> -> This option should only be used with small RNA libraries. These are rRNA-depleted libraries that have been size selected to contain fragments shorter than 200bp. Size selection enriches for small RNA species such as miRNAs, siRNAs, or piRNAs. Also, this option should not be combined with the star 2-pass basic option. If the two options are combined, STAR will run in pass basic mode. This means that STAR will not run with ENCODE's recommendations for small RNA alignment. As so, please take caution not to combine both options together. -> -> Please note: This option is only supported with single-end data. +> _type: boolean_ +> +> This option should only be used with small RNA libraries. These are rRNA-depleted libraries that have been size selected to contain fragments shorter than 200bp. Size selection enriches for small RNA species such as miRNAs, siRNAs, or piRNAs. Also, this option should not be combined with the star 2-pass basic option. If the two options are combined, STAR will run in pass basic mode. This means that STAR will not run with ENCODE's recommendations for small RNA alignment. As so, please take caution not to combine both options together. > -> ***Example:*** `--small-rna` +> Please note: This option is only supported with single-end data. +> +> **_Example:_** `--small-rna` + +--- + +`--star-2-pass-basic` ---- - `--star-2-pass-basic` > **Run STAR in per sample 2-pass mapping mode.** -> *type: boolean* -> -> It is recommended to use this option when processing a set of unrelated samples or when processing samples in a clinical setting. It is not adivsed to use this option for a study with multiple related samples. -> +> _type: boolean_ +> +> It is recommended to use this option when processing a set of unrelated samples or when processing samples in a clinical setting. It is not adivsed to use this option for a study with multiple related samples. +> > By default, the pipeline ultilizes a multi sample 2-pass mapping approach where the set of splice junctions detected across all samples are provided to the second pass of STAR. This option overrides the default behavior so each sample will be processed in a per sample two-pass basic mode. This option should not be combined with the small RNA option. If the two options are combined, STAR will run in pass basic mode. -> -> ***Example:*** `--star-2-pass-basic` +> +> **_Example:_** `--star-2-pass-basic` ### 2.3 Orchestration Options -Each of the following arguments are optional and do not need to be provided. - - `--dry-run` +Each of the following arguments are optional and do not need to be provided. + +`--dry-run` + > **Dry run the pipeline.** -> *type: boolean* -> +> _type: boolean_ +> > Displays what steps in the pipeline remain or will be run. Does not execute anything! > -> ***Example:*** `--dry-run` - ---- - `--mode {slurm,local}` -> **Execution Method.** -> *type: string* -> *default: slurm* -> -> Execution Method. Defines the mode or method of execution. Vaild mode options include: slurm or local. -> -> ***local*** -> Local executions will run serially on compute instance. This is useful for testing, debugging, or when a users does not have access to a high performance computing environment. If this option is not provided, it will default to a local execution mode. -> -> ***slurm*** +> **_Example:_** `--dry-run` + +--- + +`--mode {slurm,local}` + +> **Execution Method.** > _type: string_ +> _default: slurm_ +> +> Execution Method. Defines the mode or method of execution. Vaild mode options include: slurm or local. +> +> **_local_** +> Local executions will run serially on compute instance. This is useful for testing, debugging, or when a users does not have access to a high performance computing environment. If this option is not provided, it will default to a local execution mode. +> +> **_slurm_** > The slurm execution method will submit jobs to a cluster using a slurm + singularity backend. This method will automatically submit the master job to the cluster. It is recommended running RENEE in this mode as execution will be significantly faster in a distributed environment. -> -> ***Example:*** `--mode slurm` +> +> **_Example:_** `--mode slurm` + +--- + +`--shared-resources SHARED_RESOURCES` ---- - `--shared-resources SHARED_RESOURCES` > **Local path to shared resources.** -> *type: path* +> _type: path_ > -> The pipeline uses a set of shared reference files that can be re-used across reference genomes. These currently include reference files for kraken and FQScreen. These reference files can be downloaded with the build sub command's `--shared-resources` option. With that being said, these files only need to be downloaded once. We recommend storing this files in a shared location on the filesystem that other people can access. If you are running the pipeline on Biowulf, you do NOT need to download these reference files! They already exist on the filesystem in a location that anyone can acceess; however, if you are running the pipeline on another cluster or target system, you will need to download the shared resources with the build sub command, and you will need to provide this option every time you run the pipeline. Please provide the same path that was provided to the build sub command's --shared-resources option. Again, if you are running the pipeline on Biowulf, you do NOT need to provide this option. For more information about how to download shared resources, please reference the build sub command's `--shared-resources` option. -> -> ***Example:*** `--shared-resources /data/shared/renee` +> The pipeline uses a set of shared reference files that can be re-used across reference genomes. These currently include reference files for kraken and FQScreen. These reference files can be downloaded with the build sub command's `--shared-resources` option. With that being said, these files only need to be downloaded once. We recommend storing this files in a shared location on the filesystem that other people can access. If you are running the pipeline on Biowulf, you do NOT need to download these reference files! They already exist on the filesystem in a location that anyone can acceess; however, if you are running the pipeline on another cluster or target system, you will need to download the shared resources with the build sub command, and you will need to provide this option every time you run the pipeline. Please provide the same path that was provided to the build sub command's --shared-resources option. Again, if you are running the pipeline on Biowulf, you do NOT need to provide this option. For more information about how to download shared resources, please reference the build sub command's `--shared-resources` option. +> +> **_Example:_** `--shared-resources /data/shared/renee` + +--- + +`--singularity-cache SINGULARITY_CACHE` ---- - `--singularity-cache SINGULARITY_CACHE` > **Overrides the $SINGULARITY_CACHEDIR environment variable.** -> *type: path* -> *default: `--output OUTPUT/.singularity`* +> _type: path_ +> _default: `--output OUTPUT/.singularity`_ +> +> Singularity will cache image layers pulled from remote registries. This ultimately speeds up the process of pull an image from DockerHub if an image layer already exists in the singularity cache directory. By default, the cache is set to the value provided to the `--output` argument. Please note that this cache cannot be shared across users. Singularity strictly enforces you own the cache directory and will return a non-zero exit code if you do not own the cache directory! See the `--sif-cache` option to create a shareable resource. > -> Singularity will cache image layers pulled from remote registries. This ultimately speeds up the process of pull an image from DockerHub if an image layer already exists in the singularity cache directory. By default, the cache is set to the value provided to the `--output` argument. Please note that this cache cannot be shared across users. Singularity strictly enforces you own the cache directory and will return a non-zero exit code if you do not own the cache directory! See the `--sif-cache` option to create a shareable resource. -> -> ***Example:*** `--singularity-cache /data/$USER/.singularity` +> **_Example:_** `--singularity-cache /data/$USER/.singularity` + +--- + +`--sif-cache SIF_CACHE` ---- - `--sif-cache SIF_CACHE` > **Path where a local cache of SIFs are stored.** -> *type: path* +> _type: path_ > > Uses a local cache of SIFs on the filesystem. This SIF cache can be shared across users if permissions are set correctly. If a SIF does not exist in the SIF cache, the image will be pulled from Dockerhub and a warning message will be displayed. The `renee cache` subcommand can be used to create a local SIF cache. Please see `renee cache` for more information. This command is extremely useful for avoiding DockerHub pull rate limits. It also remove any potential errors that could occur due to network issues or DockerHub being temporarily unavailable. We recommend running RENEE with this option when ever possible. -> -> ***Example:*** `--singularity-cache /data/$USER/SIFs` +> +> **_Example:_** `--singularity-cache /data/$USER/SIFs` + +--- + +`--tmp-dir TMP_DIR` ---- - `--tmp-dir TMP_DIR` > **Path on the file system for writing temporary files.** -> *type: path* -> *default: `/lscratch/$SLURM_JOBID`* -> -> This is a path on the file system for writing temporary output files. By default, the temporary directory is set to '/lscratch/$SLURM_JOBID' for backwards compatibility with the NIH's Biowulf cluster; however, if you are running the pipeline on another cluster, this option will need to be specified. Ideally, this path should point to a dedicated location on the filesystem for writing tmp files. On many systems, this location is set to somewhere in /scratch. If you need to inject a variable into this string that should NOT be expanded, please quote this options value in single quotes. Again, if you are running the pipeline on Biowulf, you do NOT need to provide this option. -> -> ***Example:*** `--tmp-dir /cluster_scratch/$USER/` - ---- - `--threads THREADS` +> _type: path_ +> _default: `/lscratch/$SLURM_JOBID`_ +> +> This is a path on the file system for writing temporary output files. By default, the temporary directory is set to '/lscratch/$SLURM_JOBID' for backwards compatibility with the NIH's Biowulf cluster; however, if you are running the pipeline on another cluster, this option will need to be specified. Ideally, this path should point to a dedicated location on the filesystem for writing tmp files. On many systems, this location is set to somewhere in /scratch. If you need to inject a variable into this string that should NOT be expanded, please quote this options value in single quotes. Again, if you are running the pipeline on Biowulf, you do NOT need to provide this option. +> +> **_Example:_** `--tmp-dir /cluster_scratch/$USER/` + +--- + +`--threads THREADS` + > **Max number of threads for each process.** -> *type: int* -> *default: 2* -> -> Max number of threads for each process. This option is more applicable when running the pipeline with `--mode local`. It is recommended setting this vaule to the maximum number of CPUs available on the host machine. -> -> ***Example:*** `--threads 12` +> _type: int_ +> _default: 2_ +> +> Max number of threads for each process. This option is more applicable when running the pipeline with `--mode local`. It is recommended setting this vaule to the maximum number of CPUs available on the host machine. +> +> **_Example:_** `--threads 12` ### 2.4 Misc Options -Each of the following arguments are optional and do not need to be provided. +Each of the following arguments are optional and do not need to be provided. + +`-h, --help` - `-h, --help` > **Display Help.** -> *type: boolean* -> +> _type: boolean_ +> > Shows command's synopsis, help message, and an example command -> -> ***Example:*** `--help` - +> +> **_Example:_** `--help` ## 3. Example -### 3.1 Biowulf +### 3.1 Biowulf -On Biowulf getting started with the pipeline is fast and easy! The pipeline comes bundled with pre-built human and mouse reference genomes. In the example below, we will use the pre-built human reference genome. +On Biowulf getting started with the pipeline is fast and easy! The pipeline comes bundled with pre-built human and mouse reference genomes. In the example below, we will use the pre-built human reference genome. -```bash +```bash # Step 0.) Grab an interactive node (do not run on head node) srun -N 1 -n 1 --time=12:00:00 -p interactive --mem=8gb --cpus-per-task=4 --pty bash module purge @@ -210,9 +232,9 @@ renee run --input .tests/*.R?.fastq.gz \ ### 3.2 Generic SLURM Cluster -Running the pipeline outside of Biowulf is easy; however, there are a few extra steps you must first take. Before getting started, you will need to [build](../TLDR-RNA-seq/#3-building-reference-files) reference files for the pipeline. Please note when running the build sub command for the first time, you will also need to provide the `--shared-resources` option. This option will download our kraken2 database and bowtie2 indices for FastQ Screen. The path provided to this option should be provided to the `--shared-resources` option of the run sub command. Next, you will also need to provide a path to write temporary output files via the `--tmp-dir` option. We also recommend providing a path to a SIF cache. You can cache software containers locally with the [cache](../cache/) sub command. +Running the pipeline outside of Biowulf is easy; however, there are a few extra steps you must first take. Before getting started, you will need to [build](../TLDR-RNA-seq/#3-building-reference-files) reference files for the pipeline. Please note when running the build sub command for the first time, you will also need to provide the `--shared-resources` option. This option will download our kraken2 database and bowtie2 indices for FastQ Screen. The path provided to this option should be provided to the `--shared-resources` option of the run sub command. Next, you will also need to provide a path to write temporary output files via the `--tmp-dir` option. We also recommend providing a path to a SIF cache. You can cache software containers locally with the [cache](../cache/) sub command. -```bash +```bash # Step 0.) Grab an interactive node (do not run on head node) srun -N 1 -n 1 --time=2:00:00 -p interactive --mem=8gb --cpus-per-task=4 --pty bash # Add snakemake and singularity to $PATH, diff --git a/docs/RNA-seq/unlock.md b/docs/RNA-seq/unlock.md index 4d42e35..a5470c5 100644 --- a/docs/RNA-seq/unlock.md +++ b/docs/RNA-seq/unlock.md @@ -1,51 +1,55 @@ # renee unlock -## 1. About +## 1. About + The `renee` executable is composed of several inter-related sub commands. Please see `renee -h` for all available options. -This part of the documentation describes options and concepts for renee unlock sub command in more detail. With minimal configuration, the **`unlock`** sub command enables you to unlock a pipeline output directory. +This part of the documentation describes options and concepts for renee unlock sub command in more detail. With minimal configuration, the **`unlock`** sub command enables you to unlock a pipeline output directory. -If the pipeline fails ungracefully, it maybe required to unlock the working directory before proceeding again. Snakemake will inform a user when it maybe necessary to unlock a working directory with an error message stating: `Error: Directory cannot be locked`. +If the pipeline fails ungracefully, it maybe required to unlock the working directory before proceeding again. Snakemake will inform a user when it maybe necessary to unlock a working directory with an error message stating: `Error: Directory cannot be locked`. Please verify that the pipeline is not running before running this command. If the pipeline is currently running, the workflow manager will report the working directory is locked. The is the default behavior of snakemake, and it is normal. Do NOT run this command if the pipeline is still running! Please kill the master job and it's child jobs prior to running this command. -Unlocking an RENEE pipeline output directory is fast and easy! In its most basic form, renee run only has *one required inputs*. +Unlocking an RENEE pipeline output directory is fast and easy! In its most basic form, renee run only has _one required inputs_. ## 2. Synopsis + ```text $ renee unlock [-h] --output OUTPUT ``` The synopsis for this command shows its parameters and their usage. Optional parameters are shown in square brackets. -A user **must** provide an output directory to unlock via `--output` argument. After running the unlock sub command, you can resume the build or run pipeline from where it left off by re-running it. +A user **must** provide an output directory to unlock via `--output` argument. After running the unlock sub command, you can resume the build or run pipeline from where it left off by re-running it. + +Use you can always use the `-h` option for information on a specific command. -Use you can always use the `-h` option for information on a specific command. +### 2.1 Required Arguments -### 2.1 Required Arguments +`--output OUTPUT` - `--output OUTPUT` > **Output directory to unlock.** -> *type: path* -> -> Path to a previous run's output directory to unlock. This will remove a lock on the working directory. Please verify that the pipeline is not running before running this command. -> ***Example:*** `--output /data/$USER/RNA_hg38` +> _type: path_ +> +> Path to a previous run's output directory to unlock. This will remove a lock on the working directory. Please verify that the pipeline is not running before running this command. +> **_Example:_** `--output /data/$USER/RNA_hg38` ### 2.2 Options -Each of the following arguments are optional and do not need to be provided. +Each of the following arguments are optional and do not need to be provided. + +`-h, --help` - `-h, --help` > **Display Help.** -> *type: boolean* -> +> _type: boolean_ +> > Shows command's synopsis, help message, and an example command -> -> ***Example:*** `--help` - +> +> **_Example:_** `--help` ## 3. Example -```bash + +```bash # Step 0.) Grab an interactive node (do not run on head node) srun -N 1 -n 1 --time=12:00:00 -p interactive --mem=8gb --cpus-per-task=4 --pty bash module purge diff --git a/docs/assets/icons/analytics-black-48dp.svg b/docs/assets/icons/analytics-black-48dp.svg index b38bb56..51599d0 100644 --- a/docs/assets/icons/analytics-black-48dp.svg +++ b/docs/assets/icons/analytics-black-48dp.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/docs/assets/icons/analytics-white-48dp.svg b/docs/assets/icons/analytics-white-48dp.svg index 15c7fbe..2d637bb 100644 --- a/docs/assets/icons/analytics-white-48dp.svg +++ b/docs/assets/icons/analytics-white-48dp.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/docs/dev/coming-soon.md b/docs/dev/coming-soon.md index 6c8bad7..39a4570 100644 --- a/docs/dev/coming-soon.md +++ b/docs/dev/coming-soon.md @@ -1,5 +1,5 @@ # Coming Soon -This page is under construction, and our team is actively working on bringing you the most up-to-date documentation. +This page is under construction, and our team is actively working on bringing you the most up-to-date documentation. Thank you for your patience! diff --git a/docs/dev/lorem_ipsum.md b/docs/dev/lorem_ipsum.md index a1ed15f..2c16034 100644 --- a/docs/dev/lorem_ipsum.md +++ b/docs/dev/lorem_ipsum.md @@ -3,49 +3,52 @@ Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. ## h2 Heading + ### h3 Heading + #### h4 Heading + ##### h5 Heading -###### h6 Heading +###### h6 Heading ## Horizontal Rules --- -*** + +--- ## Emphasis **This is bold text** -__This is bold text__ +**This is bold text** -*This is italic text* +_This is italic text_ _This is italic text_ ~~Strikethrough~~ - ## Blockquotes - > Blockquotes can also be nested... ->> ...by using additional greater-than signs right next to each other... +> +> > ...by using additional greater-than signs right next to each other... +> > > > > ...or with spaces between arrows. - ## Lists Unordered -+ Create a list by starting a line with `+`, `-`, or `*` -+ Sub-lists are made by indenting 2 spaces: +- Create a list by starting a line with `+`, `-`, or `*` +- Sub-lists are made by indenting 2 spaces: - Marker character change forces new list start: - * Ac tristique libero volutpat at - + Facilisis in pretium nisl aliquet + - Ac tristique libero volutpat at + * Facilisis in pretium nisl aliquet - Nulla volutpat aliquam velit -+ Very easy! +- Very easy! Ordered @@ -53,16 +56,14 @@ Ordered 2. Consectetur adipiscing elit 3. Integer molestie lorem at massa - -1. You can use sequential numbers... -1. ...or keep all the numbers as `1.` +4. You can use sequential numbers... +5. ...or keep all the numbers as `1.` Start numbering with offset: 57. foo 1. bar - ## Code Inline `code` @@ -74,7 +75,6 @@ Indented code line 2 of code line 3 of code - Block code "fences" ``` @@ -83,7 +83,7 @@ Sample text here... Syntax highlighting -``` js +```js var foo = function (bar) { return bar++; }; @@ -93,20 +93,19 @@ console.log(foo(5)); ## Tables -| Option | Description | -| ------ | ----------- | +| Option | Description | +| ------ | ------------------------------------------------------------------------- | | data | path to data files to supply the data that will be passed into templates. | -| engine | engine to be used for processing templates. Handlebars is the default. | -| ext | extension to be used for dest files. | +| engine | engine to be used for processing templates. Handlebars is the default. | +| ext | extension to be used for dest files. | Right-aligned columns -| Option | Description | -| ------:| -----------:| -| data | path to data files to supply the data that will be passed into templates. | -| engine | engine to be used for processing templates. Handlebars is the default. | -| ext | extension to be used for dest files. | - +| Option | Description | +| -----: | ------------------------------------------------------------------------: | +| data | path to data files to supply the data that will be passed into templates. | +| engine | engine to be used for processing templates. Handlebars is the default. | +| ext | extension to be used for dest files. | ## Links @@ -114,7 +113,6 @@ Right-aligned columns [link with title](http://nodeca.github.io/pica/demo/ "title text!") - ## Images ![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg "The Stormtroopocat") @@ -125,15 +123,13 @@ Like links, Images also have a footnote style syntax With a reference later in the document defining the URL location: -[id]: https://octodex.github.com/images/dojocat.jpg "The Dojocat" - +[id]: https://octodex.github.com/images/dojocat.jpg "The Dojocat" ## Plugins The killer feature of `markdown-it` is very effective support of [syntax plugins](https://www.npmjs.org/browse/keyword/markdown-it-plugin). - ### [Footnotes](https://github.com/markdown-it/markdown-it-footnote) Footnote 1 link[^first]. @@ -150,17 +146,16 @@ Duplicated footnote reference[^second]. [^second]: Footnote text. - ### [Definition lists](https://github.com/markdown-it/markdown-it-deflist) Term 1 -: Definition 1 +: Definition 1 with lazy continuation. -Term 2 with *inline markup* +Term 2 with _inline markup_ -: Definition 2 +: Definition 2 { some code, part of Definition 2 } @@ -169,9 +164,8 @@ Term 2 with *inline markup* _Compact style:_ Term 1 - ~ Definition 1 +~ Definition 1 Term 2 - ~ Definition 2a - ~ Definition 2b - +~ Definition 2a +~ Definition 2b diff --git a/docs/general-questions.md b/docs/general-questions.md index 07f4b6e..c44962c 100644 --- a/docs/general-questions.md +++ b/docs/general-questions.md @@ -3,6 +3,7 @@ If you are experiencing an issue, please read through this list first before con We have compiled this FAQ from the most common questions. If you have a question that is not on this page, please feel free to [reach out to our team](https://github.com/CCBR/RENEE/issues). ## Contribute + **Q. I would like to contribute to RNA-seek. How do I get involved?** **A.** There are several ways you can get involved with the project. @@ -15,11 +16,11 @@ If you have added new features or adding new changes, please consider contributi 4. Commit and push your changes to your fork. 5. Create a [pull request](https://help.github.com/en/articles/creating-a-pull-request) to this repository. -If you would like to create or tackle an issue, please reference our [issue tracker](https://github.com/CCBR/RENEE/issues) on Github. +If you would like to create or tackle an issue, please reference our [issue tracker](https://github.com/CCBR/RENEE/issues) on Github. Also, feel free to like or :star: the project on Github! - ## Additional Support + **Q. I have a few questions about the pipeline. How can I reach you?** -**A.** For general questions and/or support, please free to [open an issue](https://github.com/CCBR/RENEE/issues) on Github or send an email to [CCBR](mailto:CCBR_Pipeliner@nih.gov) \ No newline at end of file +**A.** For general questions and/or support, please free to [open an issue](https://github.com/CCBR/RENEE/issues) on Github or send an email to [CCBR](mailto:CCBR_Pipeliner@nih.gov) diff --git a/docs/index.md b/docs/index.md index e0567ee..f6af076 100644 --- a/docs/index.md +++ b/docs/index.md @@ -5,11 +5,12 @@ > NOTE: Currently, RENEE takes sequencing FASTQ files as input and generates **genes** X **samples** _counts matrix_ as one of the outputs. Differential Gene Expression or DEG analysis can be performed by uploading the counts matrix to [NIDAP](https://nidap.nih.gov/workspace/slate/documents/nidap-home) or [iDEP](http://bioinformatics.sdstate.edu/idep/) ## 1. Introduction -Welcome to RENEE's documentation! -This guide is the main source of documentation for users that are getting started with the [RENEE pipeline](https://github.com/CCBR/RENEE). If you are not familiar with RNA-sequencing, please checkout our [theory and practical guide](RNA-seq/Theory.md). That section provides a conceptual overview to RNA-seq analysis and as well as a set of generalized guidelines to interpret different quality-control metrics. If you are a new user, we highly recommend reading through our [getting started](RNA-seq/TLDR-RNA-seq.md) section. This page contains information needed to quickly build new reference files and setup the pipeline for running in your compute environment. +Welcome to RENEE's documentation! -RENEE is composed several inter-related sub commands to faciliate the analysis of RNA-sequencing data. For more information about each available sub command, please see the [usage section](RNA-seq/run.md). To help out new users, an example of each command is also provided. The [resources page](RNA-seq/Resources.md) contains more information about the pipeline's default reference genomes along with every tool and Docker image the pipeline employs. +This guide is the main source of documentation for users that are getting started with the [RENEE pipeline](https://github.com/CCBR/RENEE). If you are not familiar with RNA-sequencing, please checkout our [theory and practical guide](RNA-seq/Theory.md). That section provides a conceptual overview to RNA-seq analysis and as well as a set of generalized guidelines to interpret different quality-control metrics. If you are a new user, we highly recommend reading through our [getting started](RNA-seq/TLDR-RNA-seq.md) section. This page contains information needed to quickly build new reference files and setup the pipeline for running in your compute environment. + +RENEE is composed several inter-related sub commands to faciliate the analysis of RNA-sequencing data. For more information about each available sub command, please see the [usage section](RNA-seq/run.md). To help out new users, an example of each command is also provided. The [resources page](RNA-seq/Resources.md) contains more information about the pipeline's default reference genomes along with every tool and Docker image the pipeline employs. For more information about issues or trouble-shooting a problem, please checkout our [FAQ](troubleshooting.md) prior to [opening an issue on Github](https://github.com/CCBR/RENEE/issues). @@ -25,45 +26,46 @@ A bioinformatics pipeline is more than the sum of its data processing steps. A p ## 3. Pipeline -The accuracy of the downstream interpretations made from transcriptomic data are highly dependent on initial sample library. Unwanted sources of technical variation, which if not accounted for properly, can influence the results. - -In addition to generating a MultiQC report, the RENEE pipeline also generates a [rNA Report](https://github.com/CCBR/rNA) to interactively allow users to identify problematic samples prior to performing any downstream analysis. RENEE's comprehensive quality-control helps ensure your results are reliable and reproducible across experiments. In the data processing steps, RENEE quantifies gene and isoform expression and predicts gene fusions. Please note that the detection of alternative splicing events and variant calling will be incorporated in a later release. +The accuracy of the downstream interpretations made from transcriptomic data are highly dependent on initial sample library. Unwanted sources of technical variation, which if not accounted for properly, can influence the results. +In addition to generating a MultiQC report, the RENEE pipeline also generates a [rNA Report](https://github.com/CCBR/rNA) to interactively allow users to identify problematic samples prior to performing any downstream analysis. RENEE's comprehensive quality-control helps ensure your results are reliable and reproducible across experiments. In the data processing steps, RENEE quantifies gene and isoform expression and predicts gene fusions. Please note that the detection of alternative splicing events and variant calling will be incorporated in a later release. ### 3.1 Quality Control -**Quality Control** -[*FastQC*4](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) is used to assess the sequencing quality. FastQC is run twice, before and after adapter trimming. It generates a set of basic statistics to identify problems that can arise during sequencing or library preparation. FastQC will summarize per base and per read QC metrics such as quality scores and GC content. It will also summarize the distribution of sequence lengths and will report the presence of adapter sequences. - -[*Kraken2*5](http://ccb.jhu.edu/software/kraken2/) and [*FastQ Screen*6](https://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/) are used to screen for various sources of contamination. During the process of sample collection to library preparation, there is a risk for introducing wanted sources of DNA. FastQ Screen compares your sequencing data to a set of different reference genomes to determine if there is contamination. It allows a user to see if the composition of your library matches what you expect. Also, if there are high levels of microbial contamination, Kraken can provide an estimation of the taxonomic composition. Kraken can be used in conjunction with [*Krona*7](https://github.com/marbl/Krona/wiki/KronaTools) to produce interactive reports. -[*Preseq*8](http://smithlabresearch.org/software/preseq/) is used to estimate the complexity of a library for each samples. If the duplication rate is very high, the overall library complexity will be low. Low library complexity could signal an issue with library preparation where very little input RNA was over-amplified or the sample may be degraded. +**Quality Control** +[_FastQC_4](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) is used to assess the sequencing quality. FastQC is run twice, before and after adapter trimming. It generates a set of basic statistics to identify problems that can arise during sequencing or library preparation. FastQC will summarize per base and per read QC metrics such as quality scores and GC content. It will also summarize the distribution of sequence lengths and will report the presence of adapter sequences. + +[_Kraken2_5](http://ccb.jhu.edu/software/kraken2/) and [_FastQ Screen_6](https://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/) are used to screen for various sources of contamination. During the process of sample collection to library preparation, there is a risk for introducing wanted sources of DNA. FastQ Screen compares your sequencing data to a set of different reference genomes to determine if there is contamination. It allows a user to see if the composition of your library matches what you expect. Also, if there are high levels of microbial contamination, Kraken can provide an estimation of the taxonomic composition. Kraken can be used in conjunction with [_Krona_7](https://github.com/marbl/Krona/wiki/KronaTools) to produce interactive reports. -[*Picard*9](https://broadinstitute.github.io/picard/) can be used to estimate the duplication rate, and it has another particularly useful sub-command called CollectRNAseqMetrics which reports the number and percentage of reads that align to various regions: such as coding, intronic, UTR, intergenic and ribosomal regions. This is particularly useful as you would expect a library constructed with ploy(A)-selection to have a high percentage of reads that map to coding regions. Picard CollectRNAseqMetrics will also report the uniformity of coverage across all genes, which is useful for determining whether a sample has a 3' bias (observed in ploy(A)-selection libraries containing degraded RNA). +[_Preseq_8](http://smithlabresearch.org/software/preseq/) is used to estimate the complexity of a library for each samples. If the duplication rate is very high, the overall library complexity will be low. Low library complexity could signal an issue with library preparation where very little input RNA was over-amplified or the sample may be degraded. -[*RSeQC*10](http://rseqc.sourceforge.net/) is another particularity useful package that is tailored for RNA-seq data. It is used to calculate the inner distance between paired-end reads and calculate TIN values for a set of canonical protein-coding transcripts. A median TIN value is calucated for each sample, which analogous to a computationally derived RIN. +[_Picard_9](https://broadinstitute.github.io/picard/) can be used to estimate the duplication rate, and it has another particularly useful sub-command called CollectRNAseqMetrics which reports the number and percentage of reads that align to various regions: such as coding, intronic, UTR, intergenic and ribosomal regions. This is particularly useful as you would expect a library constructed with ploy(A)-selection to have a high percentage of reads that map to coding regions. Picard CollectRNAseqMetrics will also report the uniformity of coverage across all genes, which is useful for determining whether a sample has a 3' bias (observed in ploy(A)-selection libraries containing degraded RNA). -[MultiQC11](https://multiqc.info/) is used to aggreate the results of each tool into a single interactive report. +[_RSeQC_10](http://rseqc.sourceforge.net/) is another particularity useful package that is tailored for RNA-seq data. It is used to calculate the inner distance between paired-end reads and calculate TIN values for a set of canonical protein-coding transcripts. A median TIN value is calucated for each sample, which analogous to a computationally derived RIN. + +[MultiQC11](https://multiqc.info/) is used to aggreate the results of each tool into a single interactive report. ### 3.2 Data Processing - -[*Cutadapt*12](https://cutadapt.readthedocs.io/en/stable/) is used to remove adapter sequences, perform quality trimming, and remove very short sequences that would otherwise multi-map all over the genome prior to alignment. -[*STAR*13](https://github.com/alexdobin/STAR) is used to align reads to the reference genome. The RENEE pipeline runs STAR in a two-passes where splice-junctions are collected and aggregated across all samples and provided to the second-pass of STAR. In the second pass of STAR, the splice-junctions detected in the first pass are inserted into the genome indices prior to alignment. +[_Cutadapt_12](https://cutadapt.readthedocs.io/en/stable/) is used to remove adapter sequences, perform quality trimming, and remove very short sequences that would otherwise multi-map all over the genome prior to alignment. + +[_STAR_13](https://github.com/alexdobin/STAR) is used to align reads to the reference genome. The RENEE pipeline runs STAR in a two-passes where splice-junctions are collected and aggregated across all samples and provided to the second-pass of STAR. In the second pass of STAR, the splice-junctions detected in the first pass are inserted into the genome indices prior to alignment. -[*RSEM*14](https://github.com/deweylab/RSEM) is used to quantify gene and isoform expression. The expected counts from RSEM are merged across samples to create a two counts matrices for gene counts and isoform counts. +[_RSEM_14](https://github.com/deweylab/RSEM) is used to quantify gene and isoform expression. The expected counts from RSEM are merged across samples to create a two counts matrices for gene counts and isoform counts. -[*Arriba*15](https://arriba.readthedocs.io/en/latest/) is used to predict gene-fusion events. The pre-built human and mouse reference genomes use Arriba blacklists to reduce the false-positive rate. +[_Arriba_15](https://arriba.readthedocs.io/en/latest/) is used to predict gene-fusion events. The pre-built human and mouse reference genomes use Arriba blacklists to reduce the false-positive rate. ## 4. Contribute -This site is a living document, created for and by the genomics community. RENEE is maintained by the developers and bioinformaticians at [CCBR](https://ccbr.github.io/) and is improved by feedback from external collaborators like *you*! +This site is a living document, created for and by the genomics community. RENEE is maintained by the developers and bioinformaticians at [CCBR](https://ccbr.github.io/) and is improved by feedback from external collaborators like _you_! -We want to make it easy for users to connect with us to share ideas, solve problems, and to continuously deliver the best pipelines. We encourage you to contribute new content and make improvements to existing content via pull request to our [GitHub repository](https://github.com/CCBR/RENEE). *You* can also contribute by reporting bugs/enhancement requests/etc. by [opening an issue on Github](https://github.com/CCBR/RENEE/issues). +We want to make it easy for users to connect with us to share ideas, solve problems, and to continuously deliver the best pipelines. We encourage you to contribute new content and make improvements to existing content via pull request to our [GitHub repository](https://github.com/CCBR/RENEE). _You_ can also contribute by reporting bugs/enhancement requests/etc. by [opening an issue on Github](https://github.com/CCBR/RENEE/issues). ## 5. References -**1.** Merkel, D. (2014). Docker: lightweight linux containers for consistent development and deployment. Linux Journal, 2014(239), 2. -**2.** Kurtzer GM, Sochat V, Bauer MW (2017). Singularity: Scientific containers for mobility of compute. PLoS ONE 12(5): e0177459. -**3.** Koster, J. and S. Rahmann (2018). "Snakemake-a scalable bioinformatics workflow engine." Bioinformatics 34(20): 3600. + +**1.** Merkel, D. (2014). Docker: lightweight linux containers for consistent development and deployment. Linux Journal, 2014(239), 2. +**2.** Kurtzer GM, Sochat V, Bauer MW (2017). Singularity: Scientific containers for mobility of compute. PLoS ONE 12(5): e0177459. +**3.** Koster, J. and S. Rahmann (2018). "Snakemake-a scalable bioinformatics workflow engine." Bioinformatics 34(20): 3600. **4.** Andrews, S. (2010). FastQC: a quality control tool for high throughput sequence data. **5.** Wood, D. E. and S. L. Salzberg (2014). "Kraken: ultrafast metagenomic sequence classification using exact alignments." Genome Biol 15(3): R46. **6.** Wingett, S. and S. Andrews (2018). "FastQ Screen: A tool for multi-genome mapping and quality control." F1000Research 7(2): 1338. @@ -75,10 +77,8 @@ We want to make it easy for users to connect with us to share ideas, solve probl **12.** Martin, M. (2011). "Cutadapt removes adapter sequences from high-throughput sequencing reads." EMBnet 17(1): 10-12. **13.** Dobin, A., et al., STAR: ultrafast universal RNA-seq aligner. Bioinformatics, 2013. 29(1): p. 15-21. **14.** Li, B. and C.N. Dewey, RSEM: accurate transcript quantification from RNA-Seq data with or without a reference genome. BMC Bioinformatics, 2011. 12: p. 323. -**15.** Uhrig, S., et al. (2021). "Accurate and efficient detection of gene fusions from RNA sequencing data". Genome Res. 31(3): 448-460. - - - +**15.** Uhrig, S., et al. (2021). "Accurate and efficient detection of gene fusions from RNA sequencing data". Genome Res. 31(3): 448-460. - [1]: contact-us.md + +[1]: contact-us.md diff --git a/docs/license.md b/docs/license.md index c8573b3..dd2938f 100644 --- a/docs/license.md +++ b/docs/license.md @@ -1,6 +1,6 @@ # MIT License -*Copyright (c) 2023 CCBR* +_Copyright (c) 2023 CCBR_ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index a6077c7..5f0576a 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -47,7 +47,6 @@ Here are a few suggestions: Each job that RNA-seek submits to the cluster starts with the `pl:` prefix. - **Q. How do I identify failed jobs?** **A.** If there are errors, you'll need to identify which jobs failed and check its corresponding SLURM output file. @@ -67,13 +66,11 @@ The SLURM output file may contain a clue as to why the job failed. [Bash script]( https://github.com/CCBR/Tools/blob/master/Biowulf/get_slurm_file_with_error.sh) identify the SLURM ID of the first failed job and check if the output file exists. - Many failures are caused by filesystem or network issues on Biowulf, and in such cases, simply re-starting the Pipeline should resolve the issue. Snakemake will dynamically determine which steps have been completed, and which steps still need to be run. If you are still running into problems after re-running the pipeline, there may be another issue. If that is the case, please feel free to [contact us](https://github.com/skchronicles/RNA-seek/issues). - **Q. How do I cancel ongoing RNA-seek jobs?** -**A.** Sometimes, you might need to manually stop a RNA-seek run prematurely, perhaps because the run was configured incorrectly or if a job is stalled. Although the walltime limits will eventually stop the workflow, this can take up to 5 or 10 days depending on the pipeline. +**A.** Sometimes, you might need to manually stop a RNA-seek run prematurely, perhaps because the run was configured incorrectly or if a job is stalled. Although the walltime limits will eventually stop the workflow, this can take up to 5 or 10 days depending on the pipeline. To stop RNA-seek jobs that are currently running, you can follow these options. @@ -113,7 +110,6 @@ Once you've ensured that all running jobs have been stopped, you need to unlock **A.** Are you running the `rna-seek` on `helix.nih.gov` by mistake. [Helix](https://hpc.nih.gov/systems/) does not have a job scheduler. One may be able to fire up the singularity module, initial working directory and perform dry-run on `helix`. But to submit jobs, you need to log into `biowulf` using `ssh -Y username@biowulf.nih.gov`. - **Q. Why am I getting a message saying `Error: Directory cannot be locked. ...` when I do the dry-run?** **A.** This is caused when a run is stopped prematurely, either accidentally or on purpose, or the pipeline is still running in your working directory. Snakemake will lock a working directory to prevent two concurrent pipelines from writing to the same location. This can be remedied easily by running `rna-seek unlock` sub command. Please check to see if the pipeline is still running prior to running the commands below. If you would like to cancel a submitted or running pipeline, please reference the instructions above. @@ -128,7 +124,7 @@ rna-seek unlock --output /path/to/working/dir **Q. Why am I getting a message saying `MissingInputException in line ...` when I do the dry-run?** -**A.** This error usually occurs when snakemake is terminated ungracefully. Did you forcefully cancel a running pipeline? Or did one of your running pipelines abruptly end? Either way, the solution is straight-forward. Please go to your pipeline's output directory, and rename or delete the following hidden directory: `.snakemake/`. This directory contains metadata pertaining any snakemake runs inside that working directory. Sometimes when a pipeline is pre-maturely or forcefully terminated, a few files related to tracking temp() files are not deleted and snakemake raises a MissingInputException. +**A.** This error usually occurs when snakemake is terminated ungracefully. Did you forcefully cancel a running pipeline? Or did one of your running pipelines abruptly end? Either way, the solution is straight-forward. Please go to your pipeline's output directory, and rename or delete the following hidden directory: `.snakemake/`. This directory contains metadata pertaining any snakemake runs inside that working directory. Sometimes when a pipeline is pre-maturely or forcefully terminated, a few files related to tracking temp() files are not deleted and snakemake raises a MissingInputException. ```bash # Navigate to working directory @@ -138,4 +134,3 @@ cd /path/to/working/dir # And try re-dry running the pipeline mv .snakemake .old_snakemake ``` - diff --git a/mkdocs.yml b/mkdocs.yml index 85749e4..5746fd4 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -3,7 +3,7 @@ site_name: RENEE Documentation site_author: CCBR site_description: >- RENEE is an open-source, reproducible, and scalable best - practices pipeline for analyzing RNA-sequencing data. + practices pipeline for analyzing RNA-sequencing data. # Repository repo_name: CCBR/RENEE @@ -19,7 +19,7 @@ theme: features: - navigation.tabs - navigation.top - - toc.integrate + - toc.integrate palette: - scheme: default primary: indigo @@ -43,7 +43,6 @@ plugins: - minify: minify_html: true - # Customization extra: social: @@ -56,7 +55,6 @@ extra: version: provider: mike - # Extensions markdown_extensions: - markdown.extensions.admonition @@ -98,13 +96,13 @@ nav: - About: index.md - Theory: RNA-seq/Theory.md - Getting started: RNA-seq/TLDR-RNA-seq.md - - Usage: - - run: RNA-seq/run.md - - build: RNA-seq/build.md - - cache: RNA-seq/cache.md - - unlock: RNA-seq/unlock.md + - Usage: + - run: RNA-seq/run.md + - build: RNA-seq/build.md + - cache: RNA-seq/cache.md + - unlock: RNA-seq/unlock.md - Resources: RNA-seq/Resources.md - FAQ: - - General Questions: general-questions.md - - Troubleshooting: troubleshooting.md + - General Questions: general-questions.md + - Troubleshooting: troubleshooting.md - License: license.md diff --git a/renee b/renee index 670d6a4..d099a54 100755 --- a/renee +++ b/renee @@ -45,7 +45,7 @@ except AssertionError: def scontrol_show(): - """ Run scontrol show config and parse the output as a dictionary + """Run scontrol show config and parse the output as a dictionary @return scontrol_dict : """ scontrol_dict = dict() @@ -61,24 +61,26 @@ def scontrol_show(): def get_hpcname(): - """ Get the HPC name (biowulf, frce, or an empty string) + """Get the HPC name (biowulf, frce, or an empty string) @return hpcname """ scontrol_out = scontrol_show() - hpc = scontrol_out["ClusterName"] if "ClusterName" in scontrol_out.keys() else '' - if hpc == 'fnlcr': - hpc = 'frce' + hpc = scontrol_out["ClusterName"] if "ClusterName" in scontrol_out.keys() else "" + if hpc == "fnlcr": + hpc = "frce" return hpc -def get_genomes_list(renee_path, hpcname = get_hpcname()): - """ Get list of genome annotations available for the current platform +def get_genomes_list(renee_path, hpcname=get_hpcname()): + """Get list of genome annotations available for the current platform @return genomes_list """ genome_config_dir = os.path.join(renee_path, "config", "genomes", hpcname) json_files = glob.glob(genome_config_dir + "/*.json") if not json_files: - warnings.warn(f"WARNING: No Genome Annotation JSONs found in {genome_config_dir}. Please specify a custom genome json file with `--genome`") + warnings.warn( + f"WARNING: No Genome Annotation JSONs found in {genome_config_dir}. Please specify a custom genome json file with `--genome`" + ) genomes = [os.path.basename(file).replace(".json", "") for file in json_files] return sorted(genomes) @@ -722,16 +724,20 @@ def setup(sub_args, ifiles, repo_path, output_path): "genome": genome_config, # Template for tool information "tools": os.path.join(output_path, "config", "templates", "tools.json"), - } + } # Global config file for pipeline, config.json config = join_jsons(required.values()) # uses templates in the renee repo # Update cluster-specific paths for fastq screen & kraken db - if hpcname == 'biowulf' or hpcname == 'frce': - db_json_filename = os.path.join(output_path, 'config', 'templates', f"dbs_{hpcname}.json") - with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), db_json_filename), "r") as json_file: - config['bin']['rnaseq']['tool_parameters'].update(json.load(json_file)) - + if hpcname == "biowulf" or hpcname == "frce": + db_json_filename = os.path.join( + output_path, "config", "templates", f"dbs_{hpcname}.json" + ) + with open( + os.path.join(os.path.dirname(os.path.abspath(__file__)), db_json_filename), + "r", + ) as json_file: + config["bin"]["rnaseq"]["tool_parameters"].update(json.load(json_file)) config = add_user_information(config) config = add_rawdata_information(sub_args, config, ifiles) @@ -1084,22 +1090,28 @@ def run(sub_args): # hpcname is either biowulf, frce, or blank hpcname = get_hpcname() - if sub_args.runmode == 'init' or not os.path.exists(os.path.join(sub_args.output, 'config.json')): + if sub_args.runmode == "init" or not os.path.exists( + os.path.join(sub_args.output, "config.json") + ): # Initialize working directory, copy over required pipeline resources - input_files = initialize(sub_args, repo_path=git_repo, output_path=sub_args.output) + input_files = initialize( + sub_args, repo_path=git_repo, output_path=sub_args.output + ) # Step pipeline for execution, create config.json config file from templates config = setup( - sub_args, ifiles=input_files, repo_path=git_repo, output_path=sub_args.output + sub_args, + ifiles=input_files, + repo_path=git_repo, + output_path=sub_args.output, ) # load config from existing file else: with open(os.path.join(sub_args.output, "config.json"), "r") as config_file: config = json.load(config_file) - # ensure the working dir is read/write friendly - scripts_path = os.path.join(sub_args.output,'workflow','scripts') + scripts_path = os.path.join(sub_args.output, "workflow", "scripts") os.chmod(scripts_path, 0o755) # Optional Step: Dry-run pipeline @@ -1133,7 +1145,7 @@ def run(sub_args): list(config["references"]["rnaseq"].values()) + fq_screen_paths + kraken_db_path ) all_bind_paths = "{},{}".format(",".join(genome_bind_paths), rawdata_bind_paths) - + if sub_args.dry_run: # print singularity bind baths and exit print("\nSingularity Bind Paths:{}".format(all_bind_paths)) sys.exit(0) @@ -1286,8 +1298,10 @@ def _configure(sub_args, filename, git_repo): fh.write(' {}: "{}"\n'.format(tag, uri)) print("Done!") + def _reset_write_permission(target): - os.system("chmod -R u+w,g-w,o-w "+target) + os.system("chmod -R u+w,g-w,o-w " + target) + def configure_build(sub_args, git_repo, output_path): """Setups up working directory for build option and creates config file (build.yml) @@ -1825,9 +1839,7 @@ def parsed_arguments(name, description): subparser_run.add_argument( "--genome", required=True, - type=lambda option: str( - genome_options(subparser_run, option, GENOMES_LIST) - ), + type=lambda option: str(genome_options(subparser_run, option, GENOMES_LIST)), help=argparse.SUPPRESS, ) @@ -1865,16 +1877,17 @@ def parsed_arguments(name, description): default=False, help=argparse.SUPPRESS, ) - subparser_run.add_argument('--runmode', + subparser_run.add_argument( + "--runmode", # Determines how to run the pipeline: init, run # TODO: this API is different from XAVIER & CARLISLE, which have a --runmode=dryrun option instead of a --dry-run flag. - required = False, - default = 'run', - choices = ['init','run'], - type = str, - help = argparse.SUPPRESS + required=False, + default="run", + choices=["init", "run"], + type=str, + help=argparse.SUPPRESS, ) - + # Execution Method, run locally # on a compute node or submit to # a supported job scheduler, etc. @@ -2090,7 +2103,7 @@ def parsed_arguments(name, description): Wait until master job completes. This is required if the job is submitted using HPC API. If not provided the API may interpret submission of master job as - completion of the pipeline! + completion of the pipeline! {1}{2}Misc Options:{4} -h, --help Show usage information, help message, and exit. @@ -2259,7 +2272,6 @@ def parsed_arguments(name, description): completion of the pipeline!", ) - # Sub-parser for the "unlock" sub-command # Grouped sub-parser arguments are currently # not supported: https://bugs.python.org/issue9341 diff --git a/resources/RENEE_Pipeline.svg b/resources/RENEE_Pipeline.svg index 4d14b3e..4955e61 100644 --- a/resources/RENEE_Pipeline.svg +++ b/resources/RENEE_Pipeline.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/resources/biowulf/fastq_screen.conf b/resources/biowulf/fastq_screen.conf index a403359..d02f490 100644 --- a/resources/biowulf/fastq_screen.conf +++ b/resources/biowulf/fastq_screen.conf @@ -3,10 +3,10 @@ ###################### ## Bowtie or Bowtie2 # ###################### -## If the Bowtie1/2 binary is not in your PATH then you can +## If the Bowtie1/2 binary is not in your PATH then you can ## set this value to tell the program where to find it. ## Uncomment the line below and set the appropriate location. -## Please note, this path should include the executable +## Please note, this path should include the executable ## filename. #BOWTIE /usr/local/bin/bowtie/bowtie @@ -16,10 +16,10 @@ ########################################### ## Bismark (for bisulfite sequencing only)# ########################################### -## If the Bismark binary is not in your PATH then you can +## If the Bismark binary is not in your PATH then you can ## set this value to tell the program where to find it. ## Uncomment the line below and set the appropriate location. -## Please note, this path should include the executable +## Please note, this path should include the executable ## filename. #BISMARK /usr/local/bin/bismark/bismark @@ -40,26 +40,26 @@ THREADS 24 ## This section allows you to configure multiple databases ## to search against in your screen. For each database ## you need to provide a database name (which can't contain -## spaces) and the location of the bowtie indices which +## spaces) and the location of the bowtie indices which ## you created for that database. -## -## The entries shown below are only suggested examples, you +## +## The entries shown below are only suggested examples, you ## can add as many DATABASE sections as required, and you ## can comment out or remove as many of the existing entries ## as desired. ## ## Either the original bowtie or bowtie2 may be used for the -## mapping. Specify the aligner to use with the command line -## flag --aligner with arguments 'bowtie' or +## mapping. Specify the aligner to use with the command line +## flag --aligner with arguments 'bowtie' or ## 'bowtie2' (default). -## -## The configuration file may list paths to both bowtie and +## +## The configuration file may list paths to both bowtie and ## bowtie2 indices. FastQ Screen automatically detects whether -## a specified index is compatible with bowtie or bowtie2. +## a specified index is compatible with bowtie or bowtie2. ## -## Although the configuration file may list paths to both -## bowtie and bowtie2 indices, only one aligner will be used -## for the mapping, as specified by the --aligner flag. +## Although the configuration file may list paths to both +## bowtie and bowtie2 indices, only one aligner will be used +## for the mapping, as specified by the --aligner flag. ## ## The path to the index files SHOULD INCLUDE THE BASENAME of ## the index, e.g: @@ -72,13 +72,13 @@ THREADS 24 ## ## Human - sequences available from ## ftp://ftp.ensembl.org/pub/current/fasta/homo_sapiens/dna/ -DATABASE Human /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/hg19/hg19 -DATABASE Mouse /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/mm9/mm9 +DATABASE Human /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/hg19/hg19 +DATABASE Mouse /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/mm9/mm9 #DATABASE Phix /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/PhiX/phix #DATABASE Salmo /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/Salmo_salar_clone/Salmo_salar -#DATABASE Uni_Vec /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/UniVec_vectors/UniVec_vectors +#DATABASE Uni_Vec /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/UniVec_vectors/UniVec_vectors DATABASE Bacteria /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/Bacteria/bacteria DATABASE Fungi /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/Fungi/fungi DATABASE Virus /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/Virus/virus #DATABASE rRNA /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/rRNA/rRNA -#DATABASE Lambda /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/Lambda/Lambda +#DATABASE Lambda /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/Lambda/Lambda diff --git a/resources/biowulf/fastq_screen_2.conf b/resources/biowulf/fastq_screen_2.conf index cb7c72e..25554ac 100644 --- a/resources/biowulf/fastq_screen_2.conf +++ b/resources/biowulf/fastq_screen_2.conf @@ -3,10 +3,10 @@ ###################### ## Bowtie or Bowtie2 # ###################### -## If the Bowtie1/2 binary is not in your PATH then you can +## If the Bowtie1/2 binary is not in your PATH then you can ## set this value to tell the program where to find it. ## Uncomment the line below and set the appropriate location. -## Please note, this path should include the executable +## Please note, this path should include the executable ## filename. #BOWTIE /usr/local/bin/bowtie/bowtie @@ -16,10 +16,10 @@ ########################################### ## Bismark (for bisulfite sequencing only)# ########################################### -## If the Bismark binary is not in your PATH then you can +## If the Bismark binary is not in your PATH then you can ## set this value to tell the program where to find it. ## Uncomment the line below and set the appropriate location. -## Please note, this path should include the executable +## Please note, this path should include the executable ## filename. #BISMARK /usr/local/bin/bismark/bismark @@ -40,26 +40,26 @@ THREADS 24 ## This section allows you to configure multiple databases ## to search against in your screen. For each database ## you need to provide a database name (which can't contain -## spaces) and the location of the bowtie indices which +## spaces) and the location of the bowtie indices which ## you created for that database. -## -## The entries shown below are only suggested examples, you +## +## The entries shown below are only suggested examples, you ## can add as many DATABASE sections as required, and you ## can comment out or remove as many of the existing entries ## as desired. ## ## Either the original bowtie or bowtie2 may be used for the -## mapping. Specify the aligner to use with the command line -## flag --aligner with arguments 'bowtie' or +## mapping. Specify the aligner to use with the command line +## flag --aligner with arguments 'bowtie' or ## 'bowtie2' (default). -## -## The configuration file may list paths to both bowtie and +## +## The configuration file may list paths to both bowtie and ## bowtie2 indices. FastQ Screen automatically detects whether -## a specified index is compatible with bowtie or bowtie2. +## a specified index is compatible with bowtie or bowtie2. ## -## Although the configuration file may list paths to both -## bowtie and bowtie2 indices, only one aligner will be used -## for the mapping, as specified by the --aligner flag. +## Although the configuration file may list paths to both +## bowtie and bowtie2 indices, only one aligner will be used +## for the mapping, as specified by the --aligner flag. ## ## The path to the index files SHOULD INCLUDE THE BASENAME of ## the index, e.g: @@ -72,12 +72,12 @@ THREADS 24 ## ## Human - sequences available from ## ftp://ftp.ensembl.org/pub/current/fasta/homo_sapiens/dna/ -#DATABASE Human /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/hg19/hg19 -#DATABASE Mouse /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/mm9/mm9 +#DATABASE Human /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/hg19/hg19 +#DATABASE Mouse /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/mm9/mm9 #DATABASE Phix /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/PhiX/phix #DATABASE Salmo /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/Salmo_salar_clone/Salmo_salar -DATABASE Uni_Vec /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/UniVec_vectors/UniVec_vectors +DATABASE Uni_Vec /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/UniVec_vectors/UniVec_vectors #DATABASE Bacteria /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/Bacteria/bacteria #DATABASE Virus /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/Virus/virus DATABASE rRNA /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/rRNA/rRNA -#DATABASE Lambda /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/Lambda/Lambda +#DATABASE Lambda /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/Lambda/Lambda diff --git a/resources/builder b/resources/builder index ee66b1a..a484dfa 100755 --- a/resources/builder +++ b/resources/builder @@ -40,13 +40,13 @@ Required Arguments: the rawdata directory(s) along with the pipeline's output directory. Please see example usage below. -t, --tmp-dir [Type:Path] Temporary directory. The pipeline generates - intermediate, temporary output files. Any + intermediate, temporary output files. Any temporary output files will be written to - this location. On Biowulf, it should be + this location. On Biowulf, it should be set to '/lscratch/\$SLURM_JOBID/'. On FRCE, - this value should be set to the following: + this value should be set to the following: '/scratch/cluster_scratch/\$USER/'. - -h, --hpc-name [Type: Str] biowulf or frce or unknown + -h, --hpc-name [Type: Str] biowulf or frce or unknown OPTIONS: -o, --outdir [Type: Path] Path to output directory. If not provided, the Path will default to the current working directory of @@ -186,17 +186,17 @@ function submit(){ triggeroptions="" fi # Check if NOT running on Biowulf - # Assumes other clusters do NOT + # Assumes other clusters do NOT # have GRES for local node disk, - # long term it might be worth - # adding a new option to allow - # a user to decide whether to + # long term it might be worth + # adding a new option to allow + # a user to decide whether to # use GRES at job submission, # trying to infer this because # most users will not even know # what GRES is and how or why # it should be used and by default - # SLURM is not configured to use + # SLURM is not configured to use # GRES, remove prefix single quote # if [[ ${6#\'} != /lscratch* ]]; then # CLUSTER_OPTS="sbatch --cpus-per-task {cluster.threads} -t {cluster.time} --mem {cluster.mem} --job-name={params.rname}" diff --git a/resources/cacher b/resources/cacher index fb19a94..2f0ee6a 100755 --- a/resources/cacher +++ b/resources/cacher @@ -8,7 +8,7 @@ USAGE: SYNOPSIS: This script submits the cacher master job to the cluster. This main process dictates how subsequent resources are pulled onto the cluster's filesystem. cacher utilizes SLURM -to avoid pull into resources on a compute node but support for additional job schedulers +to avoid pull into resources on a compute node but support for additional job schedulers (i.e. PBS, SGE, LSF, Tibanna) may be added in the near future. The main entry point of the pipeline 'renee' calls this job submission wrapper script. As so, this script can be used to manually by-pass 'renee' for a previously failed cache. @@ -25,14 +25,14 @@ Required Positional Argument: most of the steps are computationally intensive. Required Arguments: -s, --sif-cache [Type: Path] Path to output directory to cache remote resources. - -i, --image-uris [Type: Str] Image(s) to pull from Dockerhub. Multiple images + -i, --image-uris [Type: Str] Image(s) to pull from Dockerhub. Multiple images are seperated by a comma. OPTIONS: -t, --tmp-dir [Type: Path] Path to tmp singularity dir. Singularity uses this directory when images are pulled from DockerHub - and coverted into SIFs. If not provided, the - location to the temp dir will default to the - following "/tmp/$USER/cacher/.singularity/" + and coverted into SIFs. If not provided, the + location to the temp dir will default to the + following "/tmp/$USER/cacher/.singularity/" directory. -h, --help [Type: Bool] Displays usage and help information. Example: @@ -103,19 +103,19 @@ function check(){ function retry() { - # Tries to run a cmd 5 times before failing + # Tries to run a cmd 5 times before failing # If a command is successful, it will break out of attempt loop - # Failed attempts are padding with the following exponential + # Failed attempts are padding with the following exponential # back-off strategy {4, 16, 64, 256, 1024} in seconds - # @INPUTS "$@"" = cmd to run + # @INPUTS "$@"" = cmd to run # @CALLS fatal() if command cannot be run in 5 attempts local n=1 local max=5 local attempt=true # flag for while loop while $attempt; do - # Attempt command and break if successful + # Attempt command and break if successful "$@" && attempt=false || { - # Try again up to 5 times + # Try again up to 5 times if [[ $n -le $max ]]; then err "Command failed: $@" delay=$(( 4**$n )) @@ -139,8 +139,8 @@ function _pull(){ # Check if singularity in $PATH # If not, try to module load singularity as a last resort - command -V singularity &> /dev/null || { - command -V module &> /dev/null && + command -V singularity &> /dev/null || { + command -V module &> /dev/null && module purge && module load singularity } || fatal "Fail to find or load 'singularity', not installed on target system." @@ -159,7 +159,7 @@ function _pull(){ case "$executor" in slurm) # Create directory for logfiles - for image in ${4//,/$'\t'}; do + for image in ${4//,/$'\t'}; do # Try to pull image from URI with 5 max attempt echo "Singularity pulling ${image}" retry singularity pull -F ${image} @@ -209,7 +209,7 @@ function main(){ echo -e "Running with the following parameters:" for key in "${!Arguments[@]}"; do echo -e "\t${key}\t${Arguments["$key"]}"; done - # Pull remote resources into RENEE cache + # Pull remote resources into RENEE cache # Cache remote image from DockerHub # INPUT $1 = Snakemake Mode of execution # INPUT $2 = Cache output directory diff --git a/resources/clean_gtf.py b/resources/clean_gtf.py index 15b478a..2150ce3 100755 --- a/resources/clean_gtf.py +++ b/resources/clean_gtf.py @@ -15,9 +15,9 @@ # Steps for converting messy gff into properly formatted GTF file # 1. Pull image from registry and create SIF -module load singularity +module load singularity SINGULARITY_CACHEDIR=$PWD singularity pull \\ - docker://quay.io/biocontainers/agat:0.8.0--pl5262hdfd78af_0 + docker://quay.io/biocontainers/agat:0.8.0--pl5262hdfd78af_0 # 2. Run AGAT todo the heavy lifting of gtf conversion singularity exec -B $PWD \\ @@ -30,30 +30,33 @@ ./clean_gtf.py /path/to/converted.gtf > /path/to/clean.gtf """ -def replace_nested_quotes(line, find_char = '"', replace_char = ''): + +def replace_nested_quotes(line, find_char='"', replace_char=""): """ - Assumes the quote character in the 9th column is a double - quote or <"> character. This is the correct character to - use based on the speficiation. + Assumes the quote character in the 9th column is a double + quote or <"> character. This is the correct character to + use based on the speficiation. """ # Normal: - # protein_id "XP_040355194.1"; - # Bad: + # protein_id "XP_040355194.1"; + # Bad: # transl_except "(pos:956284..956286)" "(pos:956290..956292)"; # Fixed: # transl_except "(pos:956284..956286) (pos:956290..956292)"; quote_count = 0 inside_quotes = False - fixed = '' + fixed = "" for i in range(len(line)): curr_char = line[i] # Scan for next character to determine # if reached end of quotation. - try: next_char = line[i+1] - except IndexError: next_char = '' + try: + next_char = line[i + 1] + except IndexError: + next_char = "" - if curr_char == '"': + if curr_char == '"': # Entered the border or ending of # a quote, increase the counter and # check where we are in the string @@ -61,7 +64,7 @@ def replace_nested_quotes(line, find_char = '"', replace_char = ''): if quote_count == 1: inside_quotes = True - if next_char == ';': + if next_char == ";": # Reached end border of quote, # reset boolean flag and counters inside_quotes = False @@ -72,8 +75,8 @@ def replace_nested_quotes(line, find_char = '"', replace_char = ''): # replace reserved delimeter with # another character, let's use a # url encoding of the character - if curr_char == find_char and quote_count > 1: - curr_char = replace_char + if curr_char == find_char and quote_count > 1: + curr_char = replace_char # Add the existing/converted character fixed += curr_char @@ -81,21 +84,20 @@ def replace_nested_quotes(line, find_char = '"', replace_char = ''): return fixed - -def url_escape_inside_quotes(line, delimiter=';', url_encoding = '%3B'): +def url_escape_inside_quotes(line, delimiter=";", url_encoding="%3B"): """See the following issue for description and context: https://github.com/NBISweden/AGAT/issues/250 - Assumes the quote character in the 9th column is a double - quote or <"> character. This is the correct character to - use based on the speficiation. + Assumes the quote character in the 9th column is a double + quote or <"> character. This is the correct character to + use based on the speficiation. """ quote_count = 0 inside_quotes = False - fixed = '' + fixed = "" for c in line: if c == '"': - # Entered the border or ending of + # Entered the border or ending of # a quote, increase the counter and # check where we are in the string quote_count += 1 @@ -108,17 +110,17 @@ def url_escape_inside_quotes(line, delimiter=';', url_encoding = '%3B'): quote_count = 0 if inside_quotes: - # Fix evil mistakes of the past, - # replace reserved delimeter with - # another character, let's use a + # Fix evil mistakes of the past, + # replace reserved delimeter with + # another character, let's use a # url encoding of the character if c == delimiter: c = url_encoding - # Add the existing/converted character + # Add the existing/converted character fixed += c - return fixed + return fixed def stripped(v): @@ -127,10 +129,10 @@ def stripped(v): def lookup(mykey, dictionary): - """ Tries to lookup value in dictionary using an - exact match if the key. Returns empty string if - not found. """ - v = '' + """Tries to lookup value in dictionary using an + exact match if the key. Returns empty string if + not found.""" + v = "" if mykey in dictionary: v = dictionary[mykey] v = stripped(v) @@ -142,7 +144,7 @@ def contains(pattern, dictionary): instead of a key. Returns empty string if pattern is not found in dictionary. """ - v = '' + v = "" kys = dictionary.keys() for k in kys: if pattern in k: @@ -155,10 +157,12 @@ def parse(linelist): """Parses key, value pairs in 9th column and returns and index (dictionary) of all fields. """ - tags = {} # store key, value pairs in 9th column - metadata = re.split('; ', replace_nested_quotes(url_escape_inside_quotes(linelist[8].rstrip(';')))) - for field in metadata: - k,v = field.split(' ', 1) + tags = {} # store key, value pairs in 9th column + metadata = re.split( + "; ", replace_nested_quotes(url_escape_inside_quotes(linelist[8].rstrip(";"))) + ) + for field in metadata: + k, v = field.split(" ", 1) tags[k] = v.strip('"').strip("'") return tags @@ -173,35 +177,35 @@ def default(v, d): def biotypes(gtf): """Creates dictionary to map each gene to its biotype. - biotype features listed as mRNA will be converted to + biotype features listed as mRNA will be converted to protein_coding. """ gene2type = {} with open(sys.argv[1]) as file: for line in file: - if line.startswith('#'): + if line.startswith("#"): # Skip over comments in header section continue - linelist = line.strip().split('\t') + linelist = line.strip().split("\t") metadata = parse(linelist) - # Get gene and biotype - gene = lookup('gene_id', metadata) + # Get gene and biotype + gene = lookup("gene_id", metadata) # Setting biotype to unknown as default # value, then checking if metadata contains - # any fields with biotype as a sub string, - # then if biotype is not an empty string + # any fields with biotype as a sub string, + # then if biotype is not an empty string # set it to whatever is in the gtf file if gene not in gene2type: gene2type[gene] = "unknown" - biotype = contains('biotype', metadata) - if default(biotype, 'unknown') != 'unknown': - if biotype.lower() == 'mrna': - # agat_convert_sp_gff2gtf.pl does - # not set this value correct even - # when it is in the original GTF - # file, fixing the problem for + biotype = contains("biotype", metadata) + if default(biotype, "unknown") != "unknown": + if biotype.lower() == "mrna": + # agat_convert_sp_gff2gtf.pl does + # not set this value correct even + # when it is in the original GTF + # file, fixing the problem for # RSeQC TIN reference file - biotype = 'protein_coding' + biotype = "protein_coding" gene2type[gene] = biotype return gene2type @@ -211,18 +215,18 @@ def formatted(metadata): """Reformats key, value metadata to be written into the 9th column. """ - out = '' - for k,v in metadata.items(): - out += '{} "{}"; '.format(k,v) - out = out.rstrip(' ') + out = "" + for k, v in metadata.items(): + out += '{} "{}"; '.format(k, v) + out = out.rstrip(" ") return out def main(): if len(sys.argv) != 2: print(_help) - print('Usage: python {} genes.gtf > clean.gtf'.format(sys.argv[0])) - print('Error: failed to provide all positional arguments!', file=sys.stderr) + print("Usage: python {} genes.gtf > clean.gtf".format(sys.argv[0])) + print("Error: failed to provide all positional arguments!", file=sys.stderr) sys.exit(1) input_gtf = sys.argv[1] @@ -230,37 +234,41 @@ def main(): with open(input_gtf) as file: for line in file: - if line.startswith('#'): + if line.startswith("#"): # Skip over comments in header section print(line.strip()) continue - linelist = line.strip().split('\t') + linelist = line.strip().split("\t") feature = linelist[2] metadata = parse(linelist) # Should always be in GTF file - gene_id = lookup('gene_id', metadata) - if feature == 'gene': + gene_id = lookup("gene_id", metadata) + if feature == "gene": # May not be in GTF, add as needed - gene_name = default(lookup('gene_name', metadata) , gene_id) - metadata['gene_name'] = gene_name - gene_biotype = default(lookup('gene_biotype', metadata) , g2b[gene_id]) - metadata['gene_biotype'] = gene_biotype - elif feature in ['transcript', 'exon']: + gene_name = default(lookup("gene_name", metadata), gene_id) + metadata["gene_name"] = gene_name + gene_biotype = default(lookup("gene_biotype", metadata), g2b[gene_id]) + metadata["gene_biotype"] = gene_biotype + elif feature in ["transcript", "exon"]: # May not be in GTF, add as needed # assumes transcript_id is in gtf - gene_name = default(lookup('gene_name', metadata) , gene_id) - metadata['gene_name'] = gene_name - gene_biotype = default(lookup('gene_biotype', metadata) , g2b[gene_id]) - metadata['gene_biotype'] = gene_biotype - transcript_id = lookup('transcript_id', metadata) - transcript_name = default(lookup('transcript_name', metadata) , transcript_id) - metadata['transcript_name'] = transcript_name - transcript_type = default(lookup('transcript_type', metadata) , g2b[gene_id]) - metadata['transcript_type'] = transcript_type + gene_name = default(lookup("gene_name", metadata), gene_id) + metadata["gene_name"] = gene_name + gene_biotype = default(lookup("gene_biotype", metadata), g2b[gene_id]) + metadata["gene_biotype"] = gene_biotype + transcript_id = lookup("transcript_id", metadata) + transcript_name = default( + lookup("transcript_name", metadata), transcript_id + ) + metadata["transcript_name"] = transcript_name + transcript_type = default( + lookup("transcript_type", metadata), g2b[gene_id] + ) + metadata["transcript_type"] = transcript_type tags = formatted(metadata) linelist[8] = tags print("\t".join(linelist)) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/resources/download_dme_files b/resources/download_dme_files index 5dfc50f..0fb4d75 100755 --- a/resources/download_dme_files +++ b/resources/download_dme_files @@ -17,12 +17,12 @@ Usage: Synopsis: This script provides a high level wrapper to the HPC DME API. Given a list of input dme file paths and an API token, it will -download those files locally to the user provided output path. +download those files locally to the user provided output path. Required Arguments: - -f, --files FILE [Type: Str] Files to download from DME. + -f, --files FILE [Type: Str] Files to download from DME. One or more dme file paths can - be provided. Multiple files can + be provided. Multiple files can be downloaded at once by providing a quoted space separated list of files. -o, --output OUTPUT [Type: Path] Path to an output directory. @@ -30,23 +30,23 @@ Required Arguments: downloaded. If the provided output directory does not exist, it will be initialized automatically. - -t, --token TOKEN [Type: Str] API token for HPC DME. A text file + -t, --token TOKEN [Type: Str] API token for HPC DME. A text file containing an API token for DME - can be provided, or the API token + can be provided, or the API token can be provided as a string. Options: - -r, --rid RID [Type: Str] Request Identifer. This is an optional - string to help track a given request. - This identifer is appended to any log - files. If a request identifer is not - provided, a request identifer will be - generated from an MD5 checksum based - on the time of the request and the - other user provided options. - -p, --proxy PROXY [Type: Str] HTTPS Proxy. This option can be used to + -r, --rid RID [Type: Str] Request Identifer. This is an optional + string to help track a given request. + This identifer is appended to any log + files. If a request identifer is not + provided, a request identifer will be + generated from an MD5 checksum based + on the time of the request and the + other user provided options. + -p, --proxy PROXY [Type: Str] HTTPS Proxy. This option can be used to set or override the following environment - variable: https_proxy. By default, a - https proxy will not be utilized unless + variable: https_proxy. By default, a + https proxy will not be utilized unless it is inherited from a parent shell. -h, --help [Type: Bool] Displays usage and help information. -v, --version [Type: Bool] Displays version information. @@ -85,8 +85,8 @@ function parser() { -f | --files) provided "$key" "${2:-}"; Arguments["f"]="$2"; shift; shift;; -o | --output) provided "$key" "${2:-}"; Arguments["o"]="$2"; shift; shift;; -t | --token) provided "$key" "${2:-}"; Arguments["t"]="$2"; shift; shift;; - -r | --rid) provided "$key" "${2:-}"; Arguments["r"]="$2"; shift; shift;; - -p | --proxy) provided "$key" "${2:-}"; Arguments["p"]="$2"; shift; shift;; + -r | --rid) provided "$key" "${2:-}"; Arguments["r"]="$2"; shift; shift;; + -p | --proxy) provided "$key" "${2:-}"; Arguments["p"]="$2"; shift; shift;; -* | --*) err "Error: Failed to parse unsupported argument: '${key}'."; usage && exit 1;; *) err "Error: Failed to parse unrecognized argument: '${key}'. Do any of your inputs have spaces?"; usage && exit 1;; esac @@ -128,9 +128,9 @@ function check(){ function grab(){ # Grabs the contents of a file # else returns input that was provided - # to allow for flexiable API token input - # so a user can directly provide a token - # as a string or point to a file containing + # to allow for flexiable API token input + # so a user can directly provide a token + # as a string or point to a file containing # the token # INPUT $1 = token file or string @@ -148,11 +148,11 @@ function grab(){ function retry() { - # Tries to run a cmd 5 times before failing + # Tries to run a cmd 5 times before failing # If a command is successful, it will break out of attempt loop - # Failed attempts are padding with the following exponential + # Failed attempts are padding with the following exponential # back-off strategy {4, 16, 64, 256, 1024} in seconds - # @INPUTS "$@"" = cmd to run + # @INPUTS "$@"" = cmd to run # @CALLS timestamp() to log time of encountered error # @CALLS err() to redirect logging information to stderr # @CALLS fatal() if command cannot be run in 5 attempts @@ -161,9 +161,9 @@ function retry() { local max=5 local attempt=true # flag for while loop while $attempt; do - # Attempt command and break if successful + # Attempt command and break if successful "$@" && attempt=false || { - # Try again up to 5 times + # Try again up to 5 times if [[ $n -le $max ]]; then err "[$(timestamp)] Command failed: $@" delay=$(( 4**$n )) @@ -180,15 +180,15 @@ function retry() { function require(){ # Requires an executable is in $PATH - # as a last resort it will attempt to load + # as a last resort it will attempt to load # the executable as a module. If an exe is # not in $PATH raises fatal(). # INPUT $1 = executable to check # Check if $1 in $PATH # If not, try to module load $1 as a last resort - command -V "$1" &> /dev/null || { - command -V module &> /dev/null && + command -V "$1" &> /dev/null || { + command -V module &> /dev/null && module purge && module load "$1" } || fatal "Error: failed to find or load '$1', not installed on target system." @@ -196,12 +196,12 @@ function require(){ function _id(){ - # Generates a default request identifer + # Generates a default request identifer # if the -r or --rid option is not provided. # The default identifer is generated by generating - # an md5 checksum of the required user inputs and + # an md5 checksum of the required user inputs and # the timestamp - + # List of required arguments local features local identifer @@ -210,15 +210,15 @@ function _id(){ value=${Arguments[${arg}]:-} features+="${value}" done - + identifer=$(md5sum <<< "${features}" | awk '{print $1}') echo "${identifer}" -} +} function _download(){ - # Downloads a file from HPC DME + # Downloads a file from HPC DME # INPUT $1 = File to download from DME # INPUT $2 = Local output directory # INPUT $3 = DME API token @@ -226,16 +226,16 @@ function _download(){ # INPUT $5 = HTTPS Proxy, defaults to no proxy set # @CALLS require() to enfore cURL installation # @CALLS timestamp() to log time of file download - # @CALLS fatal() if curl returns a non-200 http response + # @CALLS fatal() if curl returns a non-200 http response # Require curl is installed require curl # Check if a proxy needs to be set if [[ ! -z "${5:-}" ]]; then export https_proxy="${5}"; fi - + # Try to download each file from DME with 5 max attempts - for file in ${1// /$'\t'}; do + for file in ${1// /$'\t'}; do local fname="$(basename "$file")" echo "[$(timestamp) @ ${4}] Downloading '${file}' from HPC DME to '$2/${fname}'" response=$(retry \ @@ -251,10 +251,10 @@ function _download(){ -w "%{http_code}" ) - # Check http response code for any failures - if [ $response != "200" ]; then + # Check http response code for any failures + if [ $response != "200" ]; then fatal "Error: download request for '$file' failed with http response of '$response'!" - fi + fi done } @@ -278,7 +278,7 @@ function main(){ requestid="${Arguments[r]:-$(_id)}" proxy="${Arguments[p]:-}" - # Download files from DME to local output directory + # Download files from DME to local output directory # INPUT $1 = Files to download from DME # INPUT $2 = Local output directory # INPUT $3 = DME API token diff --git a/resources/frce/fastq_screen.conf b/resources/frce/fastq_screen.conf index 0e2ee82..e396832 100644 --- a/resources/frce/fastq_screen.conf +++ b/resources/frce/fastq_screen.conf @@ -3,10 +3,10 @@ ###################### ## Bowtie or Bowtie2 # ###################### -## If the Bowtie1/2 binary is not in your PATH then you can +## If the Bowtie1/2 binary is not in your PATH then you can ## set this value to tell the program where to find it. ## Uncomment the line below and set the appropriate location. -## Please note, this path should include the executable +## Please note, this path should include the executable ## filename. #BOWTIE /usr/local/bin/bowtie/bowtie @@ -16,10 +16,10 @@ ########################################### ## Bismark (for bisulfite sequencing only)# ########################################### -## If the Bismark binary is not in your PATH then you can +## If the Bismark binary is not in your PATH then you can ## set this value to tell the program where to find it. ## Uncomment the line below and set the appropriate location. -## Please note, this path should include the executable +## Please note, this path should include the executable ## filename. #BISMARK /usr/local/bin/bismark/bismark @@ -40,26 +40,26 @@ THREADS 24 ## This section allows you to configure multiple databases ## to search against in your screen. For each database ## you need to provide a database name (which can't contain -## spaces) and the location of the bowtie indices which +## spaces) and the location of the bowtie indices which ## you created for that database. -## -## The entries shown below are only suggested examples, you +## +## The entries shown below are only suggested examples, you ## can add as many DATABASE sections as required, and you ## can comment out or remove as many of the existing entries ## as desired. ## ## Either the original bowtie or bowtie2 may be used for the -## mapping. Specify the aligner to use with the command line -## flag --aligner with arguments 'bowtie' or +## mapping. Specify the aligner to use with the command line +## flag --aligner with arguments 'bowtie' or ## 'bowtie2' (default). -## -## The configuration file may list paths to both bowtie and +## +## The configuration file may list paths to both bowtie and ## bowtie2 indices. FastQ Screen automatically detects whether -## a specified index is compatible with bowtie or bowtie2. +## a specified index is compatible with bowtie or bowtie2. ## -## Although the configuration file may list paths to both -## bowtie and bowtie2 indices, only one aligner will be used -## for the mapping, as specified by the --aligner flag. +## Although the configuration file may list paths to both +## bowtie and bowtie2 indices, only one aligner will be used +## for the mapping, as specified by the --aligner flag. ## ## The path to the index files SHOULD INCLUDE THE BASENAME of ## the index, e.g: @@ -72,8 +72,8 @@ THREADS 24 ## ## Human - sequences available from ## ftp://ftp.ensembl.org/pub/current/fasta/homo_sapiens/dna/ -DATABASE Human /mnt/projects/CCBR-Pipelines/db/fastq_screen_db/hg19/hg19 -DATABASE Mouse /mnt/projects/CCBR-Pipelines/db/fastq_screen_db/mm9/mm9 +DATABASE Human /mnt/projects/CCBR-Pipelines/db/fastq_screen_db/hg19/hg19 +DATABASE Mouse /mnt/projects/CCBR-Pipelines/db/fastq_screen_db/mm9/mm9 DATABASE Bacteria /mnt/projects/CCBR-Pipelines/db/fastq_screen_db/Bacteria/bacteria DATABASE Fungi /mnt/projects/CCBR-Pipelines/db/fastq_screen_db/Fungi/fungi DATABASE Virus /mnt/projects/CCBR-Pipelines/db/fastq_screen_db/Virus/virus diff --git a/resources/frce/fastq_screen_2.conf b/resources/frce/fastq_screen_2.conf index 879c273..8fc9a91 100644 --- a/resources/frce/fastq_screen_2.conf +++ b/resources/frce/fastq_screen_2.conf @@ -3,10 +3,10 @@ ###################### ## Bowtie or Bowtie2 # ###################### -## If the Bowtie1/2 binary is not in your PATH then you can +## If the Bowtie1/2 binary is not in your PATH then you can ## set this value to tell the program where to find it. ## Uncomment the line below and set the appropriate location. -## Please note, this path should include the executable +## Please note, this path should include the executable ## filename. #BOWTIE /usr/local/bin/bowtie/bowtie @@ -16,10 +16,10 @@ ########################################### ## Bismark (for bisulfite sequencing only)# ########################################### -## If the Bismark binary is not in your PATH then you can +## If the Bismark binary is not in your PATH then you can ## set this value to tell the program where to find it. ## Uncomment the line below and set the appropriate location. -## Please note, this path should include the executable +## Please note, this path should include the executable ## filename. #BISMARK /usr/local/bin/bismark/bismark @@ -40,26 +40,26 @@ THREADS 24 ## This section allows you to configure multiple databases ## to search against in your screen. For each database ## you need to provide a database name (which can't contain -## spaces) and the location of the bowtie indices which +## spaces) and the location of the bowtie indices which ## you created for that database. -## -## The entries shown below are only suggested examples, you +## +## The entries shown below are only suggested examples, you ## can add as many DATABASE sections as required, and you ## can comment out or remove as many of the existing entries ## as desired. ## ## Either the original bowtie or bowtie2 may be used for the -## mapping. Specify the aligner to use with the command line -## flag --aligner with arguments 'bowtie' or +## mapping. Specify the aligner to use with the command line +## flag --aligner with arguments 'bowtie' or ## 'bowtie2' (default). -## -## The configuration file may list paths to both bowtie and +## +## The configuration file may list paths to both bowtie and ## bowtie2 indices. FastQ Screen automatically detects whether -## a specified index is compatible with bowtie or bowtie2. +## a specified index is compatible with bowtie or bowtie2. ## -## Although the configuration file may list paths to both -## bowtie and bowtie2 indices, only one aligner will be used -## for the mapping, as specified by the --aligner flag. +## Although the configuration file may list paths to both +## bowtie and bowtie2 indices, only one aligner will be used +## for the mapping, as specified by the --aligner flag. ## ## The path to the index files SHOULD INCLUDE THE BASENAME of ## the index, e.g: @@ -74,4 +74,4 @@ THREADS 24 ## ftp://ftp.ensembl.org/pub/current/fasta/homo_sapiens/dna/ DATABASE Uni_Vec /mnt/projects/CCBR-Pipelines/db/fastq_screen_db/UniVec_vectors/UniVec_vectors -DATABASE rRNA /mnt/projects/CCBR-Pipelines/db/fastq_screen_db/rRNA/rRNA +DATABASE rRNA /mnt/projects/CCBR-Pipelines/db/fastq_screen_db/rRNA/rRNA diff --git a/resources/gff3togtf.py b/resources/gff3togtf.py index 965d20e..f3f47bb 100755 --- a/resources/gff3togtf.py +++ b/resources/gff3togtf.py @@ -5,12 +5,12 @@ Created by: Dr. Tovah Markowitz, NCBR Date: 11/22/21 Purpose: - The script converts a NCBI Nucleotide GFF3 into a GTF - that will work with RSEM and qualimap. In the gtf, each + The script converts a NCBI Nucleotide GFF3 into a GTF + that will work with RSEM and qualimap. In the gtf, each gene will have at least one transcript and at least one - exon. Transcripts will be defined by ncRNA or mRNA when - available or by CDS otherwise. Also, adds the necessary - information for column 9 to match up the multiple rows + exon. Transcripts will be defined by ncRNA or mRNA when + available or by CDS otherwise. Also, adds the necessary + information for column 9 to match up the multiple rows for each gene and other required functionality. """ @@ -20,68 +20,112 @@ def readGff3(inputName): - f = open(inputName,'r') + f = open(inputName, "r") inputData = f.readlines() f.close() - inputData = [ row.strip().split('\t') for row in inputData ] - header = [ row[0] for row in inputData if row[0].startswith("#") ] - inputData = [ row for row in inputData if not row[0].startswith("#") ] - inputData = [ row for row in inputData if len(row) == 9 ] - return(header, inputData) + inputData = [row.strip().split("\t") for row in inputData] + header = [row[0] for row in inputData if row[0].startswith("#")] + inputData = [row for row in inputData if not row[0].startswith("#")] + inputData = [row for row in inputData if len(row) == 9] + return (header, inputData) def makeGTF(header, inputData): - genes = [ row for row in inputData if row[2] == "gene" ] - RNAs = [ row for row in inputData if (row[2] == "mRNA") | (row[2] == "ncRNA") ] - CDS = [ row for row in inputData if row[2] == "CDS" ] - exons = [ row for row in inputData if row[2] == "exon" ] + genes = [row for row in inputData if row[2] == "gene"] + RNAs = [row for row in inputData if (row[2] == "mRNA") | (row[2] == "ncRNA")] + CDS = [row for row in inputData if row[2] == "CDS"] + exons = [row for row in inputData if row[2] == "exon"] outAll = header for i in range(len(genes)): - column9 = genes[i][8].split(';') - geneID = column9[0].split("-",1)[1] - geneName = [ pos.split("=")[1] for pos in column9 if pos.startswith("Name=") ][0] - geneBiotype = [ pos.split("=")[1] for pos in column9 if pos.startswith("gene_biotype") ][0] + column9 = genes[i][8].split(";") + geneID = column9[0].split("-", 1)[1] + geneName = [pos.split("=")[1] for pos in column9 if pos.startswith("Name=")][0] + geneBiotype = [ + pos.split("=")[1] for pos in column9 if pos.startswith("gene_biotype") + ][0] outGene = genes[i][0:8] - outGene.append('gene_id "' + geneID + '"; gene_name "' + geneName + '"; gene_biotype "' + - geneBiotype + '";') - outAll.append( "\t".join(outGene) ) - geneRNAs = [ RNA for RNA in RNAs if re.search(geneID + ";", RNA[8]) ] + outGene.append( + 'gene_id "' + + geneID + + '"; gene_name "' + + geneName + + '"; gene_biotype "' + + geneBiotype + + '";' + ) + outAll.append("\t".join(outGene)) + geneRNAs = [RNA for RNA in RNAs if re.search(geneID + ";", RNA[8])] if len(geneRNAs) != 0: geneTranscript = geneRNAs else: - geneTranscript = [ CDSrow for CDSrow in CDS if re.search(geneID + ";", CDSrow[8]) ] - geneExons = [ exon for exon in exons if re.search(geneID + ";", exon[8]) ] + geneTranscript = [ + CDSrow for CDSrow in CDS if re.search(geneID + ";", CDSrow[8]) + ] + geneExons = [exon for exon in exons if re.search(geneID + ";", exon[8])] if len(geneTranscript) != 0: for row in geneTranscript: transcriptRow = row[0:8] outTranscript = transcriptRow outTranscript[2] = "transcript" tmp = row[8].split(";") - transcriptID = tmp[0].split("-",1)[1] + transcriptID = tmp[0].split("-", 1)[1] if row[2] == "CDS": - transcriptName = [ pos.split("=")[1] for pos in tmp if pos.startswith("Name=") ][0] + transcriptName = [ + pos.split("=")[1] for pos in tmp if pos.startswith("Name=") + ][0] else: - transcriptName = [ pos.split("=")[1] for pos in tmp if pos.startswith("gene=") ][0] - outTranscript.append('gene_id "' + geneID + '"; gene_name "' + geneName + - '"; gene_biotype "' + geneBiotype + '"; transcript_id "' + - transcriptID + '"; transcript_name "' + transcriptName + '";') - outAll.append( "\t".join(outTranscript) ) + transcriptName = [ + pos.split("=")[1] for pos in tmp if pos.startswith("gene=") + ][0] + outTranscript.append( + 'gene_id "' + + geneID + + '"; gene_name "' + + geneName + + '"; gene_biotype "' + + geneBiotype + + '"; transcript_id "' + + transcriptID + + '"; transcript_name "' + + transcriptName + + '";' + ) + outAll.append("\t".join(outTranscript)) else: transcriptRow = genes[i][0:8] outTranscript = transcriptRow outTranscript[2] = "transcript" - outTranscript.append('gene_id "' + geneID + '"; gene_name "' + geneName + - '"; gene_biotype "' + geneBiotype + '"; transcript_id "' + - geneID + '"; transcript_name "' + geneName + '";') - outAll.append( "\t".join(outTranscript) ) + outTranscript.append( + 'gene_id "' + + geneID + + '"; gene_name "' + + geneName + + '"; gene_biotype "' + + geneBiotype + + '"; transcript_id "' + + geneID + + '"; transcript_name "' + + geneName + + '";' + ) + outAll.append("\t".join(outTranscript)) if len(geneExons) != 0: for row in geneExons: outExon = row[0:8] tmp = row[8].split(";") - transcriptID2 = [ pos.split("=")[1] for pos in tmp if pos.startswith("Parent=") ][0].split("-",1)[1] - outExon.append('gene_id "' + geneID + '"; transcript_id "' + - transcriptID2 + '"; gene_biotype "' + geneBiotype + '";') - outAll.append( "\t".join(outExon) ) + transcriptID2 = [ + pos.split("=")[1] for pos in tmp if pos.startswith("Parent=") + ][0].split("-", 1)[1] + outExon.append( + 'gene_id "' + + geneID + + '"; transcript_id "' + + transcriptID2 + + '"; gene_biotype "' + + geneBiotype + + '";' + ) + outAll.append("\t".join(outExon)) else: outExon = transcriptRow[0:8] outExon[2] = "exon" @@ -89,32 +133,54 @@ def makeGTF(header, inputData): # Cannot find transcript ID, # set to gene id transcriptID = geneID - outExon.append('gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_biotype "' + geneBiotype + '";') - outAll.append( "\t".join(outExon) ) - return(outAll) + outExon.append( + 'gene_id "' + + geneID + + '"; transcript_id "' + + transcriptID + + '"; gene_biotype "' + + geneBiotype + + '";' + ) + outAll.append("\t".join(outExon)) + return outAll def writeGTF(GTF, outputName): - f = open(outputName, 'w') - f.write( "\n".join(GTF) ) + f = open(outputName, "w") + f.write("\n".join(GTF)) f.close() if __name__ == "__main__": - descriptionText = """ The script converts a NCBI Nucleotide GFF3 into a GTF - that will work with RSEM and qualimap. In the gtf, each - gene will have at least one transcript and at least one - exon. Transcripts will be defined by ncRNA or mRNA when - available or by CDS otherwise. Also, adds the necessary - information for column 9 to match up the multiple rows + that will work with RSEM and qualimap. In the gtf, each + gene will have at least one transcript and at least one + exon. Transcripts will be defined by ncRNA or mRNA when + available or by CDS otherwise. Also, adds the necessary + information for column 9 to match up the multiple rows for each gene and other required functionality. """ - parser = argparse.ArgumentParser(description = descriptionText, formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument("-i", action="store", required="true", dest="inputName", help="Input GFF3 file name.") - parser.add_argument("-o", action="store", required="true", dest="outputName", help="Output GTF file name.") + parser = argparse.ArgumentParser( + description=descriptionText, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "-i", + action="store", + required="true", + dest="inputName", + help="Input GFF3 file name.", + ) + parser.add_argument( + "-o", + action="store", + required="true", + dest="outputName", + help="Output GTF file name.", + ) args = parser.parse_args() inputName = args.inputName @@ -123,4 +189,3 @@ def writeGTF(GTF, outputName): (header, inputData) = readGff3(inputName) GTF = makeGTF(header, inputData) writeGTF(GTF, outputName) - diff --git a/resources/jobby b/resources/jobby index cb5f584..747183e 100755 --- a/resources/jobby +++ b/resources/jobby @@ -4,14 +4,14 @@ # -*- coding: UTF-8 -*- """ -ABOUT: +ABOUT: `jobby` will take your past jobs and display their job information. - Why? We have pipelines running on several different clusters and + Why? We have pipelines running on several different clusters and job schedulers. `jobby` is an attempt to centralize and abstract the process of querying different job schedulers. On each supported - target system, `jobby` will attempt to determine the best method for - getting job information to return to the user in a standardized - format and unified cli. + target system, `jobby` will attempt to determine the best method for + getting job information to return to the user in a standardized + format and unified cli. REQUIRES: - python>=3.5 @@ -22,26 +22,26 @@ DISCLAIMER: National Institute of Allergy and Infectious Diseases (NIAID) This software/database is a "United States Government Work" under - the terms of the United States Copyright Act. It was written as + the terms of the United States Copyright Act. It was written as part of the author's official duties as a United States Government employee and thus cannot be copyrighted. This software is freely available to the public for use. - + Although all reasonable efforts have been taken to ensure the accuracy and reliability of the software and data, NCBR do not and - cannot warrant the performance or results that may be obtained by + cannot warrant the performance or results that may be obtained by using this software or data. NCBR and NIH disclaim all warranties, - express or implied, including warranties of performance, + express or implied, including warranties of performance, merchantability or fitness for any particular purpose. - - Please cite the author and NIH resources like the "Biowulf Cluster" + + Please cite the author and NIH resources like the "Biowulf Cluster" in any work or product based on this material. USAGE: $ jobby [OPTIONS] JOB_ID [JOB_ID ...] EXAMPLE: - $ jobby 18627545 15627516 58627597 + $ jobby 18627545 15627516 58627597 """ # Python standard library @@ -50,52 +50,53 @@ import sys, os, subprocess, math, re from subprocess import PIPE import argparse # added in python/3.5 import textwrap # added in python/3.5 -import tempfile # added in python/3.5 +import tempfile # added in python/3.5 # Jobby metadata -__version__ = 'v0.2.0' -__authors__ = 'Skyler Kuhn' -__email__ = 'skyler.kuhn@nih.gov' -__home__ = os.path.dirname(os.path.abspath(__file__)) +__version__ = "v0.2.0" +__authors__ = "Skyler Kuhn" +__email__ = "skyler.kuhn@nih.gov" +__home__ = os.path.dirname(os.path.abspath(__file__)) _name = os.path.basename(sys.argv[0]) -_description = 'Will take your job(s)... and display their information!' +_description = "Will take your job(s)... and display their information!" # Classes -class Colors(): +class Colors: """Class encoding for ANSI escape sequeces for styling terminal text. Any string that is formatting with these styles must be terminated with the escape sequence, i.e. `Colors.end`. """ + # Escape sequence - end = '\33[0m' + end = "\33[0m" # Formatting options - bold = '\33[1m' - italic = '\33[3m' - url = '\33[4m' - blink = '\33[5m' - higlighted = '\33[7m' + bold = "\33[1m" + italic = "\33[3m" + url = "\33[4m" + blink = "\33[5m" + higlighted = "\33[7m" # Text Colors - black = '\33[30m' - red = '\33[31m' - green = '\33[32m' - yellow = '\33[33m' - blue = '\33[34m' - pink = '\33[35m' - cyan = '\33[96m' - white = '\33[37m' + black = "\33[30m" + red = "\33[31m" + green = "\33[32m" + yellow = "\33[33m" + blue = "\33[34m" + pink = "\33[35m" + cyan = "\33[96m" + white = "\33[37m" # Background fill colors - bg_black = '\33[40m' - bg_red = '\33[41m' - bg_green = '\33[42m' - bg_yellow = '\33[43m' - bg_blue = '\33[44m' - bg_pink = '\33[45m' - bg_cyan = '\33[46m' - bg_white = '\33[47m' + bg_black = "\33[40m" + bg_red = "\33[41m" + bg_green = "\33[42m" + bg_yellow = "\33[43m" + bg_blue = "\33[44m" + bg_pink = "\33[45m" + bg_cyan = "\33[46m" + bg_white = "\33[47m" -# Helper Functions +# Helper Functions def which(cmd, path=None): """Checks if an executable is in $PATH @param cmd : @@ -120,7 +121,7 @@ def which(cmd, path=None): def err(*message, **kwargs): """Prints any provided args to standard error. - kwargs can be provided to modify print functions + kwargs can be provided to modify print functions behavior. @param message : Values printed to standard error @@ -130,7 +131,6 @@ def err(*message, **kwargs): print(*message, file=sys.stderr, **kwargs) - def fatal(*message, **kwargs): """Prints any provided args to standard error and exits with an exit code of 1. @@ -146,42 +146,40 @@ def fatal(*message, **kwargs): def get_toolkit(tool_list): """Finds the best suited tool from a list of possible choices. Assumes tool list is already - ordered from the best to worst choice. The first + ordered from the best to worst choice. The first tool found in a user's $PATH is returned. @param tool_list list[]: List of ordered tools to find @returns best_choice : First tool found in tool_list """ - best_choice = None + best_choice = None for exe in tool_list: if which(exe): best_choice = exe break - + # Did not find any tools # to potentially use if not best_choice: - err( - 'Error: Did not find any tools to get job information!' - ) + err("Error: Did not find any tools to get job information!") fatal( - 'Expected one of the following tools to be in $PATH:' - '\t{0}'.format(tool_list) + "Expected one of the following tools to be in $PATH:" + "\t{0}".format(tool_list) ) - + return best_choice def add_missing(linelist, insertion_dict): - """Adds missing information to a list. This can be used - to add missing job information fields to the results of + """Adds missing information to a list. This can be used + to add missing job information fields to the results of job querying tool. @param linelist list[]: List containing job information for each field of interest @param insertion_dict dict[] = str Dictionary used to insert missing information to a given - index, where the keys are indices of the `linelist` and the + index, where the keys are indices of the `linelist` and the values are information to add. Please note that the indices should be zero based. Note that multiple consequetive values should be inserted at once as a list, see example below: @@ -192,16 +190,16 @@ def add_missing(linelist, insertion_dict): # Get the order of indices # add missing information # starting from largest to - # smallest, if we insert - # missing values in this + # smallest, if we insert + # missing values in this # order we do not need to - # calculate the offset of + # calculate the offset of # new indices tmp_list = linelist indices = sorted(list(insertion_dict.keys()), reverse=True) for i in indices: # Check if multiple values - # need to be inserted at a + # need to be inserted at a # given index if isinstance(insertion_dict[i], list): for v in reversed(insertion_dict[i]): @@ -212,17 +210,12 @@ def add_missing(linelist, insertion_dict): def convert_size(size_bytes): - """Converts bytes to a human readable format. - """ - # Sizes range from B to YiB, + """Converts bytes to a human readable format.""" + # Sizes range from B to YiB, # warning larger sizes storage - # may results in blackhole - size_name = ( - "B", "KiB", "MiB", - "GiB", "TiB", "PiB", - "EiB", "ZiB", "YiB" - ) - if size_bytes == 0: + # may results in blackhole + size_name = ("B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB") + if size_bytes == 0: return "0B" i = int(math.floor(math.log(size_bytes, 1024))) p = math.pow(1024, i) @@ -234,36 +227,53 @@ def to_bytes(size): """Convert a human readable size unit into bytes. Returns None if cannot convert/parse provided size.""" size2bytes = { - "b":1, "bytes":1, "byte":1, - "k":1024, "kib":1024, "kb":1000, - "m": 1024**2, "mib": 1024**2, "mb": 1000**2, - "g": 1024**3, "gib": 1024**3, "gb": 1000**3, - "t": 1024**4, "tib": 1024**4, "tb": 1000**4, - "p": 1024**5, "pib": 1024**5, "pb": 1000**5, - "e": 1024**6, "eib": 1024**6, "eb": 1000**6, - "z": 1024**7, "zib": 1024**7, "zb": 1000**7, - "y": 1024**8, "yib": 1024**8, "yb": 1000**8 + "b": 1, + "bytes": 1, + "byte": 1, + "k": 1024, + "kib": 1024, + "kb": 1000, + "m": 1024**2, + "mib": 1024**2, + "mb": 1000**2, + "g": 1024**3, + "gib": 1024**3, + "gb": 1000**3, + "t": 1024**4, + "tib": 1024**4, + "tb": 1000**4, + "p": 1024**5, + "pib": 1024**5, + "pb": 1000**5, + "e": 1024**6, + "eib": 1024**6, + "eb": 1000**6, + "z": 1024**7, + "zib": 1024**7, + "zb": 1000**7, + "y": 1024**8, + "yib": 1024**8, + "yb": 1000**8, } - - size = size.replace(' ','') - match = re.search('(?P[0-9.]+)(?P[a-zA-Z]+)$', size) - + + size = size.replace(" ", "") + match = re.search("(?P[0-9.]+)(?P[a-zA-Z]+)$", size) + if match: - human_units = match.group('units').lower() + human_units = match.group("units").lower() human_units = human_units.lstrip().rstrip() scaling_factor = size2bytes[human_units] - bytes = int(math.ceil(scaling_factor * float(match.group('size')))) + bytes = int(math.ceil(scaling_factor * float(match.group("size")))) else: # Cannot parse units, # cannot convert value # into bytes return None - - return bytes + return bytes -# Core logic for getting +# Core logic for getting # job information def sge(jobs, threads, tmp_dir): """Displays SGE job information to standard output. @@ -281,19 +291,19 @@ def uge(jobs, threads, tmp_dir): Parsed command-line arguments @return None """ - # NOTE: add later for LOCUS cluster + # NOTE: add later for LOCUS cluster pass def dashboard_cli(jobs, threads=1, tmp_dir=None): """Biowulf-specific tool to get SLURM job information. - HPC staff recommend using this over the default slurm - `sacct` command for performance reasons. By default, + HPC staff recommend using this over the default slurm + `sacct` command for performance reasons. By default, the `dashboard_cli` returns information for the following fields: - jobid state submit_time partition nodes - cpus mem timelimit gres dependency - queued_time state_reason start_time elapsed_time end_time + jobid state submit_time partition nodes + cpus mem timelimit gres dependency + queued_time state_reason start_time elapsed_time end_time cpu_max mem_max eval Runs command: $ dashboard_cli jobs \\ @@ -302,41 +312,50 @@ def dashboard_cli(jobs, threads=1, tmp_dir=None): --tab --archive """ fields = [ - "jobid","jobname", - "state","partition", - "gres","cpus","mem", - "cpu_max","mem_max", - "timelimit","queued_time", - "start_time","end_time", - "elapsed_time","nodelist", - "user", "std_out", "std_err", - "work_dir" + "jobid", + "jobname", + "state", + "partition", + "gres", + "cpus", + "mem", + "cpu_max", + "mem_max", + "timelimit", + "queued_time", + "start_time", + "end_time", + "elapsed_time", + "nodelist", + "user", + "std_out", + "std_err", + "work_dir", ] - + # Display header information, # --tab option does not print # the header - print('\t'.join(fields)) + print("\t".join(fields)) # Display job information cmd = subprocess.run( - 'dashboard_cli jobs --archive --tab --joblist {0} --fields {1}'.format( - ','.join(jobs), - ','.join(fields) + "dashboard_cli jobs --archive --tab --joblist {0} --fields {1}".format( + ",".join(jobs), ",".join(fields) ), stdout=PIPE, stderr=PIPE, universal_newlines=True, - shell=True + shell=True, ) # Check for failure # of the last command if cmd.returncode != 0: err("\nError: Failed to get job information with 'dashboard_cli'!") - err('Please see error message below:') - fatal(' └── ', cmd.stderr) + err("Please see error message below:") + fatal(" └── ", cmd.stderr) - print(cmd.stdout.rstrip('\n')) + print(cmd.stdout.rstrip("\n")) def sacct(jobs, threads=1, tmp_dir=None): @@ -344,11 +363,11 @@ def sacct(jobs, threads=1, tmp_dir=None): `sacct` should be available on all SLURM clusters. The `dashboard_cli` is prioritized over using `sacct` due to perform reasons; however, this method will be - portable across different SLURM clusters. To get maximum - memory usage for a job, we will need to parse the MaxRSS + portable across different SLURM clusters. To get maximum + memory usage for a job, we will need to parse the MaxRSS field from the `$SLURM_JOBID.batch` lines. Returns job information for the following fields: - jobid jobname state partition reqtres + jobid jobname state partition reqtres alloccpus reqmem maxrss timelimit reserved start end elapsed nodelist user workdir @@ -357,49 +376,70 @@ def sacct(jobs, threads=1, tmp_dir=None): Runs command: $ sacct -j 12345679,12345680 \\ --fields FIELD,FIELD,FIELD \\ - -P --delimiter $'\t' + -P --delimiter $'\t' """ - header = [ - "jobid","jobname","state","partition", - "gres","cpus","mem","cpu_max","mem_max", - "timelimit","queued_time","start_time", - "end_time","elapsed_time","nodelist", - "user","std_out","std_err", "work_dir" + header = [ + "jobid", + "jobname", + "state", + "partition", + "gres", + "cpus", + "mem", + "cpu_max", + "mem_max", + "timelimit", + "queued_time", + "start_time", + "end_time", + "elapsed_time", + "nodelist", + "user", + "std_out", + "std_err", + "work_dir", ] fields = [ - "jobid", "jobname", - "state", "partition", - "reqtres", "alloccpus", - "reqmem", "maxrss", - "timelimit", "reserved", - "start", "end", - "elapsed", "nodelist", - "user", "workdir" + "jobid", + "jobname", + "state", + "partition", + "reqtres", + "alloccpus", + "reqmem", + "maxrss", + "timelimit", + "reserved", + "start", + "end", + "elapsed", + "nodelist", + "user", + "workdir", ] # Missing std_out and std_err - missing_fields = {15:['-','-']} + missing_fields = {15: ["-", "-"]} # Display header information, - print('\t'.join(header)) + print("\t".join(header)) # Display job information cmd = subprocess.run( "sacct -j {0} -P --delimiter $'\\t' --format={1}".format( - ','.join(jobs), - ','.join(fields) + ",".join(jobs), ",".join(fields) ), - stdout=PIPE, + stdout=PIPE, stderr=PIPE, universal_newlines=True, - shell=True + shell=True, ) # Check for failure # of the last command if cmd.returncode != 0: err("\nError: Failed to get job information with 'dashboard_cli'!") - err('Please see error message below:') - fatal(' └── ', cmd.stderr) - + err("Please see error message below:") + fatal(" └── ", cmd.stderr) + # Get max memory information, # Stored as $SLURM_JOBID.batch # in the MaxRSS field @@ -407,22 +447,22 @@ def sacct(jobs, threads=1, tmp_dir=None): # Remove trailing newline from # standard output and split lines # on remaining newline characters - job_information = cmd.stdout.rstrip('\n').split('\n') + job_information = cmd.stdout.rstrip("\n").split("\n") for i, line in enumerate(job_information): if i < 1: # skip over header continue - linelist = line.lstrip().rstrip().split('\t') - if linelist[0].endswith('.batch'): - jobid = linelist[0].strip().split('.')[0] - maxmem = linelist[7].replace(' ', '') + linelist = line.lstrip().rstrip().split("\t") + if linelist[0].endswith(".batch"): + jobid = linelist[0].strip().split(".")[0] + maxmem = linelist[7].replace(" ", "") mem_bytes = to_bytes(maxmem) if not mem_bytes: # Could not convert - # max_mem value into + # max_mem value into # bytes - j2m[jobid] = '-' - continue # goto next line + j2m[jobid] = "-" + continue # goto next line human_readable_mem = convert_size(mem_bytes) j2m[jobid] = human_readable_mem @@ -432,22 +472,22 @@ def sacct(jobs, threads=1, tmp_dir=None): if i < 1: # skip over header continue - linelist = line.lstrip().rstrip().split('\t') + linelist = line.lstrip().rstrip().split("\t") jobid = linelist[0].strip() - if '.' not in jobid: + if "." not in jobid: try: max_mem = j2m[jobid] except KeyError: - # Job maybe still be + # Job maybe still be # running or in a non- # completed state. - max_mem = '-' - status = linelist[2].split(' ')[0] + max_mem = "-" + status = linelist[2].split(" ")[0] linelist[2] = status missing_fields[8] = max_mem linelist = add_missing(linelist, missing_fields) - linelist = [info if info else '-' for info in linelist] - print('\t'.join(linelist)) + linelist = [info if info else "-" for info in linelist] + print("\t".join(linelist)) def slurm(jobs, threads, tmp_dir): @@ -456,11 +496,11 @@ def slurm(jobs, threads, tmp_dir): Parsed command-line arguments @return None """ - # Try to use the following tools in this + # Try to use the following tools in this # order to get job information! # [1] `dashboard_cli` is Biowulf-specific # [2] `sacct` should always be there - tool_priority = ['dashboard_cli', 'sacct'] + tool_priority = ["dashboard_cli", "sacct"] job_tool = get_toolkit(tool_priority) # Get information about each job # must use eval() to make string @@ -470,65 +510,57 @@ def slurm(jobs, threads, tmp_dir): def jobby(args): """ - Wrapper to each supported job scheduler: slurm, etc. + Wrapper to each supported job scheduler: slurm, etc. Each scheduler has a custom handler to most effectively - get and parse job information. + get and parse job information. @param sub_args : Parsed command-line arguments @return None """ # Get command line options abstract_handler = None - job_ids = args.JOB_ID + job_ids = args.JOB_ID scheduler = args.scheduler - threads = args.threads - tmp_dir = args.tmp_dir + threads = args.threads + tmp_dir = args.tmp_dir - # Set handler for each - # supported scheduler - if scheduler == 'slurm': + # Set handler for each + # supported scheduler + if scheduler == "slurm": abstract_handler = slurm else: # Unsupported job scheduler, # needs to be implemented - fatal( - 'Error: "{0}" is an unsupported job scheduler!'.format(scheduler) - ) - - # Display job(s) information + fatal('Error: "{0}" is an unsupported job scheduler!'.format(scheduler)) + + # Display job(s) information # to standard output - abstract_handler( - jobs=job_ids, - threads=threads, - tmp_dir=tmp_dir - ) + abstract_handler(jobs=job_ids, threads=threads, tmp_dir=tmp_dir) # Parse command-line arguments def parsed_arguments(name, description): - """Parses user-provided command-line arguments. This requires - argparse and textwrap packages. To create custom help formatting - a text wrapped docstring is used to create the help message for - required options. As so, the help message for require options - must be suppressed. If a new required argument is added to the + """Parses user-provided command-line arguments. This requires + argparse and textwrap packages. To create custom help formatting + a text wrapped docstring is used to create the help message for + required options. As so, the help message for require options + must be suppressed. If a new required argument is added to the cli, it must be updated in the usage statement docstring below. @param name : - Name of the pipeline or command-line tool + Name of the pipeline or command-line tool @param description : - Short description of pipeline or command-line tool + Short description of pipeline or command-line tool """ # Add styled name and description c = Colors - styled_name = "{0}{1}{2}{3}{4}".format( - c.bold, c.bg_black, - c.cyan, name, c.end - ) + styled_name = "{0}{1}{2}{3}{4}".format(c.bold, c.bg_black, c.cyan, name, c.end) description = "{0}{1}{2}".format(c.bold, description, c.end) temp = tempfile.gettempdir() # Please note: update the usage statement # below if a new option is added! - usage_statement = textwrap.dedent("""\ + usage_statement = textwrap.dedent( + """\ {0}: {1} {3}{4}Synopsis:{5} @@ -538,138 +570,136 @@ def parsed_arguments(name, description): {3}{4}Description:{5} - {2} will take your past jobs and display their job information + {2} will take your past jobs and display their job information in a standardized format. Why???! We have pipelines running on several - different clusters (using different job schedulers). {2} centralizes + different clusters (using different job schedulers). {2} centralizes and abstracts the process of querying different job schedulers within - a unified command-line interface. - + a unified command-line interface. + For each supported scheduler, jobby will determine the best method - on a given target system for getting job information to return to the + on a given target system for getting job information to return to the user in a common output format. {3}{4}Required Positional Arguments:{5} - Identiers of past jobs. One or more JOB_IDs - can be provided. Multiple JOB_IDs should be - seperated by a space. Information for each - of the JOB_IDs will be displayed to standard - output. Please see example section below for + Identiers of past jobs. One or more JOB_IDs + can be provided. Multiple JOB_IDs should be + seperated by a space. Information for each + of the JOB_IDs will be displayed to standard + output. Please see example section below for more information. {3}{4}Options:{5} - -s,--scheduler {{slurm | ...}} + -s,--scheduler {{slurm | ...}} @Default: slurm - Job scheduler. Defines the job scheduler + Job scheduler. Defines the job scheduler of the target system. Additional support - for more schedulers coming soon! + for more schedulers coming soon! @Example: --scheduler slurm - -n, --threads THREADS + -n, --threads THREADS @Default: 1 - Number of threads to query the scheduler + Number of threads to query the scheduler in parallel. @Example: --threads: 8 - -t, --tmp-dir TMP_DIR + -t, --tmp-dir TMP_DIR @Default: {7}/ - Temporary directory. Path on the filesystem - for writing temporary output files. Ideally, - this path should point to a dedicated space - on the filesystem for writing tmp files. If - you need to inject a variable into this path - that should NOT be expanded, please quote the - options value in single quotes. The default + Temporary directory. Path on the filesystem + for writing temporary output files. Ideally, + this path should point to a dedicated space + on the filesystem for writing tmp files. If + you need to inject a variable into this path + that should NOT be expanded, please quote the + options value in single quotes. The default location of this option is set to the system default via the $TMPDIR environment variable. @Example: --tmp-dir '/scratch/$USER/' - + -h, --help Shows help and usage information and exits. @Example: --help - + -v, --version Displays version information and exits. @Example: --version - """.format(styled_name, description, name, c.bold, c.url, c.end, c.italic, temp)) + """.format( + styled_name, description, name, c.bold, c.url, c.end, c.italic, temp + ) + ) # Display example usage in epilog - run_epilog = textwrap.dedent("""\ + run_epilog = textwrap.dedent( + """\ {2}{3}Example:{4} - # Please avoid running jobby + # Please avoid running jobby # on a cluster's head node! ./jobby -s slurm -n 4 18627542 13627516 58627597 48627666 {2}{3}Version:{4} {1} - """.format(name, __version__, c.bold, c.url, c.end)) + """.format( + name, __version__, c.bold, c.url, c.end + ) + ) # Create a top-level parser parser = argparse.ArgumentParser( - usage = argparse.SUPPRESS, + usage=argparse.SUPPRESS, formatter_class=argparse.RawDescriptionHelpFormatter, - description = usage_statement, - epilog = run_epilog, - add_help=False + description=usage_statement, + epilog=run_epilog, + add_help=False, ) # Required Positional Arguments # List of JOB_IDs, 1 ... N_JOB_IDS - parser.add_argument( - 'JOB_ID', - nargs = '+', - help = argparse.SUPPRESS - ) + parser.add_argument("JOB_ID", nargs="+", help=argparse.SUPPRESS) # Options # Adding verison information parser.add_argument( - '-v', '--version', - action = 'version', - version = '%(prog)s {}'.format(__version__), - help = argparse.SUPPRESS + "-v", + "--version", + action="version", + version="%(prog)s {}".format(__version__), + help=argparse.SUPPRESS, ) # Add custom help message - parser.add_argument( - '-h', '--help', - action='help', - help=argparse.SUPPRESS - ) + parser.add_argument("-h", "--help", action="help", help=argparse.SUPPRESS) - # Base directory to write - # temporary/intermediate files + # Base directory to write + # temporary/intermediate files parser.add_argument( - '-t', '--tmp-dir', - type = str, - required = False, - default = temp, - help = argparse.SUPPRESS + "-t", + "--tmp-dir", + type=str, + required=False, + default=temp, + help=argparse.SUPPRESS, ) - # Number of threads for the + # Number of threads for the # pipeline's main proceess - # This is only applicable for - # local rules or when running + # This is only applicable for + # local rules or when running # in local mode. parser.add_argument( - '-n', '--threads', - type = int, - required = False, - default = 1, - help = argparse.SUPPRESS + "-n", "--threads", type=int, required=False, default=1, help=argparse.SUPPRESS ) # Job scheduler to query, # available: SLURM, ... # More coming soon! parser.add_argument( - '-s', '--scheduler', - type = lambda s: str(s).lower(), - required = False, - default = "slurm", - choices = ['slurm'], - help = argparse.SUPPRESS + "-s", + "--scheduler", + type=lambda s: str(s).lower(), + required=False, + default="slurm", + choices=["slurm"], + help=argparse.SUPPRESS, ) # Define handlers for each sub-parser - parser.set_defaults(func = jobby) + parser.set_defaults(func=jobby) # Parse command-line args args = parser.parse_args() @@ -680,20 +710,17 @@ def main(): # Sanity check for usage if len(sys.argv) == 1: # Nothing was provided - fatal('Invalid usage: {} [-h] [--version] ...'.format(_name)) - + fatal("Invalid usage: {} [-h] [--version] ...".format(_name)) + # Collect args for sub-command - args = parsed_arguments( - name = _name, - description = _description - ) - + args = parsed_arguments(name=_name, description=_description) + # Display version information - err('{} ({})'.format(_name, __version__)) - # Mediator method to call the + err("{} ({})".format(_name, __version__)) + # Mediator method to call the # default handler function args.func(args) -if __name__ == '__main__': - main() \ No newline at end of file +if __name__ == "__main__": + main() diff --git a/resources/multiqc_config.yaml b/resources/multiqc_config.yaml index 19bfd30..622adfa 100755 --- a/resources/multiqc_config.yaml +++ b/resources/multiqc_config.yaml @@ -2,7 +2,7 @@ # eg. FastQC is usually the first step, so should be last in this list # USAGE: multiqc -f -c multiqc_config.yaml --interactive -d ../ -n multiqc_report.html -custom_logo: 'resources/nih-bnfo-logo.png' +custom_logo: "resources/nih-bnfo-logo.png" fastqscreen_simpleplot: true @@ -11,146 +11,145 @@ fn_ignore_files: - Log.final.out module_order: -# Pre-alignment QC - - 'fastq_screen' - - 'kraken' - - 'fastqc': - name: 'FastQC' - info: 'This section of the report shows FastQC results before and after adapter trimming.' - path_filters: - - '*_fastqc.zip' - - 'cutadapt' - - 'trimmomatic' -# Alignment tool stats - - 'bismark' - - 'hicup' - - 'salmon' - - 'kallisto' - - 'star' - - 'tophat' - - 'bowtie2' - - 'bowtie1' -# Post-alignment analysis results - - 'quast' - - 'preseq' - - 'rsem' - - 'snpeff' - - 'qualimap' - - 'bcftools' - - 'featureCounts' - - 'methylQA' - - 'rseqc' - - 'picard' - - 'prokka' - - 'samblaster' - - 'samtools' - - 'bamtools' + # Pre-alignment QC + - "fastq_screen" + - "kraken" + - "fastqc": + name: "FastQC" + info: "This section of the report shows FastQC results before and after adapter trimming." + path_filters: + - "*_fastqc.zip" + - "cutadapt" + - "trimmomatic" + # Alignment tool stats + - "bismark" + - "hicup" + - "salmon" + - "kallisto" + - "star" + - "tophat" + - "bowtie2" + - "bowtie1" + # Post-alignment analysis results + - "quast" + - "preseq" + - "rsem" + - "snpeff" + - "qualimap" + - "bcftools" + - "featureCounts" + - "methylQA" + - "rseqc" + - "picard" + - "prokka" + - "samblaster" + - "samtools" + - "bamtools" # Example of adding a comment #section_comments: # featurecounts: 'This comment is for a module header, but should still work' # star_alignments: 'This new way of commenting above sections is **awesome**!' - table_columns_visible: - QualiMap: - avg_gc: False - median_insert_size: True - 30_x_pc: False - featureCounts: - percent_assigned: False - Assigned: False + QualiMap: + avg_gc: False + median_insert_size: True + 30_x_pc: False + featureCounts: + percent_assigned: False + Assigned: False # Using MultiQC/1.9 extensions -# for cleaning sample name to +# for cleaning sample name to # ensure backwards compatibility fn_clean_exts: - - '.gz' - - '.fastq' - - '.fq' - - '.bam' - - '.sam' - - '.sra' - - '.vcf' - - '.dat' - - '_tophat' - - '.log' - - '.stderr' - - '.out' - - '.spp' - - '.fa' - - '.fasta' - - '.png' - - '.jpg' - - '.jpeg' - - '.html' - - 'Log.final' - - 'ReadsPerGene' - - '.flagstat' - - '_star_aligned' - - '_fastqc' - - '.hicup' - - '.counts' - - '_counts' - - '.txt' - - '.tsv' - - '.csv' - - '.aligned' - - 'Aligned' - - '.merge' - - '.deduplicated' - - '.dedup' - - '.clean' - - '.sorted' - - '.report' - - '| stdin' - - '.geneBodyCoverage' - - '.inner_distance_freq' - - '.junctionSaturation_plot.r' - - '.pos.DupRate.xls' - - '.GC.xls' - - '_slamdunk' - - '_bismark' - - '.conpair' - - '.concordance' - - '.contamination' - - '.BEST.results' - - '_peaks.xls' - - '.relatedness' - - '.cnt' - - '.aqhist' - - '.bhist' - - '.bincov' - - '.bqhist' - - '.covhist' - - '.covstats' - - '.ehist' - - '.gchist' - - '.idhist' - - '.ihist' - - '.indelhist' - - '.lhist' - - '.mhist' - - '.qahist' - - '.qhist' - - '.rpkm' - - '.selfSM' - - '.extendedFrags' - - '_SummaryStatistics' + - ".gz" + - ".fastq" + - ".fq" + - ".bam" + - ".sam" + - ".sra" + - ".vcf" + - ".dat" + - "_tophat" + - ".log" + - ".stderr" + - ".out" + - ".spp" + - ".fa" + - ".fasta" + - ".png" + - ".jpg" + - ".jpeg" + - ".html" + - "Log.final" + - "ReadsPerGene" + - ".flagstat" + - "_star_aligned" + - "_fastqc" + - ".hicup" + - ".counts" + - "_counts" + - ".txt" + - ".tsv" + - ".csv" + - ".aligned" + - "Aligned" + - ".merge" + - ".deduplicated" + - ".dedup" + - ".clean" + - ".sorted" + - ".report" + - "| stdin" + - ".geneBodyCoverage" + - ".inner_distance_freq" + - ".junctionSaturation_plot.r" + - ".pos.DupRate.xls" + - ".GC.xls" + - "_slamdunk" + - "_bismark" + - ".conpair" + - ".concordance" + - ".contamination" + - ".BEST.results" + - "_peaks.xls" + - ".relatedness" + - ".cnt" + - ".aqhist" + - ".bhist" + - ".bincov" + - ".bqhist" + - ".covhist" + - ".covstats" + - ".ehist" + - ".gchist" + - ".idhist" + - ".ihist" + - ".indelhist" + - ".lhist" + - ".mhist" + - ".qahist" + - ".qhist" + - ".rpkm" + - ".selfSM" + - ".extendedFrags" + - "_SummaryStatistics" # These are removed after the above extra_fn_clean_exts: - - type: regex - pattern: 'QualiMap \| \w.* |' - - type: regex - pattern: 'DEG_ALL \| \w.* |' - - type: regex - pattern: 'STAR_files \|' - - type: regex - pattern: 'logfiles \|' - - type: remove - pattern: '.star_rg_added' - - type: remove - pattern: '.RSEM' - - type: remove - pattern: '.p2' + - type: regex + pattern: 'QualiMap \| \w.* |' + - type: regex + pattern: 'DEG_ALL \| \w.* |' + - type: regex + pattern: 'STAR_files \|' + - type: regex + pattern: 'logfiles \|' + - type: remove + pattern: ".star_rg_added" + - type: remove + pattern: ".RSEM" + - type: remove + pattern: ".p2" diff --git a/resources/overview.svg b/resources/overview.svg index 52a5db7..cdc72ed 100644 --- a/resources/overview.svg +++ b/resources/overview.svg @@ -6,4 +6,4 @@ tspan { white-space:pre } - \ No newline at end of file + diff --git a/resources/upload_to_nidap b/resources/upload_to_nidap index 6943fc0..70c2fec 100755 --- a/resources/upload_to_nidap +++ b/resources/upload_to_nidap @@ -17,29 +17,29 @@ Usage: Synopsis: This script provides a high level wrapper to the NIDAP API. Given a list of local file paths, a NIDAP API token, and a NIDAP -dataset identifer, it will upload those files to a dataset on NIDAP. +dataset identifer, it will upload those files to a dataset on NIDAP. Required Arguments: - -f, --files FILE [Type: Str] Files to upload to NIDAP. + -f, --files FILE [Type: Str] Files to upload to NIDAP. One or more local file paths can - be provided. Multiple files can + be provided. Multiple files can be uploaded at once by providing - a quoted space separated list of + a quoted space separated list of local files. -d, --dataid DATAID [Type: Path] Dataset Identifer for NIDAP upload. - Identifer to a dataset on NIDAP + Identifer to a dataset on NIDAP where file(s) will be uploaded. -r, --rid RID [Type: Str] Request Identifer. This transaction - identifer is used to help track a - given request. This identifer is - also appended to any log files. - -t, --token TOKEN [Type: Str] API token for NIDAP. A text file + identifer is used to help track a + given request. This identifer is + also appended to any log files. + -t, --token TOKEN [Type: Str] API token for NIDAP. A text file containing an API token for NIDAP - can be provided, or the API token + can be provided, or the API token can be provided as a string. Options: - -p, --proxy PROXY [Type: Str] HTTPS Proxy. This option can be used - to set or override the following + -p, --proxy PROXY [Type: Str] HTTPS Proxy. This option can be used + to set or override the following environment variable: https_proxy. By default, a https proxy will not be utilized unless it is inherited @@ -81,9 +81,9 @@ function parser() { -v | --version) version && exit 0;; -f | --files) provided "$key" "${2:-}"; Arguments["f"]="$2"; shift; shift;; -d | --dataid) provided "$key" "${2:-}"; Arguments["d"]="$2"; shift; shift;; - -r | --rid) provided "$key" "${2:-}"; Arguments["r"]="$2"; shift; shift;; + -r | --rid) provided "$key" "${2:-}"; Arguments["r"]="$2"; shift; shift;; -t | --token) provided "$key" "${2:-}"; Arguments["t"]="$2"; shift; shift;; - -p | --proxy) provided "$key" "${2:-}"; Arguments["p"]="$2"; shift; shift;; + -p | --proxy) provided "$key" "${2:-}"; Arguments["p"]="$2"; shift; shift;; -* | --*) err "Error: Failed to parse unsupported argument: '${key}'."; usage && exit 1;; *) err "Error: Failed to parse unrecognized argument: '${key}'. Do any of your inputs have spaces?"; usage && exit 1;; esac @@ -123,11 +123,11 @@ function check(){ function retry() { - # Tries to run a cmd 5 times before failing + # Tries to run a cmd 5 times before failing # If a command is successful, it will break out of attempt loop - # Failed attempts are padding with the following exponential + # Failed attempts are padding with the following exponential # back-off strategy {4, 16, 64, 256, 1024} in seconds - # @INPUTS "$@"" = cmd to run + # @INPUTS "$@"" = cmd to run # @CALLS timestamp() to log time of encountered error # @CALLS err() to redirect logging information to stderr # @CALLS fatal() if command cannot be run in 5 attempts @@ -136,9 +136,9 @@ function retry() { local max=5 local attempt=true # flag for while loop while $attempt; do - # Attempt command and break if successful + # Attempt command and break if successful "$@" && attempt=false || { - # Try again up to 5 times + # Try again up to 5 times if [[ $n -le $max ]]; then err "[$(timestamp)] Command failed: $@" delay=$(( 4**$n )) @@ -155,15 +155,15 @@ function retry() { function require(){ # Requires an executable is in $PATH - # as a last resort it will attempt to load + # as a last resort it will attempt to load # the executable as a module. If an exe is # not in $PATH raises fatal(). # INPUT $1 = executable to check # Check if $1 in $PATH # If not, try to module load $1 as a last resort - command -V "$1" &> /dev/null || { - command -V module &> /dev/null && + command -V "$1" &> /dev/null || { + command -V module &> /dev/null && module purge && module load "$1" } || fatal "Error: failed to find or load '$1', not installed on target system." @@ -173,9 +173,9 @@ function require(){ function grab(){ # Grabs the contents of a file # else returns input that was provided - # to allow for flexiable API token input - # so a user can directly provide a token - # as a string or point to a file containing + # to allow for flexiable API token input + # so a user can directly provide a token + # as a string or point to a file containing # the token # INPUT $1 = token file or string @@ -193,16 +193,16 @@ function grab(){ function _commit(){ - # Closes an open upload API transaction on NIDAP + # Closes an open upload API transaction on NIDAP # and commits uploaded the files to the NIDAP dataset - # An open transaction must be closed or the uploaded + # An open transaction must be closed or the uploaded # files will remain in an un-usable/un-findable state. # https://nidap.nih.gov/workspace/documentation/developer/api/general/api-overview # INPUT $1 = NIDAP dataset rid # INPUT $2 = NIDAP API token # INPUT $3 = NIDAP Request or transaction ID # @CALLS timestamp() to log time of file uploads - # @CALLS fatal() if curl returns a non-200 http response + # @CALLS fatal() if curl returns a non-200 http response echo "[$(timestamp) @ ${3}] Committing upload transaction '$3' to NIDAP dataset '$1'" @@ -216,19 +216,19 @@ function _commit(){ -f \ -s \ -w "%{http_code}" \ - "https://nidap.nih.gov/foundry-catalog/api/catalog/datasets/${1}/transactions/${3}/commit" + "https://nidap.nih.gov/foundry-catalog/api/catalog/datasets/${1}/transactions/${3}/commit" ) - - # Check http response code for any failures - if [[ $response != 2?? ]]; then + + # Check http response code for any failures + if [[ $response != 2?? ]]; then fatal "Error: commit request for transaction '${3}' failed with http response of '$response'!" - fi + fi } function _upload(){ - # Uploads a file to NIDAP dataset + # Uploads a file to NIDAP dataset # https://nidap.nih.gov/workspace/documentation/developer/api/general/api-overview # INPUT $1 = File(s) to upload # INPUT $2 = NIDAP dataset rid @@ -237,7 +237,7 @@ function _upload(){ # INPUT $5 = HTTPS Proxy, defaults to no proxy set # @CALLS require() to enfore cURL installation # @CALLS timestamp() to log time of file uploads - # @CALLS fatal() if curl returns a non-200 http response + # @CALLS fatal() if curl returns a non-200 http response # @CALLS _commit() to close an open upload transaction and commit the files to dataset # Require curl is installed @@ -245,9 +245,9 @@ function _upload(){ # Check if a proxy needs to be set if [[ ! -z "${5:-}" ]]; then export https_proxy="${5}"; fi - + # Try to upload each file from NIDAP with 5 max attempts - for file in ${1// /$'\t'}; do + for file in ${1// /$'\t'}; do local fname="$(basename <<< clean "$file")" echo "[$(timestamp) @ ${4}] Uploading '${file}' to NIDAP dataset '$2' as ${fname}" response=$(retry \ @@ -260,15 +260,15 @@ function _upload(){ -F file=@"${file}" \ "https://nidap.nih.gov/foundry-data-proxy/api/dataproxy/datasets/${2}/transactions/${4}" \ ) - - # Check http response code for any failures - if [[ $response != 2?? ]]; then + + # Check http response code for any failures + if [[ $response != 2?? ]]; then fatal "Error: upload request for '$file' failed with http response of '$response'!" - fi + fi done - - # Close open upload transaction and commit files to dataset + + # Close open upload transaction and commit files to dataset _commit "${2}" "${3}" "${4}" } @@ -284,14 +284,14 @@ function main(){ declare -Ag Arguments # Parses user provided command-line arguments - parser "${@}" + parser "${@}" local_files="${Arguments[f]}" upload_nidap_dataset="${Arguments[d]}" token=$(grab "${Arguments[t]}") # grab contents if file provided requestid="${Arguments[r]}" proxy="${Arguments[p]:-}" - # Upload local files to NIDAP Dataset + # Upload local files to NIDAP Dataset # INPUT $1 = File(s) to upload # INPUT $2 = NIDAP dataset rid # INPUT $3 = NIDAP API token diff --git a/workflow/rules/build.smk b/workflow/rules/build.smk index b3f2f6c..cc91c35 100644 --- a/workflow/rules/build.smk +++ b/workflow/rules/build.smk @@ -3,26 +3,26 @@ import json # Helper Functions def allocated(resource, rule, lookup, default="__default__"): - """Pulls resource information for a given rule. If a rule does not have any information + """Pulls resource information for a given rule. If a rule does not have any information for a given resource type, then it will pull from the default. Information is pulled from - definitions in the cluster.json (which is used a job submission). This ensures that any + definitions in the cluster.json (which is used a job submission). This ensures that any resources used at runtime mirror the resources that were allocated. :param resource : resource type to look in cluster.json (i.e. threads, mem, time, gres) :param rule : rule to lookup its information :param lookup : Lookup containing allocation information (i.e. cluster.json) :param default : default information to use if rule information cannot be found - :return allocation : + :return allocation : allocation information for a given resource type for a given rule """ - try: + try: # Try to get allocation information # for a given rule allocation = lookup[rule][resource] except KeyError: # Use default allocation information allocation = lookup[default][resource] - + return allocation @@ -31,11 +31,11 @@ def provided(samplelist, condition): Determines if optional rules should run. If an empty list is provided to rule all, snakemake will not try to generate that set of target files. If a given condition is not met (i.e. False) then it will not try to run that rule. - """ + """ if not str_bool(condition): - # If condition is False, - # returns an empty list - # to prevent rule from + # If condition is False, + # returns an empty list + # to prevent rule from # running samplelist = [] return samplelist @@ -43,9 +43,9 @@ def provided(samplelist, condition): def str_bool(s): """Converts a string to boolean. It is dangerous to try to - typecast a string into a boolean value using the built-in + typecast a string into a boolean value using the built-in `bool()` function. This function avoids any issues that can - arise when using `bool()`. + arise when using `bool()`. Example: boolean('True') returns True boolean('False') returns False @@ -76,14 +76,14 @@ tmpdir=config["TMP_DIR"] workdir:OUTDIR # Read in resource information, -# containing information about +# containing information about # threads, mem, walltimes, etc. # TODO: Add handler for when the # mode is set to local. with open(join(OUTDIR, 'resources', 'build_cluster.json')) as fh: cluster = json.load(fh) -# Ensures backwards compatibility +# Ensures backwards compatibility try: SMALL_GENOME=config["SMALL_GENOME"] except KeyError: @@ -236,7 +236,7 @@ rule star_rl: container: config['images']['arriba'] shell: """ # Setups temporary directory for - # intermediate files with built-in + # intermediate files with built-in # mechanism for deletion on exit if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi tmp=$(mktemp -d -p "{params.tmpdir}") @@ -258,16 +258,16 @@ rule star_rl: if SMALL_GENOME == "True": - # Build a index that is optimized for - # small genomes. For small genomes, the - # parameter --genomeSAindexNbases must - # to be scaled down, with a typical value - # of min(14, log2(GenomeLength)/2 - 1). - # For example, for 1 megaBase genome, this - # is equal to 9, for 100 kiloBase genome, + # Build a index that is optimized for + # small genomes. For small genomes, the + # parameter --genomeSAindexNbases must + # to be scaled down, with a typical value + # of min(14, log2(GenomeLength)/2 - 1). + # For example, for 1 megaBase genome, this + # is equal to 9, for 100 kiloBase genome, # this is equal to 7. Using this guidance # from the author of STAR, we will dynamically - # determine what the optimal value should be + # determine what the optimal value should be # based on the provided reference genome size. rule star_genome: """ @@ -277,9 +277,9 @@ if SMALL_GENOME == "True": base index from which processing annotations from a GTF and insert junctions on the fly. This has the advantage of saving diskspace as an index will not be created for a list of predefined readlengths. This rule replaces star_rl above. - This rule dynamically determine the optimal --genomeSAindexNbases value before - running STAR generateGenome based on the size of the provided reference. This is - needed for very small reference genomes. + This rule dynamically determine the optimal --genomeSAindexNbases value before + running STAR generateGenome based on the size of the provided reference. This is + needed for very small reference genomes. @Input: Genomic FASTA file @Output: @@ -301,7 +301,7 @@ if SMALL_GENOME == "True": container: config['images']['arriba'] shell: """ # Setups temporary directory for - # intermediate files with built-in + # intermediate files with built-in # mechanism for deletion on exit if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi tmp=$(mktemp -d -p "{params.tmpdir}") @@ -357,7 +357,7 @@ else: container: config['images']['arriba'] shell: """ # Setups temporary directory for - # intermediate files with built-in + # intermediate files with built-in # mechanism for deletion on exit if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi tmp=$(mktemp -d -p "{params.tmpdir}") @@ -528,7 +528,7 @@ rule qualimapinfo: rule fqscreen_db1: """ Downloads fastq screen bowtie2 databases from the OpenOmics public resource bundle. - Currently, there are fastq screen indices for the following organisms: hg19, mm9, + Currently, there are fastq screen indices for the following organisms: hg19, mm9, bateria, fungi, virus, univec vector sequences, and rRNA. @Input: @@ -564,7 +564,7 @@ rule fqscreen_db1: rule fqscreen_db2: """ Downloads fastq screen bowtie2 databases from the OpenOmics public resource bundle. - Currently, there are fastq screen indices for the following organisms: hg19, mm9, + Currently, there are fastq screen indices for the following organisms: hg19, mm9, bateria, fungi, virus, univec vector sequences, and rRNA. @Input: @@ -729,4 +729,3 @@ rule jsonmaker: with open(output.json, 'w') as fp: json.dump(refdict, fp, indent=4) - diff --git a/workflow/rules/nidap.smk b/workflow/rules/nidap.smk index 317640e..3e75e3b 100644 --- a/workflow/rules/nidap.smk +++ b/workflow/rules/nidap.smk @@ -11,10 +11,10 @@ rule nidap: outdir=join(workpath,"NIDAP") shell:""" set -exo pipefail -if [ -d {params.outdir} ];then rm -rf {params.outdir};fi +if [ -d {params.outdir} ];then rm -rf {params.outdir};fi mkdir -p {params.outdir} cd {params.outdir} for input in {input};do ln $input . done -""" \ No newline at end of file +""" diff --git a/workflow/rules/single-end.smk b/workflow/rules/single-end.smk index 5e95f70..7a1ef1d 100644 --- a/workflow/rules/single-end.smk +++ b/workflow/rules/single-end.smk @@ -1,6 +1,6 @@ # Single-end snakemake rules imported in the main Snakefile. from scripts.common import ( - abstract_location, + abstract_location, allocated, references ) @@ -58,7 +58,7 @@ rule rawfastqc: if config['options']['small_rna']: # Run STAR with ENCODE's recommendations for small RNA sequencing. - # Set the min read legth to + # Set the min read legth to rule trim_se: """ Data-processing step to remove adapter sequences and perform quality trimming @@ -224,13 +224,13 @@ rule kraken_se: container: config['images']['kraken'] shell: """ # Setups temporary directory for - # intermediate files with built-in + # intermediate files with built-in # mechanism for deletion on exit if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi tmp=$(mktemp -d -p "{params.tmpdir}") trap 'rm -rf "${{tmp}}"' EXIT - # Copy kraken2 db to /lscratch or temp + # Copy kraken2 db to /lscratch or temp # location to reduce filesytem strain cp -rv {params.bacdb}/* ${{tmp}}/ kraken2 --db ${{tmp}} \ @@ -299,7 +299,7 @@ if config['options']['star_2_pass_basic']: container: config['images']['arriba'] shell: """ # Setups temporary directory for - # intermediate files with built-in + # intermediate files with built-in # mechanism for deletion on exit if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi tmp=$(mktemp -d -p "{params.tmpdir}") @@ -346,7 +346,7 @@ elif config['options']['small_rna']: rule star_small: """ Data processing step to align reads against reference genome using STAR using - ENCODE's recommendations for small RNA. + ENCODE's recommendations for small RNA. Please see this links for more information: https://www.encodeproject.org/pipelines/ENCPL337CSA/ https://github.com/ENCODE-DCC/long-rna-seq-pipeline/tree/master/dnanexus/small-rna @@ -393,7 +393,7 @@ elif config['options']['small_rna']: container: config['images']['arriba'] shell: """ # Setups temporary directory for - # intermediate files with built-in + # intermediate files with built-in # mechanism for deletion on exit if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi tmp=$(mktemp -d -p "{params.tmpdir}") @@ -479,7 +479,7 @@ else: container: config['images']['arriba'] shell: """ # Setups temporary directory for - # intermediate files with built-in + # intermediate files with built-in # mechanism for deletion on exit if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi tmp=$(mktemp -d -p "{params.tmpdir}") @@ -592,7 +592,7 @@ else: container: config['images']['arriba'] shell: """ # Setups temporary directory for - # intermediate files with built-in + # intermediate files with built-in # mechanism for deletion on exit if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi tmp=$(mktemp -d -p "{params.tmpdir}") @@ -664,7 +664,7 @@ rule rsem: container: config['images']['rsem'] shell: """ # Setups temporary directory for - # intermediate files with built-in + # intermediate files with built-in # mechanism for deletion on exit if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi tmp=$(mktemp -d -p "{params.tmpdir}") diff --git a/workflow/scripts/PcaReport.Rmd b/workflow/scripts/PcaReport.Rmd index 0841b90..6aed44f 100644 --- a/workflow/scripts/PcaReport.Rmd +++ b/workflow/scripts/PcaReport.Rmd @@ -12,48 +12,49 @@ params: ```{r, include=FALSE, warning=FALSE, message=FALSE} countMethod <- "subread" -if(grepl("rsem",params$data, ignore.case=TRUE)){ +if (grepl("rsem", params$data, ignore.case = TRUE)) { countMethod <- "rsem" } -projectId<-params$projectId -projectDesc<-params$projectDesc -dateandtime<-format(Sys.time(), "%a %b %d %Y - %X") +projectId <- params$projectId +projectDesc <- params$projectDesc +dateandtime <- format(Sys.time(), "%a %b %d %Y - %X") ``` ### **Project:** #### *`r projectId`* -### **Description:** +### **Description:** #### *`r projectDesc`* -### **Count method:** -#### *`r countMethod`* -### **Report generated:** -#### *`r dateandtime`* +### **Count method:** +#### *`r countMethod`* +### **Report generated:** +#### *`r dateandtime`* ```{r setup, echo=FALSE, warning=FALSE,message=FALSE,fig.align='center'} - suppressMessages(library(rgl)) suppressMessages(library(knitr)) -suppressMessages(library('edgeR')) -suppressMessages(library('statmod')) -suppressMessages(library('RColorBrewer')) -suppressMessages(library('gplots')) -suppressMessages(library('reshape')) -suppressMessages(library('ggplot2')) -suppressMessages(library('ggfortify')) +suppressMessages(library("edgeR")) +suppressMessages(library("statmod")) +suppressMessages(library("RColorBrewer")) +suppressMessages(library("gplots")) +suppressMessages(library("reshape")) +suppressMessages(library("ggplot2")) +suppressMessages(library("ggfortify")) suppressMessages(library(ggdendro)) suppressMessages(library(amap)) suppressMessages(library(DT)) suppressMessages(library(plotly)) -suppressMessages(library('geneplotter')) -suppressMessages(library('DESeq2')) +suppressMessages(library("geneplotter")) +suppressMessages(library("DESeq2")) knit_hooks$set(rgl = function(before, options, envir) { if (!before) { ## after a chunk has been evaluated - if (rgl.cur() == 0) return() # no active device - name = paste(options$fig.path, options$label, sep = '') - rgl.snapshot(paste(name, '.png', sep = ''), fmt = 'png') - return(paste('\\includegraphics{', name, '}\n', sep = '')) + if (rgl.cur() == 0) { + return() + } # no active device + name <- paste(options$fig.path, options$label, sep = "") + rgl.snapshot(paste(name, ".png", sep = ""), fmt = "png") + return(paste("\\includegraphics{", name, "}\n", sep = "")) } }) @@ -61,7 +62,6 @@ knit_hooks$set(webgl = hook_webgl) ``` ```{r, echo=FALSE, warning=FALSE,message=FALSE} - ## grab args & normalization DIR <- params$folder @@ -73,44 +73,44 @@ FILE2 <- params$data # ## setwd(DIR) # read files -sampleinfo=read.delim(FILE1) -x = read.delim(FILE2,row.names=1) -colnames(x)=as.character(sampleinfo[,4]) -sampleFiles=as.character(sampleinfo[,2]) +sampleinfo <- read.delim(FILE1) +x <- read.delim(FILE2, row.names = 1) +colnames(x) <- as.character(sampleinfo[, 4]) +sampleFiles <- as.character(sampleinfo[, 2]) ## read annotation file ## ann=read.delim(ANNOTATE) # DGElist object -------------------------------------------------------------- -condition = as.factor(sampleinfo$condition) -y = DGEList(counts=x,group=condition) +condition <- as.factor(sampleinfo$condition) +y <- DGEList(counts = x, group = condition) ## Normalization TMM ------------------------------------------------------------ ## method = =c("TMM","RLE","upperquartile","none") -y <- calcNormFactors(y,method="TMM") +y <- calcNormFactors(y, method = "TMM") # y$samples Group <- factor(sampleinfo$condition) -if ( length(levels(Group)) == 1 ) { - design=~1 - v1 <- voom(as.matrix(x),design=NULL,plot=FALSE,normalize="quantile") +if (length(levels(Group)) == 1) { + design <- ~1 + v1 <- voom(as.matrix(x), design = NULL, plot = FALSE, normalize = "quantile") } else { - design=model.matrix(~0+Group) - v1 <- voom(as.matrix(x),design,plot=FALSE,normalize="quantile") + design <- model.matrix(~ 0 + Group) + v1 <- voom(as.matrix(x), design, plot = FALSE, normalize = "quantile") } -ddsHTSeq<-DESeqDataSetFromMatrix(countData=x,colData=sampleinfo, design=design) -dds<-DESeq(ddsHTSeq) -dds.ndata=as.data.frame(counts(dds,normalized=TRUE)) +ddsHTSeq <- DESeqDataSetFromMatrix(countData = x, colData = sampleinfo, design = design) +dds <- DESeq(ddsHTSeq) +dds.ndata <- as.data.frame(counts(dds, normalized = TRUE)) ## estimating common and tagwise dispersions ----------------------------------------- y <- estimateCommonDisp(y) -y <- estimateTagwiseDisp(y) #default trend: moveingave +y <- estimateTagwiseDisp(y) # default trend: moveingave -ylog2=cpm(y,log=TRUE,normalized.lib.sizes=TRUE,prior.count=0.5) # prior count like avelogcpm -rawlog2= cpm(y,log=TRUE,normalized.lib.sizes=FALSE,prior.count=0.5) -#ddslog2= cpm(dds.ndata,log=TRUE,normalized.lib.sizes=FALSE,prior.count=0.5) +ylog2 <- cpm(y, log = TRUE, normalized.lib.sizes = TRUE, prior.count = 0.5) # prior count like avelogcpm +rawlog2 <- cpm(y, log = TRUE, normalized.lib.sizes = FALSE, prior.count = 0.5) +# ddslog2= cpm(dds.ndata,log=TRUE,normalized.lib.sizes=FALSE,prior.count=0.5) -rld <- rlogTransformation(dds, blind=TRUE) -rldm=assay(rld) -colnames(rldm)=colnames(x) +rld <- rlogTransformation(dds, blind = TRUE) +rldm <- assay(rld) +colnames(rldm) <- colnames(x) ## save it ``` @@ -120,28 +120,44 @@ colnames(rldm)=colnames(x) ### Before Normalization ```{r, echo=FALSE, warning=FALSE,message=FALSE} -beforehist <- ggplotly(ggplot(melt(as.data.frame(rawlog2))) + geom_line(stat="density", aes(x = value,colour = variable)) + labs(x = NULL) + theme(legend.position='right') + scale_x_log10()) +beforehist <- ggplotly(ggplot(melt(as.data.frame(rawlog2))) + + geom_line(stat = "density", aes(x = value, colour = variable)) + + labs(x = NULL) + + theme(legend.position = "right") + + scale_x_log10()) beforehist ``` ### TMM ```{r, echo=FALSE, warning=FALSE,message=FALSE} -tmmhist <- ggplotly(ggplot(melt(as.data.frame(ylog2))) + geom_line(stat="density", aes(x = value,colour = variable)) + labs(x = NULL) + theme(legend.position='right') + scale_x_log10()) +tmmhist <- ggplotly(ggplot(melt(as.data.frame(ylog2))) + + geom_line(stat = "density", aes(x = value, colour = variable)) + + labs(x = NULL) + + theme(legend.position = "right") + + scale_x_log10()) tmmhist ``` ### DESeq2 ```{r, echo=FALSE, warning=FALSE,message=FALSE} -deshist <- ggplotly(ggplot(melt(as.data.frame(rldm))) + geom_line(stat="density", aes(x = value,colour = variable)) + labs(x = NULL) + theme(legend.position='right') + scale_x_log10()) +deshist <- ggplotly(ggplot(melt(as.data.frame(rldm))) + + geom_line(stat = "density", aes(x = value, colour = variable)) + + labs(x = NULL) + + theme(legend.position = "right") + + scale_x_log10()) deshist ``` ### Limma ```{r, echo=FALSE, warning=FALSE,message=FALSE} -limmahist <- ggplotly(ggplot(melt(as.data.frame(v1$E))) + geom_line(stat="density", aes(x = value,colour = variable)) + labs(x = NULL) + theme(legend.position='right') + scale_x_log10()) +limmahist <- ggplotly(ggplot(melt(as.data.frame(v1$E))) + + geom_line(stat = "density", aes(x = value, colour = variable)) + + labs(x = NULL) + + theme(legend.position = "right") + + scale_x_log10()) limmahist ``` @@ -150,182 +166,198 @@ limmahist ### Before Normalization ```{r, echo=FALSE,webgl=TRUE,message=FALSE, warning=FALSE} - ## PCA for before norm -before.edf=rawlog2 -before.tedf= t(before.edf) -before.pca=prcomp(before.tedf,scale.=T) -before.tedf1 = data.frame(before.tedf) -Phenotype=sampleinfo$condition -cell_rep=sampleinfo$label -before.tedf1$group = as.factor(Phenotype) -before.pc1 = round(before.pca$sdev[1]^2/sum(before.pca$sdev^2)*100,2) -before.pc2 = round(before.pca$sdev[2]^2/sum(before.pca$sdev^2)*100,2) -before.pc3 = round(before.pca$sdev[3]^2/sum(before.pca$sdev^2)*100,2) +before.edf <- rawlog2 +before.tedf <- t(before.edf) +before.pca <- prcomp(before.tedf, scale. = T) +before.tedf1 <- data.frame(before.tedf) +Phenotype <- sampleinfo$condition +cell_rep <- sampleinfo$label +before.tedf1$group <- as.factor(Phenotype) +before.pc1 <- round(before.pca$sdev[1]^2 / sum(before.pca$sdev^2) * 100, 2) +before.pc2 <- round(before.pca$sdev[2]^2 / sum(before.pca$sdev^2) * 100, 2) +before.pc3 <- round(before.pca$sdev[3]^2 / sum(before.pca$sdev^2) * 100, 2) -pcafactor = as.factor(sampleinfo$condition) +pcafactor <- as.factor(sampleinfo$condition) library(RColorBrewer) col <- brewer.pal(nlevels(pcafactor), "Paired") -p <- plot_ly(as.data.frame(before.pca$x[,1:3]), x = ~PC1, y = ~PC2, z = ~PC3, color = pcafactor, colors = col, hoverinfo="text", - hovertext = ~sampleinfo$label) %>% +p <- plot_ly(as.data.frame(before.pca$x[, 1:3]), + x = ~PC1, y = ~PC2, z = ~PC3, color = pcafactor, colors = col, hoverinfo = "text", + hovertext = ~ sampleinfo$label +) %>% add_markers() %>% - layout(title = "Before Normalization PCA plot", - scene = list(xaxis = list(title = paste0("PC1 (",before.pc1,"%)")), - yaxis = list(title = paste0("PC2 (",before.pc2,"%)")), - zaxis = list(title = paste0("PC3 (",before.pc3,"%)")))) + layout( + title = "Before Normalization PCA plot", + scene = list( + xaxis = list(title = paste0("PC1 (", before.pc1, "%)")), + yaxis = list(title = paste0("PC2 (", before.pc2, "%)")), + zaxis = list(title = paste0("PC3 (", before.pc3, "%)")) + ) + ) p # plot(before.pca,type="lines") #Decide how many PC's are relevant for plotting - #before.pca$x[,1:3] #look at first 3 PC's - -#plot3d(before.pca$x[,1:3],col = as.integer(before.tedf1$group),type="s",size=2,main="PCA before normalization",xlab=paste0("PC1 (",before.pc1,"%)"),ylab=paste0("PC2 (",before.pc2,"%)"),zlab=paste0("PC3 (",before.pc3,"%)")) -#group.v<-as.vector(cell_rep) -#text3d(before.pca$x, before.pca$y, before.pca$z, group.v, cex=1.0, adj = 1.2) -#legend3d("topright", legend = levels(sampleinfo$condition), pch = 16, col = as.numeric(as.factor(levels(sampleinfo$condition))), cex=0.5) -#rgl.postscript("pca3d_raw.pdf","pdf") -#rgl.snapshot("pca3d_raw.png","png") - +# before.pca$x[,1:3] #look at first 3 PC's + +# plot3d(before.pca$x[,1:3],col = as.integer(before.tedf1$group),type="s",size=2,main="PCA before normalization",xlab=paste0("PC1 (",before.pc1,"%)"),ylab=paste0("PC2 (",before.pc2,"%)"),zlab=paste0("PC3 (",before.pc3,"%)")) +# group.v<-as.vector(cell_rep) +# text3d(before.pca$x, before.pca$y, before.pca$z, group.v, cex=1.0, adj = 1.2) +# legend3d("topright", legend = levels(sampleinfo$condition), pch = 16, col = as.numeric(as.factor(levels(sampleinfo$condition))), cex=0.5) +# rgl.postscript("pca3d_raw.pdf","pdf") +# rgl.snapshot("pca3d_raw.png","png") ``` ### TMM ```{r, echo=FALSE,webgl=TRUE,message=FALSE, warning=FALSE} - ## PCA for edgeR -edgeR.edf=ylog2 -edgeR.tedf= t(edgeR.edf) -edgeR.pca=prcomp(edgeR.tedf,scale.=T) -edgeR.tedf1 = data.frame(edgeR.tedf) -Phenotype=sampleinfo$condition -cell_rep=sampleinfo$label -edgeR.tedf1$group = as.factor(Phenotype) -edgeR.pc1 = round(edgeR.pca$sdev[1]^2/sum(edgeR.pca$sdev^2)*100,2) -edgeR.pc2 = round(edgeR.pca$sdev[2]^2/sum(edgeR.pca$sdev^2)*100,2) -edgeR.pc3 = round(edgeR.pca$sdev[3]^2/sum(edgeR.pca$sdev^2)*100,2) +edgeR.edf <- ylog2 +edgeR.tedf <- t(edgeR.edf) +edgeR.pca <- prcomp(edgeR.tedf, scale. = T) +edgeR.tedf1 <- data.frame(edgeR.tedf) +Phenotype <- sampleinfo$condition +cell_rep <- sampleinfo$label +edgeR.tedf1$group <- as.factor(Phenotype) +edgeR.pc1 <- round(edgeR.pca$sdev[1]^2 / sum(edgeR.pca$sdev^2) * 100, 2) +edgeR.pc2 <- round(edgeR.pca$sdev[2]^2 / sum(edgeR.pca$sdev^2) * 100, 2) +edgeR.pc3 <- round(edgeR.pca$sdev[3]^2 / sum(edgeR.pca$sdev^2) * 100, 2) -pcafactor = as.factor(sampleinfo$condition) +pcafactor <- as.factor(sampleinfo$condition) library(RColorBrewer) col <- brewer.pal(nlevels(pcafactor), "Paired") -p <- plot_ly(as.data.frame(edgeR.pca$x[,1:3]), x = ~PC1, y = ~PC2, z = ~PC3, color = pcafactor, colors = col, hoverinfo="text", - hovertext = ~sampleinfo$label) %>% +p <- plot_ly(as.data.frame(edgeR.pca$x[, 1:3]), + x = ~PC1, y = ~PC2, z = ~PC3, color = pcafactor, colors = col, hoverinfo = "text", + hovertext = ~ sampleinfo$label +) %>% add_markers() %>% - layout(title = "edgeR PCA plot", - scene = list(xaxis = list(title = paste0("PC1 (",edgeR.pc1,"%)")), - yaxis = list(title = paste0("PC2 (",edgeR.pc2,"%)")), - zaxis = list(title = paste0("PC3 (",edgeR.pc3,"%)")))) + layout( + title = "edgeR PCA plot", + scene = list( + xaxis = list(title = paste0("PC1 (", edgeR.pc1, "%)")), + yaxis = list(title = paste0("PC2 (", edgeR.pc2, "%)")), + zaxis = list(title = paste0("PC3 (", edgeR.pc3, "%)")) + ) + ) p # plot(edgeR.pca,type="lines") #Decide how many PC's are relevant for plotting - #edgeR.pca$x[,1:3] #look at first 3 PC's - -#plot3d(edgeR.pca$x[,1:3],col = as.integer(edgeR.tedf1$group),type="s",size=2,main="PCA after TMM normalization",xlab=paste0("PC1 (",edgeR.pc1,"%)"),ylab=paste0("PC2 (",edgeR.pc2,"%)"),zlab=paste0("PC3 (",edgeR.pc3,"%)")) -#group.v<-as.vector(cell_rep) -#text3d(edgeR.pca$x, edgeR.pca$y, edgeR.pca$z, group.v, cex=1.0, adj = 1.2) -#legend3d("topright", legend = levels(sampleinfo$condition), pch = 16, col = as.numeric(as.factor(levels(sampleinfo$condition))), cex=0.5) -#rgl.postscript("pca3d_edgeR.pdf","pdf") -#rgl.snapshot("pca3d_edgeR.png","png") - +# edgeR.pca$x[,1:3] #look at first 3 PC's + +# plot3d(edgeR.pca$x[,1:3],col = as.integer(edgeR.tedf1$group),type="s",size=2,main="PCA after TMM normalization",xlab=paste0("PC1 (",edgeR.pc1,"%)"),ylab=paste0("PC2 (",edgeR.pc2,"%)"),zlab=paste0("PC3 (",edgeR.pc3,"%)")) +# group.v<-as.vector(cell_rep) +# text3d(edgeR.pca$x, edgeR.pca$y, edgeR.pca$z, group.v, cex=1.0, adj = 1.2) +# legend3d("topright", legend = levels(sampleinfo$condition), pch = 16, col = as.numeric(as.factor(levels(sampleinfo$condition))), cex=0.5) +# rgl.postscript("pca3d_edgeR.pdf","pdf") +# rgl.snapshot("pca3d_edgeR.png","png") ``` ### DESeq2 ```{r, echo=FALSE,webgl=TRUE,message=FALSE, warning=FALSE} - ## PCA for deseq2 -#rld <- rlogTransformation(dds, blind=TRUE) -#rldm=assay(rld) -#colnames(rldm)=colnames(x) -deseq2.edf=as.matrix(rldm) -#deseq2.edf=ddslog2 -deseq2.tedf= t(deseq2.edf) -deseq2.tedf=deseq2.tedf[,apply(deseq2.tedf,2,var)!= 0] -deseq2.pca=prcomp(deseq2.tedf,scale.=T) -deseq2.tedf1 = data.frame(deseq2.tedf) -Phenotype=sampleinfo$condition -cell_rep=sampleinfo$label -deseq2.tedf1$group = as.factor(Phenotype) -deseq2.pc1 = round(deseq2.pca$sdev[1]^2/sum(deseq2.pca$sdev^2)*100,2) -deseq2.pc2 = round(deseq2.pca$sdev[2]^2/sum(deseq2.pca$sdev^2)*100,2) -deseq2.pc3 = round(deseq2.pca$sdev[3]^2/sum(deseq2.pca$sdev^2)*100,2) - -pcafactor = as.factor(sampleinfo$condition) +# rld <- rlogTransformation(dds, blind=TRUE) +# rldm=assay(rld) +# colnames(rldm)=colnames(x) +deseq2.edf <- as.matrix(rldm) +# deseq2.edf=ddslog2 +deseq2.tedf <- t(deseq2.edf) +deseq2.tedf <- deseq2.tedf[, apply(deseq2.tedf, 2, var) != 0] +deseq2.pca <- prcomp(deseq2.tedf, scale. = T) +deseq2.tedf1 <- data.frame(deseq2.tedf) +Phenotype <- sampleinfo$condition +cell_rep <- sampleinfo$label +deseq2.tedf1$group <- as.factor(Phenotype) +deseq2.pc1 <- round(deseq2.pca$sdev[1]^2 / sum(deseq2.pca$sdev^2) * 100, 2) +deseq2.pc2 <- round(deseq2.pca$sdev[2]^2 / sum(deseq2.pca$sdev^2) * 100, 2) +deseq2.pc3 <- round(deseq2.pca$sdev[3]^2 / sum(deseq2.pca$sdev^2) * 100, 2) + +pcafactor <- as.factor(sampleinfo$condition) library(RColorBrewer) col <- brewer.pal(nlevels(pcafactor), "Paired") -p <- plot_ly(as.data.frame(deseq2.pca$x[,1:3]), x = ~PC1, y = ~PC2, z = ~PC3, color = pcafactor, colors = col, hoverinfo="text", - hovertext = ~sampleinfo$label) %>% +p <- plot_ly(as.data.frame(deseq2.pca$x[, 1:3]), + x = ~PC1, y = ~PC2, z = ~PC3, color = pcafactor, colors = col, hoverinfo = "text", + hovertext = ~ sampleinfo$label +) %>% add_markers() %>% - layout(title = "DESeq2 PCA plot", - scene = list(xaxis = list(title = paste0("PC1 (",deseq2.pc1,"%)")), - yaxis = list(title = paste0("PC2 (",deseq2.pc2,"%)")), - zaxis = list(title = paste0("PC3 (",deseq2.pc3,"%)")))) + layout( + title = "DESeq2 PCA plot", + scene = list( + xaxis = list(title = paste0("PC1 (", deseq2.pc1, "%)")), + yaxis = list(title = paste0("PC2 (", deseq2.pc2, "%)")), + zaxis = list(title = paste0("PC3 (", deseq2.pc3, "%)")) + ) + ) p -#plot3d(deseq2.pca$x[,1:3],col = as.integer(deseq2.tedf1$group),type="s",size=2,main="PCA after DESeq2 normalization",xlab=paste0("PC1 (",deseq2.pc1,"%)"),ylab=paste0("PC2 (",deseq2.pc2,"%)"),zlab=paste0("PC3 (",deseq2.pc3,"%)")) -#group.v<-as.vector(cell_rep) -#text3d(deseq2.pca$x, deseq2.pca$y, deseq2.pca$z, group.v, cex=1.0, adj = 1.2) -#legend3d("topright", legend = levels(sampleinfo$condition), pch = 16, col = as.numeric(as.factor(levels(sampleinfo$condition))), cex=0.5) -#rgl.postscript("pca3d_deseq2.pdf","pdf") -#rgl.snapshot("pca3d_deseq2.png","png") - +# plot3d(deseq2.pca$x[,1:3],col = as.integer(deseq2.tedf1$group),type="s",size=2,main="PCA after DESeq2 normalization",xlab=paste0("PC1 (",deseq2.pc1,"%)"),ylab=paste0("PC2 (",deseq2.pc2,"%)"),zlab=paste0("PC3 (",deseq2.pc3,"%)")) +# group.v<-as.vector(cell_rep) +# text3d(deseq2.pca$x, deseq2.pca$y, deseq2.pca$z, group.v, cex=1.0, adj = 1.2) +# legend3d("topright", legend = levels(sampleinfo$condition), pch = 16, col = as.numeric(as.factor(levels(sampleinfo$condition))), cex=0.5) +# rgl.postscript("pca3d_deseq2.pdf","pdf") +# rgl.snapshot("pca3d_deseq2.png","png") ``` ### Limma ```{r, echo=FALSE,webgl=TRUE,message=FALSE, warning=FALSE} - ## PCA for Limma -limma.edf=as.matrix(v1$E) -limma.tedf= t(limma.edf) -limma.tedf=limma.tedf[,apply(limma.tedf,2,var)!= 0] -limma.pca=prcomp(limma.tedf,scale.=T) -limma.tedf1 = data.frame(limma.tedf) -Phenotype=sampleinfo$condition -cell_rep=sampleinfo$label -limma.tedf1$group = as.factor(Phenotype) -limma.pc1 = round(limma.pca$sdev[1]^2/sum(limma.pca$sdev^2)*100,2) -limma.pc2 = round(limma.pca$sdev[2]^2/sum(limma.pca$sdev^2)*100,2) -limma.pc3 = round(limma.pca$sdev[3]^2/sum(limma.pca$sdev^2)*100,2) +limma.edf <- as.matrix(v1$E) +limma.tedf <- t(limma.edf) +limma.tedf <- limma.tedf[, apply(limma.tedf, 2, var) != 0] +limma.pca <- prcomp(limma.tedf, scale. = T) +limma.tedf1 <- data.frame(limma.tedf) +Phenotype <- sampleinfo$condition +cell_rep <- sampleinfo$label +limma.tedf1$group <- as.factor(Phenotype) +limma.pc1 <- round(limma.pca$sdev[1]^2 / sum(limma.pca$sdev^2) * 100, 2) +limma.pc2 <- round(limma.pca$sdev[2]^2 / sum(limma.pca$sdev^2) * 100, 2) +limma.pc3 <- round(limma.pca$sdev[3]^2 / sum(limma.pca$sdev^2) * 100, 2) -pcafactor = as.factor(sampleinfo$condition) +pcafactor <- as.factor(sampleinfo$condition) library(RColorBrewer) col <- brewer.pal(nlevels(pcafactor), "Paired") -p <- plot_ly(as.data.frame(limma.pca$x[,1:3]), x = ~PC1, y = ~PC2, z = ~PC3, color = pcafactor, colors = col, hoverinfo="text", - hovertext = ~sampleinfo$label) %>% +p <- plot_ly(as.data.frame(limma.pca$x[, 1:3]), + x = ~PC1, y = ~PC2, z = ~PC3, color = pcafactor, colors = col, hoverinfo = "text", + hovertext = ~ sampleinfo$label +) %>% add_markers() %>% - layout(title = "Limma PCA plot", - scene = list(xaxis = list(title = paste0("PC1 (",limma.pc1,"%)")), - yaxis = list(title = paste0("PC2 (",limma.pc2,"%)")), - zaxis = list(title = paste0("PC3 (",limma.pc3,"%)")))) + layout( + title = "Limma PCA plot", + scene = list( + xaxis = list(title = paste0("PC1 (", limma.pc1, "%)")), + yaxis = list(title = paste0("PC2 (", limma.pc2, "%)")), + zaxis = list(title = paste0("PC3 (", limma.pc3, "%)")) + ) + ) p -#plot3d(limma.pca$x[,1:3],col = as.integer(limma.tedf1$group),type="s",size=2,main="PCA after Limma normalization",xlab=paste0("PC1 (",limma.pc1,"%)"),ylab=paste0("PC2 (",limma.pc2,"%)"),zlab=paste0("PC3 (",limma.pc3,"%)")) -#group.v<-as.vector(cell_rep) -#text3d(limma.pca$x, limma.pca$y, limma.pca$z, group.v, cex=1.0, adj = 1.2) -#legend3d("topright", legend = levels(sampleinfo$condition), pch = 16, col = as.numeric(as.factor(levels(sampleinfo$condition))), cex=0.5) -#rgl.postscript("pca3d_limma.pdf","pdf") -#rgl.snapshot("pca3d_limma.png","png") - +# plot3d(limma.pca$x[,1:3],col = as.integer(limma.tedf1$group),type="s",size=2,main="PCA after Limma normalization",xlab=paste0("PC1 (",limma.pc1,"%)"),ylab=paste0("PC2 (",limma.pc2,"%)"),zlab=paste0("PC3 (",limma.pc3,"%)")) +# group.v<-as.vector(cell_rep) +# text3d(limma.pca$x, limma.pca$y, limma.pca$z, group.v, cex=1.0, adj = 1.2) +# legend3d("topright", legend = levels(sampleinfo$condition), pch = 16, col = as.numeric(as.factor(levels(sampleinfo$condition))), cex=0.5) +# rgl.postscript("pca3d_limma.pdf","pdf") +# rgl.snapshot("pca3d_limma.png","png") ``` ```{r, echo=FALSE,message=FALSE,warning=FALSE} @@ -340,159 +372,160 @@ limma.dfm <- melt(as.data.frame(v1$E)) ### Before Normalization ```{r, echo=FALSE,message=FALSE,fig.show='hold',fig.align='center', warning=FALSE} -par(mar=c(par("mar")[1]+5,par("mar")[-1])) -boxplot(value~variable,las=2,data=before.dfm,main="Before normalization",ylab="Counts",col=as.numeric(as.factor(sampleinfo$condition))) +par(mar = c(par("mar")[1] + 5, par("mar")[-1])) +boxplot(value ~ variable, las = 2, data = before.dfm, main = "Before normalization", ylab = "Counts", col = as.numeric(as.factor(sampleinfo$condition))) ``` ### TMM ```{r, echo=FALSE,message=FALSE,fig.show='hold',fig.align='center', warning=FALSE} -par(mar=c(par("mar")[1]+5,par("mar")[-1])) -boxplot(value~variable,las=2,data=edgeR.dfm,main="TMM",ylab="Counts",col=as.numeric(as.factor(sampleinfo$condition))) +par(mar = c(par("mar")[1] + 5, par("mar")[-1])) +boxplot(value ~ variable, las = 2, data = edgeR.dfm, main = "TMM", ylab = "Counts", col = as.numeric(as.factor(sampleinfo$condition))) ``` ### DESeq2 ```{r, echo=FALSE,message=FALSE,fig.show='hold',fig.align='center', warning=FALSE} -par(mar=c(par("mar")[1]+5,par("mar")[-1])) -boxplot(value~variable,las=2,data=deseq2.dfm,main="DESeq2",ylab="Counts",col=as.numeric(as.factor(sampleinfo$condition))) +par(mar = c(par("mar")[1] + 5, par("mar")[-1])) +boxplot(value ~ variable, las = 2, data = deseq2.dfm, main = "DESeq2", ylab = "Counts", col = as.numeric(as.factor(sampleinfo$condition))) ``` ### Limma ```{r, echo=FALSE,message=FALSE,fig.show='hold',fig.align='center', warning=FALSE} -par(mar=c(par("mar")[1]+5,par("mar")[-1])) -boxplot(value~variable,las=2,data=limma.dfm,main="Limma",ylab="Counts",col=as.numeric(as.factor(sampleinfo$condition))) +par(mar = c(par("mar")[1] + 5, par("mar")[-1])) +boxplot(value ~ variable, las = 2, data = limma.dfm, main = "Limma", ylab = "Counts", col = as.numeric(as.factor(sampleinfo$condition))) ``` ## **Similarity Heatmaps Using Pearson - Complete Linkage** {.tabset} ```{r, echo=FALSE,message=FALSE, warning=FALSE} -bothdendro = "both" -hmapsize = floor(length(sampleinfo$label)/20) + 8 -if(hmapsize>20){ -hmapsize = 15 -bothdendro = "none" +bothdendro <- "both" +hmapsize <- floor(length(sampleinfo$label) / 20) + 8 +if (hmapsize > 20) { + hmapsize <- 15 + bothdendro <- "none" } -create_heatmap <- function(data){ - dd_col <- as.dendrogram(hclust(dist(data))) - dd_row <- as.dendrogram(hclust(dist(t(data)))) - dendro_1 <- dendro_data(dd_col) - dendro_2 <- dendro_data(dd_row) - - hmcol <- colorRampPalette(c("black","red","yellow","white"),space="rgb")(100) - - ggdend <- function(df) { - ggplot() + - geom_segment(data = df, aes(x=x, y=y, xend=xend, yend=yend)) + - labs(x = "", y = "") + theme_minimal() + - theme(axis.text = element_blank(), axis.ticks = element_blank(), - panel.grid = element_blank()) - } - - dendro_columns <- ggdend(dendro_1$segments) - dendro_rows <- ggdend(dendro_2$segments) + coord_flip() - - melt_mat <- melt(data) - hmap <- ggplot(data = melt_mat, aes(x=X1, y=X2, fill=value)) + - geom_tile() + - scale_fill_gradientn(colours = rev(hmcol), limit = c(0,max(data)), space = "Lab", name="Correlation") + - theme_minimal()+ - theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1), - legend.justification = c(1, 0), - legend.position = c(0.6, 0.7), - legend.direction = "horizontal", - axis.title.x = element_blank(), - axis.title.y = element_blank()) + - coord_fixed() - - eaxis <- list(showticklabels = FALSE, showgrid = FALSE, zeroline = FALSE) - p_empty <- plot_ly() %>% layout(margin = list(l = 200), xaxis = eaxis,yaxis = eaxis) - - sply <- subplot(dendro_columns, p_empty, hmap, dendro_rows, nrows = 2) - - sply <- layout(sply, - yaxis = list(domain=c(0.47, 1)), - xaxis = list(domain=c(0, 0.5)), - xaxis3 = list(domain=c(0, 0.5)), - xaxis4 = list(domain=c(0.5, 1)), - margin = list(l = 150, r = 0, b = 50,t = 0)) - - return(sply) -} +create_heatmap <- function(data) { + dd_col <- as.dendrogram(hclust(dist(data))) + dd_row <- as.dendrogram(hclust(dist(t(data)))) + dendro_1 <- dendro_data(dd_col) + dendro_2 <- dendro_data(dd_row) + + hmcol <- colorRampPalette(c("black", "red", "yellow", "white"), space = "rgb")(100) + + ggdend <- function(df) { + ggplot() + + geom_segment(data = df, aes(x = x, y = y, xend = xend, yend = yend)) + + labs(x = "", y = "") + + theme_minimal() + + theme( + axis.text = element_blank(), axis.ticks = element_blank(), + panel.grid = element_blank() + ) + } + dendro_columns <- ggdend(dendro_1$segments) + dendro_rows <- ggdend(dendro_2$segments) + coord_flip() + + melt_mat <- melt(data) + hmap <- ggplot(data = melt_mat, aes(x = X1, y = X2, fill = value)) + + geom_tile() + + scale_fill_gradientn(colours = rev(hmcol), limit = c(0, max(data)), space = "Lab", name = "Correlation") + + theme_minimal() + + theme( + axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1), + legend.justification = c(1, 0), + legend.position = c(0.6, 0.7), + legend.direction = "horizontal", + axis.title.x = element_blank(), + axis.title.y = element_blank() + ) + + coord_fixed() + + eaxis <- list(showticklabels = FALSE, showgrid = FALSE, zeroline = FALSE) + p_empty <- plot_ly() %>% layout(margin = list(l = 200), xaxis = eaxis, yaxis = eaxis) + + sply <- subplot(dendro_columns, p_empty, hmap, dendro_rows, nrows = 2) + + sply <- layout(sply, + yaxis = list(domain = c(0.47, 1)), + xaxis = list(domain = c(0, 0.5)), + xaxis3 = list(domain = c(0, 0.5)), + xaxis4 = list(domain = c(0.5, 1)), + margin = list(l = 150, r = 0, b = 50, t = 0) + ) + + return(sply) +} ``` ### Before Normalization ```{r, echo=FALSE,message=FALSE,fig.show='hold',fig.align='center', warning=FALSE, fig.width=hmapsize, fig.height=hmapsize} +hmcol <- colorRampPalette(c("black", "red", "yellow", "white"), space = "rgb")(100) +before.distrawlog2 <- amap::Dist(t(rawlog2), method = "pearson") +before.mat <- as.matrix(before.distrawlog2) -hmcol <- colorRampPalette(c("black","red","yellow","white"),space="rgb")(100) -before.distrawlog2=amap::Dist(t(rawlog2),method="pearson") -before.mat = as.matrix(before.distrawlog2) - -heatmap.2(before.mat, trace="none", col = rev(hmcol), labCol=FALSE, Rowv=TRUE, Colv= TRUE, dendrogram = bothdendro, colRow=as.numeric(as.factor(sampleinfo$condition)), margin=c(16, 16), main="Before normalization") +heatmap.2(before.mat, trace = "none", col = rev(hmcol), labCol = FALSE, Rowv = TRUE, Colv = TRUE, dendrogram = bothdendro, colRow = as.numeric(as.factor(sampleinfo$condition)), margin = c(16, 16), main = "Before normalization") ``` ### TMM ```{r, echo=FALSE,message=FALSE,fig.show='hold',fig.align='center', warning=FALSE, fig.width=hmapsize, fig.height=hmapsize} - -edgeR.distylog2=amap::Dist(t(ylog2),method="pearson") -edgeR.mat = as.matrix(edgeR.distylog2) -heatmap.2(edgeR.mat, trace="none", col = rev(hmcol), labCol=FALSE, Rowv=TRUE, Colv= TRUE, dendrogram = bothdendro, colRow=as.numeric(as.factor(sampleinfo$condition)), margin=c(16, 16), main="TMM") +edgeR.distylog2 <- amap::Dist(t(ylog2), method = "pearson") +edgeR.mat <- as.matrix(edgeR.distylog2) +heatmap.2(edgeR.mat, trace = "none", col = rev(hmcol), labCol = FALSE, Rowv = TRUE, Colv = TRUE, dendrogram = bothdendro, colRow = as.numeric(as.factor(sampleinfo$condition)), margin = c(16, 16), main = "TMM") ``` ### DESeq2 ```{r, echo=FALSE,message=FALSE,fig.show='hold',fig.align='center', warning=FALSE, fig.width=hmapsize, fig.height=hmapsize} - -deseq2.dists <- amap::Dist(t(rldm),method="pearson") +deseq2.dists <- amap::Dist(t(rldm), method = "pearson") deseq2.mat <- as.matrix(deseq2.dists) -heatmap.2(deseq2.mat, trace="none", col = rev(hmcol), labCol=FALSE, Rowv=TRUE, Colv= TRUE, dendrogram = bothdendro, colRow=as.numeric(as.factor(sampleinfo$condition)), margin=c(16, 16), main="DESeq2") +heatmap.2(deseq2.mat, trace = "none", col = rev(hmcol), labCol = FALSE, Rowv = TRUE, Colv = TRUE, dendrogram = bothdendro, colRow = as.numeric(as.factor(sampleinfo$condition)), margin = c(16, 16), main = "DESeq2") ``` ### Limma ```{r, echo=FALSE,message=FALSE,fig.show='hold',fig.align='center', warning=FALSE, fig.width=hmapsize, fig.height=hmapsize} - -limma.dists <- amap::Dist(t(limma.edf),method="pearson") +limma.dists <- amap::Dist(t(limma.edf), method = "pearson") limma.mat <- as.matrix(limma.dists) -heatmap.2(limma.mat, trace="none", col = rev(hmcol), labCol=FALSE, Rowv=TRUE, Colv= TRUE, dendrogram = bothdendro, colRow=as.numeric(as.factor(sampleinfo$condition)), margin=c(16, 16), main="Limma") +heatmap.2(limma.mat, trace = "none", col = rev(hmcol), labCol = FALSE, Rowv = TRUE, Colv = TRUE, dendrogram = bothdendro, colRow = as.numeric(as.factor(sampleinfo$condition)), margin = c(16, 16), main = "Limma") ``` ## **MD Plots** (sample by sample mean vs fold change of expression) ```{r, echo=FALSE,message=FALSE, warning=FALSE} -mdchunkheight <- 3.3*length(sampleinfo$label) +mdchunkheight <- 3.3 * length(sampleinfo$label) mdflag <- TRUE -if(mdchunkheight>100){ - mdchunkheight = 8 - mdflag <- FALSE +if (mdchunkheight > 100) { + mdchunkheight <- 8 + mdflag <- FALSE } ``` ```{r, echo=FALSE,message=FALSE,fig.show='hold',fig.align='center', warning=FALSE, fig.height=mdchunkheight} -if(mdflag){ - par(mfcol=c(length(sampleinfo$label),4)) - for(i in 1:length(sampleinfo$label)){ - plotMD(rawlog2,column=i, main=paste0("Raw ",sampleinfo$label[i]), xlim=c(-5,15), ylim=c(-15,15)) - abline(h=0, col="red", lty=2, lwd=2) +if (mdflag) { + par(mfcol = c(length(sampleinfo$label), 4)) + for (i in 1:length(sampleinfo$label)) { + plotMD(rawlog2, column = i, main = paste0("Raw ", sampleinfo$label[i]), xlim = c(-5, 15), ylim = c(-15, 15)) + abline(h = 0, col = "red", lty = 2, lwd = 2) } - for(i in 1:length(sampleinfo$label)){ - plotMD(ylog2,column=i, main=paste0("TMM ",sampleinfo$label[i]), xlim=c(-5,15), ylim=c(-15,15)) - abline(h=0, col="red", lty=2, lwd=2) + for (i in 1:length(sampleinfo$label)) { + plotMD(ylog2, column = i, main = paste0("TMM ", sampleinfo$label[i]), xlim = c(-5, 15), ylim = c(-15, 15)) + abline(h = 0, col = "red", lty = 2, lwd = 2) } - for(i in 1:length(sampleinfo$label)){ - plotMD(rldm,column=i, main=paste0("DESeq2 ",sampleinfo$label[i]), xlim=c(-5,15), ylim=c(-15,15)) - abline(h=0, col="red", lty=2, lwd=2) + for (i in 1:length(sampleinfo$label)) { + plotMD(rldm, column = i, main = paste0("DESeq2 ", sampleinfo$label[i]), xlim = c(-5, 15), ylim = c(-15, 15)) + abline(h = 0, col = "red", lty = 2, lwd = 2) } - for(i in 1:length(sampleinfo$label)){ - plotMD(v1$E,column=i, main=paste0("Limma ",sampleinfo$label[i]), xlim=c(-5,15), ylim=c(-15,15)) - abline(h=0, col="red", lty=2, lwd=2) + for (i in 1:length(sampleinfo$label)) { + plotMD(v1$E, column = i, main = paste0("Limma ", sampleinfo$label[i]), xlim = c(-5, 15), ylim = c(-15, 15)) + abline(h = 0, col = "red", lty = 2, lwd = 2) } } ``` diff --git a/workflow/scripts/bam_count_concord_stats.py b/workflow/scripts/bam_count_concord_stats.py index b6dcca3..b295d0b 100644 --- a/workflow/scripts/bam_count_concord_stats.py +++ b/workflow/scripts/bam_count_concord_stats.py @@ -10,12 +10,12 @@ if DNAread.is_proper_pair: isproper_count += 1 if DNAread.has_tag("NH"): - tag_value=DNAread.get_tag("NH") + tag_value = DNAread.get_tag("NH") if tag_value != 1: keep = False if keep: - count+=1 + count += 1 keep = True diff --git a/workflow/scripts/builder/create_rRNA_intervals.py b/workflow/scripts/builder/create_rRNA_intervals.py index 4da327a..56f8c0c 100644 --- a/workflow/scripts/builder/create_rRNA_intervals.py +++ b/workflow/scripts/builder/create_rRNA_intervals.py @@ -1,33 +1,42 @@ -import sys,os,pysam -fa=sys.argv[1] -genomename=sys.argv[3] -gtf=sys.argv[2] -if not os.path.exists(fa+".fai"): - pysam.faidx(fa) -unknown="\"Unknown\";" -#out=open(genomename+".rRNA_interval_list",'w') -for f in open(fa+".fai").readlines(): - f=f.strip().split("\t") -# out.write("@SQ\tSN:%s\tLN:%s\tAS:%s\n"%(f[0],f[1],genomename)) - print("@SQ\tSN:%s\tLN:%s\tAS:%s"%(f[0],f[1],genomename)) - +import sys, os, pysam -for i in list(filter(lambda x:x[2]=="gene",filter(lambda x:not x[0].startswith("#"),list(map(lambda x:x.strip().split("\t"),open(gtf).readlines()))))): - gene_id="" - j=i[8].split() - gene_id=unknown - gene_name=unknown - gene_biotype=unknown - for k in list(range(0,len(j)-1,2)): - if j[k]=="gene_id": - gene_id=j[k+1][1:-2] - elif j[k]=="gene_name": - gene_name=j[k+1][1:-2] - elif j[k]=="gene_biotype": - gene_biotype=j[k+1][1:-2] - elif j[k]=="gene_type": - gene_biotype=j[k+1][1:-2] - if gene_biotype.lower()=="rrna": - #out.write("%s\t%s\t%s\t%s\t%s\n"%(i[0],i[3],i[4],i[6],gene_id)) - print("%s\t%s\t%s\t%s\t%s"%(i[0],i[3],i[4],i[6],gene_id)) -#out.close() +fa = sys.argv[1] +genomename = sys.argv[3] +gtf = sys.argv[2] +if not os.path.exists(fa + ".fai"): + pysam.faidx(fa) +unknown = '"Unknown";' +# out=open(genomename+".rRNA_interval_list",'w') +for f in open(fa + ".fai").readlines(): + f = f.strip().split("\t") + # out.write("@SQ\tSN:%s\tLN:%s\tAS:%s\n"%(f[0],f[1],genomename)) + print("@SQ\tSN:%s\tLN:%s\tAS:%s" % (f[0], f[1], genomename)) + + +for i in list( + filter( + lambda x: x[2] == "gene", + filter( + lambda x: not x[0].startswith("#"), + list(map(lambda x: x.strip().split("\t"), open(gtf).readlines())), + ), + ) +): + gene_id = "" + j = i[8].split() + gene_id = unknown + gene_name = unknown + gene_biotype = unknown + for k in list(range(0, len(j) - 1, 2)): + if j[k] == "gene_id": + gene_id = j[k + 1][1:-2] + elif j[k] == "gene_name": + gene_name = j[k + 1][1:-2] + elif j[k] == "gene_biotype": + gene_biotype = j[k + 1][1:-2] + elif j[k] == "gene_type": + gene_biotype = j[k + 1][1:-2] + if gene_biotype.lower() == "rrna": + # out.write("%s\t%s\t%s\t%s\t%s\n"%(i[0],i[3],i[4],i[6],gene_id)) + print("%s\t%s\t%s\t%s\t%s" % (i[0], i[3], i[4], i[6], gene_id)) +# out.close() diff --git a/workflow/scripts/builder/gene2transcripts_add_length.py b/workflow/scripts/builder/gene2transcripts_add_length.py index f337534..7966c72 100644 --- a/workflow/scripts/builder/gene2transcripts_add_length.py +++ b/workflow/scripts/builder/gene2transcripts_add_length.py @@ -8,29 +8,33 @@ # Example: # $ python gene2transcripts_add_length.py gene2transcripts.protein_coding_only genes.gtf.genePred.bed > gene2transcripts.protein_coding_only.with_len -def get_len(s): - lengths=[int(num) for num in s.strip().rstrip(',').split(",")] - return sum(lengths) - - -if __name__ == '__main__': - - if len(sys.argv) != 3: - print('Usage: python {} gene2transcripts.protein_coding_only genes.gtf.genePred.bed'.format(sys.argv[0])) - print('\nError: failed to provide all positional arguments!', file=sys.stderr) - sys.exit(1) - transcript2length = {} - - # Read in genes.gtf.genePred.bed - with open(sys.argv[2]) as in_file: - for line in in_file: - linelist = line.strip().split("\t") - transcript2length[linelist[3]] = get_len(linelist[10]) - - # Read in gene2transcripts.protein_coding_only - with open(sys.argv[1]) as in_file: - for line in in_file: - linelist = line.strip().split("\t") - linelist.append(str(transcript2length[linelist[1]])) - print("\t".join(linelist)) +def get_len(s): + lengths = [int(num) for num in s.strip().rstrip(",").split(",")] + return sum(lengths) + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print( + "Usage: python {} gene2transcripts.protein_coding_only genes.gtf.genePred.bed".format( + sys.argv[0] + ) + ) + print("\nError: failed to provide all positional arguments!", file=sys.stderr) + sys.exit(1) + + transcript2length = {} + + # Read in genes.gtf.genePred.bed + with open(sys.argv[2]) as in_file: + for line in in_file: + linelist = line.strip().split("\t") + transcript2length[linelist[3]] = get_len(linelist[10]) + + # Read in gene2transcripts.protein_coding_only + with open(sys.argv[1]) as in_file: + for line in in_file: + linelist = line.strip().split("\t") + linelist.append(str(transcript2length[linelist[1]])) + print("\t".join(linelist)) diff --git a/workflow/scripts/builder/generate_qualimap_ref.py b/workflow/scripts/builder/generate_qualimap_ref.py index c14a3f0..c4fb322 100644 --- a/workflow/scripts/builder/generate_qualimap_ref.py +++ b/workflow/scripts/builder/generate_qualimap_ref.py @@ -15,15 +15,48 @@ def idsContainGiven(givenId, transcriptIds): return False -if __name__ == "__main__": +if __name__ == "__main__": descriptionText = "The script extracts features from a GTF file and a FASTA file into Qualimap annotation format. Note: exons have to be sorted according to exon number! This important for correct reverse transcribed cDNA sequences extraction." - parser = argparse.ArgumentParser(description = descriptionText,formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument("-g", action="store", required="true", dest="gtfFile", help="Input file with list of genes in GTF format") - parser.add_argument("-f", action="store", required="true", dest="fastaFile", help="Input genome sequence. ") - parser.add_argument("-o", action="store", dest="outFile", default="annotations.txt", help="Output file. Default is annotations.txt") - parser.add_argument("--filter", action="store", dest="filterStr", default="", help="Comma-separted list of entries to filter from GTF file based on given attribute id") - parser.add_argument("--ignore-strange-chrom", action="store_true", default=False, dest="ignoreStrangeChromosomes", help="All chromosomes except numbered and X,Y,MT are ignored ") + parser = argparse.ArgumentParser( + description=descriptionText, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "-g", + action="store", + required="true", + dest="gtfFile", + help="Input file with list of genes in GTF format", + ) + parser.add_argument( + "-f", + action="store", + required="true", + dest="fastaFile", + help="Input genome sequence. ", + ) + parser.add_argument( + "-o", + action="store", + dest="outFile", + default="annotations.txt", + help="Output file. Default is annotations.txt", + ) + parser.add_argument( + "--filter", + action="store", + dest="filterStr", + default="", + help="Comma-separted list of entries to filter from GTF file based on given attribute id", + ) + parser.add_argument( + "--ignore-strange-chrom", + action="store_true", + default=False, + dest="ignoreStrangeChromosomes", + help="All chromosomes except numbered and X,Y,MT are ignored ", + ) args = parser.parse_args() print(args) @@ -41,7 +74,7 @@ def idsContainGiven(givenId, transcriptIds): print("Filtering for: ", filtered_transcripts) for feature in gtf_file: - if feature.type == 'exon': + if feature.type == "exon": geneName = feature.attr[attr_id] if geneName in features: features[geneName].append(feature) @@ -52,10 +85,9 @@ def idsContainGiven(givenId, transcriptIds): seqData = SeqIO.to_dict(SeqIO.parse(fastaFileName, "fasta")) outFile = open(outFileName, "w") - header = "\"%s\"\t\"%s\"\t\"%s\"\n" % ("biotypes","length","gc") + header = '"%s"\t"%s"\t"%s"\n' % ("biotypes", "length", "gc") outFile.write(header) - for geneId in features: exons = features[geneId] print("Processing %s" % geneId) @@ -76,25 +108,29 @@ def idsContainGiven(givenId, transcriptIds): iv = exon.iv seqName = iv.chrom if seqName in seqData: - buf = seqData[ iv.chrom ].seq[ iv.start : iv.end ] - if iv.strand == '-': + buf = seqData[iv.chrom].seq[iv.start : iv.end] + if iv.strand == "-": buf = buf.reverse_complement() tSeq += buf else: - print("Can not locate sequence {} in {}, skipping region...".format(seqName, fastaFileName)) + print( + "Can not locate sequence {} in {}, skipping region...".format( + seqName, fastaFileName + ) + ) transcripts[transcriptId] = tSeq gc_array = [] lengths = [] for tSeq in transcripts.values(): - lengths.append( len(tSeq) ) - gc_array.append ( SeqUtils.GC(tSeq) ) + lengths.append(len(tSeq)) + gc_array.append(SeqUtils.GC(tSeq)) gene_length = np.mean(lengths) gene_gc = np.mean(gc_array) - line = "\"%s\"\t\"%s\"\t%d\t%.2f\n" % (geneId, biotype, gene_length, gene_gc) - outFile.write ( line ) + line = '"%s"\t"%s"\t%d\t%.2f\n' % (geneId, biotype, gene_length, gene_gc) + outFile.write(line) outFile.close() diff --git a/workflow/scripts/builder/get_gene_annotate.py b/workflow/scripts/builder/get_gene_annotate.py index 0897268..231001e 100644 --- a/workflow/scripts/builder/get_gene_annotate.py +++ b/workflow/scripts/builder/get_gene_annotate.py @@ -1,24 +1,32 @@ -import sys; -unknown="\"Unknown\";" +import sys -for i in list(filter(lambda x:x[2]=="gene",filter(lambda x:not x[0].startswith("#"),list(map(lambda x:x.strip().split("\t"),open(sys.argv[1]).readlines()))))): -#for i in list(map(lambda x:x.strip().split("\t")[8],open(sys.argv[1]).readlines())): - gene_id="" - j=i[8].split() - gene_id=unknown - gene_name=unknown - gene_biotype=unknown - for k in list(range(0,len(j)-1,2)): - if j[k]=="gene_id": - gene_id=j[k+1] - elif j[k]=="gene_name": - gene_name=j[k+1] - elif j[k]=="gene_biotype": - gene_biotype=j[k+1] - elif j[k]=="gene_type": - gene_biotype=j[k+1] - if gene_name == unknown and gene_id != unknown : - gene_name=gene_id - s="%s %s %s"%(gene_id[:-1],gene_name[:-1],gene_biotype[:-1]) - print(s) - +unknown = '"Unknown";' + +for i in list( + filter( + lambda x: x[2] == "gene", + filter( + lambda x: not x[0].startswith("#"), + list(map(lambda x: x.strip().split("\t"), open(sys.argv[1]).readlines())), + ), + ) +): + # for i in list(map(lambda x:x.strip().split("\t")[8],open(sys.argv[1]).readlines())): + gene_id = "" + j = i[8].split() + gene_id = unknown + gene_name = unknown + gene_biotype = unknown + for k in list(range(0, len(j) - 1, 2)): + if j[k] == "gene_id": + gene_id = j[k + 1] + elif j[k] == "gene_name": + gene_name = j[k + 1] + elif j[k] == "gene_biotype": + gene_biotype = j[k + 1] + elif j[k] == "gene_type": + gene_biotype = j[k + 1] + if gene_name == unknown and gene_id != unknown: + gene_name = gene_id + s = "%s %s %s" % (gene_id[:-1], gene_name[:-1], gene_biotype[:-1]) + print(s) diff --git a/workflow/scripts/builder/get_isoform_annotate.py b/workflow/scripts/builder/get_isoform_annotate.py index d5e44eb..232dd1f 100644 --- a/workflow/scripts/builder/get_isoform_annotate.py +++ b/workflow/scripts/builder/get_isoform_annotate.py @@ -1,27 +1,40 @@ -import sys; -unknown="\"Unknown\";" +import sys -for i in list(filter(lambda x:x[2]=="transcript",filter(lambda x:not x[0].startswith("#"),list(map(lambda x:x.strip().split("\t"),open(sys.argv[1]).readlines()))))): -#for i in list(map(lambda x:x.strip().split("\t")[8],open(sys.argv[1]).readlines())): - gene_id="" - j=i[8].split() - transcript_id=unknown - gene_id=unknown - gene_name=unknown - transcript_name=unknown - for k in list(range(0,len(j)-1,2)): - if j[k]=="transcript_id": - transcript_id=j[k+1] - elif j[k]=="gene_id": - gene_id=j[k+1] - elif j[k]=="transcript_name": - transcript_name=j[k+1] - elif j[k]=="gene_name": - gene_name=j[k+1] - if transcript_name == unknown and transcript_id != unknown : - transcript_name=transcript_id - if gene_name == unknown and gene_id != unknown : - gene_name=gene_id - s="%s %s %s %s"%(transcript_id[:-1],transcript_name[:-1],gene_id[:-1],gene_name[:-1]) - print(s) - +unknown = '"Unknown";' + +for i in list( + filter( + lambda x: x[2] == "transcript", + filter( + lambda x: not x[0].startswith("#"), + list(map(lambda x: x.strip().split("\t"), open(sys.argv[1]).readlines())), + ), + ) +): + # for i in list(map(lambda x:x.strip().split("\t")[8],open(sys.argv[1]).readlines())): + gene_id = "" + j = i[8].split() + transcript_id = unknown + gene_id = unknown + gene_name = unknown + transcript_name = unknown + for k in list(range(0, len(j) - 1, 2)): + if j[k] == "transcript_id": + transcript_id = j[k + 1] + elif j[k] == "gene_id": + gene_id = j[k + 1] + elif j[k] == "transcript_name": + transcript_name = j[k + 1] + elif j[k] == "gene_name": + gene_name = j[k + 1] + if transcript_name == unknown and transcript_id != unknown: + transcript_name = transcript_id + if gene_name == unknown and gene_id != unknown: + gene_name = gene_id + s = "%s %s %s %s" % ( + transcript_id[:-1], + transcript_name[:-1], + gene_id[:-1], + gene_name[:-1], + ) + print(s) diff --git a/workflow/scripts/builder/get_karyoplot_beds.py b/workflow/scripts/builder/get_karyoplot_beds.py index d8aff55..5394686 100644 --- a/workflow/scripts/builder/get_karyoplot_beds.py +++ b/workflow/scripts/builder/get_karyoplot_beds.py @@ -1,45 +1,47 @@ import sys + + def get_gene_name(j): - searchfor="gene_name" - if not searchfor in j: - searchfor="gene_id" - k=j.split() - ind=-1 - for i,l in enumerate(k): - if l==searchfor: - ind=i+1 - break - m=k[ind].split("\"")[1] - return m - -genelist=[] -chrs=[] -f=open("karyobed.bed",'w') + searchfor = "gene_name" + if not searchfor in j: + searchfor = "gene_id" + k = j.split() + ind = -1 + for i, l in enumerate(k): + if l == searchfor: + ind = i + 1 + break + m = k[ind].split('"')[1] + return m + + +genelist = [] +chrs = [] +f = open("karyobed.bed", "w") for i in open(sys.argv[1]).readlines(): - if i.startswith("#"): - continue - j=i.strip().split("\t") - if j[2]=="gene": - start=j[3] - end=j[4] - gene_name=get_gene_name(j[-1]) - if not gene_name in genelist: - genelist.append(gene_name) - outtxt=[] - outtxt.append(j[0]) - outtxt.append(start) - outtxt.append(end) - outtxt.append(j[6]) - outtxt.append(gene_name) - f.write("\t".join(outtxt)+"\n") - chrs.append(j[0]) + if i.startswith("#"): + continue + j = i.strip().split("\t") + if j[2] == "gene": + start = j[3] + end = j[4] + gene_name = get_gene_name(j[-1]) + if not gene_name in genelist: + genelist.append(gene_name) + outtxt = [] + outtxt.append(j[0]) + outtxt.append(start) + outtxt.append(end) + outtxt.append(j[6]) + outtxt.append(gene_name) + f.write("\t".join(outtxt) + "\n") + chrs.append(j[0]) f.close() -chrs=list(set(chrs)) +chrs = list(set(chrs)) for c in chrs: - f=open("karyobed."+c+".bed",'w') - for i in open("karyobed.bed").readlines(): - j=i.strip().split("\t") - if j[0]==c: - f.write(i) - f.close() - + f = open("karyobed." + c + ".bed", "w") + for i in open("karyobed.bed").readlines(): + j = i.strip().split("\t") + if j[0] == c: + f.write(i) + f.close() diff --git a/workflow/scripts/builder/get_karyoplot_gene_coordinates.py b/workflow/scripts/builder/get_karyoplot_gene_coordinates.py index d420eac..82520c7 100644 --- a/workflow/scripts/builder/get_karyoplot_gene_coordinates.py +++ b/workflow/scripts/builder/get_karyoplot_gene_coordinates.py @@ -1,26 +1,29 @@ import sys + + def get_gene_name(j): - searchfor="gene_name" - if not searchfor in j: - searchfor="gene_id" - k=j.split() - ind=-1 - for i,l in enumerate(k): - if l==searchfor: - ind=i+1 - break - m=k[ind].split("\"")[1] - return m - -print("chr","coord","gene_name","strand",sep="\t") -genelist=[] + searchfor = "gene_name" + if not searchfor in j: + searchfor = "gene_id" + k = j.split() + ind = -1 + for i, l in enumerate(k): + if l == searchfor: + ind = i + 1 + break + m = k[ind].split('"')[1] + return m + + +print("chr", "coord", "gene_name", "strand", sep="\t") +genelist = [] for i in open(sys.argv[1]).readlines(): - if i.startswith("#"): - continue - j=i.strip().split("\t") - if j[2]=="gene": - coord=int((int(j[3])+int(j[4]))*0.5) - gene_name=get_gene_name(j[-1]) - if not gene_name in genelist: - genelist.append(gene_name) - print(j[0],coord,gene_name,j[6],sep="\t") + if i.startswith("#"): + continue + j = i.strip().split("\t") + if j[2] == "gene": + coord = int((int(j[3]) + int(j[4])) * 0.5) + gene_name = get_gene_name(j[-1]) + if not gene_name in genelist: + genelist.append(gene_name) + print(j[0], coord, gene_name, j[6], sep="\t") diff --git a/workflow/scripts/builder/gtf2protein_coding_genes.py b/workflow/scripts/builder/gtf2protein_coding_genes.py index f74e2f5..1f0fcc2 100644 --- a/workflow/scripts/builder/gtf2protein_coding_genes.py +++ b/workflow/scripts/builder/gtf2protein_coding_genes.py @@ -9,55 +9,58 @@ def get_value(mykey, lookup): - try: - myvalue = lookup[mykey] - except KeyError: - myvalue = '' - return myvalue.strip('"').strip("'") + try: + myvalue = lookup[mykey] + except KeyError: + myvalue = "" + return myvalue.strip('"').strip("'") def seperated(pairslist): - for kv in pairslist: - k = kv.split(' ')[0] - v = " ".join(kv.split(' ')[1:]).rstrip(';') - yield k,v + for kv in pairslist: + k = kv.split(" ")[0] + v = " ".join(kv.split(" ")[1:]).rstrip(";") + yield k, v def get_id_and_type(last_column): - pairs = {} - kv_pairs_list = last_column.strip().split('; ') - - for k,v in seperated(kv_pairs_list): - pairs[k] = v - - gene_id = get_value('gene_id', pairs) - gene_type = get_value('gene_type', pairs) - if not gene_type: - # gene_type does not exist - # default to using gene_biotype - gene_type = get_value('gene_biotype', pairs) - - return gene_id, gene_type - -if __name__ == '__main__': - - - if len(sys.argv) != 2: - print('Usage: python {} genes.gtf > protein_coding_genes.lst'.format(sys.argv[0])) - print('\nError: failed to provide all positional arguments!', file=sys.stderr) - sys.exit(1) - - protein_coding_genes = [] - with open(sys.argv[1]) as file: - for line in file: - if line.startswith('#'): - # Skip over comments in header section - continue - - linelist = line.strip().split("\t") - if linelist[2]=="gene": - # Get gene_id and gene_type - gene_id, gene_type = get_id_and_type(last_column = linelist[-1]) - if gene_type=="protein_coding": - protein_coding_genes.append(gene_id) - print(gene_id,) + pairs = {} + kv_pairs_list = last_column.strip().split("; ") + + for k, v in seperated(kv_pairs_list): + pairs[k] = v + + gene_id = get_value("gene_id", pairs) + gene_type = get_value("gene_type", pairs) + if not gene_type: + # gene_type does not exist + # default to using gene_biotype + gene_type = get_value("gene_biotype", pairs) + + return gene_id, gene_type + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print( + "Usage: python {} genes.gtf > protein_coding_genes.lst".format(sys.argv[0]) + ) + print("\nError: failed to provide all positional arguments!", file=sys.stderr) + sys.exit(1) + + protein_coding_genes = [] + with open(sys.argv[1]) as file: + for line in file: + if line.startswith("#"): + # Skip over comments in header section + continue + + linelist = line.strip().split("\t") + if linelist[2] == "gene": + # Get gene_id and gene_type + gene_id, gene_type = get_id_and_type(last_column=linelist[-1]) + if gene_type == "protein_coding": + protein_coding_genes.append(gene_id) + print( + gene_id, + ) diff --git a/workflow/scripts/builder/jsonmaker.py b/workflow/scripts/builder/jsonmaker.py index bdeaebe..89041af 100644 --- a/workflow/scripts/builder/jsonmaker.py +++ b/workflow/scripts/builder/jsonmaker.py @@ -1,31 +1,50 @@ import json -input_fa="/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/indexes/mm10.fa" -input_gtf="/data/CCBR_Pipeliner/db/PipeDB/Indices/GTFs/mm10/gencode.vM21.annotation.gtf" -params_workdir="/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_M21" -params_genome="mm10_M21" -output_json=params_workdir+"/"+params_genome+".json" -bigdict=dict() -bigdict["references"]=dict() + +input_fa = "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/indexes/mm10.fa" +input_gtf = ( + "/data/CCBR_Pipeliner/db/PipeDB/Indices/GTFs/mm10/gencode.vM21.annotation.gtf" +) +params_workdir = "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_M21" +params_genome = "mm10_M21" +output_json = params_workdir + "/" + params_genome + ".json" +bigdict = dict() +bigdict["references"] = dict() for i in ["exomeseq", "genomeseq", "rnaseq", "rnaseqvargerm", "ChIPseq"]: - bigdict["references"][i]=dict() -bigdict["references"]["rnaseq"]["GENOMEFILE"]=input_fa -bigdict["references"]["rnaseq"]["GENOME"]=input_fa -bigdict["references"]["rnaseq"]["GTFFILE"]=input_gtf -bigdict["references"]["rnaseq"]["STARDIR"]=params_workdir+"/STAR/2.7.0f/genes-" -bigdict["references"]["rnaseq"]["STARREF"]=params_workdir+"/STAR/2.7.0f/genes-" -bigdict["references"]["rnaseq"]["ANNOTATE"]=params_workdir+"/annotate.genes.txt" -bigdict["references"]["rnaseq"]["ANNOTATEISOFORMS"]=params_workdir+"/annotate.isoforms.txt" -bigdict["references"]["rnaseq"]["REFFLAT"]=params_workdir+"/refFlat.txt" -bigdict["references"]["rnaseq"]["BEDREF"]=params_workdir+"/genes.ref.bed" -bigdict["references"]["rnaseq"]["GENEINFO"]=params_workdir+"/geneinfo.bed" -bigdict["references"]["rnaseq"]["KARYOBEDS"]=params_workdir+"/karyobeds" -bigdict["references"]["rnaseq"]["RSEMREF"]=params_workdir+"/rsemref/"+params_genome -bigdict["references"]["rnaseq"]["RRNALIST"]=params_workdir+"/"+params_genome+".rRNA_interval_list" -bigdict["references"]["rnaseq"]["FASTQ_SCREEN_CONFIG"]="/data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen.conf" -bigdict["references"]["rnaseq"]["FASTAWITHADAPTERSETC"]="/data/CCBR_Pipeliner/db/PipeDB/dev/TruSeq_and_nextera_adapters_new.fa" -bigdict["references"]["rnaseq"]["adapter.file"]="/data/CCBR_Pipeliner/db/PipeDB/dev/TruSeq_and_nextera_adapters.ngsqc.dat" -bigdict["references"]["rnaseq"]["trimmomatic.adapters"]="/data/CCBR_Pipeliner/db/PipeDB/dev/adapters2.fa" -bigdict["references"]["rnaseq"]["fastqc.adapters"]="/data/CCBR_Pipeliner/db/PipeDB/dev/fastqc.adapters" -bigdict["references"]["rnaseq"]["ORGANISM"]="MOUSE" -with open(output_json, 'w') as fp: + bigdict["references"][i] = dict() +bigdict["references"]["rnaseq"]["GENOMEFILE"] = input_fa +bigdict["references"]["rnaseq"]["GENOME"] = input_fa +bigdict["references"]["rnaseq"]["GTFFILE"] = input_gtf +bigdict["references"]["rnaseq"]["STARDIR"] = params_workdir + "/STAR/2.7.0f/genes-" +bigdict["references"]["rnaseq"]["STARREF"] = params_workdir + "/STAR/2.7.0f/genes-" +bigdict["references"]["rnaseq"]["ANNOTATE"] = params_workdir + "/annotate.genes.txt" +bigdict["references"]["rnaseq"]["ANNOTATEISOFORMS"] = ( + params_workdir + "/annotate.isoforms.txt" +) +bigdict["references"]["rnaseq"]["REFFLAT"] = params_workdir + "/refFlat.txt" +bigdict["references"]["rnaseq"]["BEDREF"] = params_workdir + "/genes.ref.bed" +bigdict["references"]["rnaseq"]["GENEINFO"] = params_workdir + "/geneinfo.bed" +bigdict["references"]["rnaseq"]["KARYOBEDS"] = params_workdir + "/karyobeds" +bigdict["references"]["rnaseq"]["RSEMREF"] = ( + params_workdir + "/rsemref/" + params_genome +) +bigdict["references"]["rnaseq"]["RRNALIST"] = ( + params_workdir + "/" + params_genome + ".rRNA_interval_list" +) +bigdict["references"]["rnaseq"][ + "FASTQ_SCREEN_CONFIG" +] = "/data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen.conf" +bigdict["references"]["rnaseq"][ + "FASTAWITHADAPTERSETC" +] = "/data/CCBR_Pipeliner/db/PipeDB/dev/TruSeq_and_nextera_adapters_new.fa" +bigdict["references"]["rnaseq"][ + "adapter.file" +] = "/data/CCBR_Pipeliner/db/PipeDB/dev/TruSeq_and_nextera_adapters.ngsqc.dat" +bigdict["references"]["rnaseq"][ + "trimmomatic.adapters" +] = "/data/CCBR_Pipeliner/db/PipeDB/dev/adapters2.fa" +bigdict["references"]["rnaseq"][ + "fastqc.adapters" +] = "/data/CCBR_Pipeliner/db/PipeDB/dev/fastqc.adapters" +bigdict["references"]["rnaseq"]["ORGANISM"] = "MOUSE" +with open(output_json, "w") as fp: json.dump(bigdict, fp, indent=4) diff --git a/workflow/scripts/builder/make_geneinfo.py b/workflow/scripts/builder/make_geneinfo.py index 7a0f8b4..ef49dbf 100644 --- a/workflow/scripts/builder/make_geneinfo.py +++ b/workflow/scripts/builder/make_geneinfo.py @@ -1,25 +1,39 @@ from __future__ import print_function import sys -gtf=sys.argv[1] +gtf = sys.argv[1] -annotate_genes={a[0]:a for a in map(lambda x:x.strip().replace('"','').split(" "),open("annotate.genes.txt").readlines())} +annotate_genes = { + a[0]: a + for a in map( + lambda x: x.strip().replace('"', "").split(" "), + open("annotate.genes.txt").readlines(), + ) +} -for l in list(filter(lambda x:x[2]=="gene",filter(lambda x:not x[0].startswith("#"),list(map(lambda x:x.strip().split("\t"),open(gtf).readlines()))))): - newl=[] - newl.append(l[0]) - newl.append(l[3]) - newl.append(l[4]) - newl.append(l[6]) - col9=l[8].split(" ") - gene_id_index=col9.index("gene_id") - gene_id=col9[gene_id_index+1].strip(";").strip("\"") - newl.append(gene_id) - try: - #newl.append(annotate_genes["\""+gene_id+"\""][2].strip("\"")) - newl.append(annotate_genes[gene_id][2].strip("\"")) - except IndexError: - print(gene_id) - exit() - newl.append(annotate_genes[gene_id][1].strip("\"")) - print("\t".join(newl)) +for l in list( + filter( + lambda x: x[2] == "gene", + filter( + lambda x: not x[0].startswith("#"), + list(map(lambda x: x.strip().split("\t"), open(gtf).readlines())), + ), + ) +): + newl = [] + newl.append(l[0]) + newl.append(l[3]) + newl.append(l[4]) + newl.append(l[6]) + col9 = l[8].split(" ") + gene_id_index = col9.index("gene_id") + gene_id = col9[gene_id_index + 1].strip(";").strip('"') + newl.append(gene_id) + try: + # newl.append(annotate_genes["\""+gene_id+"\""][2].strip("\"")) + newl.append(annotate_genes[gene_id][2].strip('"')) + except IndexError: + print(gene_id) + exit() + newl.append(annotate_genes[gene_id][1].strip('"')) + print("\t".join(newl)) diff --git a/workflow/scripts/builder/make_refFlat.py b/workflow/scripts/builder/make_refFlat.py index 48febd4..276bb59 100644 --- a/workflow/scripts/builder/make_refFlat.py +++ b/workflow/scripts/builder/make_refFlat.py @@ -1,36 +1,45 @@ - -#while read a b c d e f g h i j k l;do -#a1=`grep -m1 $d hs38d1.annotate.isoforms.txt|awk '{print $2}'`; -#i1=`python add2all.py $l $b|sed "s/ //g"` -#j1=`python sum_lists.py $i1 $k|sed "s/ //g"` -#echo -e "$a1\t$d\t$a\t$f\t$b\t$c\t$g\t$h\t$i1\t$j1" -#done< genes.ref.bed|head +# while read a b c d e f g h i j k l;do +# a1=`grep -m1 $d hs38d1.annotate.isoforms.txt|awk '{print $2}'`; +# i1=`python add2all.py $l $b|sed "s/ //g"` +# j1=`python sum_lists.py $i1 $k|sed "s/ //g"` +# echo -e "$a1\t$d\t$a\t$f\t$b\t$c\t$g\t$h\t$i1\t$j1" +# done< genes.ref.bed|head import sys -def add2all(a,b): - lst=a.strip().split(",") - lst.pop(-1) - return ",".join([str(int(x)+int(b)) for x in lst])+"," -def sum_lists(a,b): - lst1=a.strip().split(",") - lst1.pop(-1) - lst2=b.strip().split(",") - lst2.pop(-1) - return ",".join([str(int(a)+int(b)) for a,b in zip(lst1,lst2)])+"," -tid2genename={a[0]: a[1] for a in map(lambda x:x.strip().split(" "),open("annotate.isoforms.txt").readlines())} -for l in map(lambda x:x.strip().split("\t"),open("genes.ref.bed").readlines()): - newl=[] - newl.append(tid2genename["\""+l[3]+"\""].strip("\"")) - newl.append(l[3]) - newl.append(l[0]) - newl.append(l[5]) - newl.append(l[1]) - newl.append(l[2]) - newl.append(l[6]) - newl.append(l[7]) - newl.append(l[9]) - lst=add2all(l[11],l[1]) - newl.append(lst) - newl.append(sum_lists(l[10],lst)) - print("\t".join(newl)) +def add2all(a, b): + lst = a.strip().split(",") + lst.pop(-1) + return ",".join([str(int(x) + int(b)) for x in lst]) + "," + + +def sum_lists(a, b): + lst1 = a.strip().split(",") + lst1.pop(-1) + lst2 = b.strip().split(",") + lst2.pop(-1) + return ",".join([str(int(a) + int(b)) for a, b in zip(lst1, lst2)]) + "," + + +tid2genename = { + a[0]: a[1] + for a in map( + lambda x: x.strip().split(" "), open("annotate.isoforms.txt").readlines() + ) +} + +for l in map(lambda x: x.strip().split("\t"), open("genes.ref.bed").readlines()): + newl = [] + newl.append(tid2genename['"' + l[3] + '"'].strip('"')) + newl.append(l[3]) + newl.append(l[0]) + newl.append(l[5]) + newl.append(l[1]) + newl.append(l[2]) + newl.append(l[6]) + newl.append(l[7]) + newl.append(l[9]) + lst = add2all(l[11], l[1]) + newl.append(lst) + newl.append(sum_lists(l[10], lst)) + print("\t".join(newl)) diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py index baeb129..c0cd7b6 100644 --- a/workflow/scripts/common.py +++ b/workflow/scripts/common.py @@ -1,5 +1,6 @@ # Common helper functions shared across the entire workflow + def check_existence(filename): """Checks if file exists on filesystem :param filename : Name of file to check @@ -13,8 +14,12 @@ def check_readaccess(filename): :param filename : Name of file to check """ check_existence(filename) - if not os.access(filename,os.R_OK): - sys.exit("File: {} exists, but user cannot read from file due to permissions!".format(filename)) + if not os.access(filename, os.R_OK): + sys.exit( + "File: {} exists, but user cannot read from file due to permissions!".format( + filename + ) + ) def check_writeaccess(filename): @@ -22,39 +27,45 @@ def check_writeaccess(filename): :param filename : Name of file to check """ check_existence(filename) - if not os.access(filename,os.W_OK): - sys.exit("File: {} exists, but user cannot write to file due to permissions!".format(filename)) + if not os.access(filename, os.W_OK): + sys.exit( + "File: {} exists, but user cannot write to file due to permissions!".format( + filename + ) + ) def references(config, pipeline, reflist): - ''' + """ Checks if a set of required reference files were provided. Some rules depend on a set of required reference files that may only exist for specific reference genomes. An example of this would be blasklists arriba. The blacklist are manually curated and only exist for a few reference genomes (mm10, hg38, hg19). If one of the required reference files does not exist, then it will return an empty list. - ''' + """ _all = True for ref in reflist: - try: tmp = config['references'][pipeline][ref] + try: + tmp = config["references"][pipeline][ref] # Check if ref exists in config except KeyError: _all = False break # Check if ref is empty key string - if not tmp: _all = False + if not tmp: + _all = False return _all def provided(samplelist, condition): - ''' + """ Determines if optional rules should run. If an empty list is provided to rule all, snakemake will not try to generate that set of target files. If a given condition is not met (i.e. False) then it will not try to run that rule. - ''' + """ if not condition: # If condition is False, returns an empty list to prevent rule from running @@ -64,7 +75,7 @@ def provided(samplelist, condition): def s3_configured(uri): - ''' + """ Determines if user can access s3 object using their credentials saved in "~/.aws/credentials" or "~/.boto". This handles an edge case where a user has aws configure on the target system but the AWS Access Key Id provided in @@ -74,13 +85,13 @@ def s3_configured(uri): :param uri : URI/URL to object in S3 bucket :return accessible : True if user can access S3 object, False if user cannot access the object (403) - ''' + """ import boto3 import botocore import re # Get bucket and key from s3 uri - parsed = re.match(r's3:\/\/(.+?)\/(.+)', uri) + parsed = re.match(r"s3:\/\/(.+?)\/(.+)", uri) bucket, key = parsed.groups() accessible = True @@ -97,7 +108,7 @@ def s3_configured(uri): def abstract_location(file_address, *args, **kwargs): - ''' + """ Determines if a provided file or list of file(s) resides in a remote location. If file(s) are determined to reside in remote store, like a S3 or Google Cloud Storage, Snakemake's remote wrapper is used to defined remote files. @@ -106,11 +117,15 @@ def abstract_location(file_address, *args, **kwargs): Supported remotes file options include: s3, gs, and sftp Input: File path or a list or file paths list[] Output: List of files or remote objects - ''' + """ # Check if user provided any input if not file_address or file_address is None: - raise IOError("Failed to provide any input files! Input(s) are required to resolve required files.".format(file_address)) + raise IOError( + "Failed to provide any input files! Input(s) are required to resolve required files.".format( + file_address + ) + ) # If given file path to one file, convert it a list[] file_list = [file_address] if isinstance(file_address, str) else file_address @@ -118,7 +133,7 @@ def abstract_location(file_address, *args, **kwargs): # Loop through list of provided files, and if a remote storage option # is given, convert its index to a remote file object. for i, uri in enumerate(file_list): - if uri.lower().startswith('s3://'): + if uri.lower().startswith("s3://"): # Remote option for S3 storage import snakemake.remote.S3 import botocore.session @@ -134,18 +149,22 @@ def abstract_location(file_address, *args, **kwargs): # s3 bucket are configured correctly. # If a file in provieded as input to a Snakemake rule, only read # access is needed to access the remote S3 object. - remote_provider = snakemake.remote.S3.RemoteProvider(config=botocore.client.Config(signature_version=botocore.UNSIGNED)) + remote_provider = snakemake.remote.S3.RemoteProvider( + config=botocore.client.Config(signature_version=botocore.UNSIGNED) + ) file_list[i] = remote_provider.remote(uri, *args, **kwargs) - elif uri.lower().startswith('gs://'): + elif uri.lower().startswith("gs://"): # Remote option for Google Cloud Storage import snakemake.remote.GS + remote_provider = snakemake.remote.GS.RemoteProvider() file_list[i] = remote_provider.remote(uri, *args, **kwargs) - elif uri.lower().startswith('sftp://'): + elif uri.lower().startswith("sftp://"): # Remote option for SFTP transfers import snakemake.remote.SFTP + remote_provider = snakemake.remote.SFTP.RemoteProvider() file_list[i] = remote_provider.remote(uri, *args, **kwargs) @@ -153,24 +172,24 @@ def abstract_location(file_address, *args, **kwargs): def allocated(resource, rule, lookup, default="__default__"): - """Pulls resource information for a given rule. If a rule does not have any information + """Pulls resource information for a given rule. If a rule does not have any information for a given resource type, then it will pull from the default. Information is pulled from - definitions in the cluster.json (which is used a job submission). This ensures that any + definitions in the cluster.json (which is used a job submission). This ensures that any resources used at runtime mirror the resources that were allocated. :param resource : resource type to look in cluster.json (i.e. threads, mem, time, gres) :param rule : rule to lookup its information :param lookup : Lookup containing allocation information (i.e. cluster.json) :param default : default information to use if rule information cannot be found - :return allocation : + :return allocation : allocation information for a given resource type for a given rule """ - try: + try: # Try to get allocation information # for a given rule allocation = lookup[rule][resource] except KeyError: # Use default allocation information allocation = lookup[default][resource] - - return allocation \ No newline at end of file + + return allocation diff --git a/workflow/scripts/create_tin_matrix.py b/workflow/scripts/create_tin_matrix.py index ecb634f..632b6c4 100644 --- a/workflow/scripts/create_tin_matrix.py +++ b/workflow/scripts/create_tin_matrix.py @@ -6,49 +6,46 @@ def create(file, tin_dict, key_index=0, parse_index=4): - """Populates the TIN nested dictionary - @param file : Path to RSEQC output file with TIN values to extract - @param tin_dict : Dictionary to populate where [samplebasename][transcriptid] = tin_value - @param key_index : Index of the field to join multiple files - @param parse_index : Index of field of interest (i.e. TIN value) - """ - - with open(file, 'r') as fh: - header = next(fh).strip().split('\t') - colid = header[key_index] - file = os.path.basename(file) # Remove PATH - sample = file.split(".star_rg_added.sorted.dmark.tin.xls")[0] - - for line in fh: - linelist = line.strip().split('\t') - tid = linelist[key_index] - tinvalue = linelist[parse_index] - if sample not in tin_dict: - tin_dict[sample] = {} - - tin_dict[sample][tid] = tinvalue - - return colid, tin_dict - - - - -if __name__ == '__main__': - - # Get filenames to parse - args = sys.argv - files = sys.argv[1:] - - # Check if at least two files were provided - if not len(args) >= 2: - print("FATAL: Failed to provide more than one input file!") - sys.exit("Usage:\n python {} *.tin.xls > combinedTIN.tsv".format(args[0])) - - # Populate tins with TINS values for all transcripts across all samples - tins = {} - for file in files: - keycolname, tins = create(file, tins) - - df = pandas.DataFrame(tins) - # Print dataframe to standard output - df.to_csv(sys.stdout, sep="\t", header=True, index=True, index_label = keycolname) + """Populates the TIN nested dictionary + @param file : Path to RSEQC output file with TIN values to extract + @param tin_dict : Dictionary to populate where [samplebasename][transcriptid] = tin_value + @param key_index : Index of the field to join multiple files + @param parse_index : Index of field of interest (i.e. TIN value) + """ + + with open(file, "r") as fh: + header = next(fh).strip().split("\t") + colid = header[key_index] + file = os.path.basename(file) # Remove PATH + sample = file.split(".star_rg_added.sorted.dmark.tin.xls")[0] + + for line in fh: + linelist = line.strip().split("\t") + tid = linelist[key_index] + tinvalue = linelist[parse_index] + if sample not in tin_dict: + tin_dict[sample] = {} + + tin_dict[sample][tid] = tinvalue + + return colid, tin_dict + + +if __name__ == "__main__": + # Get filenames to parse + args = sys.argv + files = sys.argv[1:] + + # Check if at least two files were provided + if not len(args) >= 2: + print("FATAL: Failed to provide more than one input file!") + sys.exit("Usage:\n python {} *.tin.xls > combinedTIN.tsv".format(args[0])) + + # Populate tins with TINS values for all transcripts across all samples + tins = {} + for file in files: + keycolname, tins = create(file, tins) + + df = pandas.DataFrame(tins) + # Print dataframe to standard output + df.to_csv(sys.stdout, sep="\t", header=True, index=True, index_label=keycolname) diff --git a/workflow/scripts/do_run_rMATS b/workflow/scripts/do_run_rMATS index f2f4e73..d10fee4 100755 --- a/workflow/scripts/do_run_rMATS +++ b/workflow/scripts/do_run_rMATS @@ -11,28 +11,28 @@ USAGE: $ ./do_run_rMATS [-h] \\ [--skip-index] SYNOPSIS: - Convience script to run rMATS with an RENEE output directory. A + Convience script to run rMATS with an RENEE output directory. A user just needs to create a 'groups.tab' and 'contrasts.tab' file in the RENEE output directory of interest. This script will run rMATS Turbo for each comparsion defined in 'contrasts.tab' file. The '--skip-index' - option can be provided if this script has already been run to generate + option can be provided if this script has already been run to generate an STAR index for rMATS. OPTIONS: - -s, --skip-index [Type: Bool] Skip building STAR's Index. - WARNING: there be dragens here! This option + -s, --skip-index [Type: Bool] Skip building STAR's Index. + WARNING: there be dragens here! This option should only be provided if ./do_run_rMATS has already built an index in the specified output - directory in the past. You may want to provide + directory in the past. You may want to provide this option if rMATS failed to run for some other weird issue AND you do not want to waste time rebuilding the index. Do not provide this - option if you do not know what you are doing! - Running rMATS without its index for STAR will - cause rMATS to fail. + option if you do not know what you are doing! + Running rMATS without its index for STAR will + cause rMATS to fail. -h, --help [Type: Bool] Displays usage and help information. Example: $ cd /path/to/RENEE/output/directory - $ nano groups.tab # create a groups.tab file similar to Pipeliner + $ nano groups.tab # create a groups.tab file similar to Pipeliner $ nano contrasts.tab # create a contrasts.tab file similar to Pipeliner $ ./do_run_rMATS Version: @@ -42,11 +42,11 @@ EOF function create_groups(){ - # Creates required sample sheet to detect differential - # AS events with rMATS turbo. - + # Creates required sample sheet to detect differential + # AS events with rMATS turbo. + # Create sample sheet from groups.tab and contrasts.tab - while read g1 g2; do + while read g1 g2; do # Get list of samples for the first group s1=$(awk \ -F '\t' \ @@ -60,11 +60,11 @@ function create_groups(){ -v group="$g2" \ -v wd="$PWD" \ '$2==group {print wd"/"$1".R1.fastq.gz:"wd"/"$1".R2.fastq.gz"}' \ - groups.tab | tr '\n' ','); + groups.tab | tr '\n' ','); # Create sample sheet for the first group - echo -e "${s1%,}" > "rMATS/${g1}_v_${g2}/s1.txt"; + echo -e "${s1%,}" > "rMATS/${g1}_v_${g2}/s1.txt"; # Create sample sheet for the second group - echo -e "${s2%,}" > "rMATS/${g1}_v_${g2}/s2.txt"; + echo -e "${s2%,}" > "rMATS/${g1}_v_${g2}/s2.txt"; done < contrasts.tab } @@ -74,10 +74,10 @@ function initalize(){ # $1 = output directory local wd="$1" - ( # Initialize a base output directory structure + ( # Initialize a base output directory structure cd ${wd}; mkdir -p "${wd}/rMATS" - while read g1 g2; do + while read g1 g2; do # Create an output directory for each contrast mkdir -p "${wd}/rMATS/${g1}_v_${g2}/"; done < contrasts.tab @@ -87,7 +87,7 @@ function initalize(){ function _get_read_length(){ # Get max read length for rMATS model - # $1 = MultiQC matrix with read lengths + # $1 = MultiQC matrix with read lengths local metadata="$1" cut -f6 "$metadata" \ | tail -n+2 \ @@ -109,7 +109,7 @@ function build_star_index(){ # Required for parsing config file module load jq > /dev/null 2>&1 - # Get maximum read length + # Get maximum read length # and create output directory for new index read_length=$(_get_read_length "Reports/multiqc_matrix.tsv") mkdir -p rMATS/STAR/2.7.6a/genes-${read_length} @@ -118,16 +118,16 @@ function build_star_index(){ gtf=$(jq .references.rnaseq.GTFFILE config.json) genome=$(jq .references.rnaseq.GENOME config.json) rl=$((${read_length}-1)) - + # Create sbacth script to build index cat << EOF > build_star_index_submit.sh #!/usr/bin/env bash -#SBATCH --cpus-per-task=32 +#SBATCH --cpus-per-task=32 #SBATCH --mem=64g -#SBATCH --gres=lscratch:250 -#SBATCH --time=8:00:00 -#SBATCH --parsable -#SBATCH -J "STAR_INDEX" +#SBATCH --gres=lscratch:250 +#SBATCH --time=8:00:00 +#SBATCH --parsable +#SBATCH -J "STAR_INDEX" #SBATCH --mail-type=BEGIN,END,FAIL set -euo pipefail @@ -156,13 +156,13 @@ function do_run_rMATS(){ # Submits job to run rMATS Turbo a contrast # $1 = Group 1 # $2 = Group 2 - # $3 = STAR Index + # $3 = STAR Index # $4 = Output directory # $5 = Job dependency (slurm job id of build index) local gtf local read_length - local g1="${1}" - local g2="${2}" + local g1="${1}" + local g2="${2}" local star_index="${3}" local outdir="${4}" local dependency="${5:-}" @@ -171,25 +171,25 @@ function do_run_rMATS(){ # Required for parsing config file module load jq > /dev/null 2>&1 gtf=$(jq .references.rnaseq.GTFFILE config.json) - # Get maximum read length + # Get maximum read length read_length=$(_get_read_length "Reports/multiqc_matrix.tsv") # Create sbacth script to run rMATS cat << EOF > run_rmats_${g1}_${g2}.sh #!/usr/bin/env bash -#SBATCH --cpus-per-task=32 +#SBATCH --cpus-per-task=32 #SBATCH --mem=64g -#SBATCH --gres=lscratch:250 -#SBATCH --time=8:00:00 -#SBATCH --parsable -#SBATCH -J "rMATS_${g1}_${g2}" +#SBATCH --gres=lscratch:250 +#SBATCH --time=8:00:00 +#SBATCH --parsable +#SBATCH -J "rMATS_${g1}_${g2}" #SBATCH --mail-type=BEGIN,END,FAIL set -euo pipefail -module load rMATS/4.1.1 +module load rMATS/4.1.1 module load STAR/${star_version} -# Run rMATS for a given contrast +# Run rMATS for a given contrast # starting from untrimmed FastQ files python \${RMATS_SRC}/rmats.py \\ --s1 "${PWD}/rMATS/${g1}_v_${g2}/s1.txt" \\ @@ -202,7 +202,7 @@ python \${RMATS_SRC}/rmats.py \\ -t "paired" \\ --tmp /lscratch/\${SLURM_JOB_ID}/ EOF - + chmod +x run_rmats_${g1}_${g2}.sh if [ -z "${dependency}" ]; then # No job dependency @@ -215,11 +215,11 @@ EOF function main(){ # Pseudo main method - # Change directory to script's working directory + # Change directory to script's working directory # (i.e pipeline output directory) cd "$(dirname "${BASH_SOURCE[0]}")" - # Parser command line arguments + # Parser command line arguments # Associative array to store parsed args declare -Ag args args["skip_index"]=false # default behavior does build index! @@ -235,33 +235,32 @@ function main(){ # Step 1. Create a base output directory hierarchy initalize "${PWD}" - + # Step 2. Create samples sheet for each contrast # from a groups.tab file and contrasts.tab file - create_groups - - # Step 3. Build STAR Index + create_groups + + # Step 3. Build STAR Index if [ "${args["skip_index"]}" = false ]; then build_job_id=$(build_star_index) fi - # Get index for the correct read length + # Get index for the correct read length read_length_index=$(_get_read_length "Reports/multiqc_matrix.tsv" \ | awk -v wd="$PWD" \ '{print wd"/rMATS/STAR/2.7.6a/genes-"$1"/"}') - while read g1 g2; do + while read g1 g2; do # Run rMATS for each constrast # $1 = Group 1 # $2 = Group 2 - # $3 = STAR Index + # $3 = STAR Index # $4 = Output directory # $5 = Job dependency (slurm job id of build index) do_run_rMATS "${g1}" "${g2}" "${read_length_index}" "${PWD}/rMATS/${g1}_v_${g2}/" "${build_job_id:-}" - + done < contrasts.tab } main "$@" - diff --git a/workflow/scripts/files2spreadsheet.py b/workflow/scripts/files2spreadsheet.py index bb3ba19..0fb46a2 100644 --- a/workflow/scripts/files2spreadsheet.py +++ b/workflow/scripts/files2spreadsheet.py @@ -7,137 +7,136 @@ import os -def reader(filename, subset=[], skip='#', **kwargs): - """Reads in an MAF-like file as a dataframe. Determines the - correct handler for reading in a given MAF file. Supports reading - in TSV files (.tsv, .txt, .text, .vcf, or .maf), CSV files (.csv), - and excel files (.xls, .xlsx, .xlsm, .xlsb, .odf, .ods, .odt ). - The subset option allows a users to only select a few columns - given a list of column names. - @param filename : - Path of an MAF-like file to read and parse - @param subset list[]: - List of column names which can be used to subset the df - @param skip : - Skips over line starting with this character - @params kwargs - Key words to modify pandas.read_excel() function behavior - @return : - dataframe with spreadsheet contents - """ - # Get file extension - extension = os.path.splitext(filename)[-1].lower() - - # Assign a handler to read in the file - if extension in ['.xls', '.xlsx', '.xlsm', '.xlsb', '.odf', '.ods', '.odt']: - # Read in as an excel file - return excel(filename, subset, skip, **kwargs) - elif extension in ['.csv']: - # Read in as an CSV file - return csv(filename, subset, skip, **kwargs) - else: - # Default to reading in as an TSV file - # Tab is the normal delimeter for MAF or VCF files - # MAF files usually have one of the following - # extensions: '.tsv', '.txt', '.text', '.vcf', '.maf' - return tsv(filename, subset, skip, **kwargs) - - -def excel(filename, subset=[], skip='#', **kwargs): - """Reads in an excel file as a dataframe. The subset option - allows a users to only select a few columns given a list of - column names. - @param filename : - Path of an EXCEL file to read and parse - @param subset list[]: - List of column names which can be used to subset the df - @param skip : - Skips over line starting with this character - @params kwargs - Key words to modify pandas.read_excel() function behavior - @return : - dataframe with spreadsheet contents - """ - if subset: - return pd.read_excel(filename, comment=skip, **kwargs)[subset] - - return pd.read_excel(filename, comment=skip, **kwargs) - - -def tsv(filename, subset=[], skip='#', **kwargs): - """Reads in an TSV file as a dataframe. The subset option - allows a users to only select a few columns given a list of - column names. - @param filename : - Path of an TSV file to read and parse - @param subset list[]: - List of column names which can be used to subset the df - @param skip : - Skips over line starting with this character - @params kwargs - Key words to modify pandas.read_excel() function behavior - @return : - dataframe with spreadsheet contents - """ - if subset: - return pd.read_table(filename, comment=skip, **kwargs)[subset] - - return pd.read_table(filename, comment=skip, **kwargs) - - -def csv(filename, subset=[], skip='#', **kwargs): - """Reads in an CSV file as a dataframe. The subset option - allows a users to only select a few columns given a list of - column names. - @param filename : - Path of an CSV file to read and parse - @param subset list[]: - List of column names which can be used to subset the df - @param skip : - Skips over line starting with this character - @params kwargs - Key words to modify pandas.read_excel() function behavior - @return : - dataframe with spreadsheet contents - """ - if subset: - return pd.read_csv(filename, comment=skip, **kwargs)[subset] - - return pd.read_csv(filename, comment=skip, **kwargs) - - -def excel_writer(files, spreadsheet= 'test.xlsx'): - """Takes a list of files and creates one excel spreadsheet. - Each file will becomes a sheet in the spreadsheet where the - name of the sheet is the basename of the file with the extension - removed. - @param files list[]: - List of files to merge into one execl file - @param spreadsheet : - Output filename of the spreadsheet - """ - - writer = pd.ExcelWriter(spreadsheet, engine='xlsxwriter') - - # Create a spreadsheet from the contents of each file - for file in files: - print('Reading in {}'.format(file)) - df = reader(file) - sheet = os.path.splitext(os.path.basename(file))[0] - try: - # Sheet name cannot exceed 31 characters in length - df.to_excel(writer, sheet_name = sheet, index = False, freeze_panes = (1,0)) - except xlsxwriter.exceptions.InvalidWorksheetName as e: - df.to_excel(writer, sheet_name = sheet[:31], index = False,freeze_panes = (1,0)) - - writer.save() - - -if __name__ == '__main__': - - # List of file to convert into an excel file - files = sys.argv[1:-1] - # Output file name - outfh = sys.argv[-1] - - excel_writer(files, outfh) +def reader(filename, subset=[], skip="#", **kwargs): + """Reads in an MAF-like file as a dataframe. Determines the + correct handler for reading in a given MAF file. Supports reading + in TSV files (.tsv, .txt, .text, .vcf, or .maf), CSV files (.csv), + and excel files (.xls, .xlsx, .xlsm, .xlsb, .odf, .ods, .odt ). + The subset option allows a users to only select a few columns + given a list of column names. + @param filename : + Path of an MAF-like file to read and parse + @param subset list[]: + List of column names which can be used to subset the df + @param skip : + Skips over line starting with this character + @params kwargs + Key words to modify pandas.read_excel() function behavior + @return : + dataframe with spreadsheet contents + """ + # Get file extension + extension = os.path.splitext(filename)[-1].lower() + + # Assign a handler to read in the file + if extension in [".xls", ".xlsx", ".xlsm", ".xlsb", ".odf", ".ods", ".odt"]: + # Read in as an excel file + return excel(filename, subset, skip, **kwargs) + elif extension in [".csv"]: + # Read in as an CSV file + return csv(filename, subset, skip, **kwargs) + else: + # Default to reading in as an TSV file + # Tab is the normal delimeter for MAF or VCF files + # MAF files usually have one of the following + # extensions: '.tsv', '.txt', '.text', '.vcf', '.maf' + return tsv(filename, subset, skip, **kwargs) + + +def excel(filename, subset=[], skip="#", **kwargs): + """Reads in an excel file as a dataframe. The subset option + allows a users to only select a few columns given a list of + column names. + @param filename : + Path of an EXCEL file to read and parse + @param subset list[]: + List of column names which can be used to subset the df + @param skip : + Skips over line starting with this character + @params kwargs + Key words to modify pandas.read_excel() function behavior + @return : + dataframe with spreadsheet contents + """ + if subset: + return pd.read_excel(filename, comment=skip, **kwargs)[subset] + + return pd.read_excel(filename, comment=skip, **kwargs) + + +def tsv(filename, subset=[], skip="#", **kwargs): + """Reads in an TSV file as a dataframe. The subset option + allows a users to only select a few columns given a list of + column names. + @param filename : + Path of an TSV file to read and parse + @param subset list[]: + List of column names which can be used to subset the df + @param skip : + Skips over line starting with this character + @params kwargs + Key words to modify pandas.read_excel() function behavior + @return : + dataframe with spreadsheet contents + """ + if subset: + return pd.read_table(filename, comment=skip, **kwargs)[subset] + + return pd.read_table(filename, comment=skip, **kwargs) + + +def csv(filename, subset=[], skip="#", **kwargs): + """Reads in an CSV file as a dataframe. The subset option + allows a users to only select a few columns given a list of + column names. + @param filename : + Path of an CSV file to read and parse + @param subset list[]: + List of column names which can be used to subset the df + @param skip : + Skips over line starting with this character + @params kwargs + Key words to modify pandas.read_excel() function behavior + @return : + dataframe with spreadsheet contents + """ + if subset: + return pd.read_csv(filename, comment=skip, **kwargs)[subset] + + return pd.read_csv(filename, comment=skip, **kwargs) + + +def excel_writer(files, spreadsheet="test.xlsx"): + """Takes a list of files and creates one excel spreadsheet. + Each file will becomes a sheet in the spreadsheet where the + name of the sheet is the basename of the file with the extension + removed. + @param files list[]: + List of files to merge into one execl file + @param spreadsheet : + Output filename of the spreadsheet + """ + + writer = pd.ExcelWriter(spreadsheet, engine="xlsxwriter") + + # Create a spreadsheet from the contents of each file + for file in files: + print("Reading in {}".format(file)) + df = reader(file) + sheet = os.path.splitext(os.path.basename(file))[0] + try: + # Sheet name cannot exceed 31 characters in length + df.to_excel(writer, sheet_name=sheet, index=False, freeze_panes=(1, 0)) + except xlsxwriter.exceptions.InvalidWorksheetName as e: + df.to_excel(writer, sheet_name=sheet[:31], index=False, freeze_panes=(1, 0)) + + writer.save() + + +if __name__ == "__main__": + # List of file to convert into an excel file + files = sys.argv[1:-1] + # Output file name + outfh = sys.argv[-1] + + excel_writer(files, outfh) diff --git a/workflow/scripts/get_flowcell_lanes.py b/workflow/scripts/get_flowcell_lanes.py index 4d1dc59..678df9a 100644 --- a/workflow/scripts/get_flowcell_lanes.py +++ b/workflow/scripts/get_flowcell_lanes.py @@ -29,12 +29,17 @@ # +SRR6755966.1 1 length=101 # CC@FFFFFHHHHHJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJHIJJJJI -def usage(message = '', exitcode = 0): + +def usage(message="", exitcode=0): """Displays help and usage information. If provided invalid usage returns non-zero exit-code. Additional message can be displayed with the 'message' parameter. """ - print('Usage: python {} sampleName.R1.fastq.gz sampleName > sampleName.flowcell_lanes.txt'.format(sys.argv[0])) + print( + "Usage: python {} sampleName.R1.fastq.gz sampleName > sampleName.flowcell_lanes.txt".format( + sys.argv[0] + ) + ) if message: print(message) sys.exit(exitcode) @@ -45,7 +50,7 @@ def reader(fname): or non-gzipped FastQ files based on the file extension. Assumes gzipped files endwith the '.gz' extension. """ - if fname.endswith('.gz'): + if fname.endswith(".gz"): # Opens up file with gzip handler return gzip.open else: @@ -62,11 +67,11 @@ def get_flowcell_lane(sequence_identifer): IDs in its sequence indentifer. For more information visit: https://en.wikipedia.org/wiki/FASTQ_format """ - id_list = sequence_identifer.strip().split(':') + id_list = sequence_identifer.strip().split(":") if len(id_list) < 7: # No Flowcell IDs in this format # Return next instrument id instead (next best thing) - if sequence_identifer.startswith('@SRR'): + if sequence_identifer.startswith("@SRR"): # SRA format or downloaded SRA FastQ file # SRA format 1: contains machine and lane information # @SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36 @@ -79,20 +84,20 @@ def get_flowcell_lane(sequence_identifer): except IndexError: # SRA format 2 id1 = id_list[0].split()[0].split(".")[0] - id2 = id1.lstrip('@') - return id1,id2 + id2 = id1.lstrip("@") + return id1, id2 else: # Casava < 1.8 (fastq format) # @HWUSI-EAS100R:6:73:941:1973#0/1 - return id_list[0],id_list[1] + return id_list[0], id_list[1] else: # Casava >= 1.8 # Normal FastQ format # @J00170:88:HNYVJBBXX:8:1101:6390:1244 1:N:0:ACTTGA - return id_list[2],id_list[3] + return id_list[2], id_list[3] -def md5sum(filename, blocksize = 65536): +def md5sum(filename, blocksize=65536): """Gets md5checksum of a file in memory-safe manner. The file is read in blocks defined by the blocksize parameter. This is a safer option to reading the entire file into memory if the file is very large. @@ -106,7 +111,7 @@ def md5sum(filename, blocksize = 65536): import hashlib hasher = hashlib.md5() - with open(filename, 'rb') as fh: + with open(filename, "rb") as fh: buf = fh.read(blocksize) while len(buf) > 0: hasher.update(buf) @@ -115,13 +120,15 @@ def md5sum(filename, blocksize = 65536): return hasher.hexdigest() -if __name__ == '__main__': - +if __name__ == "__main__": # Check Usage - if '-h' in sys.argv or '--help' in sys.argv or '-help' in sys.argv: - usage(exitcode = 0) + if "-h" in sys.argv or "--help" in sys.argv or "-help" in sys.argv: + usage(exitcode=0) elif len(sys.argv) != 3: - usage(message = 'Error: failed to provide all required positional arguments!', exitcode = 1) + usage( + message="Error: failed to provide all required positional arguments!", + exitcode=1, + ) # Get file name and sample name prefix filename = sys.argv[1] @@ -131,22 +138,33 @@ def md5sum(filename, blocksize = 65536): # Get Flowcell and Lane information handle = reader(filename) - meta = {'flowcell': [], 'lane': [], 'flowcell_lane': []} + meta = {"flowcell": [], "lane": [], "flowcell_lane": []} i = 0 # keeps track of line number - with handle(filename, 'r') as file: - print('sample_name\ttotal_read_pairs\tflowcell_ids\tlanes\tflowcell_lanes\tmd5_checksum') + with handle(filename, "r") as file: + print( + "sample_name\ttotal_read_pairs\tflowcell_ids\tlanes\tflowcell_lanes\tmd5_checksum" + ) for line in file: line = line.strip() - if i%4 == 0: # read id or sequence identifer + if i % 4 == 0: # read id or sequence identifer fc, lane = get_flowcell_lane(line) - fc = fc.lstrip('@') - fc_lane = "{}_{}".format(fc,lane) - if fc not in meta['flowcell']: - meta['flowcell'].append(fc) - if lane not in meta['lane']: - meta['lane'].append(lane) - if fc_lane not in meta['flowcell_lane']: - meta['flowcell_lane'].append(fc_lane) + fc = fc.lstrip("@") + fc_lane = "{}_{}".format(fc, lane) + if fc not in meta["flowcell"]: + meta["flowcell"].append(fc) + if lane not in meta["lane"]: + meta["lane"].append(lane) + if fc_lane not in meta["flowcell_lane"]: + meta["flowcell_lane"].append(fc_lane) i += 1 - print("{}\t{}\t{}\t{}\t{}\t{}".format(sample, int(i/4),",".join(sorted(meta['flowcell'])),",".join(sorted(meta['lane'])),",".join(sorted(meta['flowcell_lane'])), md5)) + print( + "{}\t{}\t{}\t{}\t{}\t{}".format( + sample, + int(i / 4), + ",".join(sorted(meta["flowcell"])), + ",".join(sorted(meta["lane"])), + ",".join(sorted(meta["flowcell_lane"])), + md5, + ) + ) diff --git a/workflow/scripts/get_read_length.py b/workflow/scripts/get_read_length.py index 22a13ae..c07819f 100644 --- a/workflow/scripts/get_read_length.py +++ b/workflow/scripts/get_read_length.py @@ -1,27 +1,32 @@ from __future__ import print_function import sys, os, zipfile, glob -def get_max_read_length(f): - returnvalues = [] - zipf = zipfile.ZipFile(f) - foldername = os.path.splitext(os.path.basename(f))[0] - fastqc_data = zipf.open(foldername+"/fastqc_data.txt") - line = list(filter(lambda x:x.startswith(b'Sequence length'),list(map(lambda x:x.strip(),fastqc_data.readlines()))))[0] - if b'-' in line: - returnvalues.append(int(line.split(b'\t')[1].split(b'-')[1])) - else: - returnvalues.append(int(line.split(b'\t')[1])) +def get_max_read_length(f): + returnvalues = [] + zipf = zipfile.ZipFile(f) + foldername = os.path.splitext(os.path.basename(f))[0] + fastqc_data = zipf.open(foldername + "/fastqc_data.txt") + line = list( + filter( + lambda x: x.startswith(b"Sequence length"), + list(map(lambda x: x.strip(), fastqc_data.readlines())), + ) + )[0] - return max(returnvalues) + if b"-" in line: + returnvalues.append(int(line.split(b"\t")[1].split(b"-")[1])) + else: + returnvalues.append(int(line.split(b"\t")[1])) + return max(returnvalues) -if __name__ == '__main__': - basefolder = sys.argv[1] # QC or rawQC folder - fastqs = glob.glob(basefolder+"/*fastqc.zip") - read_lengths = [] - for f in fastqs: - read_lengths.append(get_max_read_length(f)) +if __name__ == "__main__": + basefolder = sys.argv[1] # QC or rawQC folder + fastqs = glob.glob(basefolder + "/*fastqc.zip") + read_lengths = [] + for f in fastqs: + read_lengths.append(get_max_read_length(f)) - print(max(read_lengths)) + print(max(read_lengths)) diff --git a/workflow/scripts/merge_rsem_results.py b/workflow/scripts/merge_rsem_results.py index 25fda4b..f40dedd 100644 --- a/workflow/scripts/merge_rsem_results.py +++ b/workflow/scripts/merge_rsem_results.py @@ -1,45 +1,61 @@ from __future__ import print_function from os.path import join from functools import reduce -import os,sys +import os, sys import pandas as pd def Counts(fpattern, searchpath, anno, ftype, mycols): - """ - Get each samples FPKM vaules from RSEMs *.RSEM.genes.results and *.RSEM.isoform.results - """ - # Collect RSEM Results - files = sorted(list(filter(lambda x: fpattern in x,os.listdir(searchpath)))) - # print(files) - for col in ["expected_count","TPM","FPKM"]: - dflist = [] - for f in files: - x=pd.read_csv(join(searchpath,f),sep="\t",usecols=mycols+[col]) - samplename=f.split(".RSEM")[0] - x.columns=mycols+[samplename] - dflist.append(x) - - mergeddf=reduce(lambda a,b:pd.merge(a,b,how="outer",on=mycols),dflist) - mergeddf=pd.merge(anno,mergeddf,on="gene_id") - mergeddf.fillna('UNKNOWN',inplace=True) - mergeddf=mergeddf.sort_values(by=['GeneName']) - outfile=join(searchpath, "RSEM." + ftype + "." + col + ".all_samples.txt") - mergeddf.to_csv(outfile,sep="\t",index=False) - - -if __name__ == '__main__': - # Parse Args - if len(sys.argv) != 4: - print(os.path.basename(__file__)+ ": Fail to provide all required arguments!") - exit("USAGE: python {} /path/to/annotate.genes.txt /path/to/rsem/genecounts/ /path/to/isoformcounts/".format(sys.argv[0])) - - ens2genefile = sys.argv[1] - rsemgenesfolder = sys.argv[2] - rsemisoformsfolder = sys.argv[3] - - annotations=pd.read_csv(ens2genefile,header=None,sep=" ",usecols=[0,2]) - annotations.columns=["gene_id","GeneName"] - - Counts(fpattern = "RSEM.genes.results", searchpath = rsemgenesfolder, anno = annotations, ftype = "genes", mycols = ["gene_id"]) - Counts(fpattern = "RSEM.isoforms.results", searchpath = rsemisoformsfolder, anno = annotations, ftype = "isoforms", mycols = ["transcript_id","gene_id"]) + """ + Get each samples FPKM vaules from RSEMs *.RSEM.genes.results and *.RSEM.isoform.results + """ + # Collect RSEM Results + files = sorted(list(filter(lambda x: fpattern in x, os.listdir(searchpath)))) + # print(files) + for col in ["expected_count", "TPM", "FPKM"]: + dflist = [] + for f in files: + x = pd.read_csv(join(searchpath, f), sep="\t", usecols=mycols + [col]) + samplename = f.split(".RSEM")[0] + x.columns = mycols + [samplename] + dflist.append(x) + + mergeddf = reduce(lambda a, b: pd.merge(a, b, how="outer", on=mycols), dflist) + mergeddf = pd.merge(anno, mergeddf, on="gene_id") + mergeddf.fillna("UNKNOWN", inplace=True) + mergeddf = mergeddf.sort_values(by=["GeneName"]) + outfile = join(searchpath, "RSEM." + ftype + "." + col + ".all_samples.txt") + mergeddf.to_csv(outfile, sep="\t", index=False) + + +if __name__ == "__main__": + # Parse Args + if len(sys.argv) != 4: + print(os.path.basename(__file__) + ": Fail to provide all required arguments!") + exit( + "USAGE: python {} /path/to/annotate.genes.txt /path/to/rsem/genecounts/ /path/to/isoformcounts/".format( + sys.argv[0] + ) + ) + + ens2genefile = sys.argv[1] + rsemgenesfolder = sys.argv[2] + rsemisoformsfolder = sys.argv[3] + + annotations = pd.read_csv(ens2genefile, header=None, sep=" ", usecols=[0, 2]) + annotations.columns = ["gene_id", "GeneName"] + + Counts( + fpattern="RSEM.genes.results", + searchpath=rsemgenesfolder, + anno=annotations, + ftype="genes", + mycols=["gene_id"], + ) + Counts( + fpattern="RSEM.isoforms.results", + searchpath=rsemisoformsfolder, + anno=annotations, + ftype="isoforms", + mycols=["transcript_id", "gene_id"], + ) diff --git a/workflow/scripts/optimal_read_length.py b/workflow/scripts/optimal_read_length.py index 9c7f818..9150b54 100644 --- a/workflow/scripts/optimal_read_length.py +++ b/workflow/scripts/optimal_read_length.py @@ -4,34 +4,50 @@ def find_optimal_read_length(rl, dbrl): - '''Returns the best read length given a list of available STAR Indices''' - try: - best_index = next(x[1] for x in enumerate(dbrl) if x[1] >= rl) - except StopIteration: - # This could be due to not setting or incorrectly setting the singularity - # bind paths to the STAR indices for each read length. - # Please check how you are setting -B when running singularity. - print('Are you using singularity, and have you set the bind path to STAR indices?', file=sys.stderr) - sys.exit('Failed to find best index for STAR based on read length') - return best_index - - -if __name__ == '__main__': - # Parse required positional args - try: - readlength = sys.argv[1] # Max read length of all samples calculated from FastQC - stardir = sys.argv[2] # PATH to STAR Indices for different read lengths - except IndexError: - print('Failed to provide all required positional args!') - print('Example Usage: python {} QC/readlength.txt /data/CCBR_Pipeliner/db/PipeDB/Indices/hg38_30/STAR/2.7.0f/genes-'.format(sys.argv[0])) - sys.exit(1) - - # Get max read length of sample - my_read_length=int(open(readlength).readlines()[0].strip())-1 - - # Find all STAR Indice read lengths - star_read_lengths=sorted(list(map(lambda x:int(re.findall("genes-(\d+)",x)[0]), glob.glob(stardir+'*/')))) - - myrl = find_optimal_read_length(rl=my_read_length, dbrl=star_read_lengths) - - print(myrl) + """Returns the best read length given a list of available STAR Indices""" + try: + best_index = next(x[1] for x in enumerate(dbrl) if x[1] >= rl) + except StopIteration: + # This could be due to not setting or incorrectly setting the singularity + # bind paths to the STAR indices for each read length. + # Please check how you are setting -B when running singularity. + print( + "Are you using singularity, and have you set the bind path to STAR indices?", + file=sys.stderr, + ) + sys.exit("Failed to find best index for STAR based on read length") + return best_index + + +if __name__ == "__main__": + # Parse required positional args + try: + readlength = sys.argv[ + 1 + ] # Max read length of all samples calculated from FastQC + stardir = sys.argv[2] # PATH to STAR Indices for different read lengths + except IndexError: + print("Failed to provide all required positional args!") + print( + "Example Usage: python {} QC/readlength.txt /data/CCBR_Pipeliner/db/PipeDB/Indices/hg38_30/STAR/2.7.0f/genes-".format( + sys.argv[0] + ) + ) + sys.exit(1) + + # Get max read length of sample + my_read_length = int(open(readlength).readlines()[0].strip()) - 1 + + # Find all STAR Indice read lengths + star_read_lengths = sorted( + list( + map( + lambda x: int(re.findall("genes-(\d+)", x)[0]), + glob.glob(stardir + "*/"), + ) + ) + ) + + myrl = find_optimal_read_length(rl=my_read_length, dbrl=star_read_lengths) + + print(myrl) diff --git a/workflow/scripts/pcacall.R b/workflow/scripts/pcacall.R index 581287a..52802ef 100644 --- a/workflow/scripts/pcacall.R +++ b/workflow/scripts/pcacall.R @@ -4,13 +4,12 @@ args <- commandArgs(trailingOnly = TRUE) DIR <- args[1] outHtml <- args[2] pcaRmd <- args[7] -Sys.setenv(RSTUDIO_PANDOC="/usr/local/apps/rstudio/rstudio-1.1.447/bin/pandoc/") -setwd(DIR) # new -rmarkdown::render(pcaRmd,output_file=outHtml, params = list( - folder = args[1], - sampleinfo = args[3], - data = args[4], - projectId = args[5], - projectDesc = args[6] - ) -) +Sys.setenv(RSTUDIO_PANDOC = "/usr/local/apps/rstudio/rstudio-1.1.447/bin/pandoc/") +setwd(DIR) # new +rmarkdown::render(pcaRmd, output_file = outHtml, params = list( + folder = args[1], + sampleinfo = args[3], + data = args[4], + projectId = args[5], + projectDesc = args[6] +)) diff --git a/workflow/scripts/phred_encoding.py b/workflow/scripts/phred_encoding.py index 5dd5ba6..6be4792 100755 --- a/workflow/scripts/phred_encoding.py +++ b/workflow/scripts/phred_encoding.py @@ -22,12 +22,12 @@ # AAAFFJJFJJJJJJFJJJJJJJJJJFJAJJJJJFJJJJJFFJJAJJJJ7JJ <- Determine if Phred-33 encoding or Phred-64 -def usage(message = '', exitcode = 0): +def usage(message="", exitcode=0): """Displays help and usage information. If provided invalid usage returns non-zero exit-code. Additional message can be displayed with the 'message' parameter. """ - print('Usage: python {} sampleName.R1.fastq.gz'.format(sys.argv[0])) + print("Usage: python {} sampleName.R1.fastq.gz".format(sys.argv[0])) if message: print(message) sys.exit(exitcode) @@ -38,7 +38,7 @@ def reader(fname): or non-gzipped FastQ files based on the file extension. Assumes gzipped files endwith the '.gz' extension. """ - if fname.endswith('.gz'): + if fname.endswith(".gz"): # Opens up file with gzip handler return gzip.open else: @@ -50,23 +50,73 @@ def decoded(qscore): """Returns Phred ASCII encoding type of FastQ quality scores. Older FastQ files may use Phred 64 encoding. """ - encoding = '' + encoding = "" # Unique set of characters across both Phred encoding types - encodings = { # Pred-33 Encoding characters - '!': '33', '#': '33', '"': '33', '%': '33', '$': '33', "'": '33', - '&': '33', ')': '33', '(': '33', '+': '33', '*': '33', '-': '33', - ',': '33', '/': '33', '.': '33', '1': '33', '0': '33', '3': '33', - '2': '33', '5': '33', '4': '33', '7': '33', '6': '33', '9': '33', - '8': '33', ';': '33', ':': '33', '=': '33', '<': '33', '?': '33', - '>': '33', - # Pred-64 Encoding characters - 'K': '64', 'M': '64', 'L': '64', 'O': '64', 'N': '64', 'Q': '64', - 'P': '64', 'S': '64', 'R': '64', 'U': '64', 'T': '64', 'W': '64', - 'V': '64', 'Y': '64', 'X': '64', '[': '64', 'Z': '64', ']': '64', - '\\': '64', '_': '64', '^': '64', 'a': '64', '`': '64', 'c': '64', - 'b': '64', 'e': '64', 'd': '64', 'g': '64', 'f': '64', 'i': '64', - 'h': '64' - } + encodings = { # Pred-33 Encoding characters + "!": "33", + "#": "33", + '"': "33", + "%": "33", + "$": "33", + "'": "33", + "&": "33", + ")": "33", + "(": "33", + "+": "33", + "*": "33", + "-": "33", + ",": "33", + "/": "33", + ".": "33", + "1": "33", + "0": "33", + "3": "33", + "2": "33", + "5": "33", + "4": "33", + "7": "33", + "6": "33", + "9": "33", + "8": "33", + ";": "33", + ":": "33", + "=": "33", + "<": "33", + "?": "33", + ">": "33", + # Pred-64 Encoding characters + "K": "64", + "M": "64", + "L": "64", + "O": "64", + "N": "64", + "Q": "64", + "P": "64", + "S": "64", + "R": "64", + "U": "64", + "T": "64", + "W": "64", + "V": "64", + "Y": "64", + "X": "64", + "[": "64", + "Z": "64", + "]": "64", + "\\": "64", + "_": "64", + "^": "64", + "a": "64", + "`": "64", + "c": "64", + "b": "64", + "e": "64", + "d": "64", + "g": "64", + "f": "64", + "i": "64", + "h": "64", + } for char in qscore: try: @@ -78,14 +128,15 @@ def decoded(qscore): return encoding - -if __name__ == '__main__': - +if __name__ == "__main__": # Check Arguments - if '-h' in sys.argv or '--help' in sys.argv or '-help' in sys.argv: - usage(exitcode = 0) + if "-h" in sys.argv or "--help" in sys.argv or "-help" in sys.argv: + usage(exitcode=0) elif len(sys.argv) != 2: - usage(message = 'Error: failed to provide all required positional arguments!', exitcode = 1) + usage( + message="Error: failed to provide all required positional arguments!", + exitcode=1, + ) # Get file name filename = sys.argv[1] @@ -93,21 +144,21 @@ def decoded(qscore): # Set handler for gzipped or uncompressed file handle = reader(filename) # Default encoding if not found - encoding = '33' + encoding = "33" # Open in 'rt' mode to maintain compatibility across python2 and python3 # python3 default mode is 'rb' and will return a byte string representation - with handle(filename, 'rt') as fastq: + with handle(filename, "rt") as fastq: i = 0 for line in fastq: line = line.strip() - if i%4 == 3: # Quality scores + if i % 4 == 3: # Quality scores encoded = decoded(line) if encoded: # Found Phred ASCII encoding type (33 vs. 64) encoding = encoded - break # Stop Iteration - i+=1 + break # Stop Iteration + i += 1 # Print encoding to standard output print(encoding) diff --git a/workflow/scripts/pyparser.py b/workflow/scripts/pyparser.py index 3235efb..f045ad5 100644 --- a/workflow/scripts/pyparser.py +++ b/workflow/scripts/pyparser.py @@ -6,65 +6,104 @@ # Configuration for defining valid files, cleaning sample names, parse fields, rename fields # Add new files to parse and define their specifications below config = { - ".warning": ["\033[93m", "\033[00m"], ".error": ["\033[91m", "\033[00m"], - + ".warning": ["\033[93m", "\033[00m"], + ".error": ["\033[91m", "\033[00m"], ".rnaseq": { ".default": { ".output_preference": [ - "Sample", "Encoding", "total_read_pairs", "trimmed_read_pairs", - "avg_sequence_length", "sequence_length", "gc_content", "percent_duplication", - "percent_aligned","inner_distance_maxima", "median_insert_size", "mean_insert_size", "mean_mapping_quality", - "mean_coverage", "avg_aligned_read_length", "pct_mrna_bases", "pct_coding_bases", - "pct_intronic_bases", "pct_utr_bases", "pct_intergenic_bases", "median_cv_coverage", - "median_5prime_to_3prime_bias", "median_5prime_bias", "median_3prime_bias", - "rRNA_percent_aligned", "uni_vec_percent_aligned", "percent_antisense_strand", - "percent_sense_strand", "median_tin", "flowcell_lanes" + "Sample", + "Encoding", + "total_read_pairs", + "trimmed_read_pairs", + "avg_sequence_length", + "sequence_length", + "gc_content", + "percent_duplication", + "percent_aligned", + "inner_distance_maxima", + "median_insert_size", + "mean_insert_size", + "mean_mapping_quality", + "mean_coverage", + "avg_aligned_read_length", + "pct_mrna_bases", + "pct_coding_bases", + "pct_intronic_bases", + "pct_utr_bases", + "pct_intergenic_bases", + "median_cv_coverage", + "median_5prime_to_3prime_bias", + "median_5prime_bias", + "median_3prime_bias", + "rRNA_percent_aligned", + "uni_vec_percent_aligned", + "percent_antisense_strand", + "percent_sense_strand", + "median_tin", + "flowcell_lanes", ] } }, - - "multiqc_cutadapt.txt": { + "multiqc_cutadapt.txt": { "delimeter": "\t", - "clean_sample_name": ["\.R1$", "\.R2$"], - "parse_column": ["Sample", "pairs_processed", "r_processed"], - "rename_field": { - "pairs_processed": "total_read_pairs", - "r_processed": "total_read_pairs" - }, - "typecast": { - "total_read_pairs": int - } - }, - - "multiqc_fastqc.txt": { + "clean_sample_name": ["\.R1$", "\.R2$"], + "parse_column": ["Sample", "pairs_processed", "r_processed"], + "rename_field": { + "pairs_processed": "total_read_pairs", + "r_processed": "total_read_pairs", + }, + "typecast": {"total_read_pairs": int}, + }, + "multiqc_fastqc.txt": { "delimeter": "\t", - "clean_sample_name": ["^QC \\| ", "^rawQC \\| ", "\.trim$", "\.R1$", "\.R2$"], + "clean_sample_name": ["^QC \\| ", "^rawQC \\| ", "\.trim$", "\.R1$", "\.R2$"], "collapse": True, - "parse_column": ["Sample", "Encoding", "Total Sequences", "Sequence length", "%GC", "avg_sequence_length"], - "rename_field": { - "Total Sequences": "trimmed_read_pairs", - "Sequence length": "sequence_length", + "parse_column": [ + "Sample", + "Encoding", + "Total Sequences", + "Sequence length", + "%GC", + "avg_sequence_length", + ], + "rename_field": { + "Total Sequences": "trimmed_read_pairs", + "Sequence length": "sequence_length", "%GC": "gc_content", - }, - "typecast": { - "trimmed_read_pairs": int, - "avg_sequence_length": float - } - }, - - "multiqc_fastq_screen.txt": { + }, + "typecast": {"trimmed_read_pairs": int, "avg_sequence_length": float}, + }, + "multiqc_fastq_screen.txt": { "delimeter": "\t", - "clean_sample_name": ["^FQscreen \\| ", "^FQscreen2 \\| ", "_screen$", "\.trim$", "\.R1$", "\.R2$", "\.R1_2$", "\.R2_2$"], - "parse_column": ["Sample", "Uni_Vec percentage", "rRNA percentage", "Human percentage", "Mouse percentage", "Bacteria percentage", "Fungi percentage", "Virus percentage"], - "rename_field": { - "Uni_Vec percentage": "uni_vec_percent_aligned", - "rRNA percentage": "rRNA_percent_aligned", + "clean_sample_name": [ + "^FQscreen \\| ", + "^FQscreen2 \\| ", + "_screen$", + "\.trim$", + "\.R1$", + "\.R2$", + "\.R1_2$", + "\.R2_2$", + ], + "parse_column": [ + "Sample", + "Uni_Vec percentage", + "rRNA percentage", + "Human percentage", + "Mouse percentage", + "Bacteria percentage", + "Fungi percentage", + "Virus percentage", + ], + "rename_field": { + "Uni_Vec percentage": "uni_vec_percent_aligned", + "rRNA percentage": "rRNA_percent_aligned", "Human percentage": "human_percent_aligned", "Mouse percentage": "mouse_percent_aligned", "Bacteria percentage": "bacteria_percent_aligned", "Fungi percentage": "fungi_percent_aligned", - "Virus percentage": "virus_percent_aligned" - }, + "Virus percentage": "virus_percent_aligned", + }, "typecast": { "uni_vec_percent_aligned": float, "rRNA_percent_aligned": float, @@ -72,40 +111,43 @@ "mouse_percent_aligned": float, "bacteria_percent_aligned": float, "fungi_percent_aligned": float, - "virus_percent_aligned": float - } - }, - - "multiqc_picard_dups.txt": { - "delimeter": "\t", - "clean_sample_name": ["\.p2$"], - "parse_column": ["Sample", "PERCENT_DUPLICATION"], - "rename_field": { - "PERCENT_DUPLICATION": "percent_duplication" - }, - "typecast": { - "percent_duplication": float + "virus_percent_aligned": float, }, - "scaling_factor": { - "percent_duplication": 100.0 - }, - }, - - "multiqc_picard_RnaSeqMetrics.txt": { + }, + "multiqc_picard_dups.txt": { "delimeter": "\t", - "clean_sample_name": ["\.p2$"], - "parse_column": ["Sample", "PCT_CODING_BASES", "PCT_MRNA_BASES", "MEDIAN_CV_COVERAGE", "PCT_INTRONIC_BASES", "MEDIAN_3PRIME_BIAS", "MEDIAN_5PRIME_BIAS", "MEDIAN_5PRIME_TO_3PRIME_BIAS", "PCT_INTERGENIC_BASES", "PCT_UTR_BASES"], - "rename_field": { - "PCT_CODING_BASES": "pct_coding_bases", - "PCT_MRNA_BASES": "pct_mrna_bases", - "MEDIAN_CV_COVERAGE": "median_cv_coverage", - "PCT_INTRONIC_BASES": "pct_intronic_bases", - "MEDIAN_3PRIME_BIAS": "median_3prime_bias", - "MEDIAN_5PRIME_BIAS": "median_5prime_bias", - "MEDIAN_5PRIME_TO_3PRIME_BIAS": "median_5prime_to_3prime_bias", - "PCT_INTERGENIC_BASES": "pct_intergenic_bases", - "PCT_UTR_BASES": "pct_utr_bases" - }, + "clean_sample_name": ["\.p2$"], + "parse_column": ["Sample", "PERCENT_DUPLICATION"], + "rename_field": {"PERCENT_DUPLICATION": "percent_duplication"}, + "typecast": {"percent_duplication": float}, + "scaling_factor": {"percent_duplication": 100.0}, + }, + "multiqc_picard_RnaSeqMetrics.txt": { + "delimeter": "\t", + "clean_sample_name": ["\.p2$"], + "parse_column": [ + "Sample", + "PCT_CODING_BASES", + "PCT_MRNA_BASES", + "MEDIAN_CV_COVERAGE", + "PCT_INTRONIC_BASES", + "MEDIAN_3PRIME_BIAS", + "MEDIAN_5PRIME_BIAS", + "MEDIAN_5PRIME_TO_3PRIME_BIAS", + "PCT_INTERGENIC_BASES", + "PCT_UTR_BASES", + ], + "rename_field": { + "PCT_CODING_BASES": "pct_coding_bases", + "PCT_MRNA_BASES": "pct_mrna_bases", + "MEDIAN_CV_COVERAGE": "median_cv_coverage", + "PCT_INTRONIC_BASES": "pct_intronic_bases", + "MEDIAN_3PRIME_BIAS": "median_3prime_bias", + "MEDIAN_5PRIME_BIAS": "median_5prime_bias", + "MEDIAN_5PRIME_TO_3PRIME_BIAS": "median_5prime_to_3prime_bias", + "PCT_INTERGENIC_BASES": "pct_intergenic_bases", + "PCT_UTR_BASES": "pct_utr_bases", + }, "typecast": { "pct_coding_bases": float, "pct_mrna_bases": float, @@ -115,94 +157,93 @@ "median_5prime_bias": float, "median_5prime_to_3prime_bias": float, "pct_intergenic_bases": float, - "pct_utr_bases": float - } - }, - - "multiqc_rseqc_infer_experiment.txt": { + "pct_utr_bases": float, + }, + }, + "multiqc_rseqc_infer_experiment.txt": { "delimeter": "\t", - "clean_sample_name": ["^RSeQC \\| ", "\.strand\.info$","\.info\.strand$", "^output\.", "\.p2$"], - "parse_column": ["Sample", "pe_sense", "se_sense", "pe_antisense", "se_antisense"], - "rename_field": { - "pe_sense": "percent_sense_strand", + "clean_sample_name": [ + "^RSeQC \\| ", + "\.strand\.info$", + "\.info\.strand$", + "^output\.", + "\.p2$", + ], + "parse_column": [ + "Sample", + "pe_sense", + "se_sense", + "pe_antisense", + "se_antisense", + ], + "rename_field": { + "pe_sense": "percent_sense_strand", "se_sense": "percent_sense_strand", - "pe_antisense": "percent_antisense_strand", + "pe_antisense": "percent_antisense_strand", "se_antisense": "percent_antisense_strand", - }, - "typecast": { - "percent_sense_strand": float, - "percent_antisense_strand": float }, + "typecast": {"percent_sense_strand": float, "percent_antisense_strand": float}, "scaling_factor": { "percent_sense_strand": 100.0, - "percent_antisense_strand": 100.0 + "percent_antisense_strand": 100.0, }, - }, - + }, "rseqc_inner_distances.txt": { "delimeter": "\t", "clean_sample_name": ["\.inner_distance_freq\.txt$"], "parse_column": ["Sample", "Inner_Dist_Maxima"], - "rename_field": { - "Inner_Dist_Maxima": "inner_distance_maxima" - }, - "typecast": { - "inner_distance_maxima": float - }, + "rename_field": {"Inner_Dist_Maxima": "inner_distance_maxima"}, + "typecast": {"inner_distance_maxima": float}, }, - - "rseqc_median_tin.txt": { + "rseqc_median_tin.txt": { "delimeter": "\t", - "clean_sample_name": ["\.star_rg_added\.sorted\.dmark\.bam$"], - "parse_column": ["Sample", "median_tin"], - "typecast": { - "median_tin": float - } - }, - - "sample_group.txt": { + "clean_sample_name": ["\.star_rg_added\.sorted\.dmark\.bam$"], + "parse_column": ["Sample", "median_tin"], + "typecast": {"median_tin": float}, + }, + "sample_group.txt": { "delimeter": "\t", - "clean_sample_name": [], - "parse_column": ["Sample", "TissueType"], - }, - - "fastq_flowcell_lanes.txt": { + "clean_sample_name": [], + "parse_column": ["Sample", "TissueType"], + }, + "fastq_flowcell_lanes.txt": { "delimeter": "\t", - "clean_sample_name": [], - "parse_column": ["Sample", "flowcell_lanes"], - }, - - "multiqc_star.txt": { + "clean_sample_name": [], + "parse_column": ["Sample", "flowcell_lanes"], + }, + "multiqc_star.txt": { "delimeter": "\t", - "clean_sample_name": ["\.p2$"], - "parse_column": ["Sample", "uniquely_mapped_percent", "avg_input_read_length"], - "rename_field": { - "uniquely_mapped_percent": "percent_aligned", - "avg_input_read_length": "avg_aligned_read_length" - }, - "typecast": { - "percent_aligned": float, - "avg_aligned_read_length": int - } - }, - - "multiqc_qualimap_bamqc_genome_results.txt": { + "clean_sample_name": ["\.p2$"], + "parse_column": ["Sample", "uniquely_mapped_percent", "avg_input_read_length"], + "rename_field": { + "uniquely_mapped_percent": "percent_aligned", + "avg_input_read_length": "avg_aligned_read_length", + }, + "typecast": {"percent_aligned": float, "avg_aligned_read_length": int}, + }, + "multiqc_qualimap_bamqc_genome_results.txt": { "delimeter": "\t", - "clean_sample_name": ["\.p2$"], - "parse_column": ["Sample", "mean_insert_size", "median_insert_size", "mean_mapping_quality", "mean_coverage"], - "rename_field": {}, + "clean_sample_name": ["\.p2$"], + "parse_column": [ + "Sample", + "mean_insert_size", + "median_insert_size", + "mean_mapping_quality", + "mean_coverage", + ], + "rename_field": {}, "typecast": { "mean_insert_size": float, "median_insert_size": float, "mean_mapping_quality": float, - "mean_coverage": float - } - } + "mean_coverage": float, + }, + }, } def help(): - return """ + return """ pyparser.py - a config based file parser. USAGE: @@ -246,12 +287,17 @@ def args(argslist): *files, odir = argslist[1:] # Check for optional args - if '-h' in files or '--help' in files: + if "-h" in files or "--help" in files: print(help()) sys.exit(0) # Check to see if user provided input files to parse elif not files: - print("\n{}Error: Failed to provide input files to parse!{}".format(*config['.error']), file=sys.stderr) + print( + "\n{}Error: Failed to provide input files to parse!{}".format( + *config[".error"] + ), + file=sys.stderr, + ) print(help()) sys.exit(1) @@ -267,8 +313,12 @@ def isvalid(file): # Remove absolute or relateive PATH if os.path.basename(file) not in supported: - cstart, cend = config['.warning'] - print("{}Warning:{} {} is a not supported file to parse... Skipping over file!".format(cstart, cend, file)) + cstart, cend = config[".warning"] + print( + "{}Warning:{} {} is a not supported file to parse... Skipping over file!".format( + cstart, cend, file + ) + ) return False return True @@ -284,8 +334,12 @@ def exists(file): fh.close() # File cannot be opened for reading (may not exist) or permissions problem except IOError: - cstart, cend = config['.warning'] - print("{}Warning:{} Cannot open {}... File may not exist... Skipping over file!".format(cstart, cend, file)) + cstart, cend = config[".warning"] + print( + "{}Warning:{} Cannot open {}... File may not exist... Skipping over file!".format( + cstart, cend, file + ) + ) return False return True @@ -300,7 +354,7 @@ def column_indexes(line, filename, verbose=True): header = line # Remove file's PATH before cross-referencing config filename = os.path.basename(filename) - fields2parse = config[filename]["parse_column"] # Attributes or columns of interest + fields2parse = config[filename]["parse_column"] # Attributes or columns of interest # Get index of column to parse for i in range(0, len(header), 1): @@ -312,23 +366,28 @@ def column_indexes(line, filename, verbose=True): # Warning that an expected field could not be found fields_not_found = set(fields2parse) - set(found) for field in fields_not_found: - cstart, cend = config['.warning'] - print("{}Warning:{} Cannot find expected field '{}' in {}... skipping over parsing that field!".format(cstart, cend, field, filename)) + cstart, cend = config[".warning"] + print( + "{}Warning:{} Cannot find expected field '{}' in {}... skipping over parsing that field!".format( + cstart, cend, field, filename + ) + ) return indices def clean(linelist, sample_name_index, filename): """Cleans sample name from suffixes defined in config[filename]['clean_sample_name'] and - renames fields defined in config[filename]['rename_field']. Returns a list of cleaned fields.""" + renames fields defined in config[filename]['rename_field']. Returns a list of cleaned fields. + """ samplename = linelist[sample_name_index] # Remove file's PATH before cross-referencing config filename = os.path.basename(filename) - for suffix in config[filename]['clean_sample_name']: - regex = '{}'.format(suffix) - samplename = re.sub(regex, '', samplename) + for suffix in config[filename]["clean_sample_name"]: + regex = "{}".format(suffix) + samplename = re.sub(regex, "", samplename) # Update linelist with new sample name linelist[sample_name_index] = samplename @@ -342,7 +401,7 @@ def rename(header, filename): filename = os.path.basename(filename) for field in header: try: - newname = config[filename]['rename_field'][field] + newname = config[filename]["rename_field"][field] renamed.append(newname) # Field is not in config, keep old name except KeyError: @@ -359,13 +418,13 @@ def cast_typed(value, column, filename, decimals=3): # Python witch-craft, functions are first-class objects and can be used accordingly # Storing function object into caster variable for typecasting as int() or float() caster = config[filename]["typecast"][column] - value = caster(value) # typecast to spec defined in config + value = caster(value) # typecast to spec defined in config if type(value) is float: value = round(value, decimals) except ValueError: # Must convert to float before converting to integer # cannot pass a string representation of a float into int() - if value: # case for when row/column is empty string + if value: # case for when row/column is empty string value = caster(float(value)) except KeyError: # No type is defined in config, pass @@ -386,9 +445,13 @@ def scaled(value, column, filename): except TypeError: # Did not typecast value using the config # Remove warning by typecasting value as float or int - cstart, cend = config['.warning'] - print("{}Warning:{} Attribute {} in {} is NOT defined in config... defaulting to float".format(cstart, cend, column, filename)) - if value: # case for when row/column is empty string + cstart, cend = config[".warning"] + print( + "{}Warning:{} Attribute {} in {} is NOT defined in config... defaulting to float".format( + cstart, cend, column, filename + ) + ) + if value: # case for when row/column is empty string value = float(value) * scaling_unit value = round(value, 3) except KeyError: @@ -401,7 +464,7 @@ def populate_table(parsed_header, parsed_line, file, data_dict): dictionary['Sample_Name']['QC_Attribute'] = QC_Metadata. Returns an updated dictionary containing new information for N-th line.""" - sample_index = parsed_header.index('Sample') + sample_index = parsed_header.index("Sample") sample_name = parsed_line[sample_index] # Add sample name to dictionary, if does not exist [key1] @@ -410,7 +473,7 @@ def populate_table(parsed_header, parsed_line, file, data_dict): for i in range(0, len(parsed_line), 1): # Skip over sample name (already first key) - if parsed_line[i]: # check if empty string + if parsed_line[i]: # check if empty string metadata = cast_typed(parsed_line[i], parsed_header[i], file) metadata = scaled(metadata, parsed_header[i], file) data_dict[sample_name][parsed_header[i]] = metadata @@ -418,33 +481,35 @@ def populate_table(parsed_header, parsed_line, file, data_dict): return data_dict -def parsed(file, delimeter='\t'): +def parsed(file, delimeter="\t"): """Parses columns of file according to specification in config[filename]['parse_column']. Column names are renamed according to specification in config[filename]['rename_field']. Sample names are cleaned to removed any prefixes or suffixes specified in config[filename]['clean_sample_name']. Yields a tuple consisting of the parsed header and N-th parsed line of the file. """ - #print('\nBeginning to parse {}'.format(file)) - with open(file, 'r') as fh: + # print('\nBeginning to parse {}'.format(file)) + with open(file, "r") as fh: # Parse header - header = next(fh).strip().split(delimeter) # Get file header - indexes = column_indexes(header, file) # Indexes of columns to parse + header = next(fh).strip().split(delimeter) # Get file header + indexes = column_indexes(header, file) # Indexes of columns to parse header = [header[i] for i in indexes] # Parse each column of interest - header = rename(header, file) # Rename columns + header = rename(header, file) # Rename columns # Parse QC metadata from file - sample_index = header.index('Sample') + sample_index = header.index("Sample") for line in fh: - #linelist = line.strip().split(delimiter) - linelist = line.rstrip('\n').split(delimeter) + # linelist = line.strip().split(delimiter) + linelist = line.rstrip("\n").split(delimeter) parsed_line = [linelist[i] for i in indexes] - parsed_line = clean(parsed_line, sample_index, file) # remove extensions from sample name + parsed_line = clean( + parsed_line, sample_index, file + ) # remove extensions from sample name yield header, parsed_line -def main(): +def main(): # Minor Todo(s): # 1. Get rid of pandas dependency (add transpose function and loop through dict to print table) # 2. Add more advanced argument parsing, make path to config an arg @@ -465,16 +530,15 @@ def main(): # Get default output peference try: - output_preference = config['.rnaseq']['.default']['.output_preference'] - df = df.reindex(columns = output_preference) + output_preference = config[".rnaseq"][".default"][".output_preference"] + df = df.reindex(columns=output_preference) except KeyError: # Output peference is not defined in config pass # Write to file - df.to_csv(os.path.join(outdir, 'multiqc_matrix.tsv'), index = False, sep='\t') - + df.to_csv(os.path.join(outdir, "multiqc_matrix.tsv"), index=False, sep="\t") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/workflow/scripts/rNA.R b/workflow/scripts/rNA.R index c0d2213..06ee766 100755 --- a/workflow/scripts/rNA.R +++ b/workflow/scripts/rNA.R @@ -8,36 +8,47 @@ library(argparse) parser <- ArgumentParser() # rNA Rmarkdown file -parser$add_argument("-m", "--rmarkdown", type="character", required=TRUE, - help="Required Input File: rNA Rmarkdown file") +parser$add_argument("-m", "--rmarkdown", + type = "character", required = TRUE, + help = "Required Input File: rNA Rmarkdown file" +) # Raw Counts Matrix -parser$add_argument("-r", "--raw_counts", type="character", required=TRUE, - help="Required Input File: Raw counts matrix") +parser$add_argument("-r", "--raw_counts", + type = "character", required = TRUE, + help = "Required Input File: Raw counts matrix" +) # TIN Counts Matrix -parser$add_argument("-t", "--tin_counts", type="character", required=TRUE, - help="Required Input File: TIN counts matrix") +parser$add_argument("-t", "--tin_counts", + type = "character", required = TRUE, + help = "Required Input File: TIN counts matrix" +) # QC Metadata Table -parser$add_argument("-q", "--qc_table", type="character", required=TRUE, - help="Required Input File: QC Metadata Table") +parser$add_argument("-q", "--qc_table", + type = "character", required = TRUE, + help = "Required Input File: QC Metadata Table" +) # Output HTML Filename -parser$add_argument("-f", "--output_filename", type="character", required=FALSE, default = 'rNA.html', - help="Optional Output HTML Filename: Defaults to 'rNA.html'") +parser$add_argument("-f", "--output_filename", + type = "character", required = FALSE, default = "rNA.html", + help = "Optional Output HTML Filename: Defaults to 'rNA.html'" +) # Display sample names -parser$add_argument("-a", "--annotate", action="store_true", default=FALSE, - help="Display sample names in complex heatmap: Defaults to FALSE") +parser$add_argument("-a", "--annotate", + action = "store_true", default = FALSE, + help = "Display sample names in complex heatmap: Defaults to FALSE" +) args <- parser$parse_args() # Generate HTML output -rmarkdown::render(args$rmarkdown, output_file=args$output_filename, params = list( +rmarkdown::render(args$rmarkdown, output_file = args$output_filename, params = list( raw = args$raw_counts, tin = args$tin_counts, qc = args$qc_table, annot = args$annotate - ) -) \ No newline at end of file +)) diff --git a/workflow/scripts/rNA_flowcells.Rmd b/workflow/scripts/rNA_flowcells.Rmd index e61c0a4..bd8c4af 100755 --- a/workflow/scripts/rNA_flowcells.Rmd +++ b/workflow/scripts/rNA_flowcells.Rmd @@ -38,48 +38,48 @@ suppressMessages(library(ComplexHeatmap)) suppressMessages(library(circlize)) # Reading in raw counts matrix, TIN matrix, and QC metadata -rawcounts = read.table(file = params$raw, sep = '\t', header = TRUE, row.names = 1, quote = "") -#rawcounts = read.table(file = 'data/Test_Raw_RSEM_Genes_Dataset.txt', sep = '\t', header = TRUE, row.names = 1) +rawcounts <- read.table(file = params$raw, sep = "\t", header = TRUE, row.names = 1, quote = "") +# rawcounts = read.table(file = 'data/Test_Raw_RSEM_Genes_Dataset.txt', sep = '\t', header = TRUE, row.names = 1) -tincounts = read.table(file = params$tin, sep = '\t', header = TRUE, row.names = 1) -#tincounts = read.table(file = 'data/Test_TIN_Dataset.txt', sep = '\t', header = TRUE, row.names = 1) +tincounts <- read.table(file = params$tin, sep = "\t", header = TRUE, row.names = 1) +# tincounts = read.table(file = 'data/Test_TIN_Dataset.txt', sep = '\t', header = TRUE, row.names = 1) # Remove zero variance rows prior to PC -tincounts = tincounts[apply(tincounts, 1, var) != 0, ] +tincounts <- tincounts[apply(tincounts, 1, var) != 0, ] -multiQC = read.table(file = params$qc, sep = '\t', header = TRUE, stringsAsFactors = TRUE) -rownames(multiQC) = make.names(multiQC$Sample) +multiQC <- read.table(file = params$qc, sep = "\t", header = TRUE, stringsAsFactors = TRUE) +rownames(multiQC) <- make.names(multiQC$Sample) # Create DGEList -deg = edgeR::DGEList(counts = rawcounts) +deg <- edgeR::DGEList(counts = rawcounts) # Filter lowly expressed genes -keep_genes = edgeR::filterByExpr(deg) # Using default: Gene must have 10 reads in >= 70% samples -deg = deg[keep_genes,,keep.lib.sizes=FALSE] # Recaluate new lib.sizes after filtering +keep_genes <- edgeR::filterByExpr(deg) # Using default: Gene must have 10 reads in >= 70% samples +deg <- deg[keep_genes, , keep.lib.sizes = FALSE] # Recaluate new lib.sizes after filtering # edgeR TMM normalization -deg = calcNormFactors(deg, method = "TMM") # calculate scaling norm.factors +deg <- calcNormFactors(deg, method = "TMM") # calculate scaling norm.factors # limma voom normalization -deg_voom = voom(deg, normalize="quantile", plot = TRUE, save.plot = TRUE) +deg_voom <- voom(deg, normalize = "quantile", plot = TRUE, save.plot = TRUE) # Order genes by MAD -deg_voom$E <- deg_voom$E[order(apply(deg_voom$E, 1, mad), decreasing = T),] +deg_voom$E <- deg_voom$E[order(apply(deg_voom$E, 1, mad), decreasing = T), ] # Remove zero variance rows prior to PC deg_voom$E <- deg_voom$E[apply(deg_voom$E, 1, var) != 0, ] # Principal Components Analysis -pca_exp = prcomp(t(as.matrix(deg_voom$E)), scale.=T)$x[,1:3] # Expression PC Analysis -pca_tin = prcomp(t(as.matrix(tincounts)), scale.=T)$x[,1:3] # Transcript Integrity Number PC Analysis -colnames(pca_tin) = c("PC1_tin", "PC2_tin", "PC3_tin") # Renaming PC cols to avoid collision with gene expression PCs +pca_exp <- prcomp(t(as.matrix(deg_voom$E)), scale. = T)$x[, 1:3] # Expression PC Analysis +pca_tin <- prcomp(t(as.matrix(tincounts)), scale. = T)$x[, 1:3] # Transcript Integrity Number PC Analysis +colnames(pca_tin) <- c("PC1_tin", "PC2_tin", "PC3_tin") # Renaming PC cols to avoid collision with gene expression PCs # Merge both dataframes on rowname -multiQC = transform(merge(multiQC, as.data.frame(pca_exp), by='row.names', all=TRUE), row.names=Row.names, Row.names=NULL) -multiQC = transform(merge(multiQC, as.data.frame(pca_tin), by='row.names', all=TRUE), row.names=Row.names, Row.names=NULL) +multiQC <- transform(merge(multiQC, as.data.frame(pca_exp), by = "row.names", all = TRUE), row.names = Row.names, Row.names = NULL) +multiQC <- transform(merge(multiQC, as.data.frame(pca_tin), by = "row.names", all = TRUE), row.names = Row.names, Row.names = NULL) # Crosstalk object (inter-widget connectivity) -shared_metadata = SharedData$new(multiQC) +shared_metadata <- SharedData$new(multiQC) ``` @@ -92,7 +92,6 @@ Inputs {.sidebar} ### Filters ```{r filters} - # Flowcell Lanes filter_select( id = "flowcell_lanes", @@ -272,7 +271,6 @@ filter_slider( sep = "", ticks = TRUE ) - ``` @@ -286,28 +284,28 @@ shared_metadata %>% # selection = 'none', # disable datatable row selection # filter = "top", # allows filtering on each column extensions = c( - "Buttons", # add download buttons - "Scroller" # for scrolling instead of pagination + "Buttons", # add download buttons + "Scroller" # for scrolling instead of pagination ), - rownames = FALSE, # remove rownames + rownames = FALSE, # remove rownames style = "bootstrap", class = "compact", width = "100%", options = list( - dom = "Blrtip", # specify content (search box, etc) + dom = "Blrtip", # specify content (search box, etc) deferRender = TRUE, scrollY = 300, scroller = TRUE, columnDefs = list( list( visible = FALSE, - targets = c(1, 11, 14, 15, 18, 19, 21:23, 27, 30, 31, 32, 33, 34, 35) # hide columes + targets = c(1, 11, 14, 15, 18, 19, 21:23, 27, 30, 31, 32, 33, 34, 35) # hide columes ) ), buttons = list( - I("colvis"), # turn columns on and off - "csv", # download as .csv - "excel" # download as .xlsx + I("colvis"), # turn columns on and off + "csv", # download as .csv + "excel" # download as .xlsx ) ), colnames = c( @@ -327,7 +325,7 @@ shared_metadata %>% "% UTR" = "pct_utr_bases", "% Intronic" = "pct_intronic_bases", "CV Coverage" = "median_cv_coverage", - "% rRNA" = "rRNA_percent_aligned" , + "% rRNA" = "rRNA_percent_aligned", "% UniVec" = "uni_vec_percent_aligned", "% Anti-sense" = "percent_antisense_strand", "medTIN" = "median_tin", @@ -344,25 +342,32 @@ Row {data-height=600} ```{r 3d-expression-pca} # Principal Components Analysis -pca = prcomp(t(as.matrix(deg_voom$E)), scale.=T) +pca <- prcomp(t(as.matrix(deg_voom$E)), scale. = T) # Variance explained for PCs: 1, 2, 3 -pc1 = round(pca$sdev[1]^2/sum(pca$sdev^2)*100,2) -pc2 = round(pca$sdev[2]^2/sum(pca$sdev^2)*100,2) -pc3 = round(pca$sdev[3]^2/sum(pca$sdev^2)*100,2) +pc1 <- round(pca$sdev[1]^2 / sum(pca$sdev^2) * 100, 2) +pc2 <- round(pca$sdev[2]^2 / sum(pca$sdev^2) * 100, 2) +pc3 <- round(pca$sdev[3]^2 / sum(pca$sdev^2) * 100, 2) -cgroups = as.factor(multiQC$flowcell_lanes) -cgroups = addNA(cgroups) +cgroups <- as.factor(multiQC$flowcell_lanes) +cgroups <- addNA(cgroups) cpalette <- brewer.pal(nlevels(cgroups), "Paired") -p <- plot_ly(shared_metadata, x = ~PC1, y = ~PC2, z = ~PC3, color=cgroups, colors=cpalette, hoverinfo="text", marker=list(size = 8), - text = ~paste('
Sample: ', Sample, '
Flowcell Lanes: ', flowcell_lanes, '
medTIN: ', median_tin, '


% Aligned: ', percent_aligned, - '
% Dup: ', percent_duplication, '
% Coding: ', pct_coding_bases, '
% UTR: ', pct_utr_bases, - '
% Intronic: ', pct_intronic_bases, '

Sequence Range: ', sequence_length, - '
GC Content: ', gc_content, '
Inner Distance Maxima: ', inner_distance_maxima, '
Insert Size: ', median_insert_size) ) %>% - add_markers() %>% layout(scene = list(xaxis = list(title = paste0("PC1 (",pc1,"%)")), - yaxis = list(title = paste0("PC2 (",pc2,"%)")), - zaxis = list(title = paste0("PC3 (",pc3,"%)")))) +p <- plot_ly(shared_metadata, + x = ~PC1, y = ~PC2, z = ~PC3, color = cgroups, colors = cpalette, hoverinfo = "text", marker = list(size = 8), + text = ~ paste( + "
Sample: ", Sample, "
Flowcell Lanes: ", flowcell_lanes, "
medTIN: ", median_tin, "


% Aligned: ", percent_aligned, + "
% Dup: ", percent_duplication, "
% Coding: ", pct_coding_bases, "
% UTR: ", pct_utr_bases, + "
% Intronic: ", pct_intronic_bases, "

Sequence Range: ", sequence_length, + "
GC Content: ", gc_content, "
Inner Distance Maxima: ", inner_distance_maxima, "
Insert Size: ", median_insert_size + ) +) %>% + add_markers() %>% + layout(scene = list( + xaxis = list(title = paste0("PC1 (", pc1, "%)")), + yaxis = list(title = paste0("PC2 (", pc2, "%)")), + zaxis = list(title = paste0("PC3 (", pc3, "%)")) + )) # Important: disable onclick() events plotly::highlight(p, on = NULL) # fixes unexpected behavior when multiple plots via crosstalk ``` @@ -371,25 +376,32 @@ plotly::highlight(p, on = NULL) # fixes unexpected behavior when multiple plots ```{r 3d-tin-pca} # Principal Components Analysis -pca = prcomp(t(as.matrix(tincounts)), scale.=T) +pca <- prcomp(t(as.matrix(tincounts)), scale. = T) # Variance explained for PCs: 1, 2, 3 -pc1 = round(pca$sdev[1]^2/sum(pca$sdev^2)*100,2) -pc2 = round(pca$sdev[2]^2/sum(pca$sdev^2)*100,2) -pc3 = round(pca$sdev[3]^2/sum(pca$sdev^2)*100,2) +pc1 <- round(pca$sdev[1]^2 / sum(pca$sdev^2) * 100, 2) +pc2 <- round(pca$sdev[2]^2 / sum(pca$sdev^2) * 100, 2) +pc3 <- round(pca$sdev[3]^2 / sum(pca$sdev^2) * 100, 2) -cgroups = as.factor(multiQC$flowcell_lanes) -cgroups = addNA(cgroups) +cgroups <- as.factor(multiQC$flowcell_lanes) +cgroups <- addNA(cgroups) cpalette <- brewer.pal(nlevels(cgroups), "Paired") -p <- plot_ly(shared_metadata, x = ~PC1_tin, y = ~PC2_tin, z = ~PC3_tin, color=cgroups, colors=cpalette, hoverinfo="text", marker=list(size = 8), - text = ~paste('
Sample: ', Sample, '
Flowcell Lanes: ', flowcell_lanes, '
medTIN: ', median_tin, '


% Aligned: ', percent_aligned, - '
% Dup: ', percent_duplication, '
% Coding: ', pct_coding_bases, - '
% Intronic: ', pct_intronic_bases, '

medTIN: ', median_tin, '
Sequence Range: ', sequence_length, - '
GC Content: ', gc_content, '
Inner Distance Maxima: ', inner_distance_maxima, '
Insert Size: ', median_insert_size) ) %>% - add_markers() %>% layout(scene = list(xaxis = list(title = paste0("PC1 (",pc1,"%)")), - yaxis = list(title = paste0("PC2 (",pc2,"%)")), - zaxis = list(title = paste0("PC3 (",pc3,"%)")))) +p <- plot_ly(shared_metadata, + x = ~PC1_tin, y = ~PC2_tin, z = ~PC3_tin, color = cgroups, colors = cpalette, hoverinfo = "text", marker = list(size = 8), + text = ~ paste( + "
Sample: ", Sample, "
Flowcell Lanes: ", flowcell_lanes, "
medTIN: ", median_tin, "


% Aligned: ", percent_aligned, + "
% Dup: ", percent_duplication, "
% Coding: ", pct_coding_bases, + "
% Intronic: ", pct_intronic_bases, "

medTIN: ", median_tin, "
Sequence Range: ", sequence_length, + "
GC Content: ", gc_content, "
Inner Distance Maxima: ", inner_distance_maxima, "
Insert Size: ", median_insert_size + ) +) %>% + add_markers() %>% + layout(scene = list( + xaxis = list(title = paste0("PC1 (", pc1, "%)")), + yaxis = list(title = paste0("PC2 (", pc2, "%)")), + zaxis = list(title = paste0("PC3 (", pc3, "%)")) + )) # Important: disable onclick() events plotly::highlight(p, on = NULL) # fixes unexpected behavior when multiple plots via crosstalk @@ -405,20 +417,21 @@ Row ```{r pca-initialize} # Principal Components Analysis -pca = prcomp(t(as.matrix(deg_voom$E)), scale.=T) +pca <- prcomp(t(as.matrix(deg_voom$E)), scale. = T) # Variance explained for PCs: 1, 2, 3 -pc1 = round(pca$sdev[1]^2/sum(pca$sdev^2)*100,2) -pc2 = round(pca$sdev[2]^2/sum(pca$sdev^2)*100,2) - +pc1 <- round(pca$sdev[1]^2 / sum(pca$sdev^2) * 100, 2) +pc2 <- round(pca$sdev[2]^2 / sum(pca$sdev^2) * 100, 2) ``` ### Flowcell Lanes ```{r colored-by-flowcell-lanes} # Gene Expression PCA colored by Flowcell Lanes -g <- ggplot(multiQC, aes(PC1, PC2, color = flowcell_lanes), xlab) + geom_point(size = multiQC$flowcell_lanes) + theme_minimal() + - labs(color = "Flowcell + Lanes", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) +g <- ggplot(multiQC, aes(PC1, PC2, color = flowcell_lanes), xlab) + + geom_point(size = multiQC$flowcell_lanes) + + theme_minimal() + + labs(color = "Flowcell + Lanes", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) g ``` @@ -428,9 +441,11 @@ g ```{r colored-by-dups} # Gene Expression PCA colored by % Duplicates -g <- ggplot(multiQC, aes(PC1, PC2, color = percent_duplication), xlab) + geom_point(size = multiQC$flowcell_lanes) + theme_minimal() + - labs(color = "% Dups", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = percent_duplication), xlab) + + geom_point(size = multiQC$flowcell_lanes) + + theme_minimal() + + labs(color = "% Dups", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -439,9 +454,11 @@ g ```{r colored-by-alignment} # Gene Expression PCA colored by % Aligned -g <- ggplot(multiQC, aes(PC1, PC2, color = percent_aligned), xlab) + geom_point(size = multiQC$flowcell_lanes) + theme_minimal() + - labs(color = "% Aligned", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = percent_aligned), xlab) + + geom_point(size = multiQC$flowcell_lanes) + + theme_minimal() + + labs(color = "% Aligned", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -450,9 +467,11 @@ g ```{r colored-by-utr-bases} # Gene Expression PCA colored by % UTR -g <- ggplot(multiQC, aes(PC1, PC2, color = pct_utr_bases), xlab) + geom_point(size = multiQC$flowcell_lanes) + theme_minimal() + - labs(color = "% UTR", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = pct_utr_bases), xlab) + + geom_point(size = multiQC$flowcell_lanes) + + theme_minimal() + + labs(color = "% UTR", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -461,9 +480,11 @@ g ```{r colored-by-intronic-bases} # Gene Expression PCA colored by % Intronic -g <- ggplot(multiQC, aes(PC1, PC2, color = pct_intronic_bases), xlab) + geom_point(size = multiQC$flowcell_lanes) + theme_minimal() + - labs(color = "% Intronic", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = pct_intronic_bases), xlab) + + geom_point(size = multiQC$flowcell_lanes) + + theme_minimal() + + labs(color = "% Intronic", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -476,9 +497,11 @@ Row ```{r colored-by-inner-distance-maxima} # Gene Expression PCA colored by Inner Distance -g <- ggplot(multiQC, aes(PC1, PC2, color = inner_distance_maxima), xlab) + geom_point(size = multiQC$flowcell_lanes) + theme_minimal() + - labs(color = "Inner Distance Maxima", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = inner_distance_maxima), xlab) + + geom_point(size = multiQC$flowcell_lanes) + + theme_minimal() + + labs(color = "Inner Distance Maxima", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -487,9 +510,11 @@ g ```{r colored-by-cv-coverage} # Gene Expression PCA colored by CV Coverage -g <- ggplot(multiQC, aes(PC1, PC2, color = median_cv_coverage), xlab) + geom_point(size = multiQC$flowcell_lanes) + theme_minimal() + - labs(color = "CV Coverage", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = median_cv_coverage), xlab) + + geom_point(size = multiQC$flowcell_lanes) + + theme_minimal() + + labs(color = "CV Coverage", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -498,9 +523,11 @@ g ```{r colored-by-3-prime-coverage} # Gene Expression PCA colored by 3' Prime Coverage -g <- ggplot(multiQC, aes(PC1, PC2, color = median_3prime_bias), xlab) + geom_point(size = multiQC$flowcell_lanes) + theme_minimal() + - labs(color = "3' Prime Coverage", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = median_3prime_bias), xlab) + + geom_point(size = multiQC$flowcell_lanes) + + theme_minimal() + + labs(color = "3' Prime Coverage", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -509,9 +536,11 @@ g ```{r colored-by-insert-size} # Gene Expression PCA colored by Insert Size -g <- ggplot(multiQC, aes(PC1, PC2, color = median_insert_size), xlab) + geom_point(size = multiQC$flowcell_lanes) + theme_minimal() + - labs(color = "Insert Size", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = median_insert_size), xlab) + + geom_point(size = multiQC$flowcell_lanes) + + theme_minimal() + + labs(color = "Insert Size", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -520,9 +549,11 @@ g ```{r colored-by-inner-distance} # Gene Expression PCA colored by GC Content -g <- ggplot(multiQC, aes(PC1, PC2, color = gc_content), xlab) + geom_point(size = multiQC$flowcell_lanes) + theme_minimal() + - labs(color = "GC Content", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = gc_content), xlab) + + geom_point(size = multiQC$flowcell_lanes) + + theme_minimal() + + labs(color = "GC Content", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -534,9 +565,11 @@ Row ```{r colored-by-medtin} # Gene Expression PCA colored by medTIN -g <- ggplot(multiQC, aes(PC1, PC2, color = median_tin), xlab) + geom_point(size = multiQC$flowcell_lanes) + theme_minimal() + - labs(color = "medTIN", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = median_tin), xlab) + + geom_point(size = multiQC$flowcell_lanes) + + theme_minimal() + + labs(color = "medTIN", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -546,9 +579,11 @@ g ```{r colored-by-coding} # Gene Expression PCA colored by % Coding -g <- ggplot(multiQC, aes(PC1, PC2, color = pct_coding_bases), xlab) + geom_point(size = multiQC$flowcell_lanes) + theme_minimal() + - labs(color = "% Coding", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = pct_coding_bases), xlab) + + geom_point(size = multiQC$flowcell_lanes) + + theme_minimal() + + labs(color = "% Coding", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -557,9 +592,11 @@ g ```{r colored-by-rrna} # Gene Expression PCA colored by % rRNA -g <- ggplot(multiQC, aes(PC1, PC2, color = rRNA_percent_aligned), xlab) + geom_point(size = multiQC$flowcell_lanes) + theme_minimal() + - labs(color = "% rRNA", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = rRNA_percent_aligned), xlab) + + geom_point(size = multiQC$flowcell_lanes) + + theme_minimal() + + labs(color = "% rRNA", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -568,9 +605,11 @@ g ```{r colored-by-anti-sense} # Gene Expression PCA colored by % Anti-sense -g <- ggplot(multiQC, aes(PC1, PC2, color = percent_antisense_strand), xlab) + geom_point(size = multiQC$flowcell_lanes) + theme_minimal() + - labs(color = "% Anti-sense", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = percent_antisense_strand), xlab) + + geom_point(size = multiQC$flowcell_lanes) + + theme_minimal() + + labs(color = "% Anti-sense", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -587,32 +626,33 @@ Column {data-width=500} ### Heirarchical clustering of pairwise spearman correlation coefficients ```{r heirarchical-correlation-matrix, dpi=300} - # Helper Function -reorder_cormat <- function(cormat){ +reorder_cormat <- function(cormat) { # Use correlation between variables as distance for heirarchincal clustering - dd <- as.dist((1-cormat)/2) + dd <- as.dist((1 - cormat) / 2) hc <- hclust(dd) - cormat <-cormat[hc$order, hc$order] + cormat <- cormat[hc$order, hc$order] } # Remove all columns that are categorical -numericQC = multiQC[,-which(sapply(multiQC, class) == "factor")] +numericQC <- multiQC[, -which(sapply(multiQC, class) == "factor")] # Additional columns to remove -additional_remove <- names(numericQC) %in% c("total_read_pairs", "mean_insert_size", "avg_aligned_read_length", "pct_mrna_bases", - "pct_intergenic_bases", "median_5prime_to_3prime_bias", "median_5prime_bias", - "median_3prime_bias", "percent_sense_strand", "mean_mapping_quality", - "PC1_tin", "PC2_tin", "PC3_tin") +additional_remove <- names(numericQC) %in% c( + "total_read_pairs", "mean_insert_size", "avg_aligned_read_length", "pct_mrna_bases", + "pct_intergenic_bases", "median_5prime_to_3prime_bias", "median_5prime_bias", + "median_3prime_bias", "percent_sense_strand", "mean_mapping_quality", + "PC1_tin", "PC2_tin", "PC3_tin" +) # Cleaned numerical QC dataframe -numericQC = numericQC[!additional_remove] +numericQC <- numericQC[!additional_remove] # Remove zero-variance columns to prevent any hlclust() errors -#numericQC = numericQC[,-which(apply(numericQC, 2, var) == 0)] -numericQC = numericQC[, apply(numericQC, 2, var) != 0] +# numericQC = numericQC[,-which(apply(numericQC, 2, var) == 0)] +numericQC <- numericQC[, apply(numericQC, 2, var) != 0] # Pair-wise spearman correlation matrix -cormatrix = round(cor(numericQC, method = "spearman"),2) +cormatrix <- round(cor(numericQC, method = "spearman"), 2) # Reorder the correlation matrix based on hierarchical clustering of the correlation coeff cormat <- reorder_cormat(cormatrix) @@ -624,19 +664,24 @@ cormat[lower.tri(cormat)] <- NA cormat <- melt(cormat, na.rm = TRUE) # Correlation ggheatmap -ggheatmap <- ggplot(cormat, aes(Var2, Var1, fill = value)) + geom_tile(color = "white") + - scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name="Spearman\nCorrelation") + - theme_minimal() + theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 9, hjust = 1)) + coord_fixed() + - theme(axis.title.x = element_blank(), - axis.title.y = element_blank(), - panel.grid.major = element_blank(), - panel.border = element_blank(), - panel.background = element_blank(), - axis.ticks = element_blank(), - legend.justification = c(1, 0), - legend.position = c(0.6, 0.7), - legend.direction = "horizontal") + - guides(fill = guide_colorbar(barwidth = 8, barheight = 1, title.position = "top", title.hjust = 0.5)) +ggheatmap <- ggplot(cormat, aes(Var2, Var1, fill = value)) + + geom_tile(color = "white") + + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1), space = "Lab", name = "Spearman\nCorrelation") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 9, hjust = 1)) + + coord_fixed() + + theme( + axis.title.x = element_blank(), + axis.title.y = element_blank(), + panel.grid.major = element_blank(), + panel.border = element_blank(), + panel.background = element_blank(), + axis.ticks = element_blank(), + legend.justification = c(1, 0), + legend.position = c(0.6, 0.7), + legend.direction = "horizontal" + ) + + guides(fill = guide_colorbar(barwidth = 8, barheight = 1, title.position = "top", title.hjust = 0.5)) ggheatmap ``` @@ -647,14 +692,12 @@ Column {data-width=500} ### Complete linkage clustering of PC loadings with QC annotations ```{r loadings-heatmap, dpi=300, fig.height=6} - # Principal Components Analysis: 5 PCs as heatmap input # Try to use first five Expression PCs -pca_exp = prcomp(t(as.matrix(deg_voom$E)), scale.=T) -pca_exp <- tryCatch(pca_exp$x[,1:5], error = function(e){ - pca_exp$x[,1:3] - } -) +pca_exp <- prcomp(t(as.matrix(deg_voom$E)), scale. = T) +pca_exp <- tryCatch(pca_exp$x[, 1:5], error = function(e) { + pca_exp$x[, 1:3] +}) # Input for heatmap hm_data <- as.matrix(t(pca_exp)) @@ -663,27 +706,33 @@ hm_data <- as.matrix(t(pca_exp)) additional_remove <- names(numericQC) %in% c("PC1", "PC2", "PC3") # Cleaned numerical QC dataframe with matched rownames -numericQC = numericQC[match(colnames(hm_data), rownames(numericQC), nomatch=0), !additional_remove] - -column_annotations = HeatmapAnnotation(df = numericQC) - -if (params$annot){ - cheatmap <- ComplexHeatmap::Heatmap(hm_data, - col=colorRamp2(seq(-max(abs(pca_exp), na.rm = T), max(abs(pca_exp), na.rm = T), length.out = 20), - rev(colorRampPalette(brewer.pal(9, "PuOr"))(20))), - bottom_annotation = column_annotations, - show_column_names=T, - column_names_rot = 45, - cluster_rows = FALSE, - show_heatmap_legend = F) # Turning off to control the placement -} else{ - cheatmap <- ComplexHeatmap::Heatmap(hm_data, - col=colorRamp2(seq(-max(abs(pca_exp), na.rm = T), max(abs(pca_exp), na.rm = T), length.out = 20), - rev(colorRampPalette(brewer.pal(9, "PuOr"))(20))), - bottom_annotation = column_annotations, - show_column_names=F, - cluster_rows = FALSE, - show_heatmap_legend = F) # Turning off to control the placement +numericQC <- numericQC[match(colnames(hm_data), rownames(numericQC), nomatch = 0), !additional_remove] + +column_annotations <- HeatmapAnnotation(df = numericQC) + +if (params$annot) { + cheatmap <- ComplexHeatmap::Heatmap(hm_data, + col = colorRamp2( + seq(-max(abs(pca_exp), na.rm = T), max(abs(pca_exp), na.rm = T), length.out = 20), + rev(colorRampPalette(brewer.pal(9, "PuOr"))(20)) + ), + bottom_annotation = column_annotations, + show_column_names = T, + column_names_rot = 45, + cluster_rows = FALSE, + show_heatmap_legend = F + ) # Turning off to control the placement +} else { + cheatmap <- ComplexHeatmap::Heatmap(hm_data, + col = colorRamp2( + seq(-max(abs(pca_exp), na.rm = T), max(abs(pca_exp), na.rm = T), length.out = 20), + rev(colorRampPalette(brewer.pal(9, "PuOr"))(20)) + ), + bottom_annotation = column_annotations, + show_column_names = F, + cluster_rows = FALSE, + show_heatmap_legend = F + ) # Turning off to control the placement } draw(cheatmap, show_annotation_legend = FALSE) @@ -732,25 +781,25 @@ Here is a set of generalized guidelines for different QC metrics. Some of these **References** -**1.** Daley, T. and A.D. Smith, Predicting the molecular complexity of sequencing libraries. Nat Methods, 2013. 10(4): p. 325-7. -**2.** Andrews, S. (2010). FastQC: a quality control tool for high throughput sequence data. -**3.** Martin, M. (2011). "Cutadapt removes adapter sequences from high-throughput sequencing reads." EMBnet 17(1): 10-12. -**4.** Dobin, A., et al., STAR: ultrafast universal RNA-seq aligner. Bioinformatics, 2013. 29(1): p. 15-21. -**5.** Li, B. and C.N. Dewey, RSEM: accurate transcript quantification from RNA-Seq data with or without a reference genome. BMC Bioinformatics, 2011. 12: p. 323. -**6.** Harrow, J., et al., GENCODE: the reference human genome annotation for The ENCODE Project. Genome Res, 2012. 22(9): p. 1760-74. -**7.** Law, C.W., et al., voom: Precision weights unlock linear model analysis tools for RNA-seq read counts. Genome Biol, 2014. 15(2): p. R29. -**8.** Smyth, G.K., Linear models and empirical bayes methods for assessing differential expression in microarray experiments. Stat Appl Genet Mol Biol, 2004. 3: p. Article3. -**9.** Wang, L., et al. (2012). "RSeQC: quality control of RNA-seq experiments." Bioinformatics 28(16): 2184-2185. -**10.** The Picard toolkit. https://broadinstitute.github.io/picard/. -**11.** Ewels, P., et al. (2016). "MultiQC: summarize analysis results for multiple tools and samples in a single report." Bioinformatics 32(19): 3047-3048. -**12.** R Core Team (2018). R: A Language and Environment for Statistical Computing. Vienna, Austria, R Foundation for Statistical Computing. -**13.** Li, H., et al. (2009). "The Sequence Alignment/Map format and SAMtools." Bioinformatics 25(16): 2078-2079. -**14.** Wood, D. E. and S. L. Salzberg (2014). "Kraken: ultrafast metagenomic sequence classification using exact alignments." Genome Biol 15(3): R46. -**15.** Ondov, B. D., et al. (2011). "Interactive metagenomic visualization in a Web browser." BMC Bioinformatics 12(1): 385. -**16.** Okonechnikov, K., et al. (2015). "Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data." Bioinformatics 32(2): 292-294. -**17.** Wingett, S. and S. Andrews (2018). "FastQ Screen: A tool for multi-genome mapping and quality control." F1000Research 7(2): 1338. -**18.** Robinson, M. D., et al. (2009). "edgeR: a Bioconductor package for differential expression analysis of digital gene expression data." Bioinformatics 26(1): 139-140. -**9.** Wang, L., et al. (2012). "RSeQC: quality control of RNA-seq experiments." Bioinformatics 28(16): 2184-2185. +**1.** Daley, T. and A.D. Smith, Predicting the molecular complexity of sequencing libraries. Nat Methods, 2013. 10(4): p. 325-7. +**2.** Andrews, S. (2010). FastQC: a quality control tool for high throughput sequence data. +**3.** Martin, M. (2011). "Cutadapt removes adapter sequences from high-throughput sequencing reads." EMBnet 17(1): 10-12. +**4.** Dobin, A., et al., STAR: ultrafast universal RNA-seq aligner. Bioinformatics, 2013. 29(1): p. 15-21. +**5.** Li, B. and C.N. Dewey, RSEM: accurate transcript quantification from RNA-Seq data with or without a reference genome. BMC Bioinformatics, 2011. 12: p. 323. +**6.** Harrow, J., et al., GENCODE: the reference human genome annotation for The ENCODE Project. Genome Res, 2012. 22(9): p. 1760-74. +**7.** Law, C.W., et al., voom: Precision weights unlock linear model analysis tools for RNA-seq read counts. Genome Biol, 2014. 15(2): p. R29. +**8.** Smyth, G.K., Linear models and empirical bayes methods for assessing differential expression in microarray experiments. Stat Appl Genet Mol Biol, 2004. 3: p. Article3. +**9.** Wang, L., et al. (2012). "RSeQC: quality control of RNA-seq experiments." Bioinformatics 28(16): 2184-2185. +**10.** The Picard toolkit. https://broadinstitute.github.io/picard/. +**11.** Ewels, P., et al. (2016). "MultiQC: summarize analysis results for multiple tools and samples in a single report." Bioinformatics 32(19): 3047-3048. +**12.** R Core Team (2018). R: A Language and Environment for Statistical Computing. Vienna, Austria, R Foundation for Statistical Computing. +**13.** Li, H., et al. (2009). "The Sequence Alignment/Map format and SAMtools." Bioinformatics 25(16): 2078-2079. +**14.** Wood, D. E. and S. L. Salzberg (2014). "Kraken: ultrafast metagenomic sequence classification using exact alignments." Genome Biol 15(3): R46. +**15.** Ondov, B. D., et al. (2011). "Interactive metagenomic visualization in a Web browser." BMC Bioinformatics 12(1): 385. +**16.** Okonechnikov, K., et al. (2015). "Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data." Bioinformatics 32(2): 292-294. +**17.** Wingett, S. and S. Andrews (2018). "FastQ Screen: A tool for multi-genome mapping and quality control." F1000Research 7(2): 1338. +**18.** Robinson, M. D., et al. (2009). "edgeR: a Bioconductor package for differential expression analysis of digital gene expression data." Bioinformatics 26(1): 139-140. +**9.** Wang, L., et al. (2012). "RSeQC: quality control of RNA-seq experiments." Bioinformatics 28(16): 2184-2185. Column {data-width=400} ------------------------------------- diff --git a/workflow/scripts/rNA_groups.Rmd b/workflow/scripts/rNA_groups.Rmd index 1091cb3..80d901f 100755 --- a/workflow/scripts/rNA_groups.Rmd +++ b/workflow/scripts/rNA_groups.Rmd @@ -23,7 +23,7 @@ set.seed(42) # Set the working directory knitr::opts_knit$set(root.dir = params$wdir) -knitr::opts_chunk$set(echo = FALSE, warning=FALSE, dev="png", fig.path=file.path(params$wdir, "figs/")) +knitr::opts_chunk$set(echo = FALSE, warning = FALSE, dev = "png", fig.path = file.path(params$wdir, "figs/")) ``` @@ -43,53 +43,53 @@ suppressMessages(library(ComplexHeatmap)) suppressMessages(library(circlize)) # Reading in raw counts matrix, TIN matrix, and QC metadata -rawcounts = read.table(file = params$raw, sep = '\t', header = TRUE, row.names = 1, quote = "") -#rawcounts = read.table(file = 'data/Test_Raw_RSEM_Genes_Dataset.txt', sep = '\t', header = TRUE, row.names = 1) +rawcounts <- read.table(file = params$raw, sep = "\t", header = TRUE, row.names = 1, quote = "") +# rawcounts = read.table(file = 'data/Test_Raw_RSEM_Genes_Dataset.txt', sep = '\t', header = TRUE, row.names = 1) -tincounts = read.table(file = params$tin, sep = '\t', header = TRUE, row.names = 1) -#tincounts = read.table(file = 'data/Test_TIN_Dataset.txt', sep = '\t', header = TRUE, row.names = 1) +tincounts <- read.table(file = params$tin, sep = "\t", header = TRUE, row.names = 1) +# tincounts = read.table(file = 'data/Test_TIN_Dataset.txt', sep = '\t', header = TRUE, row.names = 1) # Remove zero variance rows prior to PC -tincounts = tincounts[apply(tincounts, 1, var) != 0, ] +tincounts <- tincounts[apply(tincounts, 1, var) != 0, ] -multiQC = read.table(file = params$qc, sep = '\t', header = TRUE, stringsAsFactors = TRUE) -rownames(multiQC) = make.names(multiQC$Sample) +multiQC <- read.table(file = params$qc, sep = "\t", header = TRUE, stringsAsFactors = TRUE) +rownames(multiQC) <- make.names(multiQC$Sample) # Create DGEList -deg = edgeR::DGEList(counts = rawcounts) +deg <- edgeR::DGEList(counts = rawcounts) # Filter lowly expressed genes -keep_genes = edgeR::filterByExpr(deg) # Using default: Gene must have 10 reads in >= 70% samples -deg = deg[keep_genes,,keep.lib.sizes=FALSE] # Recaluate new lib.sizes after filtering +keep_genes <- edgeR::filterByExpr(deg) # Using default: Gene must have 10 reads in >= 70% samples +deg <- deg[keep_genes, , keep.lib.sizes = FALSE] # Recaluate new lib.sizes after filtering # edgeR TMM normalization -deg = calcNormFactors(deg, method = "TMM") # calculate scaling norm.factors +deg <- calcNormFactors(deg, method = "TMM") # calculate scaling norm.factors # limma voom normalization -deg_voom = voom(deg, normalize="quantile", plot = TRUE, save.plot = TRUE) +deg_voom <- voom(deg, normalize = "quantile", plot = TRUE, save.plot = TRUE) # Order genes by MAD -deg_voom$E <- deg_voom$E[order(apply(deg_voom$E, 1, mad), decreasing = T),] +deg_voom$E <- deg_voom$E[order(apply(deg_voom$E, 1, mad), decreasing = T), ] # Remove zero variance rows prior to PC deg_voom$E <- deg_voom$E[apply(deg_voom$E, 1, var) != 0, ] # Principal Components Analysis -pca_exp = prcomp(t(as.matrix(deg_voom$E)), scale.=T)$x[,1:3] # Expression PC Analysis -pca_tin = prcomp(t(as.matrix(tincounts)), scale.=T)$x[,1:3] # Transcript Integrity Number PC Analysis -colnames(pca_tin) = c("PC1_tin", "PC2_tin", "PC3_tin") # Renaming PC cols to avoid collision with gene expression PCs +pca_exp <- prcomp(t(as.matrix(deg_voom$E)), scale. = T)$x[, 1:3] # Expression PC Analysis +pca_tin <- prcomp(t(as.matrix(tincounts)), scale. = T)$x[, 1:3] # Transcript Integrity Number PC Analysis +colnames(pca_tin) <- c("PC1_tin", "PC2_tin", "PC3_tin") # Renaming PC cols to avoid collision with gene expression PCs # Merge both dataframes on rowname -multiQC = transform(merge(multiQC, as.data.frame(pca_exp), by='row.names', all=TRUE), row.names=Row.names, Row.names=NULL) -multiQC = transform(merge(multiQC, as.data.frame(pca_tin), by='row.names', all=TRUE), row.names=Row.names, Row.names=NULL) +multiQC <- transform(merge(multiQC, as.data.frame(pca_exp), by = "row.names", all = TRUE), row.names = Row.names, Row.names = NULL) +multiQC <- transform(merge(multiQC, as.data.frame(pca_tin), by = "row.names", all = TRUE), row.names = Row.names, Row.names = NULL) # Crosstalk object (inter-widget connectivity) -shared_metadata = SharedData$new(multiQC) +shared_metadata <- SharedData$new(multiQC) ``` Interactives {data-icon="ion-android-options"} -===================================== +===================================== Inputs {.sidebar} ------------------------------------- @@ -97,7 +97,6 @@ Inputs {.sidebar} ### Filters ```{r filters} - # Extracted Tissue filter_checkbox( id = "TissueType", @@ -286,7 +285,6 @@ filter_slider( sep = "", ticks = TRUE ) - ``` @@ -300,28 +298,28 @@ shared_metadata %>% # selection = 'none', # disable datatable row selection # filter = "top", # allows filtering on each column extensions = c( - "Buttons", # add download buttons - "Scroller" # for scrolling instead of pagination + "Buttons", # add download buttons + "Scroller" # for scrolling instead of pagination ), - rownames = FALSE, # remove rownames + rownames = FALSE, # remove rownames style = "bootstrap", class = "compact", width = "100%", options = list( - dom = "Blrtip", # specify content (search box, etc) + dom = "Blrtip", # specify content (search box, etc) deferRender = TRUE, scrollY = 300, scroller = TRUE, columnDefs = list( list( visible = FALSE, - targets = c(1, 11, 14, 15, 18, 19, 21:23, 27, 31, 32, 33, 34, 35, 36) # hide columes + targets = c(1, 11, 14, 15, 18, 19, 21:23, 27, 31, 32, 33, 34, 35, 36) # hide columes ) ), buttons = list( - I("colvis"), # turn columns on and off - "csv", # download as .csv - "excel" # download as .xlsx + I("colvis"), # turn columns on and off + "csv", # download as .csv + "excel" # download as .xlsx ) ), colnames = c( @@ -341,7 +339,7 @@ shared_metadata %>% "% UTR" = "pct_utr_bases", "% Intronic" = "pct_intronic_bases", "CV Coverage" = "median_cv_coverage", - "% rRNA" = "rRNA_percent_aligned" , + "% rRNA" = "rRNA_percent_aligned", "% UniVec" = "uni_vec_percent_aligned", "Tissue" = "TissueType", "% Anti-sense" = "percent_antisense_strand", @@ -359,25 +357,32 @@ Row {data-height=600} ```{r 3d-expression-pca} # Principal Components Analysis -pca = prcomp(t(as.matrix(deg_voom$E)), scale.=T) +pca <- prcomp(t(as.matrix(deg_voom$E)), scale. = T) # Variance explained for PCs: 1, 2, 3 -pc1 = round(pca$sdev[1]^2/sum(pca$sdev^2)*100,2) -pc2 = round(pca$sdev[2]^2/sum(pca$sdev^2)*100,2) -pc3 = round(pca$sdev[3]^2/sum(pca$sdev^2)*100,2) +pc1 <- round(pca$sdev[1]^2 / sum(pca$sdev^2) * 100, 2) +pc2 <- round(pca$sdev[2]^2 / sum(pca$sdev^2) * 100, 2) +pc3 <- round(pca$sdev[3]^2 / sum(pca$sdev^2) * 100, 2) -cgroups = as.factor(multiQC$TissueType) -cgroups = addNA(cgroups) +cgroups <- as.factor(multiQC$TissueType) +cgroups <- addNA(cgroups) cpalette <- brewer.pal(nlevels(cgroups), "Paired") -p <- plot_ly(shared_metadata, x = ~PC1, y = ~PC2, z = ~PC3, color=cgroups, colors=cpalette, hoverinfo="text", marker=list(size = 8), - text = ~paste('
Sample: ', Sample, '
Flowcell Lanes: ', flowcell_lanes, '
medTIN: ', median_tin, '


% Aligned: ', percent_aligned, - '
% Dup: ', percent_duplication, '
% Coding: ', pct_coding_bases, '
% UTR: ', pct_utr_bases, - '
% Intronic: ', pct_intronic_bases, '

Sequence Range: ', sequence_length, - '
GC Content: ', gc_content, '
Inner Distance Maxima: ', inner_distance_maxima, '
Insert Size: ', median_insert_size) ) %>% - add_markers() %>% layout(scene = list(xaxis = list(title = paste0("PC1 (",pc1,"%)")), - yaxis = list(title = paste0("PC2 (",pc2,"%)")), - zaxis = list(title = paste0("PC3 (",pc3,"%)")))) +p <- plot_ly(shared_metadata, + x = ~PC1, y = ~PC2, z = ~PC3, color = cgroups, colors = cpalette, hoverinfo = "text", marker = list(size = 8), + text = ~ paste( + "
Sample: ", Sample, "
Flowcell Lanes: ", flowcell_lanes, "
medTIN: ", median_tin, "


% Aligned: ", percent_aligned, + "
% Dup: ", percent_duplication, "
% Coding: ", pct_coding_bases, "
% UTR: ", pct_utr_bases, + "
% Intronic: ", pct_intronic_bases, "

Sequence Range: ", sequence_length, + "
GC Content: ", gc_content, "
Inner Distance Maxima: ", inner_distance_maxima, "
Insert Size: ", median_insert_size + ) +) %>% + add_markers() %>% + layout(scene = list( + xaxis = list(title = paste0("PC1 (", pc1, "%)")), + yaxis = list(title = paste0("PC2 (", pc2, "%)")), + zaxis = list(title = paste0("PC3 (", pc3, "%)")) + )) # Important: disable onclick() events plotly::highlight(p, on = NULL) # fixes unexpected behavior when multiple plots via crosstalk ``` @@ -386,25 +391,32 @@ plotly::highlight(p, on = NULL) # fixes unexpected behavior when multiple plots ```{r 3d-tin-pca} # Principal Components Analysis -pca = prcomp(t(as.matrix(tincounts)), scale.=T) +pca <- prcomp(t(as.matrix(tincounts)), scale. = T) # Variance explained for PCs: 1, 2, 3 -pc1 = round(pca$sdev[1]^2/sum(pca$sdev^2)*100,2) -pc2 = round(pca$sdev[2]^2/sum(pca$sdev^2)*100,2) -pc3 = round(pca$sdev[3]^2/sum(pca$sdev^2)*100,2) +pc1 <- round(pca$sdev[1]^2 / sum(pca$sdev^2) * 100, 2) +pc2 <- round(pca$sdev[2]^2 / sum(pca$sdev^2) * 100, 2) +pc3 <- round(pca$sdev[3]^2 / sum(pca$sdev^2) * 100, 2) -cgroups = as.factor(multiQC$TissueType) -cgroups = addNA(cgroups) +cgroups <- as.factor(multiQC$TissueType) +cgroups <- addNA(cgroups) cpalette <- brewer.pal(nlevels(cgroups), "Paired") -p <- plot_ly(shared_metadata, x = ~PC1_tin, y = ~PC2_tin, z = ~PC3_tin, color=cgroups, colors=cpalette, hoverinfo="text", marker=list(size = 8), - text = ~paste('
Sample: ', Sample, '
Flowcell Lanes: ', flowcell_lanes, '
medTIN: ', median_tin, '


% Aligned: ', percent_aligned, - '
% Dup: ', percent_duplication, '
% Coding: ', pct_coding_bases, - '
% Intronic: ', pct_intronic_bases, '

medTIN: ', median_tin, '
Sequence Range: ', sequence_length, - '
GC Content: ', gc_content, '
Inner Distance Maxima: ', inner_distance_maxima, '
Insert Size: ', median_insert_size) ) %>% - add_markers() %>% layout(scene = list(xaxis = list(title = paste0("PC1 (",pc1,"%)")), - yaxis = list(title = paste0("PC2 (",pc2,"%)")), - zaxis = list(title = paste0("PC3 (",pc3,"%)")))) +p <- plot_ly(shared_metadata, + x = ~PC1_tin, y = ~PC2_tin, z = ~PC3_tin, color = cgroups, colors = cpalette, hoverinfo = "text", marker = list(size = 8), + text = ~ paste( + "
Sample: ", Sample, "
Flowcell Lanes: ", flowcell_lanes, "
medTIN: ", median_tin, "


% Aligned: ", percent_aligned, + "
% Dup: ", percent_duplication, "
% Coding: ", pct_coding_bases, + "
% Intronic: ", pct_intronic_bases, "

medTIN: ", median_tin, "
Sequence Range: ", sequence_length, + "
GC Content: ", gc_content, "
Inner Distance Maxima: ", inner_distance_maxima, "
Insert Size: ", median_insert_size + ) +) %>% + add_markers() %>% + layout(scene = list( + xaxis = list(title = paste0("PC1 (", pc1, "%)")), + yaxis = list(title = paste0("PC2 (", pc2, "%)")), + zaxis = list(title = paste0("PC3 (", pc3, "%)")) + )) # Important: disable onclick() events plotly::highlight(p, on = NULL) # fixes unexpected behavior when multiple plots via crosstalk @@ -420,32 +432,35 @@ Row ```{r pca-initialize} # Principal Components Analysis -pca = prcomp(t(as.matrix(deg_voom$E)), scale.=T) +pca <- prcomp(t(as.matrix(deg_voom$E)), scale. = T) # Variance explained for PCs: 1, 2, 3 -pc1 = round(pca$sdev[1]^2/sum(pca$sdev^2)*100,2) -pc2 = round(pca$sdev[2]^2/sum(pca$sdev^2)*100,2) - +pc1 <- round(pca$sdev[1]^2 / sum(pca$sdev^2) * 100, 2) +pc2 <- round(pca$sdev[2]^2 / sum(pca$sdev^2) * 100, 2) ``` ### Tissue Type ```{r colored-by-tissue-type} # Gene Expression PCA colored by Tissue Type -g <- ggplot(multiQC, aes(PC1, PC2, color = TissueType), xlab) + geom_point(size = multiQC$TissueType) + theme_minimal() + - labs(color = "Tissue Type", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) +g <- ggplot(multiQC, aes(PC1, PC2, color = TissueType), xlab) + + geom_point(size = multiQC$TissueType) + + theme_minimal() + + labs(color = "Tissue Type", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) g -``` +``` ### % Dups ```{r colored-by-dups} # Gene Expression PCA colored by % Duplicates -g <- ggplot(multiQC, aes(PC1, PC2, color = percent_duplication), xlab) + geom_point(size = multiQC$TissueType) + theme_minimal() + - labs(color = "% Dups", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = percent_duplication), xlab) + + geom_point(size = multiQC$TissueType) + + theme_minimal() + + labs(color = "% Dups", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -454,9 +469,11 @@ g ```{r colored-by-alignment} # Gene Expression PCA colored by % Aligned -g <- ggplot(multiQC, aes(PC1, PC2, color = percent_aligned), xlab) + geom_point(size = multiQC$TissueType) + theme_minimal() + - labs(color = "% Aligned", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = percent_aligned), xlab) + + geom_point(size = multiQC$TissueType) + + theme_minimal() + + labs(color = "% Aligned", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -465,9 +482,11 @@ g ```{r colored-by-utr-bases} # Gene Expression PCA colored by % UTR -g <- ggplot(multiQC, aes(PC1, PC2, color = pct_utr_bases), xlab) + geom_point(size = multiQC$TissueType) + theme_minimal() + - labs(color = "% UTR", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = pct_utr_bases), xlab) + + geom_point(size = multiQC$TissueType) + + theme_minimal() + + labs(color = "% UTR", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -480,9 +499,11 @@ Row ```{r colored-by-inner-distance-maxima} # Gene Expression PCA colored by Inner Distance -g <- ggplot(multiQC, aes(PC1, PC2, color = inner_distance_maxima), xlab) + geom_point(size = multiQC$TissueType) + theme_minimal() + - labs(color = "Inner Distance Maxima", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = inner_distance_maxima), xlab) + + geom_point(size = multiQC$TissueType) + + theme_minimal() + + labs(color = "Inner Distance Maxima", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -491,20 +512,24 @@ g ```{r colored-by-cv-coverage} # Gene Expression PCA colored by CV Coverage -g <- ggplot(multiQC, aes(PC1, PC2, color = median_cv_coverage), xlab) + geom_point(size = multiQC$TissueType) + theme_minimal() + - labs(color = "CV Coverage", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = median_cv_coverage), xlab) + + geom_point(size = multiQC$TissueType) + + theme_minimal() + + labs(color = "CV Coverage", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g -``` +``` ### 3' Prime Bias ```{r colored-by-3-prime-coverage} # Gene Expression PCA colored by 3' Prime Coverage -g <- ggplot(multiQC, aes(PC1, PC2, color = median_3prime_bias), xlab) + geom_point(size = multiQC$TissueType) + theme_minimal() + - labs(color = "3' Prime Coverage", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = median_3prime_bias), xlab) + + geom_point(size = multiQC$TissueType) + + theme_minimal() + + labs(color = "3' Prime Coverage", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -513,9 +538,11 @@ g ```{r colored-by-insert-size} # Gene Expression PCA colored by Insert Size -g <- ggplot(multiQC, aes(PC1, PC2, color = median_insert_size), xlab) + geom_point(size = multiQC$TissueType) + theme_minimal() + - labs(color = "Insert Size", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = median_insert_size), xlab) + + geom_point(size = multiQC$TissueType) + + theme_minimal() + + labs(color = "Insert Size", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -524,9 +551,11 @@ g ```{r colored-by-inner-distance} # Gene Expression PCA colored by GC Content -g <- ggplot(multiQC, aes(PC1, PC2, color = gc_content), xlab) + geom_point(size = multiQC$TissueType) + theme_minimal() + - labs(color = "GC Content", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = gc_content), xlab) + + geom_point(size = multiQC$TissueType) + + theme_minimal() + + labs(color = "GC Content", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -538,9 +567,11 @@ Row ```{r colored-by-medtin} # Gene Expression PCA colored by medTIN -g <- ggplot(multiQC, aes(PC1, PC2, color = median_tin), xlab) + geom_point(size = multiQC$TissueType) + theme_minimal() + - labs(color = "medTIN", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = median_tin), xlab) + + geom_point(size = multiQC$TissueType) + + theme_minimal() + + labs(color = "medTIN", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -549,8 +580,10 @@ g ```{r colored-by-flowcell-lanes} # Gene Expression PCA colored by Flowcell Lanes -g <- ggplot(multiQC, aes(PC1, PC2, color = flowcell_lanes), xlab) + geom_point(size = multiQC$TissueType) + theme_minimal() + - labs(color = "Flowcell Lanes", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) +g <- ggplot(multiQC, aes(PC1, PC2, color = flowcell_lanes), xlab) + + geom_point(size = multiQC$TissueType) + + theme_minimal() + + labs(color = "Flowcell Lanes", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) g ``` @@ -559,20 +592,24 @@ g ```{r colored-by-coding} # Gene Expression PCA colored by % Coding -g <- ggplot(multiQC, aes(PC1, PC2, color = pct_coding_bases), xlab) + geom_point(size = multiQC$TissueType) + theme_minimal() + - labs(color = "% Coding", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = pct_coding_bases), xlab) + + geom_point(size = multiQC$TissueType) + + theme_minimal() + + labs(color = "% Coding", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g -``` +``` ### % rRNA ```{r colored-by-rrna} # Gene Expression PCA colored by % rRNA -g <- ggplot(multiQC, aes(PC1, PC2, color = rRNA_percent_aligned), xlab) + geom_point(size = multiQC$TissueType) + theme_minimal() + - labs(color = "% rRNA", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = rRNA_percent_aligned), xlab) + + geom_point(size = multiQC$TissueType) + + theme_minimal() + + labs(color = "% rRNA", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -581,9 +618,11 @@ g ```{r colored-by-anti-sense} # Gene Expression PCA colored by % Anti-sense -g <- ggplot(multiQC, aes(PC1, PC2, color = percent_antisense_strand), xlab) + geom_point(size = multiQC$TissueType) + theme_minimal() + - labs(color = "% Anti-sense", x = paste0("PC1 (",pc1,"%)"), y = paste0("PC2 (",pc2,"%)")) + - scale_colour_gradientn(colours = viridis::viridis(100)) +g <- ggplot(multiQC, aes(PC1, PC2, color = percent_antisense_strand), xlab) + + geom_point(size = multiQC$TissueType) + + theme_minimal() + + labs(color = "% Anti-sense", x = paste0("PC1 (", pc1, "%)"), y = paste0("PC2 (", pc2, "%)")) + + scale_colour_gradientn(colours = viridis::viridis(100)) g ``` @@ -600,32 +639,33 @@ Column {data-width=500} ### Heirarchical clustering of pairwise spearman correlation coefficients ```{r heirarchical-correlation-matrix, dpi=300} - # Helper Function -reorder_cormat <- function(cormat){ +reorder_cormat <- function(cormat) { # Use correlation between variables as distance for heirarchincal clustering - dd <- as.dist((1-cormat)/2) + dd <- as.dist((1 - cormat) / 2) hc <- hclust(dd) - cormat <-cormat[hc$order, hc$order] + cormat <- cormat[hc$order, hc$order] } # Remove all columns that are categorical -numericQC = multiQC[,-which(sapply(multiQC, class) == "factor")] +numericQC <- multiQC[, -which(sapply(multiQC, class) == "factor")] # Additional columns to remove -additional_remove <- names(numericQC) %in% c("total_read_pairs", "mean_insert_size", "avg_aligned_read_length", "pct_mrna_bases", - "pct_intergenic_bases", "median_5prime_to_3prime_bias", "median_5prime_bias", - "median_3prime_bias", "percent_sense_strand", "mean_mapping_quality", - "PC1_tin", "PC2_tin", "PC3_tin") +additional_remove <- names(numericQC) %in% c( + "total_read_pairs", "mean_insert_size", "avg_aligned_read_length", "pct_mrna_bases", + "pct_intergenic_bases", "median_5prime_to_3prime_bias", "median_5prime_bias", + "median_3prime_bias", "percent_sense_strand", "mean_mapping_quality", + "PC1_tin", "PC2_tin", "PC3_tin" +) # Cleaned numerical QC dataframe -numericQC = numericQC[!additional_remove] +numericQC <- numericQC[!additional_remove] # Remove zero-variance columns to prevent any hlclust() errors -#numericQC = numericQC[,-which(apply(numericQC, 2, var) == 0)] -numericQC = numericQC[, apply(numericQC, 2, var) != 0] +# numericQC = numericQC[,-which(apply(numericQC, 2, var) == 0)] +numericQC <- numericQC[, apply(numericQC, 2, var) != 0] # Pair-wise spearman correlation matrix -cormatrix = round(cor(numericQC, method = "spearman"),2) +cormatrix <- round(cor(numericQC, method = "spearman"), 2) # Reorder the correlation matrix based on hierarchical clustering of the correlation coeff cormat <- reorder_cormat(cormatrix) @@ -637,19 +677,24 @@ cormat[lower.tri(cormat)] <- NA cormat <- melt(cormat, na.rm = TRUE) # Correlation ggheatmap -ggheatmap <- ggplot(cormat, aes(Var2, Var1, fill = value)) + geom_tile(color = "white") + - scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name="Spearman\nCorrelation") + - theme_minimal() + theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 9, hjust = 1)) + coord_fixed() + - theme(axis.title.x = element_blank(), - axis.title.y = element_blank(), - panel.grid.major = element_blank(), - panel.border = element_blank(), - panel.background = element_blank(), - axis.ticks = element_blank(), - legend.justification = c(1, 0), - legend.position = c(0.6, 0.7), - legend.direction = "horizontal") + - guides(fill = guide_colorbar(barwidth = 8, barheight = 1, title.position = "top", title.hjust = 0.5)) +ggheatmap <- ggplot(cormat, aes(Var2, Var1, fill = value)) + + geom_tile(color = "white") + + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1), space = "Lab", name = "Spearman\nCorrelation") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 9, hjust = 1)) + + coord_fixed() + + theme( + axis.title.x = element_blank(), + axis.title.y = element_blank(), + panel.grid.major = element_blank(), + panel.border = element_blank(), + panel.background = element_blank(), + axis.ticks = element_blank(), + legend.justification = c(1, 0), + legend.position = c(0.6, 0.7), + legend.direction = "horizontal" + ) + + guides(fill = guide_colorbar(barwidth = 8, barheight = 1, title.position = "top", title.hjust = 0.5)) ggheatmap ``` @@ -660,14 +705,12 @@ Column {data-width=500} ### Complete linkage clustering of PC loadings with QC annotations ```{r loadings-heatmap, dpi=300, fig.height=6} - # Principal Components Analysis: 5 PCs as heatmap input # Try to use first five Expression PCs -pca_exp = prcomp(t(as.matrix(deg_voom$E)), scale.=T) -pca_exp <- tryCatch(pca_exp$x[,1:5], error = function(e){ - pca_exp$x[,1:3] - } -) +pca_exp <- prcomp(t(as.matrix(deg_voom$E)), scale. = T) +pca_exp <- tryCatch(pca_exp$x[, 1:5], error = function(e) { + pca_exp$x[, 1:3] +}) # Input for heatmap hm_data <- as.matrix(t(pca_exp)) @@ -676,27 +719,33 @@ hm_data <- as.matrix(t(pca_exp)) additional_remove <- names(numericQC) %in% c("PC1", "PC2", "PC3") # Cleaned numerical QC dataframe with matched rownames -numericQC = numericQC[match(colnames(hm_data), rownames(numericQC), nomatch=0), !additional_remove] - -column_annotations = HeatmapAnnotation(df = numericQC) - -if (params$annot){ - cheatmap <- ComplexHeatmap::Heatmap(hm_data, - col=colorRamp2(seq(-max(abs(pca_exp), na.rm = T), max(abs(pca_exp), na.rm = T), length.out = 20), - rev(colorRampPalette(brewer.pal(9, "PuOr"))(20))), - bottom_annotation = column_annotations, - show_column_names=T, - column_names_rot = 45, - cluster_rows = FALSE, - show_heatmap_legend = F) # Turning off to control the placement -} else{ - cheatmap <- ComplexHeatmap::Heatmap(hm_data, - col=colorRamp2(seq(-max(abs(pca_exp), na.rm = T), max(abs(pca_exp), na.rm = T), length.out = 20), - rev(colorRampPalette(brewer.pal(9, "PuOr"))(20))), - bottom_annotation = column_annotations, - show_column_names=F, - cluster_rows = FALSE, - show_heatmap_legend = F) # Turning off to control the placement +numericQC <- numericQC[match(colnames(hm_data), rownames(numericQC), nomatch = 0), !additional_remove] + +column_annotations <- HeatmapAnnotation(df = numericQC) + +if (params$annot) { + cheatmap <- ComplexHeatmap::Heatmap(hm_data, + col = colorRamp2( + seq(-max(abs(pca_exp), na.rm = T), max(abs(pca_exp), na.rm = T), length.out = 20), + rev(colorRampPalette(brewer.pal(9, "PuOr"))(20)) + ), + bottom_annotation = column_annotations, + show_column_names = T, + column_names_rot = 45, + cluster_rows = FALSE, + show_heatmap_legend = F + ) # Turning off to control the placement +} else { + cheatmap <- ComplexHeatmap::Heatmap(hm_data, + col = colorRamp2( + seq(-max(abs(pca_exp), na.rm = T), max(abs(pca_exp), na.rm = T), length.out = 20), + rev(colorRampPalette(brewer.pal(9, "PuOr"))(20)) + ), + bottom_annotation = column_annotations, + show_column_names = F, + cluster_rows = FALSE, + show_heatmap_legend = F + ) # Turning off to control the placement } draw(cheatmap, show_annotation_legend = FALSE) @@ -745,25 +794,25 @@ Here is a set of generalized guidelines for different QC metrics. Some of these **References** -**1.** Daley, T. and A.D. Smith, Predicting the molecular complexity of sequencing libraries. Nat Methods, 2013. 10(4): p. 325-7. -**2.** Andrews, S. (2010). FastQC: a quality control tool for high throughput sequence data. -**3.** Martin, M. (2011). "Cutadapt removes adapter sequences from high-throughput sequencing reads." EMBnet 17(1): 10-12. -**4.** Dobin, A., et al., STAR: ultrafast universal RNA-seq aligner. Bioinformatics, 2013. 29(1): p. 15-21. -**5.** Li, B. and C.N. Dewey, RSEM: accurate transcript quantification from RNA-Seq data with or without a reference genome. BMC Bioinformatics, 2011. 12: p. 323. -**6.** Harrow, J., et al., GENCODE: the reference human genome annotation for The ENCODE Project. Genome Res, 2012. 22(9): p. 1760-74. -**7.** Law, C.W., et al., voom: Precision weights unlock linear model analysis tools for RNA-seq read counts. Genome Biol, 2014. 15(2): p. R29. -**8.** Smyth, G.K., Linear models and empirical bayes methods for assessing differential expression in microarray experiments. Stat Appl Genet Mol Biol, 2004. 3: p. Article3. -**9.** Wang, L., et al. (2012). "RSeQC: quality control of RNA-seq experiments." Bioinformatics 28(16): 2184-2185. -**10.** The Picard toolkit. https://broadinstitute.github.io/picard/. -**11.** Ewels, P., et al. (2016). "MultiQC: summarize analysis results for multiple tools and samples in a single report." Bioinformatics 32(19): 3047-3048. -**12.** R Core Team (2018). R: A Language and Environment for Statistical Computing. Vienna, Austria, R Foundation for Statistical Computing. -**13.** Li, H., et al. (2009). "The Sequence Alignment/Map format and SAMtools." Bioinformatics 25(16): 2078-2079. -**14.** Wood, D. E. and S. L. Salzberg (2014). "Kraken: ultrafast metagenomic sequence classification using exact alignments." Genome Biol 15(3): R46. -**15.** Ondov, B. D., et al. (2011). "Interactive metagenomic visualization in a Web browser." BMC Bioinformatics 12(1): 385. -**16.** Okonechnikov, K., et al. (2015). "Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data." Bioinformatics 32(2): 292-294. -**17.** Wingett, S. and S. Andrews (2018). "FastQ Screen: A tool for multi-genome mapping and quality control." F1000Research 7(2): 1338. -**18.** Robinson, M. D., et al. (2009). "edgeR: a Bioconductor package for differential expression analysis of digital gene expression data." Bioinformatics 26(1): 139-140. -**9.** Wang, L., et al. (2012). "RSeQC: quality control of RNA-seq experiments." Bioinformatics 28(16): 2184-2185. +**1.** Daley, T. and A.D. Smith, Predicting the molecular complexity of sequencing libraries. Nat Methods, 2013. 10(4): p. 325-7. +**2.** Andrews, S. (2010). FastQC: a quality control tool for high throughput sequence data. +**3.** Martin, M. (2011). "Cutadapt removes adapter sequences from high-throughput sequencing reads." EMBnet 17(1): 10-12. +**4.** Dobin, A., et al., STAR: ultrafast universal RNA-seq aligner. Bioinformatics, 2013. 29(1): p. 15-21. +**5.** Li, B. and C.N. Dewey, RSEM: accurate transcript quantification from RNA-Seq data with or without a reference genome. BMC Bioinformatics, 2011. 12: p. 323. +**6.** Harrow, J., et al., GENCODE: the reference human genome annotation for The ENCODE Project. Genome Res, 2012. 22(9): p. 1760-74. +**7.** Law, C.W., et al., voom: Precision weights unlock linear model analysis tools for RNA-seq read counts. Genome Biol, 2014. 15(2): p. R29. +**8.** Smyth, G.K., Linear models and empirical bayes methods for assessing differential expression in microarray experiments. Stat Appl Genet Mol Biol, 2004. 3: p. Article3. +**9.** Wang, L., et al. (2012). "RSeQC: quality control of RNA-seq experiments." Bioinformatics 28(16): 2184-2185. +**10.** The Picard toolkit. https://broadinstitute.github.io/picard/. +**11.** Ewels, P., et al. (2016). "MultiQC: summarize analysis results for multiple tools and samples in a single report." Bioinformatics 32(19): 3047-3048. +**12.** R Core Team (2018). R: A Language and Environment for Statistical Computing. Vienna, Austria, R Foundation for Statistical Computing. +**13.** Li, H., et al. (2009). "The Sequence Alignment/Map format and SAMtools." Bioinformatics 25(16): 2078-2079. +**14.** Wood, D. E. and S. L. Salzberg (2014). "Kraken: ultrafast metagenomic sequence classification using exact alignments." Genome Biol 15(3): R46. +**15.** Ondov, B. D., et al. (2011). "Interactive metagenomic visualization in a Web browser." BMC Bioinformatics 12(1): 385. +**16.** Okonechnikov, K., et al. (2015). "Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data." Bioinformatics 32(2): 292-294. +**17.** Wingett, S. and S. Andrews (2018). "FastQ Screen: A tool for multi-genome mapping and quality control." F1000Research 7(2): 1338. +**18.** Robinson, M. D., et al. (2009). "edgeR: a Bioconductor package for differential expression analysis of digital gene expression data." Bioinformatics 26(1): 139-140. +**9.** Wang, L., et al. (2012). "RSeQC: quality control of RNA-seq experiments." Bioinformatics 28(16): 2184-2185. Column {data-width=400} ------------------------------------- diff --git a/workflow/scripts/rsemcounts.R b/workflow/scripts/rsemcounts.R index 5937cbe..736e75a 100644 --- a/workflow/scripts/rsemcounts.R +++ b/workflow/scripts/rsemcounts.R @@ -1,88 +1,86 @@ -library('reshape') -library('ggplot2') -library('edgeR') -library('DESeq2') -library('tidyverse') +library("reshape") +library("ggplot2") +library("edgeR") +library("DESeq2") +library("tidyverse") -writegzfile <- function(m,f) { - m=as.data.frame(m) - m$id=rownames(m) - m=separate(data=m,col=id,into=c('ensID','geneName'),sep="\\|",remove=TRUE) - m=m %>% select('ensID','geneName',everything()) - write.table(m,file=gzfile(f),sep="\t",row.names = FALSE,quote=F) -} +writegzfile <- function(m, f) { + m <- as.data.frame(m) + m$id <- rownames(m) + m <- separate(data = m, col = id, into = c("ensID", "geneName"), sep = "\\|", remove = TRUE) + m <- m %>% select("ensID", "geneName", everything()) + write.table(m, file = gzfile(f), sep = "\t", row.names = FALSE, quote = F) +} args <- commandArgs(trailingOnly = TRUE) DIR <- args[1] FILES <- args[2] ANNOTATE <- args[3] SAMPLETABLE <- args[4] -MINCOUNT=0.5 -MINSAMPLES=0.5 +MINCOUNT <- 0.5 +MINSAMPLES <- 0.5 setwd(DIR) -x=read.table(SAMPLETABLE,header = T,sep="\t") -myfiles=as.character(unlist(strsplit(FILES, split=" "))) -res=read.delim(myfiles[1],header=T)[,c(1,5)] -colnames(res)[2]=as.character(myfiles[1]) -for(i in seq(2, length(myfiles), by = 1)) -{{ -temp=read.delim(myfiles[i],header=T)[,c(1,5)] -colnames(temp)[2]=as.character(myfiles[i]) -res=merge(res,temp) -}} +x <- read.table(SAMPLETABLE, header = T, sep = "\t") +myfiles <- as.character(unlist(strsplit(FILES, split = " "))) +res <- read.delim(myfiles[1], header = T)[, c(1, 5)] +colnames(res)[2] <- as.character(myfiles[1]) +for (i in seq(2, length(myfiles), by = 1)) +{{ temp <- read.delim(myfiles[i], header = T)[, c(1, 5)] + colnames(temp)[2] <- as.character(myfiles[i]) + res <- merge(res, temp) }} -gene_name=read.delim(ANNOTATE,header=F,sep=" ") -res2=merge(gene_name,res,by.x=1,by.y=1) -res3=cbind(symbol=paste(res2[,1],"|",res2[,3],sep=""),res2[,-c(1,2,3,4,5)]) -colnames(res3)=gsub('\\..*$','',colnames(res3)) -colnames(res3)=gsub('.*/','',colnames(res3)) -write.table(as.data.frame(res3),file="RawCountFile_RSEM_genes.txt",sep="\t",row.names=F,quote = F) -rownames(res3)=res3$symbol -mydata=res3[,-c(1)] -mydata=ceiling(mydata) -writegzfile(cpm(mydata),"RSEM_CPM_counts.txt.gz") +gene_name <- read.delim(ANNOTATE, header = F, sep = " ") +res2 <- merge(gene_name, res, by.x = 1, by.y = 1) +res3 <- cbind(symbol = paste(res2[, 1], "|", res2[, 3], sep = ""), res2[, -c(1, 2, 3, 4, 5)]) +colnames(res3) <- gsub("\\..*$", "", colnames(res3)) +colnames(res3) <- gsub(".*/", "", colnames(res3)) +write.table(as.data.frame(res3), file = "RawCountFile_RSEM_genes.txt", sep = "\t", row.names = F, quote = F) +rownames(res3) <- res3$symbol +mydata <- res3[, -c(1)] +mydata <- ceiling(mydata) +writegzfile(cpm(mydata), "RSEM_CPM_counts.txt.gz") -groups=levels(x$condition) -G1=groups[1] -g1_samples=(x$condition==G1) -ng1=max(1,floor(length(g1_samples[g1_samples==TRUE])*MINSAMPLES)) -CPM_CUTOFF=MINCOUNT -mydata1=mydata[,g1_samples] -k_g1=rowSums(cpm(mydata1)>CPM_CUTOFF)>=ng1 -k=k_g1 +groups <- levels(x$condition) +G1 <- groups[1] +g1_samples <- (x$condition == G1) +ng1 <- max(1, floor(length(g1_samples[g1_samples == TRUE]) * MINSAMPLES)) +CPM_CUTOFF <- MINCOUNT +mydata1 <- mydata[, g1_samples] +k_g1 <- rowSums(cpm(mydata1) > CPM_CUTOFF) >= ng1 +k <- k_g1 table(k) -if (length(groups)>1) { - for(i in seq(2,length(levels(x$condition)))){ - Gi=groups[i] - gi_samples=(x$condition==Gi) - ngi=max(1,floor(length(gi_samples[gi_samples==TRUE])*MINSAMPLES)) - mydatai=mydata[,gi_samples] - k_gi=rowSums(cpm(mydatai)>CPM_CUTOFF)>=ngi - k=k|k_gi +if (length(groups) > 1) { + for (i in seq(2, length(levels(x$condition)))) { + Gi <- groups[i] + gi_samples <- (x$condition == Gi) + ngi <- max(1, floor(length(gi_samples[gi_samples == TRUE]) * MINSAMPLES)) + mydatai <- mydata[, gi_samples] + k_gi <- rowSums(cpm(mydatai) > CPM_CUTOFF) >= ngi + k <- k | k_gi print(table(k)) } } -res=mydata[k,] -res2=res -res2$symbol=rownames(res2) -res2=res2 %>% select('symbol',everything()) -write.table(res2,file="RawCountFile_RSEM_genes_filtered.txt",row.names = F,quote = F,sep="\t") -y = DGEList(counts=res) +res <- mydata[k, ] +res2 <- res +res2$symbol <- rownames(res2) +res2 <- res2 %>% select("symbol", everything()) +write.table(res2, file = "RawCountFile_RSEM_genes_filtered.txt", row.names = F, quote = F, sep = "\t") +y <- DGEList(counts = res) ## Normalization TMM ------------------------------------------------------------ ## method = =c("TMM","RLE","upperquartile","none") -y <- calcNormFactors(y,method="TMM") -ndata= cpm(y,log=FALSE,normalized.lib.sizes=TRUE) -writegzfile(ndata,"RSEM_CPM_TMM_counts.txt.gz") - ## unfiltered normalization -y2 = DGEList(counts=mydata) -y2 <- calcNormFactors(y2,method="TMM") -ndata2= cpm(y2,log=FALSE,normalized.lib.sizes=TRUE) -## save it -writegzfile(ndata2,"RSEM_CPM_TMM_unfiltered_counts.txt.gz") -rlogres=rlog(as.matrix(res),blind=TRUE) -rownames(rlogres)=rownames(res) -writegzfile(rlogres,"RSEM_rlog_counts.txt.gz") +y <- calcNormFactors(y, method = "TMM") +ndata <- cpm(y, log = FALSE, normalized.lib.sizes = TRUE) +writegzfile(ndata, "RSEM_CPM_TMM_counts.txt.gz") +## unfiltered normalization +y2 <- DGEList(counts = mydata) +y2 <- calcNormFactors(y2, method = "TMM") +ndata2 <- cpm(y2, log = FALSE, normalized.lib.sizes = TRUE) +## save it +writegzfile(ndata2, "RSEM_CPM_TMM_unfiltered_counts.txt.gz") +rlogres <- rlog(as.matrix(res), blind = TRUE) +rownames(rlogres) <- rownames(res) +writegzfile(rlogres, "RSEM_rlog_counts.txt.gz") From 6300381a47b656c85917c498d588c4f31b68b9ec Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 23 Jan 2024 15:26:13 -0500 Subject: [PATCH 2/3] docs: fix spelling errors --- README.md | 2 +- docs/RNA-seq/Resources.md | 2 +- docs/RNA-seq/TLDR-RNA-seq.md | 14 +++--- docs/RNA-seq/build.md | 16 +++---- docs/RNA-seq/run.md | 12 ++--- docs/index.md | 8 ++-- docs/troubleshooting.md | 2 +- resources/biowulf/fastq_screen.conf | 2 +- resources/biowulf/fastq_screen_2.conf | 2 +- resources/builder | 8 ++-- resources/cacher | 8 ++-- resources/clean_gtf.py | 8 ++-- resources/download_dme_files | 24 +++++----- resources/frce/fastq_screen.conf | 2 +- resources/frce/fastq_screen_2.conf | 2 +- resources/jobby | 10 ++--- resources/upload_to_nidap | 14 +++--- workflow/rules/build.smk | 8 ++-- workflow/rules/single-end.smk | 12 ++--- .../builder/gtf2protein_coding_genes.py | 4 +- workflow/scripts/common.py | 2 +- workflow/scripts/do_run_rMATS | 12 ++--- workflow/scripts/files2spreadsheet.py | 2 +- workflow/scripts/get_flowcell_lanes.py | 12 ++--- workflow/scripts/merge_rsem_results.py | 2 +- workflow/scripts/pyparser.py | 44 ++++++++++--------- workflow/scripts/rNA_flowcells.Rmd | 8 ++-- workflow/scripts/rNA_groups.Rmd | 8 ++-- 28 files changed, 126 insertions(+), 124 deletions(-) diff --git a/README.md b/README.md index bd9bbff..960501b 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,7 @@ The accuracy of the downstream interpretations made from transcriptomic data are [_RSeQC_9](http://rseqc.sourceforge.net/) is another particularity useful package that is tailored for RNA-seq data. It is used to calculate the inner distance between paired-end reads and calculate TIN values for a set of canonical protein-coding transcripts. A median TIN value is calucated for each sample, which analogous to a computationally derived RIN. -[MultiQC11](https://multiqc.info/) is used to aggreate the results of each tool into a single interactive report. +[MultiQC11](https://multiqc.info/) is used to aggregate the results of each tool into a single interactive report. **Quantification** [_Cutadapt_3](https://cutadapt.readthedocs.io/en/stable/) is used to remove adapter sequences, perform quality trimming, and remove very short sequences that would otherwise multi-map all over the genome prior to alignment. diff --git a/docs/RNA-seq/Resources.md b/docs/RNA-seq/Resources.md index a00d4b9..1c864a6 100644 --- a/docs/RNA-seq/Resources.md +++ b/docs/RNA-seq/Resources.md @@ -43,7 +43,7 @@ If you do not have access to Biowulf or you are looking for a reference genome a | RSeQC13 | 4.0.0 | [nciccbr/ccbr_rseqc_4.0.0](https://hub.docker.com/repository/docker/nciccbr/ccbr_rseqc_4.0.0) | **Quality-control step** to infer stranded-ness and read distributions over specific genomic features | | RSEM14 | 1.3.3 | [nciccbr/ccbr_rsem_1.3.3](https://hub.docker.com/repository/docker/nciccbr/ccbr_rsem_1.3.3) | **Data processing step** to quantify gene and isoform counts | | Arriba15 | 2.0.0 | [nciccbr/ccbr_arriba_2.0.0](https://hub.docker.com/repository/docker/nciccbr/ccbr_arriba_2.0.0) | **Data processing step** to quantify gene-fusions | -| RNA Report | [custom](https://github.com/CCBR/rNA) | [nciccbr/ccbr_rna](https://hub.docker.com/repository/docker/nciccbr/ccbr_rna) | **Summarization step** to identify outliers and assess techincal sources of variation | +| RNA Report | [custom](https://github.com/CCBR/rNA) | [nciccbr/ccbr_rna](https://hub.docker.com/repository/docker/nciccbr/ccbr_rna) | **Summarization step** to identify outliers and assess technical sources of variation | | MultiQC16 | 1.12 | [skchronicles/multiqc](https://hub.docker.com/repository/docker/skchronicles/multiqc/) | **Reporting step** to aggregate sample statistics and quality-control information across all sample | ## 3. Acknowledgements diff --git a/docs/RNA-seq/TLDR-RNA-seq.md b/docs/RNA-seq/TLDR-RNA-seq.md index c2baaad..0de1c77 100644 --- a/docs/RNA-seq/TLDR-RNA-seq.md +++ b/docs/RNA-seq/TLDR-RNA-seq.md @@ -1,6 +1,6 @@ ## 1. Introduction -When processing RNA-sequencing data, there are often many steps that we must repeat. These are usually steps like removing adapter sequences, aligning reads against a reference genome, checking the quality of the data, and quantifying counts. RENEE is composed of several sub commands or convience functions to automate these repetitive steps. +When processing RNA-sequencing data, there are often many steps that we must repeat. These are usually steps like removing adapter sequences, aligning reads against a reference genome, checking the quality of the data, and quantifying counts. RENEE is composed of several sub commands or convenience functions to automate these repetitive steps. With RENEE, you can run your samples through our highly-reproducible pipeline, build resources for new reference genomes, and more! @@ -35,7 +35,7 @@ ssh -Y $USER@biowulf.nih.gov srun -N 1 -n 1 --time=12:00:00 -p interactive --mem=8gb --cpus-per-task=4 --pty bash ``` -### 2.3 Load dependecies +### 2.3 Load dependencies ```bash # Setup Step 2.) Add singularity and snakemake executables to $PATH @@ -47,7 +47,7 @@ module load ccbrpipeliner In this example, we will start off by building reference files downloaded from [GENCODE](https://www.gencodegenes.org/). We recommend downloading the `PRI` Genome FASTA file and annotation from [GENCODE](https://www.gencodegenes.org/). These `PRI` reference files contain the primary chromosomes and scaffolds. We **do not** recommend downloading the `CHR` reference files! -Checkout [this](./Resources.md) list for currently avaiable resources on Biowulf. If your required **genome + annotation combination** is NOT available, only then proceed to building your own reference files. Also, if you think that your **genome + annotation combination** may be beneficial for other Biowulf users of RENEE as well, then please request it to be added to RENEE's default resources by [opening an issue on Github](https://github.com/CCBR/RENEE/issues). +Checkout [this](./Resources.md) list for currently available resources on Biowulf. If your required **genome + annotation combination** is NOT available, only then proceed to building your own reference files. Also, if you think that your **genome + annotation combination** may be beneficial for other Biowulf users of RENEE as well, then please request it to be added to RENEE's default resources by [opening an issue on Github](https://github.com/CCBR/RENEE/issues). ### 3.1 Download References from GENCODE @@ -87,15 +87,15 @@ renee build --ref-fa GRCh38.primary_assembly.genome.fa \ --gtf-ver 36 --output /data/$USER/hg38_36 ``` -An email notification will be sent out when the pipeline starts and ends. Once the build pipeline completes, you can run RENEE with the provided test dataset. Please see the intructions below for more information. +An email notification will be sent out when the pipeline starts and ends. Once the build pipeline completes, you can run RENEE with the provided test dataset. Please see the instructions below for more information. ## 4. Running RENEE -Run RENEE with the reference files we built above using hg38 (GRCh38.p13) Genome FASTA file and GENCODE release 36 annotation (GTF). For more information about how the reference files we generated, please see the intructions above. You can use those instructions as a guide for building any new reference genomes in the future. +Run RENEE with the reference files we built above using hg38 (GRCh38.p13) Genome FASTA file and GENCODE release 36 annotation (GTF). For more information about how the reference files we generated, please see the instructions above. You can use those instructions as a guide for building any new reference genomes in the future. ### 4.1 Dry-run pipeline -Dry-run the pipeline prior to submiting the pipeline's master job. Please note that if you wish to run RENEE with a new dataset, you will only need to update the values provided to the `--input` and `--output` arguments (and maybe `--genome`). The `--input` argument supports globbing. If this is the first time running RENEE with for given dataset, the `--output` directory should _**not**_ exist on your local filesystem. It will be created automatically during runtime. +Dry-run the pipeline prior to submitting the pipeline's master job. Please note that if you wish to run RENEE with a new dataset, you will only need to update the values provided to the `--input` and `--output` arguments (and maybe `--genome`). The `--input` argument supports globbing. If this is the first time running RENEE with for given dataset, the `--output` directory should _**not**_ exist on your local filesystem. It will be created automatically during runtime. ```bash # Run Step 0.) Please do not run RENEE on the head node! @@ -121,7 +121,7 @@ renee run \ ### 4.2 Run pipeline -Kick off the pipeline by submiting the master job to the cluster. It is essentially the same command above without the `--dry-run` flag. +Kick off the pipeline by submitting the master job to the cluster. It is essentially the same command above without the `--dry-run` flag. ```bash # Run Step 3.) Submit the master job diff --git a/docs/RNA-seq/build.md b/docs/RNA-seq/build.md index a60f36a..b7d7ac2 100644 --- a/docs/RNA-seq/build.md +++ b/docs/RNA-seq/build.md @@ -41,7 +41,7 @@ Each of the following arguments are required. Failure to provide a required argu > **Genomic FASTA file of the reference genome.** > _type: file_ > -> This file represents the genome sequence of the reference assembly in FASTA format. If you are downloading this from GENCODE, you should select the _PRI_ genomic FASTA file. This file will contain the primary genomic assembly (contains chromosomes and scaffolds). This input file should not be compressed. Sequence identifers in this file must match with sequence identifers in the GTF file provided to `--ref-gtf`. +> This file represents the genome sequence of the reference assembly in FASTA format. If you are downloading this from GENCODE, you should select the _PRI_ genomic FASTA file. This file will contain the primary genomic assembly (contains chromosomes and scaffolds). This input file should not be compressed. Sequence identifiers in this file must match with sequence identifiers in the GTF file provided to `--ref-gtf`. > > **_Example:_** > `--ref-fa GRCh38.primary_assembly.genome.fa` @@ -63,7 +63,7 @@ Each of the following arguments are required. Failure to provide a required argu > **Gene annotation or GTF file for the reference genome.** > _type: file_ > -> This file represents the reference genome's gene annotation in GTF format. If you are downloading this from GENCODE, you should select the 'PRI' GTF file. This file contains gene annotations for the primary assembly (contains chromosomes and scaffolds). This input file should not be compressed. Sequence identifers (column 1) in this file must match with sequence identifers in the FASTA file provided to `--ref-fa`. +> This file represents the reference genome's gene annotation in GTF format. If you are downloading this from GENCODE, you should select the 'PRI' GTF file. This file contains gene annotations for the primary assembly (contains chromosomes and scaffolds). This input file should not be compressed. Sequence identifiers (column 1) in this file must match with sequence identifiers in the FASTA file provided to `--ref-fa`. > **_Example:_** `--ref-gtf gencode.v36.primary_assembly.annotation.gtf` --- @@ -95,7 +95,7 @@ Each of the following arguments are optional and do not need to be provided. If > **Local path to shared resources.** > _type: path_ > -> The pipeline uses a set of shared reference files that can be re-used across reference genomes. These currently include reference files for kraken and FQScreen. These reference files can be downloaded with the build sub command's `--shared-resources` option. With that being said, these files only need to be downloaded once. We recommend storing this files in a shared location on the filesystem that other people can access. If you are running the pipeline on Biowulf, you do NOT need to download these reference files! They already exist on the filesystem in a location that anyone can acceess; however, if you are running the pipeline on another cluster or target system, you will need to download the shared resources with the build sub command, and you will need to provide this option every time you run the pipeline. Please provide the same path that was provided to the build sub command's --shared-resources option. Again, if you are running the pipeline on Biowulf, you do NOT need to provide this option. For more information about how to download shared resources, please reference the build sub command's `--shared-resources` option. +> The pipeline uses a set of shared reference files that can be re-used across reference genomes. These currently include reference files for kraken and FQScreen. These reference files can be downloaded with the build sub command's `--shared-resources` option. With that being said, these files only need to be downloaded once. We recommend storing this files in a shared location on the filesystem that other people can access. If you are running the pipeline on Biowulf, you do NOT need to download these reference files! They already exist on the filesystem in a location that anyone can access; however, if you are running the pipeline on another cluster or target system, you will need to download the shared resources with the build sub command, and you will need to provide this option every time you run the pipeline. Please provide the same path that was provided to the build sub command's --shared-resources option. Again, if you are running the pipeline on Biowulf, you do NOT need to provide this option. For more information about how to download shared resources, please reference the build sub command's `--shared-resources` option. > > **_Example:_** `--shared-resources /data/shared/renee` @@ -106,7 +106,7 @@ Each of the following arguments are optional and do not need to be provided. If > **Builds a small genome index.** > _type: boolean_ > -> For small genomes, it is recommeded running STAR with a scaled down `--genomeSAindexNbases` value. This option runs the build pipeline in a mode where it dynamically finds the optimal value for this option using the following formula: `min(14, log2(GenomeSize)/2 - 1)`. Generally speaking, this option is not really applicable for most mammalian reference genomes, i.e. human and mouse; however, researcher working with very small reference genomes, like S. cerevisiae ~ 12Mb, should provide this option. +> For small genomes, it is recommended running STAR with a scaled down `--genomeSAindexNbases` value. This option runs the build pipeline in a mode where it dynamically finds the optimal value for this option using the following formula: `min(14, log2(GenomeSize)/2 - 1)`. Generally speaking, this option is not really applicable for most mammalian reference genomes, i.e. human and mouse; however, researcher working with very small reference genomes, like S. cerevisiae ~ 12Mb, should provide this option. > > When in doubt feel free to provide this option, as the optimal value will be found based on your input. > @@ -175,13 +175,13 @@ Each of the following arguments are optional and do not need to be provided. If you have two GTF files, e.g. hybrid genomes (host + virus), then you need to create one genomic FASTA file and one GTF file for the hybrid genome prior to running the renee build command. -We recommend creating an artifical chromosome for the non-host sequence. The sequence identifer in the FASTA file must match the sequence identifer in the GTF file (column 1). Generally speaking, since the host annotation is usually downloaded from Ensembl or GENCODE, it will be correctly formatted; however, that may not be the case for the non-host sequence! +We recommend creating an artificial chromosome for the non-host sequence. The sequence identifier in the FASTA file must match the sequence identifier in the GTF file (column 1). Generally speaking, since the host annotation is usually downloaded from Ensembl or GENCODE, it will be correctly formatted; however, that may not be the case for the non-host sequence! Please ensure the non-host annotation contains the following features and/or constraints: - for a given `gene` feature - each `gene` entry has at least one `transcript` feature - - and each `transcript` entry has atleast one `exon` feature + - and each `transcript` entry has at least one `exon` feature - `gene_id`, `gene_name` and `gene_biotype` are required - for a given `transcipt` feature - along with `gene_id`, `gene_name` and `gene_biotype` ... `transcript_id` is also required @@ -214,7 +214,7 @@ It is worth noting that RENEE comes bundled with a script to convert GFF3 files Please note that this script has only been tested with GFF3 files downloaded from NCBI, and _it is **not** recommended to use with GFF3 files originating from other sources like Ensembl or GENCODE_. If you are selecting an annotation from Ensembl or GENCODE, please download the GTF file option. -The only dependecy of the script is the python package argparse, which comes bundled with the following python2/3 distributions: `python>=2.7.18` or `python>=3.2`. If argparse is not installed, it can be downloaded with pip by running the following command: +The only dependency of the script is the python package argparse, which comes bundled with the following python2/3 distributions: `python>=2.7.18` or `python>=3.2`. If argparse is not installed, it can be downloaded with pip by running the following command: ```bash pip install --upgrade pip @@ -276,7 +276,7 @@ module purge # snakemake, and # singularity # before running renee -# Also, ensure that the `renee` execulable is in PATH +# Also, ensure that the `renee` executable is in PATH # Step 1.) Dry run the Build pipeline renee build --ref-fa GRCm39.primary_assembly.genome.fa \ diff --git a/docs/RNA-seq/run.md b/docs/RNA-seq/run.md index c8f181b..6350315 100644 --- a/docs/RNA-seq/run.md +++ b/docs/RNA-seq/run.md @@ -39,7 +39,7 @@ Each of the following arguments are required. Failure to provide a required argu > **Input FastQ file(s) to process.** > _type: file_ > -> One or more FastQ files can be provided. From the command-line, each FastQ file should seperated by a space. Globbing is supported! This makes selecting FastQ files easier. Input FastQ files should be gzipp-ed. The pipeline supports single-end and pair-end RNA-seq data; however, the pipeline will not process a mixture of single-end and paired-end samples together. If you have a mixture of single-end and pair-end samples to process, please process them as two seperate instances of the RENEE pipeline (with two seperate output directories). +> One or more FastQ files can be provided. From the command-line, each FastQ file should separated by a space. Globbing is supported! This makes selecting FastQ files easier. Input FastQ files should be gzipp-ed. The pipeline supports single-end and pair-end RNA-seq data; however, the pipeline will not process a mixture of single-end and paired-end samples together. If you have a mixture of single-end and pair-end samples to process, please process them as two separate instances of the RENEE pipeline (with two separate output directories). > > **_Example:_** `--input .tests/*.R?.fastq.gz` @@ -64,7 +64,7 @@ Each of the following arguments are required. Failure to provide a required argu > This option defines the reference genome for your set of samples. On Biowulf, RENEE does comes bundled with pre built reference files for human and mouse samples; however, it is worth noting that the pipeline does accept a custom reference genome built with the build sub command. Building a new reference genome is easy! You can create a custom reference genome with a single command. This is extremely useful when working with non-model organisms. New users can reference the documentation's [getting started](../TLDR-RNA-seq/#3-building-reference-files) section to see how a reference genome is built. > > **_Pre built Option_** -> Pre build genomes are avaiable with RENEE. Please see the [resources page](../Resources/#1.-Reference-genomes) for more information about each pre built option. +> Pre build genomes are available with RENEE. Please see the [resources page](../Resources/#1.-Reference-genomes) for more information about each pre built option. > > **_Custom Option_** > A user can also supply a custom reference genome built with the build sub command. Please supply the custom reference JSON file that was generated by the build sub command. The name of this custom reference JSON file is dependent on the values provided to the following _renee build_ args, `--ref-name REF_NAME` and `--gtf-ver GTF_VER`, where the name of the provided custom reference JSON file would be: `{REF_NAME}_{GTF_VER}.json`. @@ -75,7 +75,7 @@ Each of the following arguments are required. Failure to provide a required argu `--small-rna` -> **Run STAR using ENCODE's recomendations for small RNA.** +> **Run STAR using ENCODE's recommendations for small RNA.** > _type: boolean_ > > This option should only be used with small RNA libraries. These are rRNA-depleted libraries that have been size selected to contain fragments shorter than 200bp. Size selection enriches for small RNA species such as miRNAs, siRNAs, or piRNAs. Also, this option should not be combined with the star 2-pass basic option. If the two options are combined, STAR will run in pass basic mode. This means that STAR will not run with ENCODE's recommendations for small RNA alignment. As so, please take caution not to combine both options together. @@ -117,7 +117,7 @@ Each of the following arguments are optional and do not need to be provided. > **Execution Method.** > _type: string_ > _default: slurm_ > -> Execution Method. Defines the mode or method of execution. Vaild mode options include: slurm or local. +> Execution Method. Defines the mode or method of execution. Valid mode options include: slurm or local. > > **_local_** > Local executions will run serially on compute instance. This is useful for testing, debugging, or when a users does not have access to a high performance computing environment. If this option is not provided, it will default to a local execution mode. @@ -134,7 +134,7 @@ Each of the following arguments are optional and do not need to be provided. > **Local path to shared resources.** > _type: path_ > -> The pipeline uses a set of shared reference files that can be re-used across reference genomes. These currently include reference files for kraken and FQScreen. These reference files can be downloaded with the build sub command's `--shared-resources` option. With that being said, these files only need to be downloaded once. We recommend storing this files in a shared location on the filesystem that other people can access. If you are running the pipeline on Biowulf, you do NOT need to download these reference files! They already exist on the filesystem in a location that anyone can acceess; however, if you are running the pipeline on another cluster or target system, you will need to download the shared resources with the build sub command, and you will need to provide this option every time you run the pipeline. Please provide the same path that was provided to the build sub command's --shared-resources option. Again, if you are running the pipeline on Biowulf, you do NOT need to provide this option. For more information about how to download shared resources, please reference the build sub command's `--shared-resources` option. +> The pipeline uses a set of shared reference files that can be re-used across reference genomes. These currently include reference files for kraken and FQScreen. These reference files can be downloaded with the build sub command's `--shared-resources` option. With that being said, these files only need to be downloaded once. We recommend storing this files in a shared location on the filesystem that other people can access. If you are running the pipeline on Biowulf, you do NOT need to download these reference files! They already exist on the filesystem in a location that anyone can access; however, if you are running the pipeline on another cluster or target system, you will need to download the shared resources with the build sub command, and you will need to provide this option every time you run the pipeline. Please provide the same path that was provided to the build sub command's --shared-resources option. Again, if you are running the pipeline on Biowulf, you do NOT need to provide this option. For more information about how to download shared resources, please reference the build sub command's `--shared-resources` option. > > **_Example:_** `--shared-resources /data/shared/renee` @@ -181,7 +181,7 @@ Each of the following arguments are optional and do not need to be provided. > _type: int_ > _default: 2_ > -> Max number of threads for each process. This option is more applicable when running the pipeline with `--mode local`. It is recommended setting this vaule to the maximum number of CPUs available on the host machine. +> Max number of threads for each process. This option is more applicable when running the pipeline with `--mode local`. It is recommended setting this value to the maximum number of CPUs available on the host machine. > > **_Example:_** `--threads 12` diff --git a/docs/index.md b/docs/index.md index 3329611..7ff3646 100644 --- a/docs/index.md +++ b/docs/index.md @@ -14,15 +14,15 @@ Welcome to RENEE's documentation! This guide is the main source of documentation for users that are getting started with the [RENEE pipeline](https://github.com/CCBR/RENEE). If you are not familiar with RNA-sequencing, please checkout our [theory and practical guide](RNA-seq/Theory.md). That section provides a conceptual overview to RNA-seq analysis and as well as a set of generalized guidelines to interpret different quality-control metrics. If you are a new user, we highly recommend reading through our [getting started](RNA-seq/TLDR-RNA-seq.md) section. This page contains information needed to quickly build new reference files and setup the pipeline for running in your compute environment. -RENEE is composed several inter-related sub commands to faciliate the analysis of RNA-sequencing data. For more information about each available sub command, please see the [usage section](RNA-seq/run.md). To help out new users, an example of each command is also provided. The [resources page](RNA-seq/Resources.md) contains more information about the pipeline's default reference genomes along with every tool and Docker image the pipeline employs. +RENEE is composed several inter-related sub commands to facilitate the analysis of RNA-sequencing data. For more information about each available sub command, please see the [usage section](RNA-seq/run.md). To help out new users, an example of each command is also provided. The [resources page](RNA-seq/Resources.md) contains more information about the pipeline's default reference genomes along with every tool and Docker image the pipeline employs. For more information about issues or trouble-shooting a problem, please checkout our [FAQ](troubleshooting.md) prior to [opening an issue on Github](https://github.com/CCBR/RENEE/issues). ## 2. Overview -**RENEE** is a comprehensive, open-source RNA-seq pipeline that relies on technologies like [Docker1](https://www.docker.com/why-docker) and [Singularity2... now called Apptainer](https://apptainer.org/docs/) to maintain the highest-level of reproducibility. The pipeline consists of a series of data processing and quality-control steps orchestrated by [Snakemake3](https://snakemake.readthedocs.io/en/stable/), a flexible and scalable workflow management system, to submit jobs to a cluster or cloud provider (comming soon!). +**RENEE** is a comprehensive, open-source RNA-seq pipeline that relies on technologies like [Docker1](https://www.docker.com/why-docker) and [Singularity2... now called Apptainer](https://apptainer.org/docs/) to maintain the highest-level of reproducibility. The pipeline consists of a series of data processing and quality-control steps orchestrated by [Snakemake3](https://snakemake.readthedocs.io/en/stable/), a flexible and scalable workflow management system, to submit jobs to a cluster or cloud provider (coming soon!). -RENEE can be run locally on a compute instance, on-premise using a cluster, or on the cloud using AWS (comming soon!). A user can define the method or mode of execution. The pipeline can submit jobs to a cluster using a job scheduler like SLURM, or run on AWS using Tibanna (feature coming soon!). A hybrid approach ensures the pipeline is accessible to all users. +RENEE can be run locally on a compute instance, on-premise using a cluster, or on the cloud using AWS (coming soon!). A user can define the method or mode of execution. The pipeline can submit jobs to a cluster using a job scheduler like SLURM, or run on AWS using Tibanna (feature coming soon!). A hybrid approach ensures the pipeline is accessible to all users. A bioinformatics pipeline is more than the sum of its data processing steps. A pipeline without quality-control steps provides a myopic view of the potential sources of variation within your data (i.e., biological verses technical sources of variation). RENEE pipeline is composed of a series of quality-control and data processing steps. @@ -47,7 +47,7 @@ In addition to generating a MultiQC report, the RENEE pipeline also generates a [_RSeQC_10](http://rseqc.sourceforge.net/) is another particularity useful package that is tailored for RNA-seq data. It is used to calculate the inner distance between paired-end reads and calculate TIN values for a set of canonical protein-coding transcripts. A median TIN value is calucated for each sample, which analogous to a computationally derived RIN. -[MultiQC11](https://multiqc.info/) is used to aggreate the results of each tool into a single interactive report. +[MultiQC11](https://multiqc.info/) is used to aggregate the results of each tool into a single interactive report. ### 3.2 Data Processing diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 5f0576a..346a1b7 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -116,7 +116,7 @@ Once you've ensured that all running jobs have been stopped, you need to unlock ```bash # Load Dependencies -modue load ccbrpipeliner +module load ccbrpipeliner # Unlock the working directory rna-seek unlock --output /path/to/working/dir diff --git a/resources/biowulf/fastq_screen.conf b/resources/biowulf/fastq_screen.conf index d02f490..ff7e4cf 100644 --- a/resources/biowulf/fastq_screen.conf +++ b/resources/biowulf/fastq_screen.conf @@ -68,7 +68,7 @@ THREADS 24 ## are found in a folder named 'GRCh37'. ## ## If the bowtie AND bowtie2 indices of a given genome reside in the SAME FOLDER, -## a SINLGE path may be provided to BOTH sets of indices. +## a SINGLE path may be provided to BOTH sets of indices. ## ## Human - sequences available from ## ftp://ftp.ensembl.org/pub/current/fasta/homo_sapiens/dna/ diff --git a/resources/biowulf/fastq_screen_2.conf b/resources/biowulf/fastq_screen_2.conf index 25554ac..65a2503 100644 --- a/resources/biowulf/fastq_screen_2.conf +++ b/resources/biowulf/fastq_screen_2.conf @@ -68,7 +68,7 @@ THREADS 24 ## are found in a folder named 'GRCh37'. ## ## If the bowtie AND bowtie2 indices of a given genome reside in the SAME FOLDER, -## a SINLGE path may be provided to BOTH sets of indices. +## a SINGLE path may be provided to BOTH sets of indices. ## ## Human - sequences available from ## ftp://ftp.ensembl.org/pub/current/fasta/homo_sapiens/dna/ diff --git a/resources/builder b/resources/builder index a484dfa..862b937 100755 --- a/resources/builder +++ b/resources/builder @@ -29,10 +29,10 @@ Required Positional Argument: Required Arguments: -j, --job-name [Type: Str] Name of pipeline's master job. -b, --bind-paths [Type:Path] Singularity bind paths. The RENEE pipeline uses - singaularity images for exection. Bind paths are + singularity images for execution. Bind paths are used to mount the host filesystem to the container's filesystem. Multiple bind paths can be provided - as a comma seperated list. The main entry point + as a comma-separated list. The main entry point of the pipeline internally collects and aggregates bindpaths to mount to the container's filesystem. If you are manually running this script or by-passing @@ -157,9 +157,9 @@ function submit(){ # Snakemake executor executor=${1} - # Goto Pipeline Ouput directory + # Goto Pipeline Output directory # Create a local singularity cache in output directory - # cache can be re-used instead of re-pulling from DockerHub everytime + # cache can be re-used instead of re-pulling from DockerHub every time cd "$3" && export SINGULARITY_CACHEDIR="${5}" # unsetting XDG_RUNTIME_DIR to avoid some unsighly but harmless warnings diff --git a/resources/cacher b/resources/cacher index 2f0ee6a..f5e4d46 100755 --- a/resources/cacher +++ b/resources/cacher @@ -26,11 +26,11 @@ Required Positional Argument: Required Arguments: -s, --sif-cache [Type: Path] Path to output directory to cache remote resources. -i, --image-uris [Type: Str] Image(s) to pull from Dockerhub. Multiple images - are seperated by a comma. + are separated by a comma. OPTIONS: -t, --tmp-dir [Type: Path] Path to tmp singularity dir. Singularity uses this directory when images are pulled from DockerHub - and coverted into SIFs. If not provided, the + and converted into SIFs. If not provided, the location to the temp dir will default to the following "/tmp/$USER/cacher/.singularity/" directory. @@ -147,9 +147,9 @@ function _pull(){ # cache executor executor=${1} - # Goto Pipeline Ouput directory + # Goto Pipeline Output directory # Create a local singularity cache in output directory - # cache can be re-used instead of re-pulling from DockerHub everytime + # cache can be re-used instead of re-pulling from DockerHub every time cd "$2" && export SINGULARITY_CACHEDIR="${3}" # unsetting XDG_RUNTIME_DIR to avoid some unsighly but harmless warnings diff --git a/resources/clean_gtf.py b/resources/clean_gtf.py index 2150ce3..4039bb8 100755 --- a/resources/clean_gtf.py +++ b/resources/clean_gtf.py @@ -35,7 +35,7 @@ def replace_nested_quotes(line, find_char='"', replace_char=""): """ Assumes the quote character in the 9th column is a double quote or <"> character. This is the correct character to - use based on the speficiation. + use based on the specification. """ # Normal: # protein_id "XP_040355194.1"; @@ -72,7 +72,7 @@ def replace_nested_quotes(line, find_char='"', replace_char=""): if inside_quotes: # Fix evil mistakes of the past, - # replace reserved delimeter with + # replace reserved delimiter with # another character, let's use a # url encoding of the character if curr_char == find_char and quote_count > 1: @@ -90,7 +90,7 @@ def url_escape_inside_quotes(line, delimiter=";", url_encoding="%3B"): Assumes the quote character in the 9th column is a double quote or <"> character. This is the correct character to - use based on the speficiation. + use based on the specification. """ quote_count = 0 inside_quotes = False @@ -111,7 +111,7 @@ def url_escape_inside_quotes(line, delimiter=";", url_encoding="%3B"): if inside_quotes: # Fix evil mistakes of the past, - # replace reserved delimeter with + # replace reserved delimiter with # another character, let's use a # url encoding of the character if c == delimiter: diff --git a/resources/download_dme_files b/resources/download_dme_files index 0fb4d75..f56a3fc 100755 --- a/resources/download_dme_files +++ b/resources/download_dme_files @@ -4,7 +4,7 @@ set -euo pipefail __VERSION__="1.3.0" function usage() { cat << EOF -download_dme_files: a utility for dowloading file(s) from HPC DME. +download_dme_files: a utility for downloading file(s) from HPC DME. Usage: $ download_dme_files [-h] [-v] \\ @@ -35,11 +35,11 @@ Required Arguments: can be provided, or the API token can be provided as a string. Options: - -r, --rid RID [Type: Str] Request Identifer. This is an optional + -r, --rid RID [Type: Str] Request identifier. This is an optional string to help track a given request. - This identifer is appended to any log - files. If a request identifer is not - provided, a request identifer will be + This identifier is appended to any log + files. If a request identifier is not + provided, a request identifier will be generated from an MD5 checksum based on the time of the request and the other user provided options. @@ -196,24 +196,24 @@ function require(){ function _id(){ - # Generates a default request identifer + # Generates a default request identifier # if the -r or --rid option is not provided. - # The default identifer is generated by generating + # The default identifier is generated by generating # an md5 checksum of the required user inputs and # the timestamp # List of required arguments local features - local identifer + local identifier local required=("f" "o" "t" "$(timestamp)") for arg in "${required[@]}"; do value=${Arguments[${arg}]:-} features+="${value}" done - identifer=$(md5sum <<< "${features}" | awk '{print $1}') + identifier=$(md5sum <<< "${features}" | awk '{print $1}') - echo "${identifer}" + echo "${identifier}" } @@ -224,7 +224,7 @@ function _download(){ # INPUT $3 = DME API token # INPUT $4 = Request ID, see _id() for default # INPUT $5 = HTTPS Proxy, defaults to no proxy set - # @CALLS require() to enfore cURL installation + # @CALLS require() to enforce cURL installation # @CALLS timestamp() to log time of file download # @CALLS fatal() if curl returns a non-200 http response @@ -282,7 +282,7 @@ function main(){ # INPUT $1 = Files to download from DME # INPUT $2 = Local output directory # INPUT $3 = DME API token - # INPUT $4 = Request Identifer, see _id() for default + # INPUT $4 = Request identifier, see _id() for default # INPUT $5 = Optional HTTPS proxy server _download "${dmefiles}" "${outdir}" "${token}" "${requestid}" "${proxy}" diff --git a/resources/frce/fastq_screen.conf b/resources/frce/fastq_screen.conf index e396832..5bbc666 100644 --- a/resources/frce/fastq_screen.conf +++ b/resources/frce/fastq_screen.conf @@ -68,7 +68,7 @@ THREADS 24 ## are found in a folder named 'GRCh37'. ## ## If the bowtie AND bowtie2 indices of a given genome reside in the SAME FOLDER, -## a SINLGE path may be provided to BOTH sets of indices. +## a SINGLE path may be provided to BOTH sets of indices. ## ## Human - sequences available from ## ftp://ftp.ensembl.org/pub/current/fasta/homo_sapiens/dna/ diff --git a/resources/frce/fastq_screen_2.conf b/resources/frce/fastq_screen_2.conf index 8fc9a91..f0aa346 100644 --- a/resources/frce/fastq_screen_2.conf +++ b/resources/frce/fastq_screen_2.conf @@ -68,7 +68,7 @@ THREADS 24 ## are found in a folder named 'GRCh37'. ## ## If the bowtie AND bowtie2 indices of a given genome reside in the SAME FOLDER, -## a SINLGE path may be provided to BOTH sets of indices. +## a SINGLE path may be provided to BOTH sets of indices. ## ## Human - sequences available from ## ftp://ftp.ensembl.org/pub/current/fasta/homo_sapiens/dna/ diff --git a/resources/jobby b/resources/jobby index 747183e..c75ed5d 100755 --- a/resources/jobby +++ b/resources/jobby @@ -63,7 +63,7 @@ _description = "Will take your job(s)... and display their information!" # Classes class Colors: - """Class encoding for ANSI escape sequeces for styling terminal text. + """Class encoding for ANSI escape sequences for styling terminal text. Any string that is formatting with these styles must be terminated with the escape sequence, i.e. `Colors.end`. """ @@ -75,7 +75,7 @@ class Colors: italic = "\33[3m" url = "\33[4m" blink = "\33[5m" - higlighted = "\33[7m" + highlighted = "\33[7m" # Text Colors black = "\33[30m" red = "\33[31m" @@ -181,7 +181,7 @@ def add_missing(linelist, insertion_dict): Dictionary used to insert missing information to a given index, where the keys are indices of the `linelist` and the values are information to add. Please note that the indices - should be zero based. Note that multiple consequetive values + should be zero based. Note that multiple consecutive values should be inserted at once as a list, see example below: Example: add_missing([0,1,2,3,4], {3:['+','++'], 1:'-', 4:'@'}) @@ -584,7 +584,7 @@ def parsed_arguments(name, description): Identiers of past jobs. One or more JOB_IDs can be provided. Multiple JOB_IDs should be - seperated by a space. Information for each + separated by a space. Information for each of the JOB_IDs will be displayed to standard output. Please see example section below for more information. @@ -653,7 +653,7 @@ def parsed_arguments(name, description): parser.add_argument("JOB_ID", nargs="+", help=argparse.SUPPRESS) # Options - # Adding verison information + # Adding version information parser.add_argument( "-v", "--version", diff --git a/resources/upload_to_nidap b/resources/upload_to_nidap index 70c2fec..0746a22 100755 --- a/resources/upload_to_nidap +++ b/resources/upload_to_nidap @@ -17,7 +17,7 @@ Usage: Synopsis: This script provides a high level wrapper to the NIDAP API. Given a list of local file paths, a NIDAP API token, and a NIDAP -dataset identifer, it will upload those files to a dataset on NIDAP. +dataset identifier, it will upload those files to a dataset on NIDAP. Required Arguments: -f, --files FILE [Type: Str] Files to upload to NIDAP. @@ -26,12 +26,12 @@ Required Arguments: be uploaded at once by providing a quoted space separated list of local files. - -d, --dataid DATAID [Type: Path] Dataset Identifer for NIDAP upload. - Identifer to a dataset on NIDAP + -d, --dataid DATAID [Type: Path] Dataset identifier for NIDAP upload. + identifier to a dataset on NIDAP where file(s) will be uploaded. - -r, --rid RID [Type: Str] Request Identifer. This transaction - identifer is used to help track a - given request. This identifer is + -r, --rid RID [Type: Str] Request identifier. This transaction + identifier is used to help track a + given request. This identifier is also appended to any log files. -t, --token TOKEN [Type: Str] API token for NIDAP. A text file containing an API token for NIDAP @@ -235,7 +235,7 @@ function _upload(){ # INPUT $3 = NIDAP API token # INPUT $4 = NIDAP Request or transaction ID # INPUT $5 = HTTPS Proxy, defaults to no proxy set - # @CALLS require() to enfore cURL installation + # @CALLS require() to enforce cURL installation # @CALLS timestamp() to log time of file uploads # @CALLS fatal() if curl returns a non-200 http response # @CALLS _commit() to close an open upload transaction and commit the files to dataset diff --git a/workflow/rules/build.smk b/workflow/rules/build.smk index cc91c35..e3957a6 100644 --- a/workflow/rules/build.smk +++ b/workflow/rules/build.smk @@ -273,7 +273,7 @@ if SMALL_GENOME == "True": """ Builds STAR Index to align reads against reference genome without the GTF or readlength provided. This index only contain information pertaining - to the asssembly or reference genome in FASTA format. This indice represents a + to the assembly or reference genome in FASTA format. This indice represents a base index from which processing annotations from a GTF and insert junctions on the fly. This has the advantage of saving diskspace as an index will not be created for a list of predefined readlengths. This rule replaces star_rl above. @@ -332,7 +332,7 @@ else: """ Builds STAR Index to align reads against reference genome without the GTF or readlength provided. This index only contain information pertaining - to the asssembly or reference genome in FASTA format. This indice represents a + to the assembly or reference genome in FASTA format. This indice represents a base index from which processing annotations from a GTF and insert junctions on the fly. This has the advantage of saving diskspace as an index will not be created for a list of predefined readlengths. This rule replaces star_rl above. @@ -379,7 +379,7 @@ rule rRNA_list: estimate rRNA content or abundance. rRNA can make up a significant proportion of an RNA-seq library if not properly depleted either through poly-selection or ribosomal depletion. Samples with very high rRNA content could signal an issue - occured with library prepartion. + occurred with library preparation. @Input: Genomic FASTA file Annotation file in GTF format @@ -450,7 +450,7 @@ rule karyo_beds: rule tin_ref: """ - Builds RSeQC tin.py reference file containing all canocical protein coding genes. + Builds RSeQC tin.py reference file containing all canonical protein coding genes. @Input: Annotation file in GTF format @Output: diff --git a/workflow/rules/single-end.smk b/workflow/rules/single-end.smk index 7a1ef1d..8433102 100644 --- a/workflow/rules/single-end.smk +++ b/workflow/rules/single-end.smk @@ -58,7 +58,7 @@ rule rawfastqc: if config['options']['small_rna']: # Run STAR with ENCODE's recommendations for small RNA sequencing. - # Set the min read legth to + # Set the min read length to rule trim_se: """ Data-processing step to remove adapter sequences and perform quality trimming @@ -127,7 +127,7 @@ rule fastqc: """ Quality-control step to assess sequencing quality of the raw data after removing adapter sequences. This step is run after trim_pe rule. FastQC is run after adapter - trimming to evalute if the adapter sequences were properly removed. + trimming to evaluate if the adapter sequences were properly removed. @Input: List of Trimmed FastQ files (gather) @Output: @@ -204,7 +204,7 @@ rule kraken_se: @Input: Trimmed FastQ files (scatter) @Output: - Kraken logfile and interative krona report + Kraken logfile and interactive krona report """ input: fq=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"), @@ -231,7 +231,7 @@ rule kraken_se: trap 'rm -rf "${{tmp}}"' EXIT # Copy kraken2 db to /lscratch or temp - # location to reduce filesytem strain + # location to reduce filesystem strain cp -rv {params.bacdb}/* ${{tmp}}/ kraken2 --db ${{tmp}} \ --threads {threads} --report {output.krakentaxa} \ @@ -253,7 +253,7 @@ if config['options']['star_2_pass_basic']: Data processing step to align reads against reference genome using STAR in per sample two-pass basic mode. STAR will perform the 1st pass mapping, then it will automatically extract splice junctions, insert them into the genome - index, and, finally, re-map all reads in the 2nd mapping pass. Agian, Splice + index, and, finally, re-map all reads in the 2nd mapping pass. Again, Splice junctions are detected at a per sample level. @Input: Trimmed FastQ files (scatter) @@ -724,7 +724,7 @@ rule rnaseq_multiqc: Reporting step to aggregate sample statistics and quality-control information across all samples. This will be one of the last steps of the pipeline. The inputs listed here are to ensure that this step runs last. During runtime, MultiQC will - recurively crawl through the working directory and parse files that it supports. + recursively crawl through the working directory and parse files that it supports. @Input: List of files to ensure this step runs last (gather) @Output: diff --git a/workflow/scripts/builder/gtf2protein_coding_genes.py b/workflow/scripts/builder/gtf2protein_coding_genes.py index 1f0fcc2..927c9ef 100644 --- a/workflow/scripts/builder/gtf2protein_coding_genes.py +++ b/workflow/scripts/builder/gtf2protein_coding_genes.py @@ -16,7 +16,7 @@ def get_value(mykey, lookup): return myvalue.strip('"').strip("'") -def seperated(pairslist): +def separated(pairslist): for kv in pairslist: k = kv.split(" ")[0] v = " ".join(kv.split(" ")[1:]).rstrip(";") @@ -27,7 +27,7 @@ def get_id_and_type(last_column): pairs = {} kv_pairs_list = last_column.strip().split("; ") - for k, v in seperated(kv_pairs_list): + for k, v in separated(kv_pairs_list): pairs[k] = v gene_id = get_value("gene_id", pairs) diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py index c0cd7b6..3569da4 100644 --- a/workflow/scripts/common.py +++ b/workflow/scripts/common.py @@ -147,7 +147,7 @@ def abstract_location(file_address, *args, **kwargs): # If botocore cannot find credentials, try connecting unsigned. # This will work for anonymous S3 resources if the resources in the # s3 bucket are configured correctly. - # If a file in provieded as input to a Snakemake rule, only read + # If a file in provided as input to a Snakemake rule, only read # access is needed to access the remote S3 object. remote_provider = snakemake.remote.S3.RemoteProvider( config=botocore.client.Config(signature_version=botocore.UNSIGNED) diff --git a/workflow/scripts/do_run_rMATS b/workflow/scripts/do_run_rMATS index d10fee4..693b9a5 100755 --- a/workflow/scripts/do_run_rMATS +++ b/workflow/scripts/do_run_rMATS @@ -11,10 +11,10 @@ USAGE: $ ./do_run_rMATS [-h] \\ [--skip-index] SYNOPSIS: - Convience script to run rMATS with an RENEE output directory. A + Convenience script to run rMATS with an RENEE output directory. A user just needs to create a 'groups.tab' and 'contrasts.tab' file in the RENEE output directory of interest. This script will run rMATS Turbo - for each comparsion defined in 'contrasts.tab' file. The '--skip-index' + for each comparison defined in 'contrasts.tab' file. The '--skip-index' option can be provided if this script has already been run to generate an STAR index for rMATS. OPTIONS: @@ -69,8 +69,8 @@ function create_groups(){ } -function initalize(){ - # Creates output directory heirarchy +function initialize(){ + # Creates output directory hierarchy # $1 = output directory local wd="$1" @@ -234,7 +234,7 @@ function main(){ done # Step 1. Create a base output directory hierarchy - initalize "${PWD}" + initialize "${PWD}" # Step 2. Create samples sheet for each contrast # from a groups.tab file and contrasts.tab file @@ -250,7 +250,7 @@ function main(){ | awk -v wd="$PWD" \ '{print wd"/rMATS/STAR/2.7.6a/genes-"$1"/"}') while read g1 g2; do - # Run rMATS for each constrast + # Run rMATS for each contrast # $1 = Group 1 # $2 = Group 2 # $3 = STAR Index diff --git a/workflow/scripts/files2spreadsheet.py b/workflow/scripts/files2spreadsheet.py index 0fb46a2..76c9821 100644 --- a/workflow/scripts/files2spreadsheet.py +++ b/workflow/scripts/files2spreadsheet.py @@ -37,7 +37,7 @@ def reader(filename, subset=[], skip="#", **kwargs): return csv(filename, subset, skip, **kwargs) else: # Default to reading in as an TSV file - # Tab is the normal delimeter for MAF or VCF files + # Tab is the normal delimiter for MAF or VCF files # MAF files usually have one of the following # extensions: '.tsv', '.txt', '.text', '.vcf', '.maf' return tsv(filename, subset, skip, **kwargs) diff --git a/workflow/scripts/get_flowcell_lanes.py b/workflow/scripts/get_flowcell_lanes.py index 678df9a..51b97a0 100644 --- a/workflow/scripts/get_flowcell_lanes.py +++ b/workflow/scripts/get_flowcell_lanes.py @@ -17,7 +17,7 @@ # + # AAAFFJJFJJJJJJFJJJJJJJJJJFJAJJJJJFJJJJJFFJJAJJJJ7JJ -# Input 2 (SRA doesn't store FC ID, use intrument name instead) +# Input 2 (SRA doesn't store FC ID, use instrument name instead) # @SRR5351039.1 SN608:8:1101:31.20:96.50 length=51 # NTTTANNNNNNGNGCNCTGNNNNNNNNGNNNNNAAGGGNTNNNNNNNNNNN # +SRR5351039.1 SN608:8:1101:31.20:96.50 length=51 @@ -58,20 +58,20 @@ def reader(fname): return open -def get_flowcell_lane(sequence_identifer): +def get_flowcell_lane(sequence_identifier): """Returns flowcell and lane information for different fastq formats. FastQ files generated with older versions of Casava or downloaded from SRA have a different format than newer FastQ files generated with the current version of Casava. It is worth noting that FastQ files downloaded from SRA or FastQ files generated with Casava version < 1.8 do not have Flowcell - IDs in its sequence indentifer. + IDs in its sequence identifier. For more information visit: https://en.wikipedia.org/wiki/FASTQ_format """ - id_list = sequence_identifer.strip().split(":") + id_list = sequence_identifier.strip().split(":") if len(id_list) < 7: # No Flowcell IDs in this format # Return next instrument id instead (next best thing) - if sequence_identifer.startswith("@SRR"): + if sequence_identifier.startswith("@SRR"): # SRA format or downloaded SRA FastQ file # SRA format 1: contains machine and lane information # @SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36 @@ -146,7 +146,7 @@ def md5sum(filename, blocksize=65536): ) for line in file: line = line.strip() - if i % 4 == 0: # read id or sequence identifer + if i % 4 == 0: # read id or sequence identifier fc, lane = get_flowcell_lane(line) fc = fc.lstrip("@") fc_lane = "{}_{}".format(fc, lane) diff --git a/workflow/scripts/merge_rsem_results.py b/workflow/scripts/merge_rsem_results.py index f40dedd..a1484e0 100644 --- a/workflow/scripts/merge_rsem_results.py +++ b/workflow/scripts/merge_rsem_results.py @@ -7,7 +7,7 @@ def Counts(fpattern, searchpath, anno, ftype, mycols): """ - Get each samples FPKM vaules from RSEMs *.RSEM.genes.results and *.RSEM.isoform.results + Get each samples FPKM values from RSEMs *.RSEM.genes.results and *.RSEM.isoform.results """ # Collect RSEM Results files = sorted(list(filter(lambda x: fpattern in x, os.listdir(searchpath)))) diff --git a/workflow/scripts/pyparser.py b/workflow/scripts/pyparser.py index f045ad5..6f12d57 100644 --- a/workflow/scripts/pyparser.py +++ b/workflow/scripts/pyparser.py @@ -45,7 +45,7 @@ } }, "multiqc_cutadapt.txt": { - "delimeter": "\t", + "delimiter": "\t", "clean_sample_name": ["\.R1$", "\.R2$"], "parse_column": ["Sample", "pairs_processed", "r_processed"], "rename_field": { @@ -55,7 +55,7 @@ "typecast": {"total_read_pairs": int}, }, "multiqc_fastqc.txt": { - "delimeter": "\t", + "delimiter": "\t", "clean_sample_name": ["^QC \\| ", "^rawQC \\| ", "\.trim$", "\.R1$", "\.R2$"], "collapse": True, "parse_column": [ @@ -74,7 +74,7 @@ "typecast": {"trimmed_read_pairs": int, "avg_sequence_length": float}, }, "multiqc_fastq_screen.txt": { - "delimeter": "\t", + "delimiter": "\t", "clean_sample_name": [ "^FQscreen \\| ", "^FQscreen2 \\| ", @@ -115,7 +115,7 @@ }, }, "multiqc_picard_dups.txt": { - "delimeter": "\t", + "delimiter": "\t", "clean_sample_name": ["\.p2$"], "parse_column": ["Sample", "PERCENT_DUPLICATION"], "rename_field": {"PERCENT_DUPLICATION": "percent_duplication"}, @@ -123,7 +123,7 @@ "scaling_factor": {"percent_duplication": 100.0}, }, "multiqc_picard_RnaSeqMetrics.txt": { - "delimeter": "\t", + "delimiter": "\t", "clean_sample_name": ["\.p2$"], "parse_column": [ "Sample", @@ -161,7 +161,7 @@ }, }, "multiqc_rseqc_infer_experiment.txt": { - "delimeter": "\t", + "delimiter": "\t", "clean_sample_name": [ "^RSeQC \\| ", "\.strand\.info$", @@ -189,30 +189,30 @@ }, }, "rseqc_inner_distances.txt": { - "delimeter": "\t", + "delimiter": "\t", "clean_sample_name": ["\.inner_distance_freq\.txt$"], "parse_column": ["Sample", "Inner_Dist_Maxima"], "rename_field": {"Inner_Dist_Maxima": "inner_distance_maxima"}, "typecast": {"inner_distance_maxima": float}, }, "rseqc_median_tin.txt": { - "delimeter": "\t", + "delimiter": "\t", "clean_sample_name": ["\.star_rg_added\.sorted\.dmark\.bam$"], "parse_column": ["Sample", "median_tin"], "typecast": {"median_tin": float}, }, "sample_group.txt": { - "delimeter": "\t", + "delimiter": "\t", "clean_sample_name": [], "parse_column": ["Sample", "TissueType"], }, "fastq_flowcell_lanes.txt": { - "delimeter": "\t", + "delimiter": "\t", "clean_sample_name": [], "parse_column": ["Sample", "flowcell_lanes"], }, "multiqc_star.txt": { - "delimeter": "\t", + "delimiter": "\t", "clean_sample_name": ["\.p2$"], "parse_column": ["Sample", "uniquely_mapped_percent", "avg_input_read_length"], "rename_field": { @@ -222,7 +222,7 @@ "typecast": {"percent_aligned": float, "avg_aligned_read_length": int}, }, "multiqc_qualimap_bamqc_genome_results.txt": { - "delimeter": "\t", + "delimiter": "\t", "clean_sample_name": ["\.p2$"], "parse_column": [ "Sample", @@ -254,7 +254,7 @@ def help(): output files generated from MultiQC. Each provided file is parsed, and information is aggregated across all samples into a single tab-seperated - ouput file: multiqc_matrix.tsv + output file: multiqc_matrix.tsv Currently Supported MultiQC Files: multiqc_cutadapt.txt, multiqc_star.txt, multiqc_picard_dups.txt, @@ -272,7 +272,7 @@ def help(): # Creates QC table: multiqc_matrix.tsv in users working directory $ python pyparser.py multiqc_cutadapt.txt multiqc_fastqc.txt multiqc_fastq_screen.txt $PWD # Supports globbing - $ python pyparser.py /path/to/MultiQC/ouput/folder/*.txt $PWD + $ python pyparser.py /path/to/MultiQC/output/folder/*.txt $PWD Requirements: multiqc == 1.9 @@ -439,7 +439,9 @@ def scaled(value, column, filename): filename = os.path.basename(filename) try: # Get the scaling factor - scaling_unit = config[filename]["scaling_factor"][column] # KeyError if DNE + scaling_unit = config[filename]["scaling_factor"][ + column + ] # KeyError if does not exist value = value * scaling_unit # TypeError if string value = round(value, 3) except TypeError: @@ -481,7 +483,7 @@ def populate_table(parsed_header, parsed_line, file, data_dict): return data_dict -def parsed(file, delimeter="\t"): +def parsed(file, delimiter="\t"): """Parses columns of file according to specification in config[filename]['parse_column']. Column names are renamed according to specification in config[filename]['rename_field']. Sample names are cleaned to removed any prefixes or suffixes specified in config[filename]['clean_sample_name']. @@ -491,7 +493,7 @@ def parsed(file, delimeter="\t"): # print('\nBeginning to parse {}'.format(file)) with open(file, "r") as fh: # Parse header - header = next(fh).strip().split(delimeter) # Get file header + header = next(fh).strip().split(delimiter) # Get file header indexes = column_indexes(header, file) # Indexes of columns to parse header = [header[i] for i in indexes] # Parse each column of interest header = rename(header, file) # Rename columns @@ -500,7 +502,7 @@ def parsed(file, delimeter="\t"): sample_index = header.index("Sample") for line in fh: # linelist = line.strip().split(delimiter) - linelist = line.rstrip("\n").split(delimeter) + linelist = line.rstrip("\n").split(delimiter) parsed_line = [linelist[i] for i in indexes] parsed_line = clean( parsed_line, sample_index, file @@ -514,7 +516,7 @@ def main(): # 1. Get rid of pandas dependency (add transpose function and loop through dict to print table) # 2. Add more advanced argument parsing, make path to config an arg - # Check for usage and optional arguements, get list of files to parse and output directory + # Check for usage and optional arguments, get list of files to parse and output directory ifiles, outdir = args(sys.argv) # Check if files are supported, see config specification, and if file is readable @@ -528,12 +530,12 @@ def main(): df = pd.DataFrame(QC).transpose() - # Get default output peference + # Get default output preference try: output_preference = config[".rnaseq"][".default"][".output_preference"] df = df.reindex(columns=output_preference) except KeyError: - # Output peference is not defined in config + # Output preference is not defined in config pass # Write to file diff --git a/workflow/scripts/rNA_flowcells.Rmd b/workflow/scripts/rNA_flowcells.Rmd index bd8c4af..c8142b1 100755 --- a/workflow/scripts/rNA_flowcells.Rmd +++ b/workflow/scripts/rNA_flowcells.Rmd @@ -623,12 +623,12 @@ Corr plots {data-orientation=columns data-icon="ion-stats-bars"} Column {data-width=500} ------------------------------------- -### Heirarchical clustering of pairwise spearman correlation coefficients +### Hierarchical clustering of pairwise spearman correlation coefficients -```{r heirarchical-correlation-matrix, dpi=300} +```{r hierarchical-correlation-matrix, dpi=300} # Helper Function reorder_cormat <- function(cormat) { - # Use correlation between variables as distance for heirarchincal clustering + # Use correlation between variables as distance for hierarchincal clustering dd <- as.dist((1 - cormat) / 2) hc <- hclust(dd) cormat <- cormat[hc$order, hc$order] @@ -660,7 +660,7 @@ cormat <- reorder_cormat(cormatrix) # Get upper triangle of the correlation matrix cormat[lower.tri(cormat)] <- NA -# Remove lower trinagle and reshape from wide to long format +# Remove lower triangle and reshape from wide to long format cormat <- melt(cormat, na.rm = TRUE) # Correlation ggheatmap diff --git a/workflow/scripts/rNA_groups.Rmd b/workflow/scripts/rNA_groups.Rmd index 80d901f..d30f565 100755 --- a/workflow/scripts/rNA_groups.Rmd +++ b/workflow/scripts/rNA_groups.Rmd @@ -636,12 +636,12 @@ Corr plots {data-orientation=columns data-icon="ion-stats-bars"} Column {data-width=500} ------------------------------------- -### Heirarchical clustering of pairwise spearman correlation coefficients +### Hierarchical clustering of pairwise spearman correlation coefficients -```{r heirarchical-correlation-matrix, dpi=300} +```{r hierarchical-correlation-matrix, dpi=300} # Helper Function reorder_cormat <- function(cormat) { - # Use correlation between variables as distance for heirarchincal clustering + # Use correlation between variables as distance for hierarchincal clustering dd <- as.dist((1 - cormat) / 2) hc <- hclust(dd) cormat <- cormat[hc$order, hc$order] @@ -673,7 +673,7 @@ cormat <- reorder_cormat(cormatrix) # Get upper triangle of the correlation matrix cormat[lower.tri(cormat)] <- NA -# Remove lower trinagle and reshape from wide to long format +# Remove lower triangle and reshape from wide to long format cormat <- melt(cormat, na.rm = TRUE) # Correlation ggheatmap From a04c6f4b2cf4ec7509bde9f370c66d91779970f1 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 23 Jan 2024 15:32:08 -0500 Subject: [PATCH 3/3] chore: update CHANGELOG.md --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 15ff4b6..1fb8b41 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ ## RENEE development version +- Minor documentation improvements. (#100, @kelly-sovacool) + ## RENEE 2.5.11 - Create a citation file to describe how to cite RENEE. (#86, @kelly-sovacool)