From 92791b3f7c65481840ac6b1c898a9258d1310a1c Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 19 May 2020 17:47:10 -0400 Subject: [PATCH 01/10] add vcf merging task & workflow two tasks, one bcftools-based for simple VCFs, and one GATK3-based for merging VCFs created by UnifiedGenotyper --- pipes/WDL/tasks/tasks_interhost.wdl | 116 ++++++++++++++++++++++++++++ pipes/WDL/workflows/merge_vcfs.wdl | 11 +++ 2 files changed, 127 insertions(+) create mode 100644 pipes/WDL/workflows/merge_vcfs.wdl diff --git a/pipes/WDL/tasks/tasks_interhost.wdl b/pipes/WDL/tasks/tasks_interhost.wdl index 916a24871..fdc5f6640 100644 --- a/pipes/WDL/tasks/tasks_interhost.wdl +++ b/pipes/WDL/tasks/tasks_interhost.wdl @@ -188,4 +188,120 @@ task trimal_clean_msa { } } +task merge_vcfs_bcftools { + input { + Array[File] in_vcfs_gz + + Int? machine_mem_gb + String docker="quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0" + + String output_prefix = "merged" + } + + parameter_meta { + in_vcfs_gz: { + description: "VCF files to merged; should be (b)gzipped.", + patterns: ["*.vcf.gz"] } + } + + command { + + # tabix index input vcfs (must be gzipped) + parallel -I ,, \ + "tabix -p vcf ,," \ + ::: "${sep=' ' in_vcfs}" + + # see: https://samtools.github.io/bcftools/bcftools.html#merge + # --merge snps allows snps to be merged to multi-allelic (multi-ALT) records, all other records are listed separately + bcftools merge + --missing-to-ref \ + --force-samples \ + --merge snps \ + --output ${output_prefix}.vcf.gz \ + --output-type z \ + --threads "$(nproc --all)" \ + ${sep=' ' in_vcfs} + + # currently unused additional params + # ${'--regions=' + regions} + # ${'--filter-logic=' + filter_logic} + # ${'--info-rules=' + info_rules} + # ${'--apply-filters=' + apply_filters} + + # tabix index the vcf to create .tbi file + tabix -p vcf ${output_prefix}.vcf.gz + } + + output { + File merged_vcf_gz = "${output_prefix}.vcf.gz" + File merged_vcf_gz_tbi = "${output_prefix}.vcf.gz.tbi" + } + + runtime { + docker: "${docker}" + memory: select_first([machine_mem_gb, 3]) + " GB" + cpu: 2 + dx_instance_type: "mem1_ssd1_v2_x2" + } +} + +task merge_vcfs_gatk { + input { + Array[File] in_vcfs_gz + File ref_fasta + + Int? machine_mem_gb + String docker="quay.io/broadinstitute/viral-phylo" + + String output_prefix = "merged" + } + + parameter_meta { + in_vcfs_gz: { + description: "VCF files to merged; should be (b)gzipped.", + patterns: ["*.vcf.gz"] + } + ref_fasta: { + description: "fasta file of reference genome relative to which the input VCF sites were called", + patterns: ["*.fasta",".fa"] + } + } + + command { + + # tabix index input vcfs (must be gzipped) + parallel -I ,, \ + "tabix -p vcf ,," \ + ::: "${sep=' ' in_vcfs}" + + # index reference to create .fai and .dict indices + samtools faidx "${in_ref_fasta}" + picard CreateSequenceDictionary R="${in_ref_fasta}" O=$(basename $(basename "${in_ref_fasta}" .fasta) .fa).dict + + # store input vcf file paths in file + for invcf in $(echo "${sep=' ' in_vcfs}"); do + echo "$invcf" > input_vcfs.list + done + # merge + gatk3 -T CombineVariants -R "${ref_fasta}" -V input_vcfs.list -o "${output_prefix}.vcf" -genotypeMergeOptions UNIQUIFY + + # bgzip output + bgzip "${output_prefix}.vcf" + + # tabix index the vcf to create .tbi file + tabix -p vcf "${output_prefix}.vcf.gz" + } + + output { + File merged_vcf_gz = "${output_prefix}.vcf.gz" + File merged_vcf_gz_tbi = "${output_prefix}.vcf.gz.tbi" + } + + runtime { + docker: "${docker}" + memory: select_first([machine_mem_gb, 3]) + " GB" + cpu: 2 + dx_instance_type: "mem1_ssd1_v2_x2" + } +} diff --git a/pipes/WDL/workflows/merge_vcfs.wdl b/pipes/WDL/workflows/merge_vcfs.wdl new file mode 100644 index 000000000..a4ee06fd9 --- /dev/null +++ b/pipes/WDL/workflows/merge_vcfs.wdl @@ -0,0 +1,11 @@ +version 1.0 + +import "../tasks/tasks_interhost.wdl" as interhost + +workflow mafft { + call interhost.merge_vcfs_gatk + output { + File merged_vcf_gz = merge_vcfs_gatk.merged_vcf_gz + File merged_vcf_gz_tbi = merge_vcfs_gatk.merged_vcf_gz_tbi + } +} From 58add4837069a3015c052f4efe929603db469d6e Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 19 May 2020 17:52:40 -0400 Subject: [PATCH 02/10] s/in_vcfs/in_vcfs_gz/g --- pipes/WDL/tasks/tasks_interhost.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipes/WDL/tasks/tasks_interhost.wdl b/pipes/WDL/tasks/tasks_interhost.wdl index fdc5f6640..8aea9238e 100644 --- a/pipes/WDL/tasks/tasks_interhost.wdl +++ b/pipes/WDL/tasks/tasks_interhost.wdl @@ -209,7 +209,7 @@ task merge_vcfs_bcftools { # tabix index input vcfs (must be gzipped) parallel -I ,, \ "tabix -p vcf ,," \ - ::: "${sep=' ' in_vcfs}" + ::: "${sep=' ' in_vcfs_gz}" # see: https://samtools.github.io/bcftools/bcftools.html#merge # --merge snps allows snps to be merged to multi-allelic (multi-ALT) records, all other records are listed separately @@ -272,14 +272,14 @@ task merge_vcfs_gatk { # tabix index input vcfs (must be gzipped) parallel -I ,, \ "tabix -p vcf ,," \ - ::: "${sep=' ' in_vcfs}" + ::: "${sep=' ' in_vcfs_gz}" # index reference to create .fai and .dict indices samtools faidx "${in_ref_fasta}" picard CreateSequenceDictionary R="${in_ref_fasta}" O=$(basename $(basename "${in_ref_fasta}" .fasta) .fa).dict # store input vcf file paths in file - for invcf in $(echo "${sep=' ' in_vcfs}"); do + for invcf in $(echo "${sep=' ' in_vcfs_gz}"); do echo "$invcf" > input_vcfs.list done From e130a68649e982730e45bcfefdc33691d9a3c4ea Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 19 May 2020 18:01:54 -0400 Subject: [PATCH 03/10] s/in_ref_fasta/ref_fasta/g --- pipes/WDL/tasks/tasks_interhost.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/tasks/tasks_interhost.wdl b/pipes/WDL/tasks/tasks_interhost.wdl index 8aea9238e..f87937a4f 100644 --- a/pipes/WDL/tasks/tasks_interhost.wdl +++ b/pipes/WDL/tasks/tasks_interhost.wdl @@ -275,8 +275,8 @@ task merge_vcfs_gatk { ::: "${sep=' ' in_vcfs_gz}" # index reference to create .fai and .dict indices - samtools faidx "${in_ref_fasta}" - picard CreateSequenceDictionary R="${in_ref_fasta}" O=$(basename $(basename "${in_ref_fasta}" .fasta) .fa).dict + samtools faidx "${ref_fasta}" + picard CreateSequenceDictionary R="${ref_fasta}" O=$(basename $(basename "${ref_fasta}" .fasta) .fa).dict # store input vcf file paths in file for invcf in $(echo "${sep=' ' in_vcfs_gz}"); do From 98153962c0cc66792ff227b60f4ce61c31206680 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 19 May 2020 18:15:59 -0400 Subject: [PATCH 04/10] replace back ticks for shell with $() --- pipes/WDL/tasks/tasks_assembly.wdl | 20 ++++++++++---------- pipes/WDL/tasks/tasks_demux.wdl | 4 ++-- pipes/WDL/tasks/tasks_metagenomics.wdl | 8 ++++---- pipes/WDL/tasks/tasks_read_utils.wdl | 4 ++-- pipes/WDL/tasks/tasks_reports.wdl | 2 +- pipes/WDL/tasks/tasks_taxon_filter.wdl | 8 ++++---- 6 files changed, 23 insertions(+), 23 deletions(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index adfda3ba6..4c9408f04 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -23,8 +23,8 @@ task assemble { set -ex -o pipefail # find 90% memory - mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90` - mem_in_gb=`/opt/viral-ngs/source/docker/calc_mem.py gb 90` + mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90) + mem_in_gb=$(/opt/viral-ngs/source/docker/calc_mem.py gb 90) assembly.py --version | tee VERSION @@ -124,7 +124,7 @@ task scaffold { set -ex -o pipefail # find 90% memory - mem_in_gb=`/opt/viral-ngs/source/docker/calc_mem.py gb 90` + mem_in_gb=$(/opt/viral-ngs/source/docker/calc_mem.py gb 90) assembly.py --version | tee VERSION @@ -223,7 +223,7 @@ task ivar_trim { ${'-s ' + sliding_window} \ ${'-q ' + min_quality} \ -i ${aligned_bam} -p trim - samtools sort -@ `nproc` -m 1000M -o ${bam_basename}.trimmed.bam trim.bam + samtools sort -@ $(nproc) -m 1000M -o ${bam_basename}.trimmed.bam trim.bam } output { @@ -311,7 +311,7 @@ task align_reads { samtools view -h -F 260 ${sample_name}.all.bam | samtools flagstat - | tee ${sample_name}.all.bam.flagstat.txt grep properly ${sample_name}.all.bam.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned samtools view ${sample_name}.mapped.bam | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned - python -c "print (float("`cat bases_aligned`")/"`cat assembly_length_unambiguous`") if "`cat assembly_length_unambiguous`">0 else 0" > mean_coverage + python -c "print (float("$(cat bases_aligned)")/"$(cat assembly_length_unambiguous)") if "$(cat assembly_length_unambiguous)">0 else 0" > mean_coverage # fastqc mapped bam reports.py fastqc ${sample_name}.mapped.bam ${sample_name}.mapped_fastqc.html --out_zip ${sample_name}.mapped_fastqc.zip @@ -365,7 +365,7 @@ task refine_assembly_with_aligned_reads { set -ex -o pipefail # find 90% memory - mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90` + mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90) assembly.py --version | tee VERSION @@ -378,7 +378,7 @@ task refine_assembly_with_aligned_reads { else ln -s ${reads_aligned_bam} temp_markdup.bam fi - samtools index -@ `nproc` temp_markdup.bam temp_markdup.bai + samtools index -@ $(nproc) temp_markdup.bam temp_markdup.bai ln -s ${reference_fasta} assembly.fasta assembly.py refine_assembly \ @@ -442,7 +442,7 @@ task refine { set -ex -o pipefail # find 90% memory - mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90` + mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90) assembly.py --version | tee VERSION @@ -513,7 +513,7 @@ task refine_2x_and_plot { set -ex -o pipefail # find 90% memory - mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90` + mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90) assembly.py --version | tee VERSION @@ -569,7 +569,7 @@ task refine_2x_and_plot { grep properly ${sample_name}.all.bam.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned samtools view ${sample_name}.mapped.bam | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned #echo $(( $(cat bases_aligned) / $(cat assembly_length) )) | tee mean_coverage - python -c "print (float("`cat bases_aligned`")/"`cat assembly_length`") if "`cat assembly_length`">0 else 0" > mean_coverage + python -c "print (float("$(cat bases_aligned)")/"$(cat assembly_length)") if "$(cat assembly_length)">0 else 0" > mean_coverage # fastqc mapped bam reports.py fastqc ${sample_name}.mapped.bam ${sample_name}.mapped_fastqc.html --out_zip ${sample_name}.mapped_fastqc.zip diff --git a/pipes/WDL/tasks/tasks_demux.wdl b/pipes/WDL/tasks/tasks_demux.wdl index 3036f5952..12f0c425b 100644 --- a/pipes/WDL/tasks/tasks_demux.wdl +++ b/pipes/WDL/tasks/tasks_demux.wdl @@ -67,7 +67,7 @@ task illumina_demux { set -ex -o pipefail # find N% memory - mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 85` + mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 85) if [ -z "$TMPDIR" ]; then export TMPDIR=$(pwd) @@ -237,7 +237,7 @@ task illumina_demux { ,,_fastqc.html \ --out_zip ,,_fastqc.zip \ --threads $num_fastqc_threads" \ - ::: `cat $OUT_BASENAMES` + ::: $(cat $OUT_BASENAMES) } output { diff --git a/pipes/WDL/tasks/tasks_metagenomics.wdl b/pipes/WDL/tasks/tasks_metagenomics.wdl index bf1b041df..3c99b0bd2 100644 --- a/pipes/WDL/tasks/tasks_metagenomics.wdl +++ b/pipes/WDL/tasks/tasks_metagenomics.wdl @@ -80,8 +80,8 @@ task krakenuniq { metagenomics.py krakenuniq \ $DB_DIR/krakenuniq \ ${sep=' ' reads_unmapped_bam} \ - --outReads `cat $OUT_READS` \ - --outReport `cat $OUT_REPORTS` \ + --outReads $(cat $OUT_READS) \ + --outReport $(cat $OUT_REPORTS) \ --loglevel=DEBUG wait # for krona_taxonomy_db_tgz to download and extract @@ -99,7 +99,7 @@ task krakenuniq { --sample_name ,, \ --noRank --noHits --inputType krakenuniq \ --loglevel=DEBUG" \ - ::: `cat $OUT_BASENAME` + ::: $(cat $OUT_BASENAME) # merge all krona reports ktImportKrona -o krakenuniq.krona.combined.html *.krakenuniq-krona.html @@ -506,7 +506,7 @@ task blastx { -db $DB_DIR/blast/nr \ -out "${out_basename}.blastx.contigs.txt" \ -outfmt 7 \ - -num_threads `nproc` + -num_threads $(nproc) wait # for krona_taxonomy_db_tgz to download and extract diff --git a/pipes/WDL/tasks/tasks_read_utils.wdl b/pipes/WDL/tasks/tasks_read_utils.wdl index e39c0b4d9..4dd59d4e2 100644 --- a/pipes/WDL/tasks/tasks_read_utils.wdl +++ b/pipes/WDL/tasks/tasks_read_utils.wdl @@ -132,7 +132,7 @@ task downsample_bams { set -ex -o pipefail # find 90% memory - mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90` + mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90) if [[ "${deduplicateBefore}" == "true" ]]; then DEDUP_OPTION="--deduplicateBefore" @@ -196,7 +196,7 @@ task FastqToUBAM { set -ex -o pipefail # find 90% memory - mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90` + mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90) read_utils.py --version | tee VERSION diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index c3c9221aa..aec5cf967 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -54,7 +54,7 @@ task plot_coverage { samtools view -h -F 260 ${aligned_reads_bam} | samtools flagstat - | tee ${sample_name}.flagstat.txt grep properly ${sample_name}.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned samtools view ${aligned_reads_bam} | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned - python -c "print (float("`cat bases_aligned`")/"`cat assembly_length`") if "`cat assembly_length`">0 else 0" > mean_coverage + python -c "print (float("$(cat bases_aligned)")/"$(cat assembly_length)") if "$(cat assembly_length)">0 else 0" > mean_coverage } output { diff --git a/pipes/WDL/tasks/tasks_taxon_filter.wdl b/pipes/WDL/tasks/tasks_taxon_filter.wdl index 060a470e1..9b25b3e0d 100644 --- a/pipes/WDL/tasks/tasks_taxon_filter.wdl +++ b/pipes/WDL/tasks/tasks_taxon_filter.wdl @@ -44,8 +44,8 @@ task deplete_taxa { fi # find memory thresholds - mem_in_mb_50=`/opt/viral-ngs/source/docker/calc_mem.py mb 50` - mem_in_mb_75=`/opt/viral-ngs/source/docker/calc_mem.py mb 75` + mem_in_mb_50=$(/opt/viral-ngs/source/docker/calc_mem.py mb 50) + mem_in_mb_75=$(/opt/viral-ngs/source/docker/calc_mem.py mb 75) # bmtagger and blast db args DBS_BMTAGGER="${sep=' ' bmtaggerDbs}" @@ -121,7 +121,7 @@ task filter_to_taxon { taxon_filter.py --version | tee VERSION # find 90% memory - mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90` + mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90) if [[ "${error_on_reads_in_neg_control}" == "true" ]]; then ERROR_ON_NEG_CONTROL_ARGS="--errorOnReadsInNegControl" @@ -210,7 +210,7 @@ task merge_one_per_sample { read_utils.py --version | tee VERSION # find 90% memory - mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90` + mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90) read_utils.py merge_bams \ "${sep=' ' inputBams}" \ From 872646d81245fa20e5776c646be99b980bae29db Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 19 May 2020 18:17:00 -0400 Subject: [PATCH 05/10] remove unused params from bcftools merge task --- pipes/WDL/tasks/tasks_interhost.wdl | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pipes/WDL/tasks/tasks_interhost.wdl b/pipes/WDL/tasks/tasks_interhost.wdl index f87937a4f..26deaee53 100644 --- a/pipes/WDL/tasks/tasks_interhost.wdl +++ b/pipes/WDL/tasks/tasks_interhost.wdl @@ -220,13 +220,7 @@ task merge_vcfs_bcftools { --output ${output_prefix}.vcf.gz \ --output-type z \ --threads "$(nproc --all)" \ - ${sep=' ' in_vcfs} - - # currently unused additional params - # ${'--regions=' + regions} - # ${'--filter-logic=' + filter_logic} - # ${'--info-rules=' + info_rules} - # ${'--apply-filters=' + apply_filters} + ${sep=' ' in_vcfs_gz} # tabix index the vcf to create .tbi file tabix -p vcf ${output_prefix}.vcf.gz From 856946de2d3c829caffb06108d389a95d46122d6 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 20 May 2020 13:37:42 -0400 Subject: [PATCH 06/10] correct merge_vcfs workflow name so it matches file name correct merge_vcfs workflow name so it matches file name (copy+paste typo) --- pipes/WDL/workflows/merge_vcfs.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/workflows/merge_vcfs.wdl b/pipes/WDL/workflows/merge_vcfs.wdl index a4ee06fd9..d9ddf024b 100644 --- a/pipes/WDL/workflows/merge_vcfs.wdl +++ b/pipes/WDL/workflows/merge_vcfs.wdl @@ -2,7 +2,7 @@ version 1.0 import "../tasks/tasks_interhost.wdl" as interhost -workflow mafft { +workflow merge_vcfs { call interhost.merge_vcfs_gatk output { File merged_vcf_gz = merge_vcfs_gatk.merged_vcf_gz From c831dafb17c2271457b3dacced60bff629479f0e Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Thu, 21 May 2020 16:03:42 -0400 Subject: [PATCH 07/10] add annotate_vcf_snpeff task --- pipes/WDL/tasks/tasks_intrahost.wdl | 67 +++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/pipes/WDL/tasks/tasks_intrahost.wdl b/pipes/WDL/tasks/tasks_intrahost.wdl index 8827a1ad3..32058472b 100644 --- a/pipes/WDL/tasks/tasks_intrahost.wdl +++ b/pipes/WDL/tasks/tasks_intrahost.wdl @@ -115,4 +115,71 @@ task isnvs_vcf { } } +task annotate_vcf_snpeff { + input { + File in_vcf + File ref_fasta + + Array[String]? snpEffRef + String? emailAddress + + Int? machine_mem_gb + String docker="quay.io/broadinstitute/viral-phylo" + + String output_basename = basename(basename(in_vcf, ".gz"), ".vcf") + } + + parameter_meta { + in_vcf: { description: "input VCF to annotate with snpEff", patterns: ["*.vcf","*.vcf.gz"]} + ref_fasta: { description: "The sequence containing the accession to use for annotation; only used if snpEffRef is not provided.", patterns: ["*.fasta","*.fa"] } + snpEffRef: { description: "list of accessions to build/find snpEff database. If this is not provided, the ID from the reference fasta will be used (it must be a GenBank accession)" } + emailAddress: { description: "email address passed to NCBI if we need to download reference sequences" } + } + + command { + set -ex -o pipefail + + intrahost.py --version | tee VERSION + providedSnpRefAccessions="${sep=' ' snpEffRef}" + if [ -n "$providedSnpRefAccessions" ]; then + snpRefAccessions="$providedSnpRefAccessions"; + else + snpRefAccessions="$(python -c "from Bio import SeqIO; print(' '.join(list(s.id for s in SeqIO.parse('${ref_fasta}', 'fasta'))))")" + fi + echo "snpRefAccessions: $snpRefAccessions" + + if (file "${in_vcf}" | grep -q "gzip" ) ; then + echo "${in_vcf} is already compressed" + else + echo "${in_vcf} is not compressed; gzipping..." + bgzip "${in_vcf}" + fi + echo "Creating vcf index" + tabix -p vcf "${in_vcf}" + + interhost.py snpEff \ + "${in_vcf}" \ + $snpRefAccessions \ + "${output_basename}.annot.vcf.gz" \ + ${'--emailAddress=' + emailAddress} + + intrahost.py iSNV_table \ + "${output_basename}.annot.vcf.gz" \ + "${output_basename}.annot.txt.gz" + + tabix -p vcf "${output_basename}.annot.vcf.gz" + } + + output { + File annot_vcf_gz = "${output_basename}.annot.vcf.gz" + File annot_vcf_gz_tbi = "${output_basename}.annot.vcf.gz.tbi" + File annot_txt_gz = "${output_basename}.annot.txt.gz" + String viralngs_version = read_string("VERSION") + } + runtime { + docker: "${docker}" + memory: select_first([machine_mem_gb, 4]) + " GB" + dx_instance_type: "mem1_ssd1_v2_x4" + } +} From ebcd3c20e1eeb3e2831ab6e86a8b373230791771 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Thu, 21 May 2020 16:03:51 -0400 Subject: [PATCH 08/10] spacing --- pipes/WDL/workflows/isnvs_one_sample.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/workflows/isnvs_one_sample.wdl b/pipes/WDL/workflows/isnvs_one_sample.wdl index 68601b5c7..12727fb38 100644 --- a/pipes/WDL/workflows/isnvs_one_sample.wdl +++ b/pipes/WDL/workflows/isnvs_one_sample.wdl @@ -7,6 +7,6 @@ workflow isnvs_one_sample { output { File isnvsFile = isnvs_per_sample.isnvsFile - String isnvs_viral_phylo_version = isnvs_per_sample.viralngs_version + String isnvs_viral_phylo_version = isnvs_per_sample.viralngs_version } } From c62236e466ad90b546917130747ebec320f5b167 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Thu, 21 May 2020 16:04:03 -0400 Subject: [PATCH 09/10] add merge_vcfs_and_annotate workflow --- .../WDL/workflows/merge_vcfs_and_annotate.wdl | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 pipes/WDL/workflows/merge_vcfs_and_annotate.wdl diff --git a/pipes/WDL/workflows/merge_vcfs_and_annotate.wdl b/pipes/WDL/workflows/merge_vcfs_and_annotate.wdl new file mode 100644 index 000000000..80acac771 --- /dev/null +++ b/pipes/WDL/workflows/merge_vcfs_and_annotate.wdl @@ -0,0 +1,38 @@ +version 1.0 + +import "../tasks/tasks_interhost.wdl" as interhost +import "../tasks/tasks_intrahost.wdl" as intrahost + +workflow merge_vcfs_and_annotate { + meta { + description: "Merge VCFs emitted by GATK UnifiedGenotyper and annotate with snpEff." + } + + input { + File reference_fasta + } + + parameter_meta { + reference_fasta: { + description: "Reference genome, all segments/chromosomes in one fasta file. Headers must be Genbank accessions.", + patterns: ["*.fasta"] + } + } + + call interhost.merge_vcfs_gatk as merge_vcfs { + input: + ref_fasta = reference_fasta + } + call intrahost.annotate_vcf_snpeff as annotate_vcf { + input: + ref_fasta = reference_fasta + in_vcf = merge_vcfs.merged_vcf_gz + } + output { + File merged_vcf_gz = merge_vcfs.merged_vcf_gz + File merged_vcf_gz_tbi = merge_vcfs.merged_vcf_gz_tbi + File merged_annot_vcf_gz = annotate_vcf.annot_vcf_gz + File merged_annot_vcf_gz_tbi = annotate_vcf.annot_vcf_gz_tbi + File merged_annot_txt_gz = annotate_vcf.annot_txt_gz + } +} From ca81fbfac825f8eb5db57aff45620022fce88ae5 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Thu, 21 May 2020 16:22:38 -0400 Subject: [PATCH 10/10] add missing comma --- pipes/WDL/workflows/merge_vcfs_and_annotate.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/workflows/merge_vcfs_and_annotate.wdl b/pipes/WDL/workflows/merge_vcfs_and_annotate.wdl index 80acac771..4b6af8518 100644 --- a/pipes/WDL/workflows/merge_vcfs_and_annotate.wdl +++ b/pipes/WDL/workflows/merge_vcfs_and_annotate.wdl @@ -25,7 +25,7 @@ workflow merge_vcfs_and_annotate { } call intrahost.annotate_vcf_snpeff as annotate_vcf { input: - ref_fasta = reference_fasta + ref_fasta = reference_fasta, in_vcf = merge_vcfs.merged_vcf_gz } output {