Merge pull request #77 from broadinstitute/ct-add-merge-vcf-task

add vcf merging task & workflow
broadinstitute · May 21, 2020 · 833006b · 833006b
2 parents 2e1666f + ca81fbf
commit 833006b
Show file tree

Hide file tree

Showing 11 changed files with 250 additions and 24 deletions.
diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl
@@ -23,8 +23,8 @@ task assemble {
         set -ex -o pipefail
 
         # find 90% memory
-        mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90`
-        mem_in_gb=`/opt/viral-ngs/source/docker/calc_mem.py gb 90`
+        mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)
+        mem_in_gb=$(/opt/viral-ngs/source/docker/calc_mem.py gb 90)
 
         assembly.py --version | tee VERSION
 
@@ -124,7 +124,7 @@ task scaffold {
         set -ex -o pipefail
 
         # find 90% memory
-        mem_in_gb=`/opt/viral-ngs/source/docker/calc_mem.py gb 90`
+        mem_in_gb=$(/opt/viral-ngs/source/docker/calc_mem.py gb 90)
 
         assembly.py --version | tee VERSION
 
@@ -223,7 +223,7 @@ task ivar_trim {
           ${'-s ' + sliding_window} \
           ${'-q ' + min_quality} \
           -i ${aligned_bam} -p trim
-        samtools sort -@ `nproc` -m 1000M -o ${bam_basename}.trimmed.bam trim.bam
+        samtools sort -@ $(nproc) -m 1000M -o ${bam_basename}.trimmed.bam trim.bam
     }
 
     output {
@@ -311,7 +311,7 @@ task align_reads {
     samtools view -h -F 260 ${sample_name}.all.bam | samtools flagstat - | tee ${sample_name}.all.bam.flagstat.txt
     grep properly ${sample_name}.all.bam.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned
     samtools view ${sample_name}.mapped.bam | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned
-    python -c "print (float("`cat bases_aligned`")/"`cat assembly_length_unambiguous`") if "`cat assembly_length_unambiguous`">0 else 0" > mean_coverage
+    python -c "print (float("$(cat bases_aligned)")/"$(cat assembly_length_unambiguous)") if "$(cat assembly_length_unambiguous)">0 else 0" > mean_coverage
 
     # fastqc mapped bam
     reports.py fastqc ${sample_name}.mapped.bam ${sample_name}.mapped_fastqc.html --out_zip ${sample_name}.mapped_fastqc.zip
@@ -365,7 +365,7 @@ task refine_assembly_with_aligned_reads {
         set -ex -o pipefail
 
         # find 90% memory
-        mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90`
+        mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)
 
         assembly.py --version | tee VERSION
 
@@ -378,7 +378,7 @@ task refine_assembly_with_aligned_reads {
         else
           ln -s ${reads_aligned_bam} temp_markdup.bam
         fi
-        samtools index -@ `nproc` temp_markdup.bam temp_markdup.bai
+        samtools index -@ $(nproc) temp_markdup.bam temp_markdup.bai
 
         ln -s ${reference_fasta} assembly.fasta
         assembly.py refine_assembly \
@@ -442,7 +442,7 @@ task refine {
         set -ex -o pipefail
 
         # find 90% memory
-        mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90`
+        mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)
 
         assembly.py --version | tee VERSION
 
@@ -513,7 +513,7 @@ task refine_2x_and_plot {
         set -ex -o pipefail
 
         # find 90% memory
-        mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90`
+        mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)
 
         assembly.py --version | tee VERSION
 
@@ -569,7 +569,7 @@ task refine_2x_and_plot {
         grep properly ${sample_name}.all.bam.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned
         samtools view ${sample_name}.mapped.bam | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned
         #echo $(( $(cat bases_aligned) / $(cat assembly_length) )) | tee mean_coverage
-        python -c "print (float("`cat bases_aligned`")/"`cat assembly_length`") if "`cat assembly_length`">0 else 0" > mean_coverage
+        python -c "print (float("$(cat bases_aligned)")/"$(cat assembly_length)") if "$(cat assembly_length)">0 else 0" > mean_coverage
 
         # fastqc mapped bam
         reports.py fastqc ${sample_name}.mapped.bam ${sample_name}.mapped_fastqc.html --out_zip ${sample_name}.mapped_fastqc.zip

diff --git a/pipes/WDL/tasks/tasks_demux.wdl b/pipes/WDL/tasks/tasks_demux.wdl
@@ -67,7 +67,7 @@ task illumina_demux {
     set -ex -o pipefail
 
     # find N% memory
-    mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 85`
+    mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 85)
 
     if [ -z "$TMPDIR" ]; then
       export TMPDIR=$(pwd)
@@ -237,7 +237,7 @@ task illumina_demux {
         ,,_fastqc.html \
         --out_zip ,,_fastqc.zip \
         --threads $num_fastqc_threads" \
-      ::: `cat $OUT_BASENAMES`
+      ::: $(cat $OUT_BASENAMES)
   }
 
   output {

diff --git a/pipes/WDL/tasks/tasks_interhost.wdl b/pipes/WDL/tasks/tasks_interhost.wdl
@@ -188,4 +188,114 @@ task trimal_clean_msa {
   }
 }
 
+task merge_vcfs_bcftools {
+  input {
+    Array[File] in_vcfs_gz
+
+    Int?     machine_mem_gb
+    String   docker="quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0"
+
+    String   output_prefix = "merged"
+  }
+
+  parameter_meta {
+    in_vcfs_gz: {
+      description: "VCF files to merged; should be (b)gzipped.",
+      patterns: ["*.vcf.gz"] }
+  }
+
+  command {
+
+    # tabix index input vcfs (must be gzipped)
+    parallel -I ,, \
+      "tabix -p vcf ,," \
+      ::: "${sep=' ' in_vcfs_gz}"
+
+    # see: https://samtools.github.io/bcftools/bcftools.html#merge
+    # --merge snps allows snps to be merged to multi-allelic (multi-ALT) records, all other records are listed separately
+    bcftools merge 
+    --missing-to-ref \
+    --force-samples \
+    --merge snps \
+    --output ${output_prefix}.vcf.gz \
+    --output-type z \
+    --threads "$(nproc --all)" \
+    ${sep=' ' in_vcfs_gz}
+
+    # tabix index the vcf to create .tbi file
+    tabix -p vcf ${output_prefix}.vcf.gz
+  }
+
+  output {
+    File merged_vcf_gz     = "${output_prefix}.vcf.gz"
+    File merged_vcf_gz_tbi = "${output_prefix}.vcf.gz.tbi"
+  }
+
+  runtime {
+    docker: "${docker}"
+    memory: select_first([machine_mem_gb, 3]) + " GB"
+    cpu: 2
+    dx_instance_type: "mem1_ssd1_v2_x2"
+  }
+}
+
+task merge_vcfs_gatk {
+  input {
+    Array[File] in_vcfs_gz
+    File        ref_fasta
 
+    Int?     machine_mem_gb
+    String   docker="quay.io/broadinstitute/viral-phylo"
+
+    String   output_prefix = "merged"
+  }
+
+  parameter_meta {
+    in_vcfs_gz: {
+      description: "VCF files to merged; should be (b)gzipped.",
+      patterns: ["*.vcf.gz"] 
+    }
+    ref_fasta: {
+      description: "fasta file of reference genome relative to which the input VCF sites were called",
+      patterns: ["*.fasta",".fa"]
+    }
+  }
+
+  command {
+
+    # tabix index input vcfs (must be gzipped)
+    parallel -I ,, \
+      "tabix -p vcf ,," \
+      ::: "${sep=' ' in_vcfs_gz}"
+
+    # index reference to create .fai and .dict indices
+    samtools faidx "${ref_fasta}"
+    picard CreateSequenceDictionary R="${ref_fasta}" O=$(basename $(basename "${ref_fasta}" .fasta) .fa).dict
+
+    # store input vcf file paths in file
+    for invcf in $(echo "${sep=' ' in_vcfs_gz}"); do 
+      echo "$invcf" > input_vcfs.list
+    done
+
+    # merge
+    gatk3 -T CombineVariants -R "${ref_fasta}" -V input_vcfs.list -o "${output_prefix}.vcf" -genotypeMergeOptions UNIQUIFY
+
+    # bgzip output
+    bgzip "${output_prefix}.vcf"
+
+    # tabix index the vcf to create .tbi file
+    tabix -p vcf "${output_prefix}.vcf.gz"
+  }
+
+  output {
+    File merged_vcf_gz     = "${output_prefix}.vcf.gz"
+    File merged_vcf_gz_tbi = "${output_prefix}.vcf.gz.tbi"
+  }
+
+  runtime {
+    docker: "${docker}"
+    memory: select_first([machine_mem_gb, 3]) + " GB"
+    cpu: 2
+    dx_instance_type: "mem1_ssd1_v2_x2"
+  }
+}
diff --git a/pipes/WDL/tasks/tasks_intrahost.wdl b/pipes/WDL/tasks/tasks_intrahost.wdl
@@ -115,4 +115,71 @@ task isnvs_vcf {
   }
 }
 
+task annotate_vcf_snpeff {
+  input {
+    File           in_vcf
+    File           ref_fasta
+
+    Array[String]? snpEffRef
+    String?        emailAddress
+
+    Int?           machine_mem_gb
+    String         docker="quay.io/broadinstitute/viral-phylo"
+
+    String         output_basename = basename(basename(in_vcf, ".gz"), ".vcf")
+  }
+
+  parameter_meta {
+    in_vcf:             { description: "input VCF to annotate with snpEff", patterns: ["*.vcf","*.vcf.gz"]}
+    ref_fasta:          { description: "The sequence containing the accession to use for annotation; only used if snpEffRef is not provided.", patterns: ["*.fasta","*.fa"] }
+    snpEffRef:          { description: "list of accessions to build/find snpEff database. If this is not provided, the ID from the reference fasta will be used (it must be a GenBank accession)" }
+    emailAddress:       { description: "email address passed to NCBI if we need to download reference sequences" }
+  }
+
+  command {
+    set -ex -o pipefail
+
+    intrahost.py --version | tee VERSION
 
+    providedSnpRefAccessions="${sep=' ' snpEffRef}"
+    if [ -n "$providedSnpRefAccessions" ]; then 
+      snpRefAccessions="$providedSnpRefAccessions";
+    else
+      snpRefAccessions="$(python -c "from Bio import SeqIO; print(' '.join(list(s.id for s in SeqIO.parse('${ref_fasta}', 'fasta'))))")"
+    fi
+    echo "snpRefAccessions: $snpRefAccessions"
+
+    if (file "${in_vcf}" | grep -q "gzip" ) ; then
+      echo "${in_vcf} is already compressed"
+    else
+      echo "${in_vcf} is not compressed; gzipping..."
+      bgzip "${in_vcf}"
+    fi
+    echo "Creating vcf index"
+    tabix -p vcf "${in_vcf}"
+
+    interhost.py snpEff \
+        "${in_vcf}" \
+        $snpRefAccessions \
+        "${output_basename}.annot.vcf.gz" \
+        ${'--emailAddress=' + emailAddress}
+
+    intrahost.py iSNV_table \
+        "${output_basename}.annot.vcf.gz" \
+        "${output_basename}.annot.txt.gz"
+
+    tabix -p vcf "${output_basename}.annot.vcf.gz"
+  }
+
+  output {
+    File        annot_vcf_gz      = "${output_basename}.annot.vcf.gz"
+    File        annot_vcf_gz_tbi  = "${output_basename}.annot.vcf.gz.tbi"
+    File        annot_txt_gz      = "${output_basename}.annot.txt.gz"
+    String      viralngs_version  = read_string("VERSION")
+  }
+  runtime {
+    docker: "${docker}"
+    memory: select_first([machine_mem_gb, 4]) + " GB"
+    dx_instance_type: "mem1_ssd1_v2_x4"
+  }
+}
diff --git a/pipes/WDL/tasks/tasks_metagenomics.wdl b/pipes/WDL/tasks/tasks_metagenomics.wdl
@@ -80,8 +80,8 @@ task krakenuniq {
     metagenomics.py krakenuniq \
       $DB_DIR/krakenuniq \
       ${sep=' ' reads_unmapped_bam} \
-      --outReads `cat $OUT_READS` \
-      --outReport `cat $OUT_REPORTS` \
+      --outReads $(cat $OUT_READS) \
+      --outReport $(cat $OUT_REPORTS) \
       --loglevel=DEBUG
 
     wait # for krona_taxonomy_db_tgz to download and extract
@@ -99,7 +99,7 @@ task krakenuniq {
         --sample_name ,, \
         --noRank --noHits --inputType krakenuniq \
         --loglevel=DEBUG" \
-      ::: `cat $OUT_BASENAME`
+      ::: $(cat $OUT_BASENAME)
 
     # merge all krona reports
     ktImportKrona -o krakenuniq.krona.combined.html *.krakenuniq-krona.html
@@ -506,7 +506,7 @@ task blastx {
       -db $DB_DIR/blast/nr \
       -out "${out_basename}.blastx.contigs.txt" \
       -outfmt 7 \
-      -num_threads `nproc`
+      -num_threads $(nproc)
 
     wait # for krona_taxonomy_db_tgz to download and extract
 

diff --git a/pipes/WDL/tasks/tasks_read_utils.wdl b/pipes/WDL/tasks/tasks_read_utils.wdl
@@ -132,7 +132,7 @@ task downsample_bams {
     set -ex -o pipefail
 
     # find 90% memory
-    mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90`
+    mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)
 
     if [[ "${deduplicateBefore}" == "true" ]]; then
       DEDUP_OPTION="--deduplicateBefore"
@@ -196,7 +196,7 @@ task FastqToUBAM {
       set -ex -o pipefail
 
       # find 90% memory
-      mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90`
+      mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)
 
       read_utils.py --version | tee VERSION
 

diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl
@@ -54,7 +54,7 @@ task plot_coverage {
     samtools view -h -F 260 ${aligned_reads_bam} | samtools flagstat - | tee ${sample_name}.flagstat.txt
     grep properly ${sample_name}.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned
     samtools view ${aligned_reads_bam} | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned
-    python -c "print (float("`cat bases_aligned`")/"`cat assembly_length`") if "`cat assembly_length`">0 else 0" > mean_coverage
+    python -c "print (float("$(cat bases_aligned)")/"$(cat assembly_length)") if "$(cat assembly_length)">0 else 0" > mean_coverage
   }
 
   output {

diff --git a/pipes/WDL/tasks/tasks_taxon_filter.wdl b/pipes/WDL/tasks/tasks_taxon_filter.wdl
@@ -44,8 +44,8 @@ task deplete_taxa {
     fi
 
     # find memory thresholds
-    mem_in_mb_50=`/opt/viral-ngs/source/docker/calc_mem.py mb 50`
-    mem_in_mb_75=`/opt/viral-ngs/source/docker/calc_mem.py mb 75`
+    mem_in_mb_50=$(/opt/viral-ngs/source/docker/calc_mem.py mb 50)
+    mem_in_mb_75=$(/opt/viral-ngs/source/docker/calc_mem.py mb 75)
 
     # bmtagger and blast db args
     DBS_BMTAGGER="${sep=' ' bmtaggerDbs}"
@@ -121,7 +121,7 @@ task filter_to_taxon {
     taxon_filter.py --version | tee VERSION
 
     # find 90% memory
-    mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90`
+    mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)
 
     if [[ "${error_on_reads_in_neg_control}" == "true" ]]; then
       ERROR_ON_NEG_CONTROL_ARGS="--errorOnReadsInNegControl"
@@ -210,7 +210,7 @@ task merge_one_per_sample {
     read_utils.py --version | tee VERSION
 
     # find 90% memory
-    mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90`
+    mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)
 
     read_utils.py merge_bams \
       "${sep=' ' inputBams}" \

diff --git a/pipes/WDL/workflows/isnvs_one_sample.wdl b/pipes/WDL/workflows/isnvs_one_sample.wdl
@@ -7,6 +7,6 @@ workflow isnvs_one_sample {
 
     output {
         File   isnvsFile                   = isnvs_per_sample.isnvsFile
-        String isnvs_viral_phylo_version       = isnvs_per_sample.viralngs_version
+        String isnvs_viral_phylo_version   = isnvs_per_sample.viralngs_version
     }
 }
diff --git a/pipes/WDL/workflows/merge_vcfs.wdl b/pipes/WDL/workflows/merge_vcfs.wdl
@@ -0,0 +1,11 @@
+version 1.0
+
+import "../tasks/tasks_interhost.wdl" as interhost
+
+workflow merge_vcfs {
+    call interhost.merge_vcfs_gatk
+    output {
+        File merged_vcf_gz       = merge_vcfs_gatk.merged_vcf_gz
+        File merged_vcf_gz_tbi   = merge_vcfs_gatk.merged_vcf_gz_tbi
+    }
+}