Skip to content

Commit

Permalink
Merge pull request #77 from broadinstitute/ct-add-merge-vcf-task
Browse files Browse the repository at this point in the history
add vcf merging task & workflow
  • Loading branch information
dpark01 committed May 21, 2020
2 parents 2e1666f + ca81fbf commit 833006b
Show file tree
Hide file tree
Showing 11 changed files with 250 additions and 24 deletions.
20 changes: 10 additions & 10 deletions pipes/WDL/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ task assemble {
set -ex -o pipefail

# find 90% memory
mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90`
mem_in_gb=`/opt/viral-ngs/source/docker/calc_mem.py gb 90`
mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)
mem_in_gb=$(/opt/viral-ngs/source/docker/calc_mem.py gb 90)

assembly.py --version | tee VERSION

Expand Down Expand Up @@ -124,7 +124,7 @@ task scaffold {
set -ex -o pipefail

# find 90% memory
mem_in_gb=`/opt/viral-ngs/source/docker/calc_mem.py gb 90`
mem_in_gb=$(/opt/viral-ngs/source/docker/calc_mem.py gb 90)

assembly.py --version | tee VERSION

Expand Down Expand Up @@ -223,7 +223,7 @@ task ivar_trim {
${'-s ' + sliding_window} \
${'-q ' + min_quality} \
-i ${aligned_bam} -p trim
samtools sort -@ `nproc` -m 1000M -o ${bam_basename}.trimmed.bam trim.bam
samtools sort -@ $(nproc) -m 1000M -o ${bam_basename}.trimmed.bam trim.bam
}

output {
Expand Down Expand Up @@ -311,7 +311,7 @@ task align_reads {
samtools view -h -F 260 ${sample_name}.all.bam | samtools flagstat - | tee ${sample_name}.all.bam.flagstat.txt
grep properly ${sample_name}.all.bam.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned
samtools view ${sample_name}.mapped.bam | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned
python -c "print (float("`cat bases_aligned`")/"`cat assembly_length_unambiguous`") if "`cat assembly_length_unambiguous`">0 else 0" > mean_coverage
python -c "print (float("$(cat bases_aligned)")/"$(cat assembly_length_unambiguous)") if "$(cat assembly_length_unambiguous)">0 else 0" > mean_coverage

# fastqc mapped bam
reports.py fastqc ${sample_name}.mapped.bam ${sample_name}.mapped_fastqc.html --out_zip ${sample_name}.mapped_fastqc.zip
Expand Down Expand Up @@ -365,7 +365,7 @@ task refine_assembly_with_aligned_reads {
set -ex -o pipefail

# find 90% memory
mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90`
mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)

assembly.py --version | tee VERSION

Expand All @@ -378,7 +378,7 @@ task refine_assembly_with_aligned_reads {
else
ln -s ${reads_aligned_bam} temp_markdup.bam
fi
samtools index -@ `nproc` temp_markdup.bam temp_markdup.bai
samtools index -@ $(nproc) temp_markdup.bam temp_markdup.bai

ln -s ${reference_fasta} assembly.fasta
assembly.py refine_assembly \
Expand Down Expand Up @@ -442,7 +442,7 @@ task refine {
set -ex -o pipefail

# find 90% memory
mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90`
mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)

assembly.py --version | tee VERSION

Expand Down Expand Up @@ -513,7 +513,7 @@ task refine_2x_and_plot {
set -ex -o pipefail

# find 90% memory
mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90`
mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)

assembly.py --version | tee VERSION

Expand Down Expand Up @@ -569,7 +569,7 @@ task refine_2x_and_plot {
grep properly ${sample_name}.all.bam.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned
samtools view ${sample_name}.mapped.bam | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned
#echo $(( $(cat bases_aligned) / $(cat assembly_length) )) | tee mean_coverage
python -c "print (float("`cat bases_aligned`")/"`cat assembly_length`") if "`cat assembly_length`">0 else 0" > mean_coverage
python -c "print (float("$(cat bases_aligned)")/"$(cat assembly_length)") if "$(cat assembly_length)">0 else 0" > mean_coverage

# fastqc mapped bam
reports.py fastqc ${sample_name}.mapped.bam ${sample_name}.mapped_fastqc.html --out_zip ${sample_name}.mapped_fastqc.zip
Expand Down
4 changes: 2 additions & 2 deletions pipes/WDL/tasks/tasks_demux.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ task illumina_demux {
set -ex -o pipefail

# find N% memory
mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 85`
mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 85)

if [ -z "$TMPDIR" ]; then
export TMPDIR=$(pwd)
Expand Down Expand Up @@ -237,7 +237,7 @@ task illumina_demux {
,,_fastqc.html \
--out_zip ,,_fastqc.zip \
--threads $num_fastqc_threads" \
::: `cat $OUT_BASENAMES`
::: $(cat $OUT_BASENAMES)
}

output {
Expand Down
110 changes: 110 additions & 0 deletions pipes/WDL/tasks/tasks_interhost.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -188,4 +188,114 @@ task trimal_clean_msa {
}
}

task merge_vcfs_bcftools {
input {
Array[File] in_vcfs_gz

Int? machine_mem_gb
String docker="quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0"

String output_prefix = "merged"
}

parameter_meta {
in_vcfs_gz: {
description: "VCF files to merged; should be (b)gzipped.",
patterns: ["*.vcf.gz"] }
}

command {

# tabix index input vcfs (must be gzipped)
parallel -I ,, \
"tabix -p vcf ,," \
::: "${sep=' ' in_vcfs_gz}"

# see: https://samtools.github.io/bcftools/bcftools.html#merge
# --merge snps allows snps to be merged to multi-allelic (multi-ALT) records, all other records are listed separately
bcftools merge
--missing-to-ref \
--force-samples \
--merge snps \
--output ${output_prefix}.vcf.gz \
--output-type z \
--threads "$(nproc --all)" \
${sep=' ' in_vcfs_gz}

# tabix index the vcf to create .tbi file
tabix -p vcf ${output_prefix}.vcf.gz
}

output {
File merged_vcf_gz = "${output_prefix}.vcf.gz"
File merged_vcf_gz_tbi = "${output_prefix}.vcf.gz.tbi"
}

runtime {
docker: "${docker}"
memory: select_first([machine_mem_gb, 3]) + " GB"
cpu: 2
dx_instance_type: "mem1_ssd1_v2_x2"
}
}

task merge_vcfs_gatk {
input {
Array[File] in_vcfs_gz
File ref_fasta

Int? machine_mem_gb
String docker="quay.io/broadinstitute/viral-phylo"

String output_prefix = "merged"
}

parameter_meta {
in_vcfs_gz: {
description: "VCF files to merged; should be (b)gzipped.",
patterns: ["*.vcf.gz"]
}
ref_fasta: {
description: "fasta file of reference genome relative to which the input VCF sites were called",
patterns: ["*.fasta",".fa"]
}
}

command {

# tabix index input vcfs (must be gzipped)
parallel -I ,, \
"tabix -p vcf ,," \
::: "${sep=' ' in_vcfs_gz}"

# index reference to create .fai and .dict indices
samtools faidx "${ref_fasta}"
picard CreateSequenceDictionary R="${ref_fasta}" O=$(basename $(basename "${ref_fasta}" .fasta) .fa).dict

# store input vcf file paths in file
for invcf in $(echo "${sep=' ' in_vcfs_gz}"); do
echo "$invcf" > input_vcfs.list
done

# merge
gatk3 -T CombineVariants -R "${ref_fasta}" -V input_vcfs.list -o "${output_prefix}.vcf" -genotypeMergeOptions UNIQUIFY

# bgzip output
bgzip "${output_prefix}.vcf"

# tabix index the vcf to create .tbi file
tabix -p vcf "${output_prefix}.vcf.gz"
}

output {
File merged_vcf_gz = "${output_prefix}.vcf.gz"
File merged_vcf_gz_tbi = "${output_prefix}.vcf.gz.tbi"
}

runtime {
docker: "${docker}"
memory: select_first([machine_mem_gb, 3]) + " GB"
cpu: 2
dx_instance_type: "mem1_ssd1_v2_x2"
}
}
67 changes: 67 additions & 0 deletions pipes/WDL/tasks/tasks_intrahost.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -115,4 +115,71 @@ task isnvs_vcf {
}
}

task annotate_vcf_snpeff {
input {
File in_vcf
File ref_fasta

Array[String]? snpEffRef
String? emailAddress

Int? machine_mem_gb
String docker="quay.io/broadinstitute/viral-phylo"

String output_basename = basename(basename(in_vcf, ".gz"), ".vcf")
}

parameter_meta {
in_vcf: { description: "input VCF to annotate with snpEff", patterns: ["*.vcf","*.vcf.gz"]}
ref_fasta: { description: "The sequence containing the accession to use for annotation; only used if snpEffRef is not provided.", patterns: ["*.fasta","*.fa"] }
snpEffRef: { description: "list of accessions to build/find snpEff database. If this is not provided, the ID from the reference fasta will be used (it must be a GenBank accession)" }
emailAddress: { description: "email address passed to NCBI if we need to download reference sequences" }
}

command {
set -ex -o pipefail

intrahost.py --version | tee VERSION

providedSnpRefAccessions="${sep=' ' snpEffRef}"
if [ -n "$providedSnpRefAccessions" ]; then
snpRefAccessions="$providedSnpRefAccessions";
else
snpRefAccessions="$(python -c "from Bio import SeqIO; print(' '.join(list(s.id for s in SeqIO.parse('${ref_fasta}', 'fasta'))))")"
fi
echo "snpRefAccessions: $snpRefAccessions"

if (file "${in_vcf}" | grep -q "gzip" ) ; then
echo "${in_vcf} is already compressed"
else
echo "${in_vcf} is not compressed; gzipping..."
bgzip "${in_vcf}"
fi
echo "Creating vcf index"
tabix -p vcf "${in_vcf}"

interhost.py snpEff \
"${in_vcf}" \
$snpRefAccessions \
"${output_basename}.annot.vcf.gz" \
${'--emailAddress=' + emailAddress}

intrahost.py iSNV_table \
"${output_basename}.annot.vcf.gz" \
"${output_basename}.annot.txt.gz"

tabix -p vcf "${output_basename}.annot.vcf.gz"
}

output {
File annot_vcf_gz = "${output_basename}.annot.vcf.gz"
File annot_vcf_gz_tbi = "${output_basename}.annot.vcf.gz.tbi"
File annot_txt_gz = "${output_basename}.annot.txt.gz"
String viralngs_version = read_string("VERSION")
}
runtime {
docker: "${docker}"
memory: select_first([machine_mem_gb, 4]) + " GB"
dx_instance_type: "mem1_ssd1_v2_x4"
}
}
8 changes: 4 additions & 4 deletions pipes/WDL/tasks/tasks_metagenomics.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ task krakenuniq {
metagenomics.py krakenuniq \
$DB_DIR/krakenuniq \
${sep=' ' reads_unmapped_bam} \
--outReads `cat $OUT_READS` \
--outReport `cat $OUT_REPORTS` \
--outReads $(cat $OUT_READS) \
--outReport $(cat $OUT_REPORTS) \
--loglevel=DEBUG

wait # for krona_taxonomy_db_tgz to download and extract
Expand All @@ -99,7 +99,7 @@ task krakenuniq {
--sample_name ,, \
--noRank --noHits --inputType krakenuniq \
--loglevel=DEBUG" \
::: `cat $OUT_BASENAME`
::: $(cat $OUT_BASENAME)

# merge all krona reports
ktImportKrona -o krakenuniq.krona.combined.html *.krakenuniq-krona.html
Expand Down Expand Up @@ -506,7 +506,7 @@ task blastx {
-db $DB_DIR/blast/nr \
-out "${out_basename}.blastx.contigs.txt" \
-outfmt 7 \
-num_threads `nproc`
-num_threads $(nproc)

wait # for krona_taxonomy_db_tgz to download and extract

Expand Down
4 changes: 2 additions & 2 deletions pipes/WDL/tasks/tasks_read_utils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ task downsample_bams {
set -ex -o pipefail

# find 90% memory
mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90`
mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)

if [[ "${deduplicateBefore}" == "true" ]]; then
DEDUP_OPTION="--deduplicateBefore"
Expand Down Expand Up @@ -196,7 +196,7 @@ task FastqToUBAM {
set -ex -o pipefail

# find 90% memory
mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90`
mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)

read_utils.py --version | tee VERSION

Expand Down
2 changes: 1 addition & 1 deletion pipes/WDL/tasks/tasks_reports.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ task plot_coverage {
samtools view -h -F 260 ${aligned_reads_bam} | samtools flagstat - | tee ${sample_name}.flagstat.txt
grep properly ${sample_name}.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned
samtools view ${aligned_reads_bam} | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned
python -c "print (float("`cat bases_aligned`")/"`cat assembly_length`") if "`cat assembly_length`">0 else 0" > mean_coverage
python -c "print (float("$(cat bases_aligned)")/"$(cat assembly_length)") if "$(cat assembly_length)">0 else 0" > mean_coverage
}

output {
Expand Down
8 changes: 4 additions & 4 deletions pipes/WDL/tasks/tasks_taxon_filter.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ task deplete_taxa {
fi

# find memory thresholds
mem_in_mb_50=`/opt/viral-ngs/source/docker/calc_mem.py mb 50`
mem_in_mb_75=`/opt/viral-ngs/source/docker/calc_mem.py mb 75`
mem_in_mb_50=$(/opt/viral-ngs/source/docker/calc_mem.py mb 50)
mem_in_mb_75=$(/opt/viral-ngs/source/docker/calc_mem.py mb 75)

# bmtagger and blast db args
DBS_BMTAGGER="${sep=' ' bmtaggerDbs}"
Expand Down Expand Up @@ -121,7 +121,7 @@ task filter_to_taxon {
taxon_filter.py --version | tee VERSION

# find 90% memory
mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90`
mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)

if [[ "${error_on_reads_in_neg_control}" == "true" ]]; then
ERROR_ON_NEG_CONTROL_ARGS="--errorOnReadsInNegControl"
Expand Down Expand Up @@ -210,7 +210,7 @@ task merge_one_per_sample {
read_utils.py --version | tee VERSION

# find 90% memory
mem_in_mb=`/opt/viral-ngs/source/docker/calc_mem.py mb 90`
mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)

read_utils.py merge_bams \
"${sep=' ' inputBams}" \
Expand Down
2 changes: 1 addition & 1 deletion pipes/WDL/workflows/isnvs_one_sample.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ workflow isnvs_one_sample {

output {
File isnvsFile = isnvs_per_sample.isnvsFile
String isnvs_viral_phylo_version = isnvs_per_sample.viralngs_version
String isnvs_viral_phylo_version = isnvs_per_sample.viralngs_version
}
}
11 changes: 11 additions & 0 deletions pipes/WDL/workflows/merge_vcfs.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
version 1.0

import "../tasks/tasks_interhost.wdl" as interhost

workflow merge_vcfs {
call interhost.merge_vcfs_gatk
output {
File merged_vcf_gz = merge_vcfs_gatk.merged_vcf_gz
File merged_vcf_gz_tbi = merge_vcfs_gatk.merged_vcf_gz_tbi
}
}
Loading

0 comments on commit 833006b

Please sign in to comment.