Skip to content

Commit

Permalink
Merge pull request #124 from broadinstitute/dp-assembly
Browse files Browse the repository at this point in the history
add run_discordance task
  • Loading branch information
dpark01 committed Jun 21, 2020
2 parents fd7bc67 + 68e8900 commit 101325d
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 12 deletions.
67 changes: 67 additions & 0 deletions pipes/WDL/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -648,3 +648,70 @@ task refine_2x_and_plot {
dx_instance_type: "mem1_ssd1_v2_x8"
}
}

task run_discordance {
meta {
description: "This step evaluates discordance between sequencing runs of the same sample. The input is a merged, aligned BAM file for a single sample. If multiple runs (read groups) exist, we split the aligned reads by read group and separately evaluate consensus calls per read group using bcftools mpileup and call. A VCF is emitted that describes variation between runs."
}

input {
File reads_aligned_bam
File reference_fasta
String out_basename = "run"

String docker="quay.io/broadinstitute/viral-core"
}

command {
set -ex -o pipefail

read_utils.py --version | tee VERSION

# create 2-col table with read group ids in both cols
python3 <<CODE
import tools.samtools
header = tools.samtools.SamtoolsTool().getHeader("${reads_aligned_bam}")
rgids = [[x[3:] for x in h if x.startswith('ID:')][0] for h in header if h[0]=='@RG']
with open('readgroups.txt', 'wt') as outf:
for rg in rgids:
outf.write(rg+'\t'+rg+'\n')
CODE
# bcftools call snps while treating each RG as a separate sample
bcftools mpileup \
-G readgroups.txt -d 10000 -a "FORMAT/DP,FORMAT/AD" \
-q 1 -m 2 -Ou \
-f "${reference_fasta}" "${reads_aligned_bam}" \
| bcftools call \
-P 0 -m --ploidy 1 \
--threads $(nproc) \
-Ov -o everything.vcf
# mask all GT calls when less than 3 reads
cat everything.vcf | bcftools filter -e 'FMT/DP<3' -S . > filtered.vcf
cat filtered.vcf | bcftools filter -i 'MAC>0' > "${out_basename}.discordant.vcf"
# tally outputs
set +o pipefail # to handle empty grep
cat filtered.vcf | bcftools filter -i 'MAC=0' | grep -v '^#' | wc -l | tee num_concordant
cat "${out_basename}.discordant.vcf" | bcftools filter -i 'TYPE="snp"' | grep -v '^#' | wc -l | tee num_discordant_snps
cat "${out_basename}.discordant.vcf" | bcftools filter -i 'TYPE!="snp"' | grep -v '^#' | wc -l | tee num_discordant_indels
}
output {
File discordant_sites_vcf = "${out_basename}.discordant.vcf"
Int concordant_sites = read_int("num_concordant")
Int discordant_snps = read_int("num_discordant_snps")
Int discordant_indels = read_int("num_discordant_indels")
String viralngs_version = read_string("VERSION")
}
runtime {
docker: "${docker}"
memory: "3 GB"
cpu: 2
disks: "local-disk 100 HDD"
dx_instance_type: "mem1_ssd1_v2_x2"
preemptible: 1
}
}
10 changes: 4 additions & 6 deletions pipes/WDL/tasks/tasks_reports.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@ task align_and_count {
input {
File reads_bam
File ref_db
Int? minScoreToFilter
Int? topNHits = 3

Int? machine_mem_gb
Expand All @@ -161,11 +160,10 @@ task align_and_count {
read_utils.py --version | tee VERSION

ln -s ${reads_bam} ${reads_basename}.bam
read_utils.py bwamem_idxstats \
read_utils.py minimap2_idxstats \
${reads_basename}.bam \
${ref_db} \
--outStats ${reads_basename}.count.${ref_basename}.txt.unsorted \
${'--minScoreToFilter=' + minScoreToFilter} \
--loglevel=DEBUG

sort -b -r -n -k3 ${reads_basename}.count.${ref_basename}.txt.unsorted > ${reads_basename}.count.${ref_basename}.txt
Expand All @@ -179,7 +177,7 @@ task align_and_count {
}

runtime {
memory: select_first([machine_mem_gb, 7]) + " GB"
memory: select_first([machine_mem_gb, 3]) + " GB"
cpu: 4
docker: "${docker}"
disks: "local-disk 375 LOCAL"
Expand Down Expand Up @@ -367,7 +365,7 @@ task tsv_join {
String join_type="inner"
String out_basename

String docker="stratdat/csvkit"
String docker="quay.io/broadinstitute/viral-core"
}

command {
Expand Down Expand Up @@ -408,7 +406,7 @@ task tsv_stack {
input {
Array[File]+ input_tsvs
String out_basename
String docker="stratdat/csvkit"
String docker="quay.io/broadinstitute/viral-core"
}

command {
Expand Down
12 changes: 12 additions & 0 deletions pipes/WDL/workflows/assemble_refbased.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,13 @@ workflow assemble_refbased {
out_basename = "${sample_name}.align_to_ref.trimmed"
}

call assembly.run_discordance {
input:
reads_aligned_bam = merge_align_to_ref.out_bam,
reference_fasta = reference_fasta,
out_basename = sample_name
}

call reports.plot_coverage as plot_ref_coverage {
input:
aligned_reads_bam = merge_align_to_ref.out_bam,
Expand Down Expand Up @@ -140,6 +147,11 @@ workflow assemble_refbased {
Int reference_genome_length = plot_ref_coverage.assembly_length
Float assembly_mean_coverage = plot_ref_coverage.mean_coverage

Int replicate_concordant_sites = run_discordance.concordant_sites
Int replicate_discordant_snps = run_discordance.discordant_snps
Int replicate_discordant_indels = run_discordance.discordant_indels
File replicate_discordant_vcf = run_discordance.discordant_sites_vcf

Array[File] align_to_ref_per_input_aligned_flagstat = align_to_ref.aligned_bam_flagstat
Array[Int] align_to_ref_per_input_reads_provided = align_to_ref.reads_provided
Array[Int] align_to_ref_per_input_reads_aligned = align_to_ref.reads_aligned
Expand Down
2 changes: 1 addition & 1 deletion pipes/WDL/workflows/diff_genome_sets.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ workflow diff_genome_sets {
call reports.tsv_stack {
input:
input_tsvs = compare_two_genomes.comparison_table,
out_basename = "diff_genome_sets.txt"
out_basename = "diff_genome_sets"
}

output {
Expand Down
8 changes: 4 additions & 4 deletions requirements-modules.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
broadinstitute/viral-core=2.1.3
broadinstitute/viral-assemble=2.1.3.1
broadinstitute/viral-classify=2.1.3.1
broadinstitute/viral-phylo=2.1.3.1
broadinstitute/viral-core=2.1.4
broadinstitute/viral-assemble=2.1.4.0
broadinstitute/viral-classify=2.1.4.0
broadinstitute/viral-phylo=2.1.4.0
broadinstitute/beast-beagle-cuda=1.10.5pre
broadinstitute/ncbi-tools=2.10.7.0
nextstrain/base=build-20200608T223413Z
Expand Down
4 changes: 3 additions & 1 deletion test/input/WDL/test_outputs-assemble_refbased-local.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,7 @@
"assemble_refbased.align_to_self_merged_reads_aligned": 18409,
"assemble_refbased.reference_genome_length": 18959,
"assemble_refbased.assembly_length_unambiguous": 18889,
"assemble_refbased.assembly_length": 18889
"assemble_refbased.assembly_length": 18889,
"assemble_refbased.replicate_discordant_indels": 0,
"assemble_refbased.replicate_discordant_snps": 0
}

0 comments on commit 101325d

Please sign in to comment.