Skip to content

Commit

Permalink
Merge pull request #125 from broadinstitute/dp-assembly
Browse files Browse the repository at this point in the history
updates to run_discordance task
  • Loading branch information
dpark01 committed Jun 21, 2020
2 parents 101325d + 29243c3 commit eacfcc4
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 7 deletions.
27 changes: 21 additions & 6 deletions pipes/WDL/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,11 @@ task refine_assembly_with_aligned_reads {
file_utils.py rename_fasta_sequences \
refined.fasta "${sample_name}.fasta" "${sample_name}"

# collect variant counts
bcftools filter -e "FMT/DP<${min_coverage}" -S . "${sample_name}.sites.vcf.gz" -Ou | bcftools filter -i "AC>1" -Ou > "${sample_name}.diffs.vcf"
bcftools filter -i 'TYPE="snp"' "${sample_name}.diffs.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_snps
bcftools filter -i 'TYPE!="snp"' "${sample_name}.diffs.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_indels

# collect figures of merit
set +o pipefail # grep will exit 1 if it fails to find the pattern
grep -v '^>' refined.fasta | tr -d '\n' | wc -c | tee assembly_length
Expand All @@ -430,6 +435,8 @@ task refine_assembly_with_aligned_reads {
File sites_vcf_gz = "${sample_name}.sites.vcf.gz"
Int assembly_length = read_int("assembly_length")
Int assembly_length_unambiguous = read_int("assembly_length_unambiguous")
Int dist_to_ref_snps = read_int("num_snps")
Int dist_to_ref_indels = read_int("num_indels")
String viralngs_version = read_string("VERSION")
}

Expand Down Expand Up @@ -658,6 +665,7 @@ task run_discordance {
File reads_aligned_bam
File reference_fasta
String out_basename = "run"
Int min_coverage=4

String docker="quay.io/broadinstitute/viral-core"
}
Expand All @@ -672,9 +680,15 @@ task run_discordance {
import tools.samtools
header = tools.samtools.SamtoolsTool().getHeader("${reads_aligned_bam}")
rgids = [[x[3:] for x in h if x.startswith('ID:')][0] for h in header if h[0]=='@RG']
n_rgs = len(rgids)
with open('readgroups.txt', 'wt') as outf:
for rg in rgids:
outf.write(rg+'\t'+rg+'\n')
n_lbs = len(set([[x[3:] for x in h if x.startswith('LB:')][0] for h in header if h[0]=='@RG']))
with open('num_read_groups', 'wt') as outf:
outf.write(str(n_rgs)+'\n')
with open('num_libraries', 'wt') as outf:
outf.write(str(n_lbs)+'\n')
CODE
# bcftools call snps while treating each RG as a separate sample
Expand All @@ -688,21 +702,22 @@ task run_discordance {
-Ov -o everything.vcf
# mask all GT calls when less than 3 reads
cat everything.vcf | bcftools filter -e 'FMT/DP<3' -S . > filtered.vcf
cat filtered.vcf | bcftools filter -i 'MAC>0' > "${out_basename}.discordant.vcf"
cat everything.vcf | bcftools filter -e "FMT/DP<${min_coverage}" -S . > filtered.vcf
cat filtered.vcf | bcftools filter -i "MAC>0" > "${out_basename}.discordant.vcf"
# tally outputs
set +o pipefail # to handle empty grep
cat filtered.vcf | bcftools filter -i 'MAC=0' | grep -v '^#' | wc -l | tee num_concordant
cat "${out_basename}.discordant.vcf" | bcftools filter -i 'TYPE="snp"' | grep -v '^#' | wc -l | tee num_discordant_snps
cat "${out_basename}.discordant.vcf" | bcftools filter -i 'TYPE!="snp"' | grep -v '^#' | wc -l | tee num_discordant_indels
bcftools filter -i 'MAC=0' filtered.vcf | bcftools query -f '%POS\n' | wc -l | tee num_concordant
bcftools filter -i 'TYPE="snp"' "${out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_snps
bcftools filter -i 'TYPE!="snp"' "${out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_indels
}
output {
File discordant_sites_vcf = "${out_basename}.discordant.vcf"
Int concordant_sites = read_int("num_concordant")
Int discordant_snps = read_int("num_discordant_snps")
Int discordant_indels = read_int("num_discordant_indels")
Int num_read_groups = read_int("num_read_groups")
Int num_libraries = read_int("num_libraries")
String viralngs_version = read_string("VERSION")
}
Expand Down
5 changes: 5 additions & 0 deletions pipes/WDL/workflows/assemble_refbased.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,14 @@ workflow assemble_refbased {
Int reference_genome_length = plot_ref_coverage.assembly_length
Float assembly_mean_coverage = plot_ref_coverage.mean_coverage

Int dist_to_ref_snps = call_consensus.dist_to_ref_snps
Int dist_to_ref_indels = call_consensus.dist_to_ref_indels

Int replicate_concordant_sites = run_discordance.concordant_sites
Int replicate_discordant_snps = run_discordance.discordant_snps
Int replicate_discordant_indels = run_discordance.discordant_indels
Int num_read_groups = run_discordance.num_read_groups
Int num_libraries = run_discordance.num_libraries
File replicate_discordant_vcf = run_discordance.discordant_sites_vcf

Array[File] align_to_ref_per_input_aligned_flagstat = align_to_ref.aligned_bam_flagstat
Expand Down
6 changes: 5 additions & 1 deletion test/input/WDL/test_outputs-assemble_refbased-local.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,9 @@
"assemble_refbased.assembly_length_unambiguous": 18889,
"assemble_refbased.assembly_length": 18889,
"assemble_refbased.replicate_discordant_indels": 0,
"assemble_refbased.replicate_discordant_snps": 0
"assemble_refbased.replicate_discordant_snps": 0,
"assemble_refbased.dist_to_ref_snps": 13,
"assemble_refbased.dist_to_ref_indels": 2,
"assemble_refbased.num_read_groups": 8,
"assemble_refbased.num_libraries": 2
}

0 comments on commit eacfcc4

Please sign in to comment.