Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updates to run_discordance task #125

Merged
merged 3 commits into from
Jun 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions pipes/WDL/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,11 @@ task refine_assembly_with_aligned_reads {
file_utils.py rename_fasta_sequences \
refined.fasta "${sample_name}.fasta" "${sample_name}"

# collect variant counts
bcftools filter -e "FMT/DP<${min_coverage}" -S . "${sample_name}.sites.vcf.gz" -Ou | bcftools filter -i "AC>1" -Ou > "${sample_name}.diffs.vcf"
bcftools filter -i 'TYPE="snp"' "${sample_name}.diffs.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_snps
bcftools filter -i 'TYPE!="snp"' "${sample_name}.diffs.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_indels

# collect figures of merit
set +o pipefail # grep will exit 1 if it fails to find the pattern
grep -v '^>' refined.fasta | tr -d '\n' | wc -c | tee assembly_length
Expand All @@ -430,6 +435,8 @@ task refine_assembly_with_aligned_reads {
File sites_vcf_gz = "${sample_name}.sites.vcf.gz"
Int assembly_length = read_int("assembly_length")
Int assembly_length_unambiguous = read_int("assembly_length_unambiguous")
Int dist_to_ref_snps = read_int("num_snps")
Int dist_to_ref_indels = read_int("num_indels")
String viralngs_version = read_string("VERSION")
}

Expand Down Expand Up @@ -658,6 +665,7 @@ task run_discordance {
File reads_aligned_bam
File reference_fasta
String out_basename = "run"
Int min_coverage=4

String docker="quay.io/broadinstitute/viral-core"
}
Expand All @@ -672,9 +680,15 @@ task run_discordance {
import tools.samtools
header = tools.samtools.SamtoolsTool().getHeader("${reads_aligned_bam}")
rgids = [[x[3:] for x in h if x.startswith('ID:')][0] for h in header if h[0]=='@RG']
n_rgs = len(rgids)
with open('readgroups.txt', 'wt') as outf:
for rg in rgids:
outf.write(rg+'\t'+rg+'\n')
n_lbs = len(set([[x[3:] for x in h if x.startswith('LB:')][0] for h in header if h[0]=='@RG']))
with open('num_read_groups', 'wt') as outf:
outf.write(str(n_rgs)+'\n')
with open('num_libraries', 'wt') as outf:
outf.write(str(n_lbs)+'\n')
CODE

# bcftools call snps while treating each RG as a separate sample
Expand All @@ -688,21 +702,22 @@ task run_discordance {
-Ov -o everything.vcf

# mask all GT calls when less than 3 reads
cat everything.vcf | bcftools filter -e 'FMT/DP<3' -S . > filtered.vcf
cat filtered.vcf | bcftools filter -i 'MAC>0' > "${out_basename}.discordant.vcf"
cat everything.vcf | bcftools filter -e "FMT/DP<${min_coverage}" -S . > filtered.vcf
cat filtered.vcf | bcftools filter -i "MAC>0" > "${out_basename}.discordant.vcf"

# tally outputs
set +o pipefail # to handle empty grep
cat filtered.vcf | bcftools filter -i 'MAC=0' | grep -v '^#' | wc -l | tee num_concordant
cat "${out_basename}.discordant.vcf" | bcftools filter -i 'TYPE="snp"' | grep -v '^#' | wc -l | tee num_discordant_snps
cat "${out_basename}.discordant.vcf" | bcftools filter -i 'TYPE!="snp"' | grep -v '^#' | wc -l | tee num_discordant_indels
bcftools filter -i 'MAC=0' filtered.vcf | bcftools query -f '%POS\n' | wc -l | tee num_concordant
bcftools filter -i 'TYPE="snp"' "${out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_snps
bcftools filter -i 'TYPE!="snp"' "${out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_indels
}

output {
File discordant_sites_vcf = "${out_basename}.discordant.vcf"
Int concordant_sites = read_int("num_concordant")
Int discordant_snps = read_int("num_discordant_snps")
Int discordant_indels = read_int("num_discordant_indels")
Int num_read_groups = read_int("num_read_groups")
Int num_libraries = read_int("num_libraries")
String viralngs_version = read_string("VERSION")
}

Expand Down
5 changes: 5 additions & 0 deletions pipes/WDL/workflows/assemble_refbased.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,14 @@ workflow assemble_refbased {
Int reference_genome_length = plot_ref_coverage.assembly_length
Float assembly_mean_coverage = plot_ref_coverage.mean_coverage

Int dist_to_ref_snps = call_consensus.dist_to_ref_snps
Int dist_to_ref_indels = call_consensus.dist_to_ref_indels

Int replicate_concordant_sites = run_discordance.concordant_sites
Int replicate_discordant_snps = run_discordance.discordant_snps
Int replicate_discordant_indels = run_discordance.discordant_indels
Int num_read_groups = run_discordance.num_read_groups
Int num_libraries = run_discordance.num_libraries
File replicate_discordant_vcf = run_discordance.discordant_sites_vcf

Array[File] align_to_ref_per_input_aligned_flagstat = align_to_ref.aligned_bam_flagstat
Expand Down
6 changes: 5 additions & 1 deletion test/input/WDL/test_outputs-assemble_refbased-local.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,9 @@
"assemble_refbased.assembly_length_unambiguous": 18889,
"assemble_refbased.assembly_length": 18889,
"assemble_refbased.replicate_discordant_indels": 0,
"assemble_refbased.replicate_discordant_snps": 0
"assemble_refbased.replicate_discordant_snps": 0,
"assemble_refbased.dist_to_ref_snps": 13,
"assemble_refbased.dist_to_ref_indels": 2,
"assemble_refbased.num_read_groups": 8,
"assemble_refbased.num_libraries": 2
}