diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index ae5715e40..f6c36ad14 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -419,6 +419,11 @@ task refine_assembly_with_aligned_reads { file_utils.py rename_fasta_sequences \ refined.fasta "${sample_name}.fasta" "${sample_name}" + # collect variant counts + bcftools filter -e "FMT/DP<${min_coverage}" -S . "${sample_name}.sites.vcf.gz" -Ou | bcftools filter -i "AC>1" -Ou > "${sample_name}.diffs.vcf" + bcftools filter -i 'TYPE="snp"' "${sample_name}.diffs.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_snps + bcftools filter -i 'TYPE!="snp"' "${sample_name}.diffs.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_indels + # collect figures of merit set +o pipefail # grep will exit 1 if it fails to find the pattern grep -v '^>' refined.fasta | tr -d '\n' | wc -c | tee assembly_length @@ -430,6 +435,8 @@ task refine_assembly_with_aligned_reads { File sites_vcf_gz = "${sample_name}.sites.vcf.gz" Int assembly_length = read_int("assembly_length") Int assembly_length_unambiguous = read_int("assembly_length_unambiguous") + Int dist_to_ref_snps = read_int("num_snps") + Int dist_to_ref_indels = read_int("num_indels") String viralngs_version = read_string("VERSION") } @@ -658,6 +665,7 @@ task run_discordance { File reads_aligned_bam File reference_fasta String out_basename = "run" + Int min_coverage=4 String docker="quay.io/broadinstitute/viral-core" } @@ -672,9 +680,15 @@ task run_discordance { import tools.samtools header = tools.samtools.SamtoolsTool().getHeader("${reads_aligned_bam}") rgids = [[x[3:] for x in h if x.startswith('ID:')][0] for h in header if h[0]=='@RG'] + n_rgs = len(rgids) with open('readgroups.txt', 'wt') as outf: for rg in rgids: outf.write(rg+'\t'+rg+'\n') + n_lbs = len(set([[x[3:] for x in h if x.startswith('LB:')][0] for h in header if h[0]=='@RG'])) + with open('num_read_groups', 'wt') as outf: + outf.write(str(n_rgs)+'\n') + with open('num_libraries', 'wt') as outf: + outf.write(str(n_lbs)+'\n') CODE # bcftools call snps while treating each RG as a separate sample @@ -688,14 +702,13 @@ task run_discordance { -Ov -o everything.vcf # mask all GT calls when less than 3 reads - cat everything.vcf | bcftools filter -e 'FMT/DP<3' -S . > filtered.vcf - cat filtered.vcf | bcftools filter -i 'MAC>0' > "${out_basename}.discordant.vcf" + cat everything.vcf | bcftools filter -e "FMT/DP<${min_coverage}" -S . > filtered.vcf + cat filtered.vcf | bcftools filter -i "MAC>0" > "${out_basename}.discordant.vcf" # tally outputs - set +o pipefail # to handle empty grep - cat filtered.vcf | bcftools filter -i 'MAC=0' | grep -v '^#' | wc -l | tee num_concordant - cat "${out_basename}.discordant.vcf" | bcftools filter -i 'TYPE="snp"' | grep -v '^#' | wc -l | tee num_discordant_snps - cat "${out_basename}.discordant.vcf" | bcftools filter -i 'TYPE!="snp"' | grep -v '^#' | wc -l | tee num_discordant_indels + bcftools filter -i 'MAC=0' filtered.vcf | bcftools query -f '%POS\n' | wc -l | tee num_concordant + bcftools filter -i 'TYPE="snp"' "${out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_snps + bcftools filter -i 'TYPE!="snp"' "${out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_indels } output { @@ -703,6 +716,8 @@ task run_discordance { Int concordant_sites = read_int("num_concordant") Int discordant_snps = read_int("num_discordant_snps") Int discordant_indels = read_int("num_discordant_indels") + Int num_read_groups = read_int("num_read_groups") + Int num_libraries = read_int("num_libraries") String viralngs_version = read_string("VERSION") } diff --git a/pipes/WDL/workflows/assemble_refbased.wdl b/pipes/WDL/workflows/assemble_refbased.wdl index 46ad34a05..237742cc8 100644 --- a/pipes/WDL/workflows/assemble_refbased.wdl +++ b/pipes/WDL/workflows/assemble_refbased.wdl @@ -147,9 +147,14 @@ workflow assemble_refbased { Int reference_genome_length = plot_ref_coverage.assembly_length Float assembly_mean_coverage = plot_ref_coverage.mean_coverage + Int dist_to_ref_snps = call_consensus.dist_to_ref_snps + Int dist_to_ref_indels = call_consensus.dist_to_ref_indels + Int replicate_concordant_sites = run_discordance.concordant_sites Int replicate_discordant_snps = run_discordance.discordant_snps Int replicate_discordant_indels = run_discordance.discordant_indels + Int num_read_groups = run_discordance.num_read_groups + Int num_libraries = run_discordance.num_libraries File replicate_discordant_vcf = run_discordance.discordant_sites_vcf Array[File] align_to_ref_per_input_aligned_flagstat = align_to_ref.aligned_bam_flagstat diff --git a/test/input/WDL/test_outputs-assemble_refbased-local.json b/test/input/WDL/test_outputs-assemble_refbased-local.json index 829701192..80656cc54 100644 --- a/test/input/WDL/test_outputs-assemble_refbased-local.json +++ b/test/input/WDL/test_outputs-assemble_refbased-local.json @@ -9,5 +9,9 @@ "assemble_refbased.assembly_length_unambiguous": 18889, "assemble_refbased.assembly_length": 18889, "assemble_refbased.replicate_discordant_indels": 0, - "assemble_refbased.replicate_discordant_snps": 0 + "assemble_refbased.replicate_discordant_snps": 0, + "assemble_refbased.dist_to_ref_snps": 13, + "assemble_refbased.dist_to_ref_indels": 2, + "assemble_refbased.num_read_groups": 8, + "assemble_refbased.num_libraries": 2 }