From e95399a221f774aecb19ed291debfeebd2dd5300 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sun, 21 Jun 2020 00:07:53 -0400 Subject: [PATCH 1/3] run_discordance: parameterize min_coverage and default to 4, add num_read_groups and num_libraries outputs, remove use of grep and switch to bcftools query instead --- pipes/WDL/tasks/tasks_assembly.wdl | 20 +++++++++++++------ pipes/WDL/workflows/assemble_refbased.wdl | 2 ++ .../test_outputs-assemble_refbased-local.json | 4 +++- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index ae5715e40..ff44e8890 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -658,6 +658,7 @@ task run_discordance { File reads_aligned_bam File reference_fasta String out_basename = "run" + Int min_coverage=4 String docker="quay.io/broadinstitute/viral-core" } @@ -672,9 +673,15 @@ task run_discordance { import tools.samtools header = tools.samtools.SamtoolsTool().getHeader("${reads_aligned_bam}") rgids = [[x[3:] for x in h if x.startswith('ID:')][0] for h in header if h[0]=='@RG'] + n_rgs = len(rgids) with open('readgroups.txt', 'wt') as outf: for rg in rgids: outf.write(rg+'\t'+rg+'\n') + n_lbs = len(set([[x[3:] for x in h if x.startswith('LB:')][0] for h in header if h[0]=='@RG'])) + with open('num_read_groups', 'wt') as outf: + outf.write(str(n_rgs)+'\n') + with open('num_libraries', 'wt') as outf: + outf.write(str(n_lbs)+'\n') CODE # bcftools call snps while treating each RG as a separate sample @@ -688,14 +695,13 @@ task run_discordance { -Ov -o everything.vcf # mask all GT calls when less than 3 reads - cat everything.vcf | bcftools filter -e 'FMT/DP<3' -S . > filtered.vcf - cat filtered.vcf | bcftools filter -i 'MAC>0' > "${out_basename}.discordant.vcf" + cat everything.vcf | bcftools filter -e "FMT/DP<${min_coverage}" -S . > filtered.vcf + cat filtered.vcf | bcftools filter -i "MAC>0" > "${out_basename}.discordant.vcf" # tally outputs - set +o pipefail # to handle empty grep - cat filtered.vcf | bcftools filter -i 'MAC=0' | grep -v '^#' | wc -l | tee num_concordant - cat "${out_basename}.discordant.vcf" | bcftools filter -i 'TYPE="snp"' | grep -v '^#' | wc -l | tee num_discordant_snps - cat "${out_basename}.discordant.vcf" | bcftools filter -i 'TYPE!="snp"' | grep -v '^#' | wc -l | tee num_discordant_indels + bcftools filter -i 'MAC=0' filtered.vcf | bcftools query -f '%POS\n' | wc -l | tee num_concordant + bcftools filter -i 'TYPE="snp"' "${out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_snps + bcftools filter -i 'TYPE!="snp"' "${out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_indels } output { @@ -703,6 +709,8 @@ task run_discordance { Int concordant_sites = read_int("num_concordant") Int discordant_snps = read_int("num_discordant_snps") Int discordant_indels = read_int("num_discordant_indels") + Int num_read_groups = read_int("num_read_groups") + Int num_libraries = read_int("num_libraries") String viralngs_version = read_string("VERSION") } diff --git a/pipes/WDL/workflows/assemble_refbased.wdl b/pipes/WDL/workflows/assemble_refbased.wdl index 46ad34a05..e39876b16 100644 --- a/pipes/WDL/workflows/assemble_refbased.wdl +++ b/pipes/WDL/workflows/assemble_refbased.wdl @@ -150,6 +150,8 @@ workflow assemble_refbased { Int replicate_concordant_sites = run_discordance.concordant_sites Int replicate_discordant_snps = run_discordance.discordant_snps Int replicate_discordant_indels = run_discordance.discordant_indels + Int num_read_groups = run_discordance.num_read_groups + Int num_libraries = run_discordance.num_libraries File replicate_discordant_vcf = run_discordance.discordant_sites_vcf Array[File] align_to_ref_per_input_aligned_flagstat = align_to_ref.aligned_bam_flagstat diff --git a/test/input/WDL/test_outputs-assemble_refbased-local.json b/test/input/WDL/test_outputs-assemble_refbased-local.json index 829701192..d9a77d0fe 100644 --- a/test/input/WDL/test_outputs-assemble_refbased-local.json +++ b/test/input/WDL/test_outputs-assemble_refbased-local.json @@ -9,5 +9,7 @@ "assemble_refbased.assembly_length_unambiguous": 18889, "assemble_refbased.assembly_length": 18889, "assemble_refbased.replicate_discordant_indels": 0, - "assemble_refbased.replicate_discordant_snps": 0 + "assemble_refbased.replicate_discordant_snps": 0, + "assemble_refbased.num_read_groups": 8, + "assemble_refbased.num_libraries": 2 } From 7d510b5ae09d12dc17ffeba03fd6fc9aa577dfc8 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sun, 21 Jun 2020 09:35:10 -0400 Subject: [PATCH 2/3] more figures of merit: emit #snps and indels against ref genome --- pipes/WDL/tasks/tasks_assembly.wdl | 7 +++++++ pipes/WDL/workflows/assemble_refbased.wdl | 3 +++ 2 files changed, 10 insertions(+) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index ff44e8890..f6c36ad14 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -419,6 +419,11 @@ task refine_assembly_with_aligned_reads { file_utils.py rename_fasta_sequences \ refined.fasta "${sample_name}.fasta" "${sample_name}" + # collect variant counts + bcftools filter -e "FMT/DP<${min_coverage}" -S . "${sample_name}.sites.vcf.gz" -Ou | bcftools filter -i "AC>1" -Ou > "${sample_name}.diffs.vcf" + bcftools filter -i 'TYPE="snp"' "${sample_name}.diffs.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_snps + bcftools filter -i 'TYPE!="snp"' "${sample_name}.diffs.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_indels + # collect figures of merit set +o pipefail # grep will exit 1 if it fails to find the pattern grep -v '^>' refined.fasta | tr -d '\n' | wc -c | tee assembly_length @@ -430,6 +435,8 @@ task refine_assembly_with_aligned_reads { File sites_vcf_gz = "${sample_name}.sites.vcf.gz" Int assembly_length = read_int("assembly_length") Int assembly_length_unambiguous = read_int("assembly_length_unambiguous") + Int dist_to_ref_snps = read_int("num_snps") + Int dist_to_ref_indels = read_int("num_indels") String viralngs_version = read_string("VERSION") } diff --git a/pipes/WDL/workflows/assemble_refbased.wdl b/pipes/WDL/workflows/assemble_refbased.wdl index e39876b16..237742cc8 100644 --- a/pipes/WDL/workflows/assemble_refbased.wdl +++ b/pipes/WDL/workflows/assemble_refbased.wdl @@ -147,6 +147,9 @@ workflow assemble_refbased { Int reference_genome_length = plot_ref_coverage.assembly_length Float assembly_mean_coverage = plot_ref_coverage.mean_coverage + Int dist_to_ref_snps = call_consensus.dist_to_ref_snps + Int dist_to_ref_indels = call_consensus.dist_to_ref_indels + Int replicate_concordant_sites = run_discordance.concordant_sites Int replicate_discordant_snps = run_discordance.discordant_snps Int replicate_discordant_indels = run_discordance.discordant_indels From 29243c319d2b2a840b549cbc2732642104ef8f0c Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sun, 21 Jun 2020 09:50:51 -0400 Subject: [PATCH 3/3] add test outputs --- test/input/WDL/test_outputs-assemble_refbased-local.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/input/WDL/test_outputs-assemble_refbased-local.json b/test/input/WDL/test_outputs-assemble_refbased-local.json index d9a77d0fe..80656cc54 100644 --- a/test/input/WDL/test_outputs-assemble_refbased-local.json +++ b/test/input/WDL/test_outputs-assemble_refbased-local.json @@ -10,6 +10,8 @@ "assemble_refbased.assembly_length": 18889, "assemble_refbased.replicate_discordant_indels": 0, "assemble_refbased.replicate_discordant_snps": 0, + "assemble_refbased.dist_to_ref_snps": 13, + "assemble_refbased.dist_to_ref_indels": 2, "assemble_refbased.num_read_groups": 8, "assemble_refbased.num_libraries": 2 }