Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

more scaffolding updates #511

Merged
merged 24 commits into from
Feb 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
7064708
defend against rather common empty output scenario
dpark01 Feb 5, 2024
0593083
more compliant wdl
dpark01 Feb 5, 2024
0bedc96
add new wdl task report_primary_kraken_taxa
dpark01 Feb 7, 2024
98f9bbd
add report_primary_kraken_taxa wdl task and add to classify_single
dpark01 Feb 7, 2024
e39919b
add a few more outputs
dpark01 Feb 7, 2024
a85d7c9
Merge remote-tracking branch 'origin/master' into dp-scaffold
dpark01 Feb 8, 2024
9e12088
try wdl 1.1 and see what happens
dpark01 Feb 8, 2024
02cf671
try wdl development and see what happens
dpark01 Feb 8, 2024
d824518
update to take tsv instead of json input for reference/tax map
dpark01 Feb 8, 2024
fa07252
attempt to not fail in scaffolding when some but not all segments of …
dpark01 Feb 13, 2024
031a294
forgot $
dpark01 Feb 13, 2024
8a9b26f
remove random empty newline introduced in this branch
dpark01 Feb 13, 2024
165eb66
fix bash logical construction
dpark01 Feb 14, 2024
8c898c9
Merge remote-tracking branch 'origin/master' into dp-scaffold
dpark01 Feb 14, 2024
1080d49
initial draft of task for filtering reference list
dpark01 Feb 14, 2024
1a77bf7
pre-extract taxdump tarball
dpark01 Feb 14, 2024
d31c14a
add optional kraken-based reference selection to multitaxa
dpark01 Feb 15, 2024
526cece
why cromwell do you behave poorly on edge cases
dpark01 Feb 16, 2024
f02a58b
more stats and outputs, revert to refbased if cant denovo, dont polis…
dpark01 Feb 16, 2024
ca24b2d
Merge remote-tracking branch 'origin/master' into dp-scaffold
dpark01 Feb 16, 2024
bc6bee7
simplify cromwell fix
dpark01 Feb 16, 2024
6a71e1a
Merge branch 'master' into dp-scaffold
dpark01 Feb 16, 2024
93d455f
bump viral-classify 2.2.3.0 to 2.2.4.1
dpark01 Feb 16, 2024
88ca4d1
revert version
dpark01 Feb 16, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions github_actions_ci/install-wdl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ fetch_jar_from_github () {
ln -s $_jar_fname $_tool_name.jar
}

fetch_jar_from_github broadinstitute cromwell womtool 61
fetch_jar_from_github broadinstitute cromwell cromwell 61
fetch_jar_from_github broadinstitute cromwell womtool 86
fetch_jar_from_github broadinstitute cromwell cromwell 86
fetch_jar_from_github dnanexus dxWDL dxWDL v1.50

TGZ=dx-toolkit-v0.311.0-ubuntu-20.04-amd64.tar.gz
Expand Down
37 changes: 26 additions & 11 deletions pipes/WDL/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -231,19 +231,31 @@ task scaffold {
set +e +o pipefail
grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\n' | wc -c | tee assembly_preimpute_length
grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\nNn' | wc -c | tee assembly_preimpute_length_unambiguous
grep '^>' ~{sample_name}.intermediate_gapfill.fasta | wc -l | tee assembly_num_segments_recovered
grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | wc -l | tee reference_num_segments_required
grep -v '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | tr -d '\n' | wc -c | tee reference_length
set -e -o pipefail

#Input assembly/contigs, FASTA, already ordered oriented and merged with the reference gneome (FASTA)
assembly.py impute_from_reference \
~{sample_name}.intermediate_gapfill.fasta \
~{sample_name}.scaffolding_chosen_ref.fasta \
~{sample_name}.scaffolded_imputed.fasta \
--newName ~{sample_name} \
~{'--replaceLength=' + replace_length} \
~{'--minLengthFraction=' + min_length_fraction} \
~{'--minUnambig=' + min_unambig} \
~{'--aligner=' + aligner} \
--loglevel=DEBUG
if ~{true='true' false='false' allow_incomplete_output} && ! cmp -s assembly_num_segments_recovered reference_num_segments_required
then
# draft assembly does not have enough segments--and that's okay (allow_incomplete_output=true)
file_utils.py rename_fasta_sequences \
~{sample_name}.intermediate_gapfill.fasta \
~{sample_name}.scaffolded_imputed.fasta \
"~{sample_name}" --suffix_always --loglevel=DEBUG
else
# draft assembly must have the right number of segments (fail if not)
assembly.py impute_from_reference \
~{sample_name}.intermediate_gapfill.fasta \
~{sample_name}.scaffolding_chosen_ref.fasta \
~{sample_name}.scaffolded_imputed.fasta \
--newName ~{sample_name} \
~{'--replaceLength=' + replace_length} \
~{'--minLengthFraction=' + min_length_fraction} \
~{'--minUnambig=' + min_unambig} \
~{'--aligner=' + aligner} \
--loglevel=DEBUG
fi
}

output {
Expand All @@ -252,6 +264,9 @@ task scaffold {
File intermediate_gapfill_fasta = "~{sample_name}.intermediate_gapfill.fasta"
Int assembly_preimpute_length = read_int("assembly_preimpute_length")
Int assembly_preimpute_length_unambiguous = read_int("assembly_preimpute_length_unambiguous")
Int assembly_num_segments_recovered = read_int("assembly_num_segments_recovered")
Int reference_num_segments_required = read_int("reference_num_segments_required")
Int reference_length = read_int("reference_length")
Array[String] scaffolding_chosen_ref_names = read_lines("~{sample_name}.scaffolding_chosen_refs.txt")
File scaffolding_chosen_ref = "~{sample_name}.scaffolding_chosen_ref.fasta"
File scaffolding_stats = "~{sample_name}.scaffolding_stats.txt"
Expand Down
104 changes: 96 additions & 8 deletions pipes/WDL/tasks/tasks_metagenomics.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ task krakenuniq {
File krona_taxonomy_db_tgz # taxonomy.tab

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" #skip-global-version-pin
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" #skip-global-version-pin
}

Int disk_size = 750
Expand Down Expand Up @@ -143,7 +143,7 @@ task build_krakenuniq_db {
Int? zstd_compression_level

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" #skip-global-version-pin
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" #skip-global-version-pin
}

Int disk_size = 750
Expand Down Expand Up @@ -213,7 +213,7 @@ task kraken2 {
Int? min_base_qual

Int machine_mem_gb = 72
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

parameter_meta {
Expand Down Expand Up @@ -326,6 +326,94 @@ task kraken2 {
}
}

task report_primary_kraken_taxa {
meta {
description: "Interprets a kraken (or kraken2 or krakenuniq) summary report file and emits the primary contributing taxa under a focal taxon of interest."
}
input {
File kraken_summary_report
String focal_taxon = "Viruses"

String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}
String out_basename = basename(kraken_summary_report, '.txt')
Int disk_size = 50
Int machine_mem_gb = 2

command <<<
set -e
metagenomics.py taxlevel_plurality "~{kraken_summary_report}" "~{focal_taxon}" "~{out_basename}.ranked_focal_report.tsv"
cat "~{out_basename}.ranked_focal_report.tsv" | head -2 | tail +2 > TOPROW
cut -f 2 TOPROW > NUM_FOCAL
cut -f 4 TOPROW > PCT_OF_FOCAL
cut -f 7 TOPROW > NUM_READS
cut -f 8 TOPROW > TAX_RANK
cut -f 9 TOPROW > TAX_ID
cut -f 10 TOPROW > TAX_NAME
>>>

output {
String focal_tax_name = focal_taxon
File ranked_focal_report = "~{out_basename}.ranked_focal_report.tsv"
Int total_focal_reads = read_int("NUM_FOCAL")
Float percent_of_focal = read_float("PCT_OF_FOCAL")
Int num_reads = read_int("NUM_READS")
String tax_rank = read_string("TAX_RANK")
String tax_id = read_string("TAX_ID")
String tax_name = read_string("TAX_NAME")
}

runtime {
docker: docker
memory: machine_mem_gb + " GB"
cpu: 1
disks: "local-disk " + disk_size + " LOCAL"
disk: disk_size + " GB" # TESs
dx_instance_type: "mem1_ssd1_v2_x2"
preemptible: 2
maxRetries: 2
}
}

task filter_refs_to_found_taxa {
meta {
description: "Filters a taxid_to_ref_accessions_tsv to the set of taxa found in a focal_report."
}
input {
File taxid_to_ref_accessions_tsv
File focal_report_tsv
File taxdump_tgz
Int min_read_count = 100

String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}
String ref_basename = basename(taxid_to_ref_accessions_tsv, '.tsv')
String hits_basename = basename(focal_report_tsv, '.tsv')
Int disk_size = 50

command <<<
set -e
mkdir -p taxdump
read_utils.py extract_tarball "~{taxdump_tgz}" taxdump
metagenomics.py filter_taxids_to_focal_hits "~{taxid_to_ref_accessions_tsv}" "~{focal_report_tsv}" taxdump ~{min_read_count} "~{ref_basename}-~{hits_basename}.tsv"
>>>

output {
File filtered_taxid_to_ref_accessions_tsv = "~{ref_basename}-~{hits_basename}.tsv"
}

runtime {
docker: docker
memory: "2 GB"
cpu: 1
disks: "local-disk " + disk_size + " LOCAL"
disk: disk_size + " GB" # TESs
dx_instance_type: "mem1_ssd1_v2_x2"
preemptible: 2
maxRetries: 2
}
}

task build_kraken2_db {
meta {
description: "Builds a custom kraken2 database. Outputs tar.zst tarballs of kraken2 database, associated krona taxonomy db, and an ncbi taxdump.tar.gz. Requires live internet access if any standard_libraries are specified or if taxonomy_db_tgz is absent."
Expand All @@ -348,7 +436,7 @@ task build_kraken2_db {
Int? zstd_compression_level

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

Int disk_size = 750
Expand Down Expand Up @@ -490,7 +578,7 @@ task blastx {
File krona_taxonomy_db_tgz

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

parameter_meta {
Expand Down Expand Up @@ -580,7 +668,7 @@ task krona {
Int? magnitude_column

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

Int disk_size = 50
Expand Down Expand Up @@ -687,7 +775,7 @@ task filter_bam_to_taxa {
String out_filename_suffix = "filtered"

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

String out_basename = basename(classified_bam, ".bam") + "." + out_filename_suffix
Expand Down Expand Up @@ -774,7 +862,7 @@ task kaiju {
File krona_taxonomy_db_tgz # taxonomy/taxonomy.tab

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

String input_basename = basename(reads_unmapped_bam, ".bam")
Expand Down
2 changes: 1 addition & 1 deletion pipes/WDL/tasks/tasks_reports.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ task aggregate_metagenomics_reports {
String aggregate_taxlevel_focus = "species"
Int aggregate_top_N_hits = 5

String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

parameter_meta {
Expand Down
6 changes: 3 additions & 3 deletions pipes/WDL/tasks/tasks_taxon_filter.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ task deplete_taxa {

Int? cpu=8
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

parameter_meta {
Expand Down Expand Up @@ -113,7 +113,7 @@ task filter_to_taxon {
String? neg_control_prefixes_space_separated = "neg water NTC"

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

# do this in two steps in case the input doesn't actually have "cleaned" in the name
Expand Down Expand Up @@ -172,7 +172,7 @@ task build_lastal_db {
File sequences_fasta

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

String db_name = basename(sequences_fasta, ".fasta")
Expand Down
29 changes: 29 additions & 0 deletions pipes/WDL/tasks/tasks_utils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,35 @@ task s3_copy {
}
}

task string_split {
meta {
description: "split a string by a delimiter"
}
input {
String joined_string
String delimiter
}
command <<<
set -e
python3<<CODE
with open('TOKENS', 'wt') as outf:
for token in "~{joined_string}".split("~{delimiter}"):
outf.write(token + '\n')
CODE
>>>
output {
Array[String] tokens = read_lines("TOKENS")
}
runtime {
docker: "python:slim"
memory: "1 GB"
cpu: 1
disks: "local-disk 50 SSD"
disk: "50 GB" # TES
maxRetries: 2
}
}

task filter_sequences_by_length {
meta {
description: "Filter sequences in a fasta file to enforce a minimum count of non-N bases."
Expand Down
13 changes: 13 additions & 0 deletions pipes/WDL/workflows/classify_single.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,10 @@ workflow classify_single {
trim_clip_db = trim_clip_db,
always_succeed = true
}
call metagenomics.report_primary_kraken_taxa {
input:
kraken_summary_report = kraken2.kraken2_summary_report
}

output {
File cleaned_reads_unaligned_bam = deplete.bam_filtered_to_taxa
Expand All @@ -134,6 +138,15 @@ workflow classify_single {

File kraken2_summary_report = kraken2.kraken2_summary_report
File kraken2_krona_plot = kraken2.krona_report_html
File kraken2_top_taxa_report = report_primary_kraken_taxa.ranked_focal_report
String kraken2_focal_taxon_name = report_primary_kraken_taxa.focal_tax_name
Int kraken2_focal_total_reads = report_primary_kraken_taxa.total_focal_reads
String kraken2_top_taxon_id = report_primary_kraken_taxa.tax_id
String kraken2_top_taxon_name = report_primary_kraken_taxa.tax_name
String kraken2_top_taxon_rank = report_primary_kraken_taxa.tax_rank
Int kraken2_top_taxon_num_reads = report_primary_kraken_taxa.num_reads
Float kraken2_top_taxon_pct_of_focal = report_primary_kraken_taxa.percent_of_focal

File raw_fastqc = merge_raw_reads.fastqc
File cleaned_fastqc = fastqc_cleaned.fastqc_html
File spikein_report = spikein.report
Expand Down
Loading
Loading