biowdl · rhpvorderman · Feb 5, 2025 · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,12 @@ that users understand how the changes affect the new version.
 
 version 6.0.0-dev
 ---------------------------
++ Add Sequali task.
++ Add Clair3 task.
++ Add Modkit task.
++ Modify minimap2 task to accept ubam input, including transfer of methylation
+  tags. Also sort the BAM output file by coordinate.
++ Update DeepVariant container and update resource requirements.
 + rtg Format and VcfEval tasks now handle reference as an array of files to enable caching.
 + Added --select-genotype and --exclude-filtered flags to GATK SelectVariants
 + Use softlinks to localise the database for centrifuge.

diff --git a/clair3.wdl b/clair3.wdl
@@ -0,0 +1,94 @@
+version 1.0
+
+# Copyright (c) 2024 Leiden University Medical Center
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+task Clair3 {
+    input {
+        File bam 
+        File bamIndex 
+        File referenceFasta 
+        File referenceFastaFai
+        String outputPrefix 
+        String? sampleName
+        File? modelTar
+        String? builtinModel
+        String platform
+        Int threads = 8
+        Boolean includeAllCtgs = false
+        String memory = "~{threads + 16}GiB"
+        Int timeMinutes = 10 + ceil(size(bam, "G") * 200 / threads)
+        String dockerImage = "quay.io/biocontainers/clair3:1.0.10--py39h46983ab_0"   
+    }
+
+    String modelArg = "~{if defined(modelTar) then basename(select_first([modelTar]), '.tar.gz') else builtinModel}"
+
+    command <<<
+        set -e
+        ~{if defined(modelTar) then "tar -xvf " + modelTar else "" }
+        mkdir -p $(dirname ~{outputPrefix})
+        run_clair3.sh \
+        --model=~{modelArg} \
+        --ref_fn=~{referenceFasta} \
+        --bam_fn=~{bam} \
+        --output=out \
+        --threads=~{threads} \
+        --platform=~{platform} \
+        ~{"--sample_name=" + sampleName} \
+        ~{true="--include_all_ctgs" false ="" includeAllCtgs}  
+        mv out/merge_output.vcf.gz ~{outputPrefix}.vcf.gz
+        mv out/merge_output.vcf.gz.tbi ~{outputPrefix}.vcf.gz.tbi
+    >>>
+
+    output {
+        File vcf = "~{outputPrefix}.vcf.gz"
+        File vcfIndex = "~{outputPrefix}.vcf.gz.tbi"  
+    }
+
+    runtime {
+        cpu: threads 
+        memory: memory
+        time_minutes: timeMinutes
+        docker: dockerImage
+    } 
+
+    parameter_meta {
+        # input
+        bam: {description: "The input alignment file", category: "required"}
+        bamIndex: {description: "The index for the input alignment file", category: "required"}
+        referenceFasta: {description: "The reference fasta file.", category: "required"}
+        referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"}
+        outputPrefix: {description: "The output prefix where the data should be placed.", category: "common"}
+        modelTar: {description: "The TAR file with the model", category: "common"}
+        builtinModel: {description: "The builtin model name (in case a tar file is not used)", category: "common"}
+        sampleName: {description: "The name of the sample in the VCF", category: "common"}
+        platform: {description: "platform setting for clair3.", category: "required"}
+        includeAllCtgs: {description: "whether or not to call all contigs in the reference", category: "advanced"}
+        threads: {description: "The number of threads to use for variant calling.", category: "advanced"}
+        memory: {description: "The amount of memory this job will use.", category: "advanced"}
+        timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} 
+        dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"}
+
+        # output
+        vcf: {description: "Output VCF file."}
+        vcfIndex: {description: "Output VCF index."}
+
+    }
+}
diff --git a/deepvariant.wdl b/deepvariant.wdl
@@ -27,19 +27,19 @@ task RunDeepVariant {
         File inputBam
         File inputBamIndex
         String modelType
-        String outputVcf
+        String outputVcf = "sample.vcf.gz"
         String? postprocessVariantsExtraArgs
         File? customizedModel
-        Int? numShards
+        Int numShards = 4
         String? outputGVcf
         String? outputGVcfIndex
         File? regions
         String? sampleName
         Boolean? VCFStatsReport = true
 
-        String memory = "3GiB"
+        String memory = "48GiB"
         Int timeMinutes = 5000
-        String dockerImage = "google/deepvariant:1.0.0"
+        String dockerImage = "google/deepvariant:1.6.1"
     }
 
     command {
@@ -62,6 +62,7 @@ task RunDeepVariant {
         memory: memory
         time_minutes: timeMinutes
         docker: dockerImage
+        cpu: numShards
     }
 
     output {

diff --git a/minimap2.wdl b/minimap2.wdl
@@ -81,52 +81,82 @@ task Indexing {
 task Mapping {
     input {
         String presetOption
-        Int kmerSize = 15
-        Boolean skipSelfAndDualMappings = false
-        Boolean outputSam = false
         String outputPrefix
-        Boolean addMDTagToSam = false
-        Boolean secondaryAlignment = false
         File referenceFile
         File queryFile
+
+        Int compressionLevel = 1 
+        Int additionalSortThreads = 1
+        Int sortMemoryGb = 1
+        Boolean nameSorted = false
+        # MM, ML, MN -> Methylation flags
+        # Also keep the following flags for Sequali to be able to run on the mapped bam file and get ONT information.
+        # ch -> channel
+        # st -> start time
+        # du -> duration
+        # dx -> Whether read was duplex
+        # pi -> Parent ID for split read
+
+        String tagsToKeep = "MM,ML,MN,ch,st,du,dx,pi"
+
+        Boolean skipSelfAndDualMappings = false
+        Boolean addMDTagToSam = false
+        Boolean secondaryAlignment = true
 
+        Int? kmerSize
         Int? maxIntronLength
         Int? maxFragmentLength
         Int? retainMaxSecondaryAlignments
         Int? matchingScore
         Int? mismatchPenalty
         String? howToFindGTAG
+        String? readgroup
 
-        Int cores = 4
-        String memory = "30GiB"
+        Int cores = 8
+        String memory = "24GiB"
         Int timeMinutes = 1 + ceil(size(queryFile, "G") * 200 / cores)
-        String dockerImage = "quay.io/biocontainers/minimap2:2.20--h5bf99c6_0"
+        # Minimap 2.28 samtools 1.20
+        String dockerImage = "quay.io/biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:3161f532a5ea6f1dec9be5667c9efc2afdac6104-0"
     }
 
-    command {
-        set -e
+    # Always run data through samtools fastq. This supports both FASTQ and uBAM 
+    # files. It does remove any existing FASTQ comments, but this should not be
+    # problematic for most files.
+
+    command <<<
+        set -e -o pipefail
         mkdir -p "$(dirname ~{outputPrefix})"
+        samtools fastq -T "~{tagsToKeep}" ~{queryFile} | \
         minimap2 \
+        -a \
         -x ~{presetOption} \
-        -k ~{kmerSize} \
         ~{true="-X" false="" skipSelfAndDualMappings} \
-        ~{true="-a" false="" outputSam} \
-        -o ~{outputPrefix} \
         ~{true="--MD" false="" addMDTagToSam} \
         --secondary=~{true="yes" false="no" secondaryAlignment} \
+        -y \
         -t ~{cores} \
+        ~{"-k " + kmerSize} \
         ~{"-G " + maxIntronLength} \
         ~{"-F " + maxFragmentLength} \
         ~{"-N " + retainMaxSecondaryAlignments} \
         ~{"-A " + matchingScore} \
         ~{"-B " + mismatchPenalty} \
         ~{"-u " + howToFindGTAG} \
+        ~{"-R '" + readgroup}~{false="" true="'" defined(readgroup)} \
         ~{referenceFile} \
-        ~{queryFile}
-    }
+        - \
+        | samtools sort \
+        ~{true="-N" false="" nameSorted} \
+        -@ ~{additionalSortThreads} \
+        -l ~{compressionLevel} \
+        -m ~{sortMemoryGb}G \
+        -o ~{outputPrefix}.bam 
+        samtools index ~{outputPrefix}.bam
+    >>>
 
     output {
-        File alignmentFile = outputPrefix
+        File bam = "~{outputPrefix}.bam"
+        File bamIndex = "~{outputPrefix}.bam.bai"
     }
 
     runtime {
@@ -141,7 +171,6 @@ task Mapping {
         presetOption: {description: "This option applies multiple options at the same time.", category: "common"}
         kmerSize: {description: "K-mer size (no larger than 28).", category: "advanced"}
         skipSelfAndDualMappings: {description: "Skip self and dual mappings (for the all-vs-all mode).", category: "advanced"}
-        outputSam: {description: "Output in the sam format.", category: "common"}
         outputPrefix: {description: "Output directory path + output file prefix.", category: "required"}
         addMDTagToSam: {description: "Adds a MD tag to the sam output file.", category: "common"}
         secondaryAlignment: {description: "Whether to output secondary alignments.", category: "advanced"}
@@ -152,13 +181,15 @@ task Mapping {
         retainMaxSecondaryAlignments: {description: "Retain at most N secondary alignments.", category: "advanced"}
         matchingScore: {description: "Matching score.", category: "advanced"}
         mismatchPenalty: {description: "Mismatch penalty.", category: "advanced"}
+        tagsToKeep: {description: "Tags to keep from the input unaligned BAM file.", category: "Advanced"}
         howToFindGTAG: {description: "How to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG.", category: "common"}
         cores: {description: "The number of cores to be used.", category: "advanced"}
         memory: {description: "The amount of memory available to the job.", category: "advanced"}
         timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
         dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"}
 
         # outputs
-        alignmentFile: {description: "Mapping and alignment between collections of dna sequences file."}
+        bam: {description: "Mapping and alignment between collections of dna sequences file in BAM format."}
+        bamIndex: {description: "Accompanying index file for the BAM file."}
     }
 }
diff --git a/modkit.wdl b/modkit.wdl
@@ -0,0 +1,107 @@
+version 1.0
+
+# Copyright (c) 2025 Leiden University Medical Center
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+task Pileup {
+    input {
+        File bam
+        File bamIndex
+        String outputBed = "output.bedMethyl"
+        File referenceFasta
+        File referenceFastaFai
+
+        Int? intervalSize
+        File? includeBed
+
+        Boolean cpg = false
+        Boolean combineMods = false
+        Boolean combineStrands = false
+        Boolean bedgraph = false
+        String? ignore
+        String logFilePath = "modkit.log"
+
+        Int threads = 8
+        String memory = "4GiB"
+        Int timeMinutes = 2880 / threads  # 2 Days / threads
+        String dockerImage = "quay.io/biocontainers/ont-modkit:0.4.2--hcdda2d0_0"
+
+    }
+
+    command <<<
+        set -e
+        mkdir -p $(dirname ~{outputBed})
+        mkdir -p $(dirname ~{logFilePath})
+        modkit pileup \
+        --threads ~{threads} \
+        ~{"--interval-size " + intervalSize} \
+        ~{"--include-bed " + includeBed} \
+        ~{"--ignore " + ignore} \
+        --ref ~{referenceFasta} \
+        ~{true="--cpg" false="" cpg} \
+        ~{true="--combine-mods" false="" combineMods} \
+        ~{true="--combine-strands" false="" combineStrands} \
+        ~{true="--bedgraph" false="" bedgraph} \
+        --log-filepath ~{logFilePath} \
+        ~{bam} \
+        ~{outputBed} 
+    >>>
+
+    output {
+        File? out = outputBed  # Normal mode
+        Array[File] outFiles = glob(outputBed + "/*")  # Bedgraph mode
+        File logFile = logFilePath
+    }
+
+    runtime {
+        docker: dockerImage
+        cpu: threads
+        memory: memory
+        time_minutes: timeMinutes
+    }
+
+    parameter_meta {
+        # input
+        bam: {description: "The input alignment file", category: "required"}
+        bamIndex: {description: "The index for the input alignment file", category: "required"}
+        referenceFasta: {description: "The reference fasta file.", category: "required"}
+        referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"}
+        outputBed: {description: "The output name where the data should be placed.", category: "common"}
+
+        intervalSize: {description: "Sets the interval size", category: "advanced"}
+        includeBed: {description: "Bed file with regions to include", category: "advanced"}
+        cpg: {description: "Whether to call only at cpg sites", category: "advanced"}
+        combineMods: {description: "Whether to combine modifications in the output", category: "advanced"}
+        combineStrands: {description: "Whether to combine strands in the output", category: "advanced"}
+        bedgraph: {description: "Whether to create a folder instead with a bedgraph file", category: "advanced"}
+        ignore: {description: "Modification type to ignore. For example 'h'.", category: "advanced"}
+        logFilePath: {description: "Path where the log file should be written.", category: "advanced"}
+
+        threads: {description: "The number of threads to use for variant calling.", category: "advanced"}
+        memory: {description: "The amount of memory this job will use.", category: "advanced"}
+        timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} 
+        dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"}
+
+        # output
+        out: {description: "The output bed files. Not available when bedgraph = true."}
+        outFiles: {description: "Output files when bedgraph = true."}
+        logFile: {description: "The generated log file."}
+    }
+}
diff --git a/multiqc.wdl b/multiqc.wdl
@@ -58,7 +58,7 @@ task MultiQC {
 
         String? memory
         Int timeMinutes = 10 + ceil(size(reports, "GiB") * 8)
-        String dockerImage = "quay.io/biocontainers/multiqc:1.9--py_1"
+        String dockerImage = "quay.io/biocontainers/multiqc:1.25.1--pyhdfd78af_0"
     }
 
     Int memoryGb = 2 + ceil(size(reports, "GiB"))