Sbwf impute glimpse2 (#3349)

* Separate map file in second channel for glimpse2 chunk * Change samples infos files of place Add input, output region and map as optional Add output region as prefix * Change samples infos files in glimpse_phase * Change samples_infos file way to combine * Remove view() * Update vcf_impute_glimpse to respect previous change in glimpse process * Add new sbwf for glimpse2 * Update file name * Change input1 to input * Remove md5 sum of bin file * Correct test glimpse2_phase * Small changes * Add keyword to glimpse_chunk * Update tests/modules/nf-core/glimpse/concordance/main.nf Co-authored-by: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com> --------- Co-authored-by: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com>
nf-core · May 2, 2023 · c7ddd48 · c7ddd48
1 parent d68b2e6
commit c7ddd48
Show file tree

Hide file tree

Showing 30 changed files with 610 additions and 222 deletions.
diff --git a/modules/nf-core/glimpse/chunk/meta.yml b/modules/nf-core/glimpse/chunk/meta.yml
@@ -3,6 +3,7 @@ description: Defines chunks where to run imputation
 keywords:
   - chunk
   - imputation
+  - low coverage
 tools:
   - "glimpse":
       description: "GLIMPSE is a phasing and imputation method for large-scale low-coverage sequencing studies."

diff --git a/modules/nf-core/glimpse/phase/main.nf b/modules/nf-core/glimpse/phase/main.nf
@@ -8,7 +8,7 @@ process GLIMPSE_PHASE {
         'quay.io/biocontainers/glimpse-bio:1.1.1--hce55b13_1' }"
 
     input:
-        tuple val(meta) , path(input), path(input_index), val(input_region), val(output_region), path(reference), path(reference_index), path(map), path(samples_file)
+        tuple val(meta) , path(input), path(input_index), path(samples_file), val(input_region), val(output_region), path(reference), path(reference_index), path(map)
 
     output:
         tuple val(meta), path("*.{vcf,bcf,vcf.gz,bcf.gz}"), emit: phased_variant

diff --git a/modules/nf-core/glimpse/phase/meta.yml b/modules/nf-core/glimpse/phase/meta.yml
@@ -3,6 +3,9 @@ description: main GLIMPSE algorithm, performs phasing and imputation refining ge
 keywords:
   - phase
   - imputation
+  - low-coverage
+  - glimpse
+
 tools:
   - "glimpse":
       description: "GLIMPSE is a phasing and imputation method for large-scale low-coverage sequencing studies."
@@ -29,6 +32,15 @@ input:
       description: Index file of the input VCF/BCF file containing genotype likelihoods.
       pattern: "*.{vcf.gz.csi,bcf.gz.csi}"
 
+  - samples_file:
+      type: file
+      description: |
+        File with sample names and ploidy information.
+        One sample per line with a mandatory second column indicating ploidy (1 or 2).
+        Sample names that are not present are assumed to have ploidy 2 (diploids).
+        GLIMPSE does NOT handle the use of sex (M/F) instead of ploidy.
+      pattern: "*.{txt,tsv}"
+
   - input_region:
       type: string
       description: Target region used for imputation, including left and right buffers (e.g. chr20:1000000-2000000).
@@ -54,15 +66,6 @@ input:
       description: File containing the genetic map.
       pattern: "*.gmap"
 
-  - samples_file:
-      type: file
-      description: |
-        File with sample names and ploidy information.
-        One sample per line with a mandatory second column indicating ploidy (1 or 2).
-        Sample names that are not present are assumed to have ploidy 2 (diploids).
-        GLIMPSE does NOT handle the use of sex (M/F) instead of ploidy.
-      pattern: "*.{txt,tsv}"
-
 output:
   - meta:
       type: map

diff --git a/modules/nf-core/glimpse2/chunk/main.nf b/modules/nf-core/glimpse2/chunk/main.nf
@@ -17,7 +17,8 @@ process GLIMPSE2_CHUNK {
         'quay.io/biocontainers/glimpse-bio:2.0.0--hf340a29_0' }"
 
     input:
-    tuple val(meta), path(input), path(input_index), val(region), path(map)
+    tuple val(meta) , path(input), path(input_index), val(region)
+    tuple val(meta2), path(map)
     val(model)
 
     output:

diff --git a/modules/nf-core/glimpse2/phase/main.nf b/modules/nf-core/glimpse2/phase/main.nf
@@ -18,8 +18,9 @@ process GLIMPSE2_PHASE {
         'quay.io/biocontainers/glimpse-bio:2.0.0--hf340a29_0' }"
 
     input:
-        tuple val(meta), path(input), path(input_index), val(input_region), val(output_region), path(reference), path(reference_index), path(map), path(samples_file)
+        tuple val(meta) , path(input), path(input_index), path(samples_file), val(input_region), val(output_region), path(reference), path(reference_index), path(map)
         tuple val(meta2), path(fasta_reference), path(fasta_reference_index)
+
     output:
         tuple val(meta), path("*.{vcf,bcf,bgen}"), emit: phased_variant
         tuple val(meta), path("*.txt.gz")        , emit: stats_coverage, optional: true
@@ -29,13 +30,17 @@ process GLIMPSE2_PHASE {
         task.ext.when == null || task.ext.when
 
     script:
+    def region = input_region    ? "${output_region.replace(":","_")}" : "${reference}"
     def args   = task.ext.args   ?: ""
-    def prefix = task.ext.prefix ?: "${meta.id}_${input_region.replace(":","_")}"
+    def prefix = task.ext.prefix ?: "${meta.id}_${region}"
     def suffix = task.ext.suffix ?: "bcf"
 
     def map_command           = map                 ? "--map $map"                    : ""
     def samples_file_command  = samples_file        ? "--samples-file $samples_file"  : ""
     def fasta_command         = fasta_reference     ? "--fasta $fasta_reference"      : ""
+    def input_region_cmd      = input_region        ? "--input-region $input_region"  : ""
+    def output_region_cmd     = output_region       ? "--output-region $output_region": ""
+
     def input_bam             = input.any { it.extension in ["cram","bam"]}
 
     """
@@ -54,14 +59,14 @@ process GLIMPSE2_PHASE {
         $map_command \\
         $fasta_command \\
         $samples_file_command \\
-        --input-region $input_region \\
-        --output-region $output_region \\
+        $input_region_cmd \\
+        $output_region_cmd \\
         --thread $task.cpus \\
         --output ${prefix}.${suffix}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        glimpse2: "\$(GLIMPSE2_split_reference --help | sed -nr '/Version/p' | grep -o -E '([0-9]+.){1,2}[0-9]' | head -1)"
+        glimpse2: "\$(GLIMPSE2_phase --help | sed -nr '/Version/p' | grep -o -E '([0-9]+.){1,2}[0-9]' | head -1)"
     END_VERSIONS
     """
 }
diff --git a/modules/nf-core/glimpse2/phase/meta.yml b/modules/nf-core/glimpse2/phase/meta.yml
@@ -22,24 +22,38 @@ input:
         e.g. [ id:'test', single_end:false ]
 
   - input:
-      type: files
+      type: file
       description: |
-        Either multiple BAM/CRAM files containing low-coverage sequencing reads or one VCF/BCF file containing the genotype likelihoods. When using BAM/CRAM the name of the file is used as samples name.
+        Either multiple BAM/CRAM files containing low-coverage sequencing reads or one VCF/BCF file containing the genotype likelihoods.
+        When using BAM/CRAM the name of the file is used as samples name.
       pattern: "*.{bam,cram,vcf,vcf.gz,bcf,bcf.gz}"
 
   - input_index:
       type: file
       description: Index file of the input BAM/CRAM/VCF/BCF file.
       pattern: "*.{bam.bai,cram.crai,vcf.gz.csi,bcf.gz.csi}"
 
+  - samples_file:
+      type: file
+      description: |
+        File with sample names and ploidy information.
+        One sample per line with a mandatory second column indicating ploidy (1 or 2).
+        Sample names that are not present are assumed to have ploidy 2 (diploids).
+        GLIMPSE does NOT handle the use of sex (M/F) instead of ploidy.
+      pattern: "*.{txt,tsv}"
+
   - input_region:
       type: string
-      description: Target region used for imputation, including left and right buffers (e.g. chr20:1000000-2000000).
+      description: |
+        Target region used for imputation, including left and right buffers (e.g. chr20:1000000-2000000).
+        Optional if reference panel is in bin format.
       pattern: "chrXX:leftBufferPosition-rightBufferPosition"
 
   - output_region:
       type: string
-      description: Target imputed region, excluding left and right buffers (e.g. chr20:1000000-2000000).
+      description: |
+        Target imputed region, excluding left and right buffers (e.g. chr20:1000000-2000000).
+        Optional if reference panel is in bin format.
       pattern: "chrXX:leftBufferPosition-rightBufferPosition"
 
   - reference:
@@ -53,27 +67,24 @@ input:
       pattern: "*.{vcf.gz.csi,bcf.gz.csi}"
 
   - map:
-      type: file
-      description: File containing the genetic map.
-      pattern: "*.gmap"
-
-  - samples_file:
       type: file
       description: |
-        File with sample names and ploidy information. One sample per line with a mandatory second column indicating ploidy (1 or 2). Sample names that are not present are assumed to have ploidy 2 (diploids). GLIMPSE does NOT handle the use of sex (M/F) instead of ploidy.
-      pattern: "*.{txt,tsv}"
+        File containing the genetic map.
+        Optional if reference panel is in bin format.
+      pattern: "*.gmap"
 
   - fasta_reference:
       type: file
       description: |
         Faidx-indexed reference sequence file in the appropriate genome build.
-        Necessary for CRAM files
+        Necessary for CRAM files.
       pattern: "*.fasta"
 
   - fasta_reference_index:
       type: file
       description: |
         Faidx index of the reference sequence file in the appropriate genome build.
+        Necessary for CRAM files.
       pattern: "*.fai"
 
 output:

diff --git a/modules/nf-core/glimpse2/splitreference/main.nf b/modules/nf-core/glimpse2/splitreference/main.nf
@@ -31,7 +31,7 @@ process GLIMPSE2_SPLITREFERENCE {
 
     script:
     def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: "${meta.id}"
+    def prefix = task.ext.prefix ?: "${meta.id}_${output_region.replace(":","_")}"
     def map_command = map ? "--map $map" : ""
 
     """

diff --git a/subworkflows/nf-core/multiple_impute_glimpse2/main.nf b/subworkflows/nf-core/multiple_impute_glimpse2/main.nf
@@ -0,0 +1,73 @@
+include { GLIMPSE2_CHUNK                 } from '../../../modules/nf-core/glimpse2/chunk/main'
+include { GLIMPSE2_SPLITREFERENCE        } from '../../../modules/nf-core/glimpse2/splitreference/main'
+include { GLIMPSE2_PHASE                 } from '../../../modules/nf-core/glimpse2/phase/main'
+include { GLIMPSE2_LIGATE                } from '../../../modules/nf-core/glimpse2/ligate/main'
+include { BCFTOOLS_INDEX as INDEX_PHASE  } from '../../../modules/nf-core/bcftools/index/main.nf'
+include { BCFTOOLS_INDEX as INDEX_LIGATE } from '../../../modules/nf-core/bcftools/index/main.nf'
+
+workflow MULTIPLE_IMPUTE_GLIMPSE2 {
+
+    take:
+    ch_input    // channel (mandatory): [ meta, vcf, csi, infos ]
+    ch_ref      // channel (mandatory): [ meta, vcf, csi, region ]
+    ch_map      // channel  (optional): [ meta, map ]
+    ch_fasta    // channel  (optional): [ meta, fasta, index ]
+    chunk_model // string: model used to chunk the reference panel
+
+    main:
+
+    ch_versions = Channel.empty()
+
+    // Chunk reference panel
+    GLIMPSE2_CHUNK ( ch_ref, ch_map, chunk_model )
+    ch_versions = ch_versions.mix( GLIMPSE2_CHUNK.out.versions.first() )
+
+    chunk_output = GLIMPSE2_CHUNK.out.chunk_chr
+                                .splitCsv(header: ['ID', 'Chr', 'RegionBuf', 'RegionCnk', 'WindowCm',
+                                            'WindowMb', 'NbTotVariants', 'NbComVariants'],
+                                        sep: "\t", skip: 0)
+                                .map { meta, it -> [meta, it["RegionBuf"], it["RegionCnk"]]}
+
+    // Split reference panel in bin files
+    split_input = ch_ref.map{ meta, ref, index, region -> [meta, ref, index]}
+                        .combine(chunk_output, by: 0)
+
+    GLIMPSE2_SPLITREFERENCE( split_input, ch_map )
+    ch_versions = ch_versions.mix( GLIMPSE2_SPLITREFERENCE.out.versions.first() )
+
+    phase_input = ch_input.combine( GLIMPSE2_SPLITREFERENCE.out.bin_ref )
+                        .map{ input_meta, input_file, input_index, input_infos,
+                            panel_meta, panel_bin ->
+                            [input_meta, input_file, input_index, input_infos,
+                            [], [], panel_bin, [], []]
+                    }/* Remove unnecessary meta maps
+                        add null index as we use a bin file,
+                        add null value for input and output region as we use a bin file */
+
+    // Phase input files for each reference bin files + indexing
+    GLIMPSE2_PHASE ( phase_input, ch_fasta ) // [meta, vcf, index, sample_infos, regionin, regionout, regionindex, ref, ref_index, map], [ meta, fasta, index ]
+    ch_versions = ch_versions.mix( GLIMPSE2_PHASE.out.versions.first() )
+
+    INDEX_PHASE ( GLIMPSE2_PHASE.out.phased_variant )
+    ch_versions = ch_versions.mix( INDEX_PHASE.out.versions.first() )
+
+    // Ligate all phased files in one and index it
+    ligate_input = GLIMPSE2_PHASE.out.phased_variant
+                                    .groupTuple()
+                                    .combine( INDEX_PHASE.out.csi
+                                            .groupTuple()
+                                            .collect(), by: 0 )
+
+    GLIMPSE2_LIGATE ( ligate_input )
+    ch_versions = ch_versions.mix( GLIMPSE2_LIGATE.out.versions.first() )
+
+    INDEX_LIGATE ( GLIMPSE2_LIGATE.out.merged_variants )
+    ch_versions = ch_versions.mix( INDEX_LIGATE.out.versions.first() )
+
+    emit:
+    chunk_chr              = GLIMPSE2_CHUNK.out.chunk_chr           // channel: [ val(meta), txt ]
+    merged_variants        = GLIMPSE2_LIGATE.out.merged_variants    // channel: [ val(meta), bcf ]
+    merged_variants_index  = INDEX_LIGATE.out.csi                   // channel: [ val(meta), csi ]
+
+    versions               = ch_versions                            // channel: [ versions.yml ]
+}
diff --git a/subworkflows/nf-core/multiple_impute_glimpse2/meta.yml b/subworkflows/nf-core/multiple_impute_glimpse2/meta.yml
@@ -0,0 +1,72 @@
+name: "multiple_imputation_glimpse2"
+description: Impute VCF/BCF files, but also CRAM and BAM files with Glimpse2
+keywords:
+  - glimpse
+  - chunk
+  - phase
+  - ligate
+  - split_reference
+
+modules:
+  - glimpse2/chunk
+  - glimpse/2phase
+  - glimpse2/ligate
+  - glimpse2/split_reference
+  - bcftools/index
+
+input:
+  - ch_input:
+      type: file
+      description: |
+        Target dataset in CRAM, BAM or VCF/BCF format.
+        Index file of the input file.
+        File with sample names and ploidy information.
+        Structure: [ meta, file, index, txt ]
+
+  - ch_ref:
+      type: file
+      description: |
+        Reference panel of haplotypes in VCF/BCF format.
+        Index file of the Reference panel file.
+        Target region, usually a full chromosome (e.g. chr20:1000000-2000000 or chr20).
+        The file could possibly be without GT field (for efficiency reasons a file containing only the positions is recommended).
+        Structure: [ meta, vcf, csi, region ]
+
+  - ch_map:
+      type: file
+      description: |
+        File containing the genetic map.
+        Structure: [ meta, gmap ]
+
+  - ch_fasta:
+      type: file
+      description: |
+        Reference genome in fasta format.
+        Reference genome index in fai format
+        Structure: [ meta, fasta, fai ]
+
+output:
+  - chunk_chr:
+      type: file
+      description: |
+        Tab delimited output txt file containing buffer and imputation regions.
+        Structure: [meta, txt]
+
+  - merged_variants:
+      type: file
+      description: |
+        Output VCF/BCF file for the merged regions.
+        Phased information (HS field) is updated accordingly for the full region.
+        Structure: [ val(meta), bcf ]
+
+  - merged_variants_index:
+      type: file
+      description: Index file of the ligated phased variants files.
+
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+
+authors:
+  - "@LouisLeNezet"