Skip to content

Commit

Permalink
Sbwf impute glimpse2 (#3349)
Browse files Browse the repository at this point in the history
* Separate map file in second channel for glimpse2 chunk

* Change samples infos files of place

Add input, output region and map as optional

Add output region as prefix

* Change samples infos files in glimpse_phase

* Change samples_infos file way to combine

* Remove view()

* Update vcf_impute_glimpse to respect previous change in glimpse process

* Add new sbwf for glimpse2

* Update file name

* Change input1 to input

* Remove md5 sum of bin file

* Correct test glimpse2_phase

* Small changes

* Add keyword to glimpse_chunk

* Update tests/modules/nf-core/glimpse/concordance/main.nf

Co-authored-by: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com>

---------

Co-authored-by: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com>
  • Loading branch information
LouisLeNezet and adamrtalbot authored May 2, 2023
1 parent d68b2e6 commit c7ddd48
Show file tree
Hide file tree
Showing 30 changed files with 610 additions and 222 deletions.
1 change: 1 addition & 0 deletions modules/nf-core/glimpse/chunk/meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ description: Defines chunks where to run imputation
keywords:
- chunk
- imputation
- low coverage
tools:
- "glimpse":
description: "GLIMPSE is a phasing and imputation method for large-scale low-coverage sequencing studies."
Expand Down
2 changes: 1 addition & 1 deletion modules/nf-core/glimpse/phase/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ process GLIMPSE_PHASE {
'quay.io/biocontainers/glimpse-bio:1.1.1--hce55b13_1' }"

input:
tuple val(meta) , path(input), path(input_index), val(input_region), val(output_region), path(reference), path(reference_index), path(map), path(samples_file)
tuple val(meta) , path(input), path(input_index), path(samples_file), val(input_region), val(output_region), path(reference), path(reference_index), path(map)

output:
tuple val(meta), path("*.{vcf,bcf,vcf.gz,bcf.gz}"), emit: phased_variant
Expand Down
21 changes: 12 additions & 9 deletions modules/nf-core/glimpse/phase/meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ description: main GLIMPSE algorithm, performs phasing and imputation refining ge
keywords:
- phase
- imputation
- low-coverage
- glimpse

tools:
- "glimpse":
description: "GLIMPSE is a phasing and imputation method for large-scale low-coverage sequencing studies."
Expand All @@ -29,6 +32,15 @@ input:
description: Index file of the input VCF/BCF file containing genotype likelihoods.
pattern: "*.{vcf.gz.csi,bcf.gz.csi}"

- samples_file:
type: file
description: |
File with sample names and ploidy information.
One sample per line with a mandatory second column indicating ploidy (1 or 2).
Sample names that are not present are assumed to have ploidy 2 (diploids).
GLIMPSE does NOT handle the use of sex (M/F) instead of ploidy.
pattern: "*.{txt,tsv}"

- input_region:
type: string
description: Target region used for imputation, including left and right buffers (e.g. chr20:1000000-2000000).
Expand All @@ -54,15 +66,6 @@ input:
description: File containing the genetic map.
pattern: "*.gmap"

- samples_file:
type: file
description: |
File with sample names and ploidy information.
One sample per line with a mandatory second column indicating ploidy (1 or 2).
Sample names that are not present are assumed to have ploidy 2 (diploids).
GLIMPSE does NOT handle the use of sex (M/F) instead of ploidy.
pattern: "*.{txt,tsv}"

output:
- meta:
type: map
Expand Down
3 changes: 2 additions & 1 deletion modules/nf-core/glimpse2/chunk/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ process GLIMPSE2_CHUNK {
'quay.io/biocontainers/glimpse-bio:2.0.0--hf340a29_0' }"

input:
tuple val(meta), path(input), path(input_index), val(region), path(map)
tuple val(meta) , path(input), path(input_index), val(region)
tuple val(meta2), path(map)
val(model)

output:
Expand Down
15 changes: 10 additions & 5 deletions modules/nf-core/glimpse2/phase/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ process GLIMPSE2_PHASE {
'quay.io/biocontainers/glimpse-bio:2.0.0--hf340a29_0' }"

input:
tuple val(meta), path(input), path(input_index), val(input_region), val(output_region), path(reference), path(reference_index), path(map), path(samples_file)
tuple val(meta) , path(input), path(input_index), path(samples_file), val(input_region), val(output_region), path(reference), path(reference_index), path(map)
tuple val(meta2), path(fasta_reference), path(fasta_reference_index)

output:
tuple val(meta), path("*.{vcf,bcf,bgen}"), emit: phased_variant
tuple val(meta), path("*.txt.gz") , emit: stats_coverage, optional: true
Expand All @@ -29,13 +30,17 @@ process GLIMPSE2_PHASE {
task.ext.when == null || task.ext.when

script:
def region = input_region ? "${output_region.replace(":","_")}" : "${reference}"
def args = task.ext.args ?: ""
def prefix = task.ext.prefix ?: "${meta.id}_${input_region.replace(":","_")}"
def prefix = task.ext.prefix ?: "${meta.id}_${region}"
def suffix = task.ext.suffix ?: "bcf"

def map_command = map ? "--map $map" : ""
def samples_file_command = samples_file ? "--samples-file $samples_file" : ""
def fasta_command = fasta_reference ? "--fasta $fasta_reference" : ""
def input_region_cmd = input_region ? "--input-region $input_region" : ""
def output_region_cmd = output_region ? "--output-region $output_region": ""

def input_bam = input.any { it.extension in ["cram","bam"]}

"""
Expand All @@ -54,14 +59,14 @@ process GLIMPSE2_PHASE {
$map_command \\
$fasta_command \\
$samples_file_command \\
--input-region $input_region \\
--output-region $output_region \\
$input_region_cmd \\
$output_region_cmd \\
--thread $task.cpus \\
--output ${prefix}.${suffix}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
glimpse2: "\$(GLIMPSE2_split_reference --help | sed -nr '/Version/p' | grep -o -E '([0-9]+.){1,2}[0-9]' | head -1)"
glimpse2: "\$(GLIMPSE2_phase --help | sed -nr '/Version/p' | grep -o -E '([0-9]+.){1,2}[0-9]' | head -1)"
END_VERSIONS
"""
}
35 changes: 23 additions & 12 deletions modules/nf-core/glimpse2/phase/meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,24 +22,38 @@ input:
e.g. [ id:'test', single_end:false ]
- input:
type: files
type: file
description: |
Either multiple BAM/CRAM files containing low-coverage sequencing reads or one VCF/BCF file containing the genotype likelihoods. When using BAM/CRAM the name of the file is used as samples name.
Either multiple BAM/CRAM files containing low-coverage sequencing reads or one VCF/BCF file containing the genotype likelihoods.
When using BAM/CRAM the name of the file is used as samples name.
pattern: "*.{bam,cram,vcf,vcf.gz,bcf,bcf.gz}"

- input_index:
type: file
description: Index file of the input BAM/CRAM/VCF/BCF file.
pattern: "*.{bam.bai,cram.crai,vcf.gz.csi,bcf.gz.csi}"

- samples_file:
type: file
description: |
File with sample names and ploidy information.
One sample per line with a mandatory second column indicating ploidy (1 or 2).
Sample names that are not present are assumed to have ploidy 2 (diploids).
GLIMPSE does NOT handle the use of sex (M/F) instead of ploidy.
pattern: "*.{txt,tsv}"

- input_region:
type: string
description: Target region used for imputation, including left and right buffers (e.g. chr20:1000000-2000000).
description: |
Target region used for imputation, including left and right buffers (e.g. chr20:1000000-2000000).
Optional if reference panel is in bin format.
pattern: "chrXX:leftBufferPosition-rightBufferPosition"

- output_region:
type: string
description: Target imputed region, excluding left and right buffers (e.g. chr20:1000000-2000000).
description: |
Target imputed region, excluding left and right buffers (e.g. chr20:1000000-2000000).
Optional if reference panel is in bin format.
pattern: "chrXX:leftBufferPosition-rightBufferPosition"

- reference:
Expand All @@ -53,27 +67,24 @@ input:
pattern: "*.{vcf.gz.csi,bcf.gz.csi}"

- map:
type: file
description: File containing the genetic map.
pattern: "*.gmap"

- samples_file:
type: file
description: |
File with sample names and ploidy information. One sample per line with a mandatory second column indicating ploidy (1 or 2). Sample names that are not present are assumed to have ploidy 2 (diploids). GLIMPSE does NOT handle the use of sex (M/F) instead of ploidy.
pattern: "*.{txt,tsv}"
File containing the genetic map.
Optional if reference panel is in bin format.
pattern: "*.gmap"

- fasta_reference:
type: file
description: |
Faidx-indexed reference sequence file in the appropriate genome build.
Necessary for CRAM files
Necessary for CRAM files.
pattern: "*.fasta"

- fasta_reference_index:
type: file
description: |
Faidx index of the reference sequence file in the appropriate genome build.
Necessary for CRAM files.
pattern: "*.fai"

output:
Expand Down
2 changes: 1 addition & 1 deletion modules/nf-core/glimpse2/splitreference/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ process GLIMPSE2_SPLITREFERENCE {

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def prefix = task.ext.prefix ?: "${meta.id}_${output_region.replace(":","_")}"
def map_command = map ? "--map $map" : ""

"""
Expand Down
73 changes: 73 additions & 0 deletions subworkflows/nf-core/multiple_impute_glimpse2/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
include { GLIMPSE2_CHUNK } from '../../../modules/nf-core/glimpse2/chunk/main'
include { GLIMPSE2_SPLITREFERENCE } from '../../../modules/nf-core/glimpse2/splitreference/main'
include { GLIMPSE2_PHASE } from '../../../modules/nf-core/glimpse2/phase/main'
include { GLIMPSE2_LIGATE } from '../../../modules/nf-core/glimpse2/ligate/main'
include { BCFTOOLS_INDEX as INDEX_PHASE } from '../../../modules/nf-core/bcftools/index/main.nf'
include { BCFTOOLS_INDEX as INDEX_LIGATE } from '../../../modules/nf-core/bcftools/index/main.nf'

workflow MULTIPLE_IMPUTE_GLIMPSE2 {

take:
ch_input // channel (mandatory): [ meta, vcf, csi, infos ]
ch_ref // channel (mandatory): [ meta, vcf, csi, region ]
ch_map // channel (optional): [ meta, map ]
ch_fasta // channel (optional): [ meta, fasta, index ]
chunk_model // string: model used to chunk the reference panel

main:

ch_versions = Channel.empty()

// Chunk reference panel
GLIMPSE2_CHUNK ( ch_ref, ch_map, chunk_model )
ch_versions = ch_versions.mix( GLIMPSE2_CHUNK.out.versions.first() )

chunk_output = GLIMPSE2_CHUNK.out.chunk_chr
.splitCsv(header: ['ID', 'Chr', 'RegionBuf', 'RegionCnk', 'WindowCm',
'WindowMb', 'NbTotVariants', 'NbComVariants'],
sep: "\t", skip: 0)
.map { meta, it -> [meta, it["RegionBuf"], it["RegionCnk"]]}

// Split reference panel in bin files
split_input = ch_ref.map{ meta, ref, index, region -> [meta, ref, index]}
.combine(chunk_output, by: 0)

GLIMPSE2_SPLITREFERENCE( split_input, ch_map )
ch_versions = ch_versions.mix( GLIMPSE2_SPLITREFERENCE.out.versions.first() )

phase_input = ch_input.combine( GLIMPSE2_SPLITREFERENCE.out.bin_ref )
.map{ input_meta, input_file, input_index, input_infos,
panel_meta, panel_bin ->
[input_meta, input_file, input_index, input_infos,
[], [], panel_bin, [], []]
}/* Remove unnecessary meta maps
add null index as we use a bin file,
add null value for input and output region as we use a bin file */

// Phase input files for each reference bin files + indexing
GLIMPSE2_PHASE ( phase_input, ch_fasta ) // [meta, vcf, index, sample_infos, regionin, regionout, regionindex, ref, ref_index, map], [ meta, fasta, index ]
ch_versions = ch_versions.mix( GLIMPSE2_PHASE.out.versions.first() )

INDEX_PHASE ( GLIMPSE2_PHASE.out.phased_variant )
ch_versions = ch_versions.mix( INDEX_PHASE.out.versions.first() )

// Ligate all phased files in one and index it
ligate_input = GLIMPSE2_PHASE.out.phased_variant
.groupTuple()
.combine( INDEX_PHASE.out.csi
.groupTuple()
.collect(), by: 0 )

GLIMPSE2_LIGATE ( ligate_input )
ch_versions = ch_versions.mix( GLIMPSE2_LIGATE.out.versions.first() )

INDEX_LIGATE ( GLIMPSE2_LIGATE.out.merged_variants )
ch_versions = ch_versions.mix( INDEX_LIGATE.out.versions.first() )

emit:
chunk_chr = GLIMPSE2_CHUNK.out.chunk_chr // channel: [ val(meta), txt ]
merged_variants = GLIMPSE2_LIGATE.out.merged_variants // channel: [ val(meta), bcf ]
merged_variants_index = INDEX_LIGATE.out.csi // channel: [ val(meta), csi ]

versions = ch_versions // channel: [ versions.yml ]
}
72 changes: 72 additions & 0 deletions subworkflows/nf-core/multiple_impute_glimpse2/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
name: "multiple_imputation_glimpse2"
description: Impute VCF/BCF files, but also CRAM and BAM files with Glimpse2
keywords:
- glimpse
- chunk
- phase
- ligate
- split_reference

modules:
- glimpse2/chunk
- glimpse/2phase
- glimpse2/ligate
- glimpse2/split_reference
- bcftools/index

input:
- ch_input:
type: file
description: |
Target dataset in CRAM, BAM or VCF/BCF format.
Index file of the input file.
File with sample names and ploidy information.
Structure: [ meta, file, index, txt ]
- ch_ref:
type: file
description: |
Reference panel of haplotypes in VCF/BCF format.
Index file of the Reference panel file.
Target region, usually a full chromosome (e.g. chr20:1000000-2000000 or chr20).
The file could possibly be without GT field (for efficiency reasons a file containing only the positions is recommended).
Structure: [ meta, vcf, csi, region ]
- ch_map:
type: file
description: |
File containing the genetic map.
Structure: [ meta, gmap ]
- ch_fasta:
type: file
description: |
Reference genome in fasta format.
Reference genome index in fai format
Structure: [ meta, fasta, fai ]
output:
- chunk_chr:
type: file
description: |
Tab delimited output txt file containing buffer and imputation regions.
Structure: [meta, txt]
- merged_variants:
type: file
description: |
Output VCF/BCF file for the merged regions.
Phased information (HS field) is updated accordingly for the full region.
Structure: [ val(meta), bcf ]
- merged_variants_index:
type: file
description: Index file of the ligated phased variants files.

- versions:
type: file
description: File containing software versions
pattern: "versions.yml"

authors:
- "@LouisLeNezet"
Loading

0 comments on commit c7ddd48

Please sign in to comment.