diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cd55e636..d5324d9a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,13 +29,20 @@ jobs: - "22.10.5" - "latest-everything" test: - - "all" + - "all_annotate" + - "all_no_annotate" - "smoove" - "delly" - "manta" - - "whamg" - - "gridss" + steps: + - name: Free some space + run: | + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + - name: Check out pipeline code uses: actions/checkout@v3 diff --git a/README.md b/README.md index 885c719b..04a814c1 100644 --- a/README.md +++ b/README.md @@ -8,22 +8,13 @@ ## Introduction - - -**CenterForMedicalGeneticsGhent/nf-cmgg-structural** is a bioinformatics best-practice analysis pipeline for calling structural variants. +**CenterForMedicalGeneticsGhent/nf-cmgg-structural** is a bioinformatics best-practice analysis pipeline for calling germline structural variants from short reads. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! - - -On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources.The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/centerformedicalgeneticsghent-nf-cmgg-structural/results). - ## Pipeline summary - - -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +![metro map](docs/images/metro_map.png) ## Quick Start @@ -46,30 +37,22 @@ On release, automated continuous integration tests run the pipeline on a full-si 4. Start running your own analysis! - - ```bash - nextflow run CenterForMedicalGeneticsGhent/nf-cmgg-structural --input samplesheet.csv --outdir --genome GRCh37 -profile + nextflow run CenterForMedicalGeneticsGhent/nf-cmgg-structural --input samplesheet.csv --outdir --genome GRCh38 -profile ``` ## Documentation -The nf-core/centerformedicalgeneticsghent-nf-cmgg-structural pipeline comes with documentation about the pipeline [usage](https://nf-co.re/centerformedicalgeneticsghent-nf-cmgg-structural/usage), [parameters](https://nf-co.re/centerformedicalgeneticsghent-nf-cmgg-structural/parameters) and [output](https://nf-co.re/centerformedicalgeneticsghent-nf-cmgg-structural/output). +The CenterForMedicalGenetics/nf-cmgg-structural pipeline comes with documentation about the pipeline [usage](https://github.com/CenterForMedicalGeneticsGhent/nf-cmgg-structural/tree/master/docs/usage.md) and [output](https://github.com/CenterForMedicalGeneticsGhent/nf-cmgg-structural/tree/master/docs/output.md). ## Credits CenterForMedicalGeneticsGhent/nf-cmgg-structural was originally written by Nicolas Vannieuwkerke and Mattias Van Heetvelde. -We thank the following people for their extensive assistance in the development of this pipeline: - - - ## Contributions and Support If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). -For further information or help, don't hesitate to get in touch on the [Slack `#centerformedicalgeneticsghent-nf-cmgg-structural` channel](https://nfcore.slack.com/channels/centerformedicalgeneticsghent-nf-cmgg-structural) (you can join with [this invite](https://nf-co.re/join/slack)). - ## Citations diff --git a/assets/header.txt b/assets/header.txt index a46b3ca1..bd92458c 100644 --- a/assets/header.txt +++ b/assets/header.txt @@ -154,6 +154,8 @@ ##INFO= ##INFO= ##INFO= +##INFO= +##INFO= ##FORMAT= ##FORMAT= ##FORMAT= diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index f4d3331e..c150ec6b 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,6 +1,5 @@ sample,family,cram,crai,bed PosCon1,family1,s3://test-data/genomics/homo_sapiens/illumina/cram/SVcontrol/small/PosCon1.cram,s3://test-data/genomics/homo_sapiens/illumina/cram/SVcontrol/small/PosCon1.cram.crai,s3://test-data/genomics/homo_sapiens/illumina/regions/SVcontrol/PosCon1and2.roi.bed PosCon2,family1,s3://test-data/genomics/homo_sapiens/illumina/cram/SVcontrol/small/PosCon2.cram,s3://test-data/genomics/homo_sapiens/illumina/cram/SVcontrol/small/PosCon2.cram.crai,s3://test-data/genomics/homo_sapiens/illumina/regions/SVcontrol/PosCon1and2.roi.bed -PosCon3,,s3://test-data/genomics/homo_sapiens/illumina/cram/SVcontrol/small/PosCon3.cram,s3://test-data/genomics/homo_sapiens/illumina/cram/SVcontrol/small/PosCon3.cram.crai,s3://test-data/genomics/homo_sapiens/illumina/regions/SVcontrol/PosCon3.roi.bed +PosCon3,,s3://test-data/genomics/homo_sapiens/illumina/cram/SVcontrol/small/PosCon3.cram,s3://test-data/genomics/homo_sapiens/illumina/cram/SVcontrol/small/PosCon3.cram.crai, PosCon4,,s3://test-data/genomics/homo_sapiens/illumina/cram/SVcontrol/small/PosCon4.cram,s3://test-data/genomics/homo_sapiens/illumina/cram/SVcontrol/small/PosCon4.cram.crai,s3://test-data/genomics/homo_sapiens/illumina/regions/SVcontrol/PosCon4.roi.bed -PosCon5,,s3://test-data/genomics/homo_sapiens/illumina/cram/SVcontrol/small/PosCon5.cram,s3://test-data/genomics/homo_sapiens/illumina/cram/SVcontrol/small/PosCon5.cram.crai,s3://test-data/genomics/homo_sapiens/illumina/regions/SVcontrol/PosCon5.roi.bed diff --git a/bin/simple-event-annotation.R b/bin/simple-event-annotation.R new file mode 100755 index 00000000..2c77d884 --- /dev/null +++ b/bin/simple-event-annotation.R @@ -0,0 +1,55 @@ +#!/usr/local/bin/Rscript +# Fetched from https://github.com/PapenfussLab/gridss/blob/master/example/simple-event-annotation.R +# Although it's been slightly adjusted + +# if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") +#BiocManager::install("StructuralVariantAnnotation") +#install.packages("stringr") +library(VariantAnnotation) +library(StructuralVariantAnnotation) +library(stringr) + +args <- commandArgs() +input_vcf <- args[1] +output_vcf <- args[2] + +#' Simple SV type classifier +simpleEventType <- function(gr) { + pgr = partner(gr) + return(ifelse(seqnames(gr) != seqnames(pgr), "CTX", # inter-chromosomosal + ifelse(strand(gr) == strand(pgr), "INV", + ifelse(gr$insLen >= abs(gr$svLen) * 0.7, "INS", # TODO: improve classification of complex events + ifelse(xor(start(gr) < start(pgr), strand(gr) == "-"), "DEL", + "DUP"))))) +} + +vcf <- readVcf(input_vcf, "hg38") +info(header(vcf)) = unique(as(rbind(as.data.frame(info(header(vcf))), data.frame( + row.names=c("SIMPLE_TYPE"), + Number=c("1"), + Type=c("String"), + Description=c("Simple event type annotation based purely on breakend position and orientation."))), "DataFrame")) +gr <- breakpointRanges(vcf) +svtype <- simpleEventType(gr) +info(vcf)$SIMPLE_TYPE <- NA_character_ +info(vcf[gr$sourceId])$SIMPLE_TYPE <- svtype +info(vcf[gr$sourceId])$SVLEN <- gr$svLen +writeVcf(vcf, output_vcf) # generated by example/gridss.sh + +# # TODO: perform event filtering here +# # By default, GRIDSS is very sensitive but this comes at the cost of a high false discovery rate +# gr <- gr[gr$FILTER == "PASS" & partner(gr)$FILTER == "PASS"] # Remove low confidence calls + +# simplegr <- gr[simpleEventType(gr) %in% c("INS", "INV", "DEL", "DUP")] +# simplebed <- data.frame( +# chrom=seqnames(simplegr), +# # call the centre of the homology/inexact interval +# start=as.integer((start(simplegr) + end(simplegr)) / 2), +# end=as.integer((start(partner(simplegr)) + end(partner(simplegr))) / 2), +# name=simpleEventType(simplegr), +# score=simplegr$QUAL, +# strand="." +# ) +# # Just the lower of the two breakends so we don't output everything twice +# simplebed <- simplebed[simplebed$start < simplebed$end,] +# write.table(simplebed, "chr12.1527326.DEL1024.simple.bed", quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE) diff --git a/bin/viola_standardize.py b/bin/viola_standardize.py new file mode 100755 index 00000000..1fe651d4 --- /dev/null +++ b/bin/viola_standardize.py @@ -0,0 +1,37 @@ +#!/usr/local/bin/python + +import argparse +import os + +import viola + +if __name__ == "__main__": + # Setting up argparser + parser = argparse.ArgumentParser(description="A script to standardize VCFs using Viola-SV") + parser.add_argument('vcf', metavar='FILE', type=str, help="The called VCF") + parser.add_argument('caller', metavar='STRING', type=str, help="The caller used to call the VCF") + parser.add_argument('out_file', metavar='FILE', type=str, help="The standardized VCF") + parser.add_argument('patient_name', metavar='STRING', type=str, help="The name of the patient in the VCF file") + + args = parser.parse_args() + + vcf = args.vcf + caller = args.caller + out_file = args.out_file + patient_name = args.patient_name + + if caller == "smoove": caller = "lumpy" + + if caller == "gridss": + svlen_not_added = True + old_vcf = f'old_{vcf}' + os.rename(vcf, old_vcf) + with open(old_vcf, 'r') as old: + with open(vcf, 'w') as new: + for line in old.readlines(): + if line.startswith("##INFO") and svlen_not_added: + svlen_not_added = False + new.write("##INFO=\n") + new.write(line.replace("CIRPOS", "CIEND")) + + viola.read_vcf(vcf, variant_caller=caller, patient_name=patient_name).breakend2breakpoint().to_vcf(out_file) diff --git a/conf/modules.config b/conf/modules.config index 184da6e2..eeb35553 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -34,10 +34,6 @@ process { ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ - withName: COLLECTREADCOUNTS { - ext.args = "--format TSV --interval-merging-rule OVERLAPPING_ONLY --disable-read-filter MappingQualityReadFilter" - } - // // Delly // @@ -46,28 +42,36 @@ process { withName: '^.*:BAM_VARIANT_CALLING_DELLY:DELLY_CALL\$' { ext.args = {"--svtype ${params.delly_sv_types} --map-qual ${params.delly_map_qual} --min-clique-size ${params.delly_min_clique_size}"} ext.suffix = "vcf" - } - - withName: '^.*:BAM_VARIANT_CALLING_DELLY:BCFTOOLS_SORT\$' { ext.prefix = {"${meta.id}_delly"} publishDir = [ enabled: callers.size() == 1 || output_callers, overwrite: true, path: { callers.size() > 1 ? "${params.outdir}/${meta.id}/delly" : "${params.outdir}/${meta.id}" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename ==~ /^.*_delly.vcf.gz$/ ? filename.replace('_delly', '') : null } + saveAs: { filename -> filename ==~ /^.*_delly.vcf.gz(.tbi)?$/ ? filename.replace('_delly', '') : null } ] } - withName: '^.*:BAM_VARIANT_CALLING_DELLY:TABIX_TABIX\$' { - publishDir = [ - enabled: callers.size() == 1 || output_callers, - overwrite: true, - path: { callers.size() > 1 ? "${params.outdir}/${meta.id}/delly" : "${params.outdir}/${meta.id}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename ==~ /^.*_delly.vcf.gz.tbi$/ ? filename.replace('_delly', '') : null } - ] - } + // withName: '^.*:BAM_VARIANT_CALLING_DELLY:BCFTOOLS_SORT\$' { + // ext.prefix = {"${meta.id}_delly"} + // publishDir = [ + // enabled: callers.size() == 1 || output_callers, + // overwrite: true, + // path: { callers.size() > 1 ? "${params.outdir}/${meta.id}/delly" : "${params.outdir}/${meta.id}" }, + // mode: params.publish_dir_mode, + // saveAs: { filename -> filename ==~ /^.*_delly.vcf.gz$/ ? filename.replace('_delly', '') : null } + // ] + // } + + // withName: '^.*:BAM_VARIANT_CALLING_DELLY:TABIX_TABIX\$' { + // publishDir = [ + // enabled: callers.size() == 1 || output_callers, + // overwrite: true, + // path: { callers.size() > 1 ? "${params.outdir}/${meta.id}/delly" : "${params.outdir}/${meta.id}" }, + // mode: params.publish_dir_mode, + // saveAs: { filename -> filename ==~ /^.*_delly.vcf.gz.tbi$/ ? filename.replace('_delly', '') : null } + // ] + // } } // @@ -152,6 +156,10 @@ process { if("gridss" in callers){ withName: '^.*:BAM_VARIANT_CALLING_GRIDSS:GRIDSS_GRIDSS\$' { + ext.args = "--steps preprocess,assemble,call" + } + + withName: '^.*:BAM_VARIANT_CALLING_GRIDSS:SIMPLE_EVENT_ANNOTATION\$' { ext.prefix = { "${meta.id}_gridss" } publishDir = [ enabled: callers.size() == 1 || output_callers, @@ -160,7 +168,6 @@ process { mode: params.publish_dir_mode, saveAs: { filename -> filename ==~ /^.*_gridss.vcf.gz$/ ? filename.replace("_gridss", "") : null } ] - ext.args = "--steps preprocess,assemble,call" } withName: '^.*:BAM_VARIANT_CALLING_GRIDSS:TABIX_TABIX\$' { @@ -175,27 +182,11 @@ process { } // - // Gather metrics + // Standardize VCFs // - if(params.run_module_metrics){ - withName: COLLECTSVEVIDENCE { - ext.args = {"--sample-name ${meta.id}"} - } - - withName: SVTK_STANDARDIZE { - ext.args = {[ - args : '', - caller : "${meta.caller != "whamg" ? meta.caller : "wham"}" - ]} - } - - withName: SVTEST_VCF { - ext.args = {[ - args : '', - types : "DEL,DUP,INS,INV,BND" - ]} - } + withName: VIOLA { + ext.prefix = {"${meta.id}_${meta.caller}_standardized"} } // @@ -204,7 +195,7 @@ process { if(callers.size() > 1){ withName: "^.*:VCF_MERGE_JASMINE:JASMINESV\$" { - ext.args = "min_support=1 --allow_intrasample" + ext.args = "min_support=${params.callers_support} --allow_intrasample" } withName: "^.*:VCF_MERGE_JASMINE:REHEADER_CALLED_VCFS\$" { @@ -242,7 +233,7 @@ process { ext.prefix = {"${meta.id}_genotyped"} ext.args = "--ins-info-key SVINSSEQ" publishDir = [ - enabled: true, + enabled: !params.annotate, overwrite: true, path: { "${params.outdir}/ready/${meta.family}" }, mode: params.publish_dir_mode, @@ -251,9 +242,8 @@ process { } withName: "^.*:VCF_GENOTYPE_SV_PARAGRAPH:BCFTOOLS_MERGE\$" { - ext.prefix = {"${meta.id}"} publishDir = [ - enabled: true, + enabled: !params.annotate, overwrite: true, path: { "${params.outdir}/ready/${meta.family}" }, mode: params.publish_dir_mode, @@ -261,6 +251,70 @@ process { ] } + if(!params.annotate) { + withName: "^.*:VCF_GENOTYPE_SV_PARAGRAPH:TABIX_FAMILY\$" { + publishDir = [ + enabled: true, + overwrite: true, + path: { "${params.outdir}/ready/${meta.family}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename ==~ /^.*.vcf.gz.tbi$/ ? filename.replace("_genotyped", "") : null } + ] + } + } + + /* + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ANNOTATION + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + + if(params.annotate){ + withName: "^.*:ENSEMBLVEP_VEP\$" { + container = "nfcore/vep:${params.vep_version}.${params.genome}" + ext.prefix = {"${meta.id}_annotated"} + ext.args = {[ + // specify we use VCF files + '--format vcf', + // don't contact external db + '--offline', + // increase buffer_size to speed up analysis + '--buffer_size 100000', + // output format options + '--vcf --compress_output bgzip --force_overwrite', + // annotation options + '--variant_class --sift b --polyphen b --humdiv --allele_number --numbers --total_length --gene_phenotype --ccds --regulatory', + // identifiers + '--hgvs --hgvsg --shift_hgvs 1 --protein --symbol --ccds --uniprot --tsl --appris --canonical --mane --biotype --domains', + // co-located variant info + '--check_existing --clin_sig_allele 1 --af --max_af --af_1kg --af_gnomad --pubmed --var_synonyms', + // specific options for structural variants + '--overlaps', // TODO define the best configuration for --max_sv_size and --batch_size + // plugins + (params.vep_structuralvariantoverlap && params.gnomad_sv) ? "--plugin StructuralVariantOverlap,file=${params.gnomad_sv.split('/')[-1]}" : "", + (params.vep_structuralvariantoverlap && params.genomes1000_sv) ? "--plugin StructuralVariantOverlap,file=${params.genomes1000_sv.split('/')[-1]}": "", + (params.vep_phenotypes) ? "--plugin Phenotypes,file=${params.phenotypes.split('/')[-1]}": "" + ].join(' ').trim()} + publishDir = [ + enabled: true, + overwrite: true, + path: { "${params.outdir}/ready/${meta.family}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename ==~ /^.*.vcf.gz$/ ? filename.replace("_annotated", "") : null } + ] + } + + withName: "^.*:TABIX_ANNOTATED\$" { + publishDir = [ + enabled: true, + overwrite: true, + path: { "${params.outdir}/ready/${meta.family}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename ==~ /^.*.vcf.gz.tbi$/ ? filename.replace("_annotated", "") : null } + ] + } + } + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ REST OF THE PIPELINE diff --git a/conf/nf_test.config b/conf/nf_test.config index d693c196..62138bb8 100644 --- a/conf/nf_test.config +++ b/conf/nf_test.config @@ -16,6 +16,10 @@ params { input = "${params.baseDir}/tests/inputs/samplesheet.csv" outdir = "${params.outputDir}" + vep_cache = null + vep_version = "105.0" + vep_cache_version = "105" + // Fasta references genomes_base = "s3://reference-data/genomes" igenomes_ignore = true diff --git a/conf/test.config b/conf/test.config index dd812a21..ed356f71 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,7 +29,11 @@ params { igenomes_ignore = true genomes_ignore = false + vep_cache = null + + annotate = true + // Pipeline parameters - callers = "delly,manta,smoove,gridss" //manta,whamg,delly,smoove,gridss + callers = "delly,manta,smoove" //manta,delly,smoove output_callers = true } diff --git a/conf/test_big.config b/conf/test_big.config index b133daac..1a205ee2 100644 --- a/conf/test_big.config +++ b/conf/test_big.config @@ -22,7 +22,9 @@ params { igenomes_ignore = true genomes_ignore = false + annotate = true + // Pipeline parameters - callers = "delly,manta,smoove,gridss" //manta,whamg,delly,smoove,gridss + callers = "delly,manta,smoove" //manta,delly,smoove output_callers = true } diff --git a/docs/images/metro_map.png b/docs/images/metro_map.png new file mode 100644 index 00000000..73a6a76b Binary files /dev/null and b/docs/images/metro_map.png differ diff --git a/docs/images/metro_map.svg b/docs/images/metro_map.svg new file mode 100644 index 00000000..bf75b002 --- /dev/null +++ b/docs/images/metro_map.svg @@ -0,0 +1,1146 @@ + + + +cramcraibedOptionalOptionalMandatorybedtools sortsort, bgzip and indexVariant callingbcftools sortdelly callmantagermlinemantaconvertinversionsmoove callviolastandardize VCFsjasminemerge VCFs per samplebcftools reheaderparagraph idxdepthcreate manifest for paragraphparagraph multigrmpygenotype the variantsbcftools mergeindividuals -> familyensembl VEPannotate the variantsvcfnf-cmgg-structuralBED flowMain flowProcessLegends diff --git a/main.nf b/main.nf index 7904539d..8442bb6e 100644 --- a/main.nf +++ b/main.nf @@ -15,10 +15,16 @@ nextflow.enable.dsl = 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') -params.fasta_fai = WorkflowMain.getGenomeAttribute(params, 'fai') -params.dict = WorkflowMain.getGenomeAttribute(params, 'dict') -params.bwa = WorkflowMain.getGenomeAttribute(params, 'bwa') +params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') +params.fai = WorkflowMain.getGenomeAttribute(params, 'fai') +params.vep_cache = WorkflowMain.getGenomeAttribute(params, 'vep_cache') +params.bwa = WorkflowMain.getGenomeAttribute(params, 'bwa') +params.gnomad_sv = WorkflowMain.getGenomeAttribute(params, 'gnomad_sv') +params.gnomad_sv_tbi = WorkflowMain.getGenomeAttribute(params, 'gnomad_sv_tbi') +params.genomes1000_sv = WorkflowMain.getGenomeAttribute(params, 'genomes1000_sv') +params.genomes1000_sv_tbi = WorkflowMain.getGenomeAttribute(params, 'genomes1000_sv_tbi') +params.phenotypes = WorkflowMain.getGenomeAttribute(params, 'phenotypes') +params.phenotypes_tbi = WorkflowMain.getGenomeAttribute(params, 'phenotypes_tbi') /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/modules.json b/modules.json index 6bb4008c..0f39c3de 100644 --- a/modules.json +++ b/modules.json @@ -17,7 +17,7 @@ }, "bcftools/reheader": { "branch": "master", - "git_sha": "94688be064ebc0fc4d720ba7acb0f1f8eb8d0af0", + "git_sha": "bd4b60c7f9358c7146ac198fd0c4ae6355ddd086", "installed_by": ["modules"] }, "bcftools/sort": { @@ -42,48 +42,28 @@ }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "b6d4d476aee074311c89d82a69c1921bd70c8180", "installed_by": ["modules"] }, "delly/call": { "branch": "master", - "git_sha": "40defa2f0eb37163d310530f84bfd92e52adc53d", + "git_sha": "0263ccf45ba68887acf848bae39e13c3f866e57e", "installed_by": ["modules"], "patch": "modules/nf-core/delly/call/delly-call.diff" }, - "gatk4/collectreadcounts": { - "branch": "master", - "git_sha": "1fccb96ee4fc3efcdb7ed257f016a59cecb83dec", - "installed_by": ["modules"] - }, - "gatk4/collectsvevidence": { + "ensemblvep/vep": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] - }, - "gatk4/createsequencedictionary": { - "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] - }, - "gatk4/printsvevidence": { - "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] - }, - "gatk4/sitedepthtobaf": { - "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "bea3ca998816a7f812b1bbbcb27c3a9ffbac0706", "installed_by": ["modules"] }, "gridss/gridss": { "branch": "master", - "git_sha": "f159f90b4bfa06b64bf86650fda0eb441dce4b77", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["modules"] }, "jasminesv": { "branch": "master", - "git_sha": "bc65cc279dc5f7716091db364d992eda7beab6c4", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["modules"] }, "manta/convertinversion": { @@ -103,7 +83,7 @@ }, "paragraph/idxdepth": { "branch": "master", - "git_sha": "a6d4e569dc522658b9b47a2c1d3e013b0ae4ba77", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["modules"] }, "paragraph/multigrmpy": { @@ -113,7 +93,7 @@ }, "samtools/convert": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["modules"] }, "samtools/faidx": { @@ -123,25 +103,19 @@ }, "scramble/clusteranalysis": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["modules"] }, "scramble/clusteridentifier": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["modules"] }, "smoove/call": { "branch": "master", - "git_sha": "0a1ee3e389dc9be449a9a2f9fdc5aa753622d4d7", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["modules"] }, - "svtk/standardize": { - "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"], - "patch": "modules/nf-core/svtk/standardize/svtk-standardize.diff" - }, "tabix/bgzip": { "branch": "master", "git_sha": "90294980a903ecebd99ac31d8b6c66af48fa8259", @@ -159,7 +133,7 @@ }, "whamg": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["modules"] } } diff --git a/modules/local/gridss/simple_event_annotation/main.nf b/modules/local/gridss/simple_event_annotation/main.nf new file mode 100644 index 00000000..d17df08a --- /dev/null +++ b/modules/local/gridss/simple_event_annotation/main.nf @@ -0,0 +1,46 @@ +process SIMPLE_EVENT_ANNOTATION { + tag "$meta.id" + label 'process_low' + + conda "bioconda::bioconductor-structuralvariantannotation=1.13.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-structuralvariantannotation:1.13.0--r42hdfd78af_0' : + 'quay.io/biocontainers/bioconductor-structuralvariantannotation:1.13.0--r42hdfd78af_0' }" + + input: + tuple val(meta), path(vcf) + + output: + tuple val(meta), path("*.vcf.gz") , emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + simple-event-annotation.R \\ + ${vcf} \\ + ${prefix}.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n 1 | sed 's/R version //;s/ .*//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n 1 | sed 's/R version //;s/ .*//') + END_VERSIONS + """ +} diff --git a/modules/local/reversebed/main.nf b/modules/local/reversebed/main.nf index d3e48754..b9ad330a 100644 --- a/modules/local/reversebed/main.nf +++ b/modules/local/reversebed/main.nf @@ -37,4 +37,16 @@ process REVERSE_BED { bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") END_VERSIONS """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}_reversed.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ } diff --git a/modules/local/svtest/dockerfile b/modules/local/svtest/dockerfile deleted file mode 100644 index 8c8e96f3..00000000 --- a/modules/local/svtest/dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -FROM ubuntu:20.04 -ENV TZ=Europe/Brussels -ENV PYTHONPATH=/svtest - -RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone -RUN apt update -RUN apt install -y python3-pip libbz2-dev liblzma-dev - -ADD ./svtest /svtest - -RUN pip install ./svtest diff --git a/modules/local/svtest/pe-file/main.nf b/modules/local/svtest/pe-file/main.nf deleted file mode 100644 index 20ccea8b..00000000 --- a/modules/local/svtest/pe-file/main.nf +++ /dev/null @@ -1,50 +0,0 @@ -process SVTEST_PEFILE { - tag "$meta.id" - label 'process_low' - - container "nicolasvnk/svtest:0.1" - - input: - tuple val(meta), path(pe_file) - - output: - tuple val(meta), path("*.tsv") , emit: metrics - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.prefix ?: "${meta.id}" - - def VERSION = "0.1" - - """ - echo "${meta.id}" > samples.txt - - svtest pe-file \\ - ${pe_file} \\ - samples.txt \\ - > ${prefix}.pe-file.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - svtest: ${VERSION} - python: \$(python3 --version | sed -e "s/Python //g") - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - def VERSION = "0.1" - - """ - touch ${prefix}.pe-file.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - svtest: ${VERSION} - python: \$(python3 --version | sed -e "s/Python //g") - END_VERSIONS - """ -} diff --git a/modules/local/svtest/raw-counts/main.nf b/modules/local/svtest/raw-counts/main.nf deleted file mode 100644 index 65a0427a..00000000 --- a/modules/local/svtest/raw-counts/main.nf +++ /dev/null @@ -1,50 +0,0 @@ -process SVTEST_RAWCOUNTS { - tag "$meta.id" - label 'process_low' - - container "nicolasvnk/svtest:0.1" - - input: - tuple val(meta), path(raw_counts) - - output: - tuple val(meta), path("*.tsv") , emit: metrics - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.prefix ?: "${meta.id}" - - def VERSION = "0.1" - - """ - echo "${meta.id}" > samples.txt - - svtest raw-counts \\ - ${raw_counts} \\ - samples.txt \\ - > ${prefix}.raw-counts.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - svtest: ${VERSION} - python: \$(python3 --version | sed -e "s/Python //g") - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - def VERSION = "0.1" - - """ - touch ${prefix}.raw-counts.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - svtest: ${VERSION} - python: \$(python3 --version | sed -e "s/Python //g") - END_VERSIONS - """ -} diff --git a/modules/local/svtest/sr-file/main.nf b/modules/local/svtest/sr-file/main.nf deleted file mode 100644 index 0f693eec..00000000 --- a/modules/local/svtest/sr-file/main.nf +++ /dev/null @@ -1,50 +0,0 @@ -process SVTEST_SRFILE { - tag "$meta.id" - label 'process_low' - - container "nicolasvnk/svtest:0.1" - - input: - tuple val(meta), path(sr_file) - - output: - tuple val(meta), path("*.tsv") , emit: metrics - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.prefix ?: "${meta.id}" - - def VERSION = "0.1" - - """ - echo "${meta.id}" > samples.txt - - svtest sr-file \\ - ${sr_file} \\ - samples.txt \\ - > ${prefix}.sr-file.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - svtest: ${VERSION} - python: \$(python3 --version | sed -e "s/Python //g") - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - def VERSION = "0.1" - - """ - touch ${prefix}.sr-file.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - svtest: ${VERSION} - python: \$(python3 --version | sed -e "s/Python //g") - END_VERSIONS - """ -} diff --git a/modules/local/svtest/svtest/__init__.py b/modules/local/svtest/svtest/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modules/local/svtest/svtest/scripts/svtest b/modules/local/svtest/svtest/scripts/svtest deleted file mode 100644 index 5da7fe25..00000000 --- a/modules/local/svtest/svtest/scripts/svtest +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 - -""" -SVTest: A toolkit for testing SV pipeline output files and generating metrics. - -usage: svtest [-h] [options] - -subcommands: - baf-file BAF file - bincov-matrix Bincov matrix - gt-cutoffs Genotyping cutoffs file - medcov Median coverage file - merged-depth Merged depth bed file - metrics-file Evidence metrics file - pe-file Discordant read pair file - ped-file Family file - plot-metrics Generates comparison plot and table of two metrics files - raw-counts Count file - rf-cutoffs Random forest cutoffs file - sample-list Sample id list - sr-file Split read file - vcf VCF - -""" - -import argparse -import sys -import svtest.cli as cli - - -def main(): - parser = argparse.ArgumentParser( - description=__doc__, - usage=argparse.SUPPRESS, - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('subcommand', help=argparse.SUPPRESS) - - if len(sys.argv) < 2: - parser.print_help() - sys.exit(1) - - args = parser.parse_args(sys.argv[1:2]) - command = args.subcommand.replace('-', '_') - - if not hasattr(cli, command): - print('Unrecognized command: {0}'.format(args.subcommand)) - parser.print_help() - sys.exit(1) - - getattr(cli, command)(sys.argv[2:]) - -if __name__ == '__main__': - main() diff --git a/modules/local/svtest/svtest/setup.py b/modules/local/svtest/svtest/setup.py deleted file mode 100644 index c940bfa6..00000000 --- a/modules/local/svtest/svtest/setup.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -from setuptools import setup - -setup( - name="svtest", - version="0.1", - description="Test package for the GATK SV pipeline", - classifiers=[ - "Development Status :: 3 - Alpha", - "Programming Language :: Python :: 2.7", - ], - url="https://github.com/talkowski-lab/gatk-sv-v1", - author="Mark Walker", - author_email="markw@broadinsitute.org", - packages=["svtest"], - include_package_data=True, - zip_safe=False, - scripts=["scripts/svtest"], - install_requires=["numpy", "matplotlib", "pandas", "intervaltree", "pysam"], -) diff --git a/modules/local/svtest/svtest/svtest/__init__.py b/modules/local/svtest/svtest/svtest/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modules/local/svtest/svtest/svtest/cli/__init__.py b/modules/local/svtest/svtest/svtest/cli/__init__.py deleted file mode 100644 index 1931f464..00000000 --- a/modules/local/svtest/svtest/svtest/cli/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -from .bincov_matrix import main as bincov_matrix -from .gt_cutoffs import main as gt_cutoffs -from .vcf import main as vcf -from .medcov import main as medcov -from .merged_depth import main as merged_depth -from .metrics_file import main as metrics_file -from .baf_file import main as baf_file -from .ped_file import main as ped_file -from .pe_file import main as pe_file -from .plot_metrics import main as plot_metrics -from .rf_cutoffs import main as rf_cutoffs -from .sample_list import main as sample_list -from .sr_file import main as sr_file -from .raw_counts import main as raw_counts diff --git a/modules/local/svtest/svtest/svtest/cli/baf_file.py b/modules/local/svtest/svtest/svtest/cli/baf_file.py deleted file mode 100644 index 9bcaa603..00000000 --- a/modules/local/svtest/svtest/svtest/cli/baf_file.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python - -""" -Collect BAF file metrics. Writes stats to stdout. - -Metrics: - baf_qN_ : Nth percentile BAF - baf_count_ : total BAF record count - -If the provided sample list has more than one id, will be "_merged". - -""" - -import gzip -import argparse -import sys -import numpy as np -import svtest.utils.TestUtils as tu -import svtest.utils.IOUtils as iou - -Q25_KEY = "baf_q25" -Q50_KEY = "baf_q50" -Q75_KEY = "baf_q75" -COUNT_KEY = "baf_count" - -EXPECTED_COLUMNS = 4 - - -def main(argv): - parser = argparse.ArgumentParser( - description=__doc__, prog="svtest raw-baf", formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument("baf_file", type=str) - parser.add_argument("sample_list", type=str) - - # Print help if no arguments specified - if len(argv) == 0: - parser.print_help() - sys.exit(1) - args = parser.parse_args(argv) - - # Read file - with gzip.open(args.baf_file, mode="rb") as fbaf: - metrics = get_metrics(fbaf, args.sample_list) - - # Write metrics - write_metrics(metrics) - - -def write_metrics(metrics): - for key in metrics: - sys.stdout.write("%s\t%s\n" % (key, str(metrics[key]))) - - -def get_metrics(baf_file, sample_list): - samples = iou.read_samples_list(sample_list) - samples_set = set(samples) - - data = [] - for line in baf_file: - tokens = line.decode().strip().split("\t") - test_record(tokens, samples_set) - baf = float(tokens[2]) - data.append(baf) - arr = np.asarray(data) - quantiles = np.quantile(arr, [0.25, 0.50, 0.75]) - - if len(samples) == 1: - metric_suffix = "_" + samples[0] - else: - metric_suffix = "_merged" - - return { - Q25_KEY + metric_suffix: quantiles[0], - Q50_KEY + metric_suffix: quantiles[1], - Q75_KEY + metric_suffix: quantiles[2], - COUNT_KEY + metric_suffix: len(arr), - } - - -def test_record(columns, samples): - tu.test_iterable_size(columns, EXPECTED_COLUMNS) - tu.test_is_float(columns, 2) - tu.test_column_in_iterable(columns, 3, samples) - - -if __name__ == "__main__": - main() diff --git a/modules/local/svtest/svtest/svtest/cli/bincov_matrix.py b/modules/local/svtest/svtest/svtest/cli/bincov_matrix.py deleted file mode 100644 index cb30c3a5..00000000 --- a/modules/local/svtest/svtest/svtest/cli/bincov_matrix.py +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/env python - -""" -Collect bincov matrix metrics. Writes stats to stdout. - -Metrics: - bincov_matrix_intervals : number of intervals (rows) - bincov_matrix_intervals_all_samples_zero : number of intervals that were 0 across all samples - bincov_matrix_intervals_at_least_one_zero : number of intervals with 0 in at least one sample - bincov_matrix_q25 : 25th percentile count - bincov_matrix_q50: 50th percentile count - bincov_matrix_q75: 75th percentile count - bincov_matrix_mean_: mean count per sample - -""" - -import argparse -import gzip -import sys -import numpy as np -import svtest.utils.TestUtils as tu -import svtest.utils.IOUtils as iou - -INTERVALS_KEY = "bincov_matrix_intervals" -ALL_ZERO_KEY = "bincov_matrix_intervals_all_samples_zero" -ONE_ZERO_KEY = "bincov_matrix_intervals_at_least_one_zero" -Q25_KEY = "bincov_matrix_q25" -Q50_KEY = "bincov_matrix_q50" -Q75_KEY = "bincov_matrix_q75" -SAMPLE_MEAN_KEY = "bincov_matrix_mean" - - -def main(argv): - parser = argparse.ArgumentParser( - description=__doc__, prog="svtest bincov-matrix", formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument("bincov_matrix", type=str) - parser.add_argument("sample_list", type=str) - parser.add_argument("--low-mem-mode", action="store_true", help="Only validate and calculate number of intervals") - - # Print help if no arguments specified - if len(argv) == 0: - parser.print_help() - sys.exit(1) - args = parser.parse_args(argv) - - # Read file - with gzip.open(args.bincov_matrix, mode="rb") as f: - metrics = get_metrics(f, args.sample_list, args.low_mem_mode) - - # Write metrics - write_metrics(metrics) - - -def write_metrics(metrics): - for key in metrics: - sys.stdout.write("%s\t%s\n" % (key, str(metrics[key]))) - - -def get_metrics(matrix_file, sample_list, low_mem_mode): - samples = iou.read_samples_list(sample_list) - samples_set = set(samples) - - header = matrix_file.readline().decode().strip().split("\t") - header_samples_set = set(header[3:]) - tu.test_sets_equal(header_samples_set, samples_set, item_str="sample", name_a="header", name_b="samples list") - - data = [] - interval_size = None - num_records = 0 - for line in matrix_file: - num_records += 1 - tokens = line.decode().strip().split("\t") - tu.test_is_int(tokens, 1) - tu.test_is_int(tokens, 2) - if interval_size is None: - interval_size = int(tokens[2]) - int(tokens[1]) - else: - if interval_size != int(tokens[2]) - int(tokens[1]): - raise ValueError( - "Interval not of size {:d}: {:s}:{:d}-{:d}".format( - interval_size, tokens[0], int(tokens[1]), int(tokens[2]) - ) - ) - counts = tokens[3:] - test_record(counts, len(samples_set)) - if not low_mem_mode: - data.append([int(x) for x in counts]) - - if not low_mem_mode: - arr = np.asarray(data) - quantiles = np.quantile(arr, [0.25, 0.50, 0.75]) - max_over_samples = arr.max(axis=1) - num_zero_in_all = len(max_over_samples[max_over_samples == 0]) - min_over_samples = arr.min(axis=1) - num_zero_in_one = len(min_over_samples[min_over_samples == 0]) - metrics = { - Q25_KEY: quantiles[0], - Q50_KEY: quantiles[1], - Q75_KEY: quantiles[2], - INTERVALS_KEY: num_records, - ALL_ZERO_KEY: num_zero_in_all, - ONE_ZERO_KEY: num_zero_in_one, - } - column_means = arr.mean(axis=0) - col = 0 - for sample in header[3:]: - metrics[SAMPLE_MEAN_KEY + "_" + sample] = column_means[col] - col += 1 - else: - metrics = {INTERVALS_KEY: num_records} - - return metrics - - -def test_record(columns, n_header_cols): - tu.test_iterable_size(columns, n_header_cols) - for i in range(1, len(columns)): - tu.test_is_int(columns, i) - - -if __name__ == "__main__": - main() diff --git a/modules/local/svtest/svtest/svtest/cli/gt_cutoffs.py b/modules/local/svtest/svtest/svtest/cli/gt_cutoffs.py deleted file mode 100644 index b77f0d8e..00000000 --- a/modules/local/svtest/svtest/svtest/cli/gt_cutoffs.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python - -""" -Collect genotyping cutoff metrics. Writes stats to stdout. - -Metric format: gt_cutoffs__ - -""" - -import argparse -import sys -import pandas as pd - -KEY_PREFIX = "gt_cutoffs_" - -MEAN_COL = "mean" -SD_COL = "sd" -CUTOFFS_COL = "cutoffs" - - -def main(argv): - parser = argparse.ArgumentParser( - description=__doc__, prog="svtest gt-cutoffs", formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument("cutoffs", type=str) - parser.add_argument( - "type", type=str, help="Genotyping cutoffs type (e.g. depth_depth, depth_pesr, pesr_pesr, pesr_depth" - ) - - # Print help if no arguments specified - if len(argv) == 0: - parser.print_help() - sys.exit(1) - args = parser.parse_args(argv) - - # Get metrics - df = pd.read_csv(args.cutoffs, sep="\t") - metrics = get_metrics(df, args.type) - - # Write metrics - write_metrics(metrics) - - -def write_metrics(metrics): - for key in metrics: - sys.stdout.write("%s\t%s\n" % (key, str(metrics[key]))) - - -def get_metrics(df, type): - metrics = {} - for i in range(df.shape[0]): - row = df.iloc[i] - name = KEY_PREFIX + type + "_" + str(int(row.copy_state)) - metrics[name + "_mean"] = row[MEAN_COL] - metrics[name + "_sd"] = row[SD_COL] - metrics[name + "_cutoff"] = row[CUTOFFS_COL] - return metrics - - -if __name__ == "__main__": - main() diff --git a/modules/local/svtest/svtest/svtest/cli/medcov.py b/modules/local/svtest/svtest/svtest/cli/medcov.py deleted file mode 100644 index ddfae37c..00000000 --- a/modules/local/svtest/svtest/svtest/cli/medcov.py +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env python - -""" -Collect medcov metrics. Writes stats to stdout. - -Metrics: - medcov_mean : mean median coverage - medcov_mean_err : mean absolute deviation from baseline profile (if --baseline specified) - medcov_max_err : max absolute deviation from baseline profile (if --baseline specified) - -""" - -import argparse -import sys -import svtest.utils.TestUtils as tu -import svtest.utils.IOUtils as iou - -MEAN_KEY = "medcov_mean" -MEAN_ERROR_KEY = "medcov_mean_abs_err" -MAX_ERROR_KEY = "medcov_max_abs_err" - - -def main(argv): - parser = argparse.ArgumentParser( - description=__doc__, prog="svtest medcov", formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument("test_file", type=str) - parser.add_argument("sample_list", type=str) - parser.add_argument("--baseline-file", type=str, default=None) - - # Print help if no arguments specified - if len(argv) == 0: - parser.print_help() - sys.exit(1) - args = parser.parse_args(argv) - - samples = iou.read_samples_list(args.sample_list) - - # Read file - with open(args.test_file, mode="r") as ftest: - if args.baseline_file is None: - metrics = get_metrics(ftest, None, samples) - else: - with open(args.baseline_file, mode="r") as fbase: - metrics = get_metrics(ftest, fbase, samples) - - # Write metrics - write_metrics(metrics) - - -def write_metrics(metrics): - for key in metrics: - sys.stdout.write("%s\t%s\n" % (key, str(metrics[key]))) - - -def get_metrics(test_file, baseline_file, samples): - test_header, test_data = get_medcov_file_data(test_file) - tu.test_iterable_sizes_equal(test_header, test_data, name_a="test file header", name_b="test file data row") - tu.test_sets_equal(test_header, samples, item_str="sample id", name_a="test file header", name_b="sample list") - metrics = {MEAN_KEY: float(sum(test_data)) / len(test_header)} - if baseline_file is not None: - metrics = get_baseline_metrics(metrics, baseline_file, test_data, samples) - return metrics - - -def get_baseline_metrics(metrics, baseline_file, test_data, samples): - baseline_header, baseline_data = get_medcov_file_data(baseline_file) - tu.test_iterable_sizes_equal( - baseline_header, baseline_data, name_a="baseline file header", name_b="baseline file data row" - ) - tu.test_sets_equal(baseline_header, samples, item_str="sample id", name_a="test file header", name_b="sample list") - n = len(baseline_header) - error_list = [abs(test_data[i] - baseline_data[i]) for i in range(n)] - metrics[MEAN_ERROR_KEY] = float(sum(error_list)) / n - metrics[MAX_ERROR_KEY] = max(error_list) - return metrics - - -def get_medcov_file_data(file): - header = file.readline().strip().split("\t") - data = [int(x) for x in file.readline().strip().split("\t")] - return header, data - - -if __name__ == "__main__": - main() diff --git a/modules/local/svtest/svtest/svtest/cli/merged_depth.py b/modules/local/svtest/svtest/svtest/cli/merged_depth.py deleted file mode 100644 index b3d3efb8..00000000 --- a/modules/local/svtest/svtest/svtest/cli/merged_depth.py +++ /dev/null @@ -1,185 +0,0 @@ -#!/usr/bin/env python - -""" -Collect merged depth file metrics. Writes SVTYPE counts to stdout. - -Note --test-hits and --baseline-hits files can be generated with bedtools intersect: - bedtools intersect -wa -u -f 0.5 -r -a ${test_bed} -b ${baseline_bed} | cut -f4 > overlap.test.list - bedtools intersect -wa -u -f 0.5 -r -b ${test_bed} -a ${baseline_bed} | cut -f4 > overlap.base.list - -Metrics: - merged_depth__count : total variant count - merged_depth__tp : count of variants in test set that had at least one matching variant in the baseline set - (if baseline-bed specified) - merged_depth__fp : count of variants in test set that had no matching variants in the baseline set - (if baseline-bed specified) - merged_depth__tp : count of variants in baseline set that had no matching variants in the test set - (if baseline-bed specified) - merged_depth__X_Y : variants with size >= X and < Y - merged_depth__gte_X : variants with size >= X - -""" - -import gzip -import argparse -import sys -import svtest.utils.TestUtils as tu -import svtest.utils.IOUtils as iou - -# For size distribution -SIZES = [1000, 2000, 3000, 4000, 5000, 10000, 100000] - -KEY_PREFIX = "merged_depth_" - - -def main(argv): - parser = argparse.ArgumentParser( - description=__doc__, prog="svtest merged-depth", formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument("test_bed", type=str) - parser.add_argument("contig_list", type=str) - parser.add_argument("type", type=str) - parser.add_argument("--baseline-bed", type=str, default=None, help="Baseline bed file to evaluate against") - parser.add_argument( - "--test-hits", - type=str, - help="List of test record ids that overlap baseline set (required if using --baseline-bed)", - ) - parser.add_argument( - "--baseline-hits", - type=str, - help="List of baseline record ids that overlap test set (required if using --baseline-bed)", - ) - - # Print help if no arguments specified - if len(argv) == 0: - parser.print_help() - sys.exit(1) - args = parser.parse_args(argv) - - if ( - (bool(args.baseline_bed) ^ bool(args.test_hits)) - or (bool(args.baseline_bed) ^ bool(args.baseline_hits)) - or (bool(args.test_hits) ^ bool(args.baseline_hits)) - ): - raise ValueError( - "Inconsistent arguments specified: --baseline-bed, --test-hits, and --baseline-hits must be specified together." - ) - - contigs = iou.read_contig_list(args.contig_list) - - # Read file - with gzip.open(args.test_bed, mode="rb") as ftest: - if args.baseline_bed is None: - metrics = get_metrics(ftest, None, contigs, args.type, args.test_hits, args.baseline_hits) - else: - with gzip.open(args.baseline_bed, mode="rb") as fbase: - metrics = get_metrics(ftest, fbase, contigs, args.type, args.test_hits, args.baseline_hits) - - # Write metrics - write_metrics(metrics) - - -def write_metrics(metrics): - for key in metrics: - sys.stdout.write("%s\t%s\n" % (key, str(metrics[key]))) - - -def get_metrics(ftest, fbase, contigs, type, test_hits_path, base_hits_path): - test_header, test_ids, size_counts, num_test_records = parse_bed(ftest, type, contigs) - metrics = {KEY_PREFIX + type + "_count": num_test_records} - metrics = get_size_metrics(metrics, size_counts, type) - if fbase is not None: - metrics = get_baseline_metrics( - metrics, fbase, test_hits_path, base_hits_path, test_header, test_ids, num_test_records, type, contigs - ) - return metrics - - -def get_size_metrics(metrics, size_counts, type): - for i in range(len(SIZES)): - if i == 0: - lb = 0 - else: - lb = SIZES[i - 1] - key = KEY_PREFIX + type + "_" + str(lb) + "_" + str(SIZES[i]) - metrics[key] = size_counts[i] - key = KEY_PREFIX + type + "_gte_" + str(SIZES[-1]) - metrics[key] = size_counts[-1] - return metrics - - -def get_baseline_metrics( - metrics, fbase, test_hits_path, base_hits_path, test_header, test_ids, num_test_records, type, contigs -): - base_header, base_ids, _, num_baseline_records = parse_bed(fbase, type, contigs) - tu.test_sets_equal( - test_header, base_header, item_str="header column", name_a="test file header", name_b="baseline file header" - ) - if len(base_header) != len(test_header): - raise ValueError("Files have different column header sizes") - - with open(test_hits_path, mode="r") as f: - test_hits = f.read().splitlines() - with open(base_hits_path, mode="r") as f: - base_hits = f.read().splitlines() - check_hit_ids(test_hits, test_ids, "test") - check_hit_ids(base_hits, base_ids, "baseline") - - tp_test = len(test_hits) - fp_test = num_test_records - tp_test - fp_base = num_baseline_records - len(base_hits) - metrics[KEY_PREFIX + type + "_tp"] = tp_test - metrics[KEY_PREFIX + type + "_fp"] = fp_test - metrics[KEY_PREFIX + type + "_fn"] = fp_base - return metrics - - -def parse_bed(file, type, contigs): - header = file.readline().decode().strip().split("\t") - n_cols = len(header) - num_records = 0 - num_sizes = len(SIZES) - size_counts = [0] * (num_sizes + 1) - contigs_set = set(contigs) - ids = [] - for line in file: - num_records += 1 - tokens = line.decode().strip().split("\t") - check_record(tokens, n_cols, type, contigs_set) - s = get_size_distribution_index(tokens, num_sizes) - size_counts[s] += 1 - ids.append(tokens[3]) - return header, ids, size_counts, num_records - - -def check_hit_ids(hits, ids, name): - unknown_ids = set(hits) - set(ids) - if len(unknown_ids) > 0: - raise ValueError("Unknown %s record ids: %s" % (name, str(ids))) - - -def get_size_distribution_index(tokens, num_sizes): - start = int(tokens[1]) - end = int(tokens[2]) - interval_size = end - start - for i in range(num_sizes): - if interval_size < SIZES[i]: - return i - return len(SIZES) - - -def check_record(columns, n_cols, type, contigs): - tu.test_iterable_size(columns, n_cols) - tu.test_column_in_iterable(columns, 0, contigs) - tu.test_is_int(columns, 1) - tu.test_is_int(columns, 2) - tu.test_column_equals(columns, 5, type) - - -def sum_counts_over_contigs(tree_counts): - return sum([tree_counts[contig] for contig in tree_counts]) - - -if __name__ == "__main__": - main() diff --git a/modules/local/svtest/svtest/svtest/cli/metrics_file.py b/modules/local/svtest/svtest/svtest/cli/metrics_file.py deleted file mode 100644 index 5cee1a3f..00000000 --- a/modules/local/svtest/svtest/svtest/cli/metrics_file.py +++ /dev/null @@ -1,203 +0,0 @@ -#!/usr/bin/env python - -""" -Collect metric file metrics. Writes stats to stdout. - -Metrics: - metrics__num_records : Number of records - metrics__mean_ : Mean metric value - metrics__num_empty_ : Number of metric N/A entries - -""" - -import argparse -import math -import sys - -from matplotlib.backends.backend_pdf import PdfPages -from matplotlib import pyplot as plt -import pandas as pd -import svtest.utils.TestUtils as tu -import svtest.utils.IOUtils as iou - -KEY_PREFIX = "metrics_" - -EXPECTED_COLUMNS = [ - "name", - "chrom", - "svtype", - "svsize", - "vf", - "poor_region_cov", - "rmsk", - "is_outlier_specific", - "PE_log_pval", - "PE_called_median", - "PE_bg_median", - "PE_bg_frac", - "SR_posA_log_pval", - "SR_posB_log_pval", - "SR_sum_log_pval", - "SR_posA_called_median", - "SR_posB_called_median", - "SR_sum_called_median", - "SR_posA_bg_median", - "SR_posB_bg_median", - "SR_sum_bg_median", - "SR_posA_bg_frac", - "SR_posB_bg_frac", - "SR_sum_bg_frac", - "SR_posA_pos", - "SR_posB_pos", - "PESR_log_pval", - "PESR_called_median", - "PESR_bg_median", - "PESR_bg_frac", - "RD_Median_Power", - "RD_P", - "RD_2ndMaxP", - "RD_Model", - "RD_Median_Rank", - "RD_Median_Separation", - "RD_log_pval", - "RD_log_2ndMaxP", - "BAF_delstat", - "BAF_snp_ratio", - "BAF_del_loglik", - "BAF_dupstat", - "BAF_KS_stat", - "BAF_KS_log_pval", - "BAF_total_case_snps", - "BAF_total_snps", - "BAF_n_nonROH_cases", - "BAF_n_samples", - "BAF_mean_control_snps", - "BAF_n_nonROH_controls", - "BAF_n_controls", -] - -EXPECTED_TYPES = ["DEL", "DUP", "INS", "INV", "BND"] - -# Plotting config -WIDTH = 10.24 -HEIGHT = 16 -MAX_ROWS_PER_PLOT = 7 -BINS = 30 - - -def main(argv): - parser = argparse.ArgumentParser( - description=__doc__, prog="svtest metrics-file", formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument("metrics_file", type=str) - parser.add_argument("contig_list", type=str) - parser.add_argument("--common", action="store_true") - parser.add_argument("--plot", type=str, help="If provided, plots histogram to specified pdf path") - - # Print help if no arguments specified - if len(argv) == 0: - parser.print_help() - sys.exit(1) - args = parser.parse_args(argv) - - contigs = iou.read_contig_list(args.contig_list) - tu.test_is_not_empty(contigs, "contigs") - - # Columns used as features for adjudication in module 03 - feature_cols = [ - "poor_region_cov", - "is_outlier_specific", - "BAF_snp_ratio", - "BAF_del_loglik", - "BAF_KS_stat", - "BAF_KS_log_pval", - "SR_sum_log_pval", - "SR_sum_bg_frac", - "RD_Median_Separation", - "RD_log_pval", - "RD_log_2ndMaxP", - "PE_log_pval", - "PE_bg_frac", - ] - - # Read file - df = pd.read_csv(args.metrics_file, sep="\t") - metrics = get_metrics(df, contigs, feature_cols, args.common) - - # Generate histograms - if args.plot is not None: - plot_nonempty_data(df, feature_cols, args.plot) - - # Write metrics - write_metrics(metrics) - - -def write_metrics(metrics): - for key in metrics: - sys.stdout.write("%s\t%s\n" % (key, str(metrics[key]))) - - -def get_metrics(df, contigs, feature_cols, common): - tu.test_sets_equal(df["chrom"], contigs, item_str="contig", name_a="metric file contigs", name_b="contigs list") - tu.test_sets_equal( - df.columns, EXPECTED_COLUMNS, item_str="column", name_a="metric file header", name_b="expected columns" - ) - metric_means = get_column_means(df, feature_cols) - metric_empty_counts = get_columns_num_empty(df, feature_cols) - - if common: - prefix = "{}common_".format(KEY_PREFIX) - else: - prefix = KEY_PREFIX - metrics = {prefix + "num_records": df.size} - for key in metric_means: - metrics[prefix + "mean_" + key] = metric_means[key] - for key in metric_empty_counts: - metrics[prefix + "num_empty_" + key] = metric_empty_counts[key] - return metrics - - -def get_columns_num_empty(df, columns): - counts = {} - for col in columns: - counts[col] = get_column_num_empty(df, col) - return counts - - -def get_column_num_empty(df, column): - col = df[column] - return col[col.isna()].size - - -def get_column_means(df, columns): - means = {} - for col in columns: - means[col] = get_column_mean(df, col) - return means - - -def get_column_mean(df, column): - return df[column].mean(axis=0) - - -def plot_rows(df, cols, pdf): - for i in range(len(cols)): - plt.subplot(len(cols), 1, i + 1) - df[cols[i]].astype("float64").plot.hist(figsize=(WIDTH, HEIGHT), bins=BINS) - plt.xlabel(cols[i]) - plt.tight_layout() - pdf.savefig() - plt.close() - - -def plot_nonempty_data(df, cols, out_path): - num_plots = math.ceil(len(cols) / float(MAX_ROWS_PER_PLOT)) - with PdfPages(out_path) as pdf: - for i in range(num_plots): - start = i * MAX_ROWS_PER_PLOT - end = min((i + 1) * MAX_ROWS_PER_PLOT + 1, len(cols)) - plot_rows(df, cols[start:end], pdf) - - -if __name__ == "__main__": - main() diff --git a/modules/local/svtest/svtest/svtest/cli/pe_file.py b/modules/local/svtest/svtest/svtest/cli/pe_file.py deleted file mode 100644 index 375cbd78..00000000 --- a/modules/local/svtest/svtest/svtest/cli/pe_file.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env python - -""" -Collect PE file metrics. Writes stats to stdout. - -Metrics: - pe_LL_ : number of records with LL orientation - pe_RR_ : number of records with RR orientation - pe_RL_ : number of records with RL orientation - pe_LR_ : number of records with LR orientation - -If the provided sample list has more than one id, will be "_merged". - -""" - -import gzip -import argparse -import sys -import svtest.utils.TestUtils as tu -import svtest.utils.IOUtils as iou - -PLUS_PLUS_KEY = "pe_LL" -MINUS_MINUS_KEY = "pe_RR" -PLUS_MINUS_KEY = "pe_LR" -MINUS_PLUS_KEY = "pe_RL" - - -def main(argv): - parser = argparse.ArgumentParser( - description=__doc__, prog="svtest pe-file", formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument("pe_file", type=str) - parser.add_argument("sample_list", type=str) - - # Print help if no arguments specified - if len(argv) == 0: - parser.print_help() - sys.exit(1) - args = parser.parse_args(argv) - - # Read file - with gzip.open(args.pe_file, mode="rb") as fpe: - metrics = get_metrics(fpe, args.sample_list) - - # Write metrics - write_metrics(metrics) - - -def write_metrics(metrics): - for key in metrics: - sys.stdout.write("%s\t%s\n" % (key, str(metrics[key]))) - - -def get_metrics(file, sample_list): - samples = iou.read_samples_list(sample_list) - samples_set = set(samples) - - data = [0, 0, 0, 0] # ++, --, +-, -+ - for line in file: - tokens = line.decode().strip().split("\t") - test_record(tokens, samples_set) - first = tokens[2] - second = tokens[5] - val = first + second - if val == "++": - data[0] += 1 - elif val == "--": - data[1] += 1 - elif val == "+-": - data[2] += 1 - elif val == "-+": - data[3] += 1 - else: - raise ValueError("Unrecognized orientation: %s / %s" % (first, second)) - - if len(samples) == 1: - metric_suffix = "_" + samples[0] - else: - metric_suffix = "_merged" - - return { - PLUS_PLUS_KEY + metric_suffix: data[0], - MINUS_MINUS_KEY + metric_suffix: data[1], - PLUS_MINUS_KEY + metric_suffix: data[2], - MINUS_PLUS_KEY + metric_suffix: data[3], - } - - -def test_record(columns, samples): - tu.test_iterable_size(columns, 7) - tu.test_is_int(columns, 1) - valid_strands = set(["+", "-"]) - tu.test_column_in_iterable(columns, 2, valid_strands) - tu.test_is_int(columns, 4) - tu.test_column_in_iterable(columns, 5, valid_strands) - tu.test_column_in_iterable(columns, 6, samples) - - -if __name__ == "__main__": - main() diff --git a/modules/local/svtest/svtest/svtest/cli/ped_file.py b/modules/local/svtest/svtest/svtest/cli/ped_file.py deleted file mode 100644 index 59265a93..00000000 --- a/modules/local/svtest/svtest/svtest/cli/ped_file.py +++ /dev/null @@ -1,132 +0,0 @@ -#!/usr/bin/env python - -""" -Collect ped file metrics. Writes stats to stdout. - -Metrics: - ped_file_count : Number of records - ped_file_ : Number of families of each type - -""" - -import argparse -import sys -import pandas as pd -import svtest.utils.IOUtils as iou - -KEY_PREFIX = "ped_file_" - -SINGLETON_STR = "singletons" -DUO_STR = "duos" -TRIO_STR = "family_size_3" -QUAD_STR = "family_size_4" -QUINTET_PLUS_STR = "family_size_5_or_larger" - -MALE = "1" -FEMALE = "2" -MALE_METRIC = "male" -FEMALE_METRIC = "female" -OTHER_METRIC = "other" - - -def main(argv): - parser = argparse.ArgumentParser( - description=__doc__, prog="svtest ped-file", formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument("test_ped_file", type=str) - parser.add_argument( - "--sample-list", type=str, default=None, help="Sample ids not found in this list will cause an error" - ) - parser.add_argument("--prefix", type=str, default=None, help="Prefix to add to metric names") - - # Print help if no arguments specified - if len(argv) == 0: - parser.print_help() - sys.exit(1) - args = parser.parse_args(argv) - - if args.sample_list is not None: - samples = iou.read_samples_list(args.sample_list) - else: - samples = None - - # Get metrics - df = pd.read_csv(args.test_ped_file, sep="\t", names=range(6)) - metrics = get_metrics(df, valid_samples=samples, metric_prefix=args.prefix) - - # Write metrics - write_metrics(metrics) - - -def write_metrics(metrics): - for key in metrics: - sys.stdout.write("%s\t%s\n" % (key, str(metrics[key]))) - - -def get_metrics(df, valid_samples=None, metric_prefix=None): - check_samples(df, valid_samples) - if metric_prefix is None: - pfx = KEY_PREFIX - else: - pfx = metric_prefix + "_" + KEY_PREFIX - metrics = {pfx + "count": df.shape[0]} - metrics = add_family_count_metrics(metrics, df, pfx) - metrics = add_sex_metrics(metrics, df, pfx) - return metrics - - -def add_sex_metrics(metrics, df, prefix): - num_male = count_sex(MALE, df) - num_female = count_sex(FEMALE, df) - num_other = df.shape[0] - num_male - num_female - metrics[prefix + MALE_METRIC] = num_male - metrics[prefix + FEMALE_METRIC] = num_female - metrics[prefix + OTHER_METRIC] = num_other - return metrics - - -def count_sex(type, df): - sex_col = df[4].astype("str") - return sex_col[sex_col == type].size - - -def add_family_count_metrics(metrics, df, prefix): - counts_by_size = get_family_counts(df) - for key in counts_by_size: - metrics[prefix + key] = counts_by_size[key] - return metrics - - -def get_family_counts(df): - family_id_col = df[0] - family_ids = list(set(family_id_col)) - counts_by_id = {} - for id in family_ids: - counts_by_id[id] = family_id_col[family_id_col == id].size - counts_by_size = {SINGLETON_STR: 0, DUO_STR: 0, TRIO_STR: 0, QUAD_STR: 0, QUINTET_PLUS_STR: 0} - for id in counts_by_id: - if counts_by_id[id] == 1: - counts_by_size[SINGLETON_STR] += 1 - elif counts_by_id[id] == 2: - counts_by_size[DUO_STR] += 1 - elif counts_by_id[id] == 3: - counts_by_size[TRIO_STR] += 1 - elif counts_by_id[id] == 4: - counts_by_size[QUAD_STR] += 1 - else: - counts_by_size[QUINTET_PLUS_STR] += 1 - return counts_by_size - - -def check_samples(df, valid_samples): - samples = set(df[1]) - if len(samples) < df.shape[0]: - raise ValueError("There are duplicate sample ids in the ped file") - if valid_samples is not None: - unexpected_samples = samples - set(valid_samples) - if len(unexpected_samples) > 0: - raise ValueError("Unexpected samples: %s" % unexpected_samples) - - -if __name__ == "__main__": - main() diff --git a/modules/local/svtest/svtest/svtest/cli/plot_metrics.py b/modules/local/svtest/svtest/svtest/cli/plot_metrics.py deleted file mode 100644 index 4c0a70b9..00000000 --- a/modules/local/svtest/svtest/svtest/cli/plot_metrics.py +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/env python - -""" -Compare two metrics files. -""" - -import sys -import argparse -import math -import re -import pandas as pd -import matplotlib.pyplot as plt -from matplotlib.backends.backend_pdf import PdfPages -import svtest.utils.IOUtils as iou - -WIDTH = 10.24 -HEIGHT_SCALE = 0.25 -MAX_ROWS_PER_PLOT = 500 - - -def main(argv): - parser = argparse.ArgumentParser( - description=__doc__, prog="svtest plot-metrics", formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument("metrics_a", type=str) - parser.add_argument("metrics_b", type=str) - parser.add_argument("pdf_out", type=str) - parser.add_argument("--a-name", type=str, default="metrics_a") - parser.add_argument("--b-name", type=str, default="metrics_b") - parser.add_argument("--sample-list", type=str, default=None) - parser.add_argument("--changes-only", action="store_true", help="Only plot values that are different") - parser.add_argument("--linear", action="store_true", help="Plot linear scale [default log]") - parser.add_argument("--metrics-out", type=str, help="Write plotted metrics to tsv", default=None) - - # Print help if no arguments specified - if len(argv) == 0: - parser.print_help() - sys.exit(1) - args = parser.parse_args(argv) - - # Read metric tables and join - df_a = get_metrics(args.metrics_a) - df_b = get_metrics(args.metrics_b) - df = df_a.join(df_b, how="outer", lsuffix="_a", rsuffix="_b", sort=True).rename( - columns={"value_a": args.a_name, "value_b": args.b_name} - ) - - # If sample ids are provided, consolidate sample-specific metrics - if args.sample_list is not None: - samples = iou.read_samples_list(args.sample_list) - df = consolidate_sample_metrics(df, samples) - - # Only plot changed metrics - if args.changes_only: - df = df[df["value_a"] != df["value_b"]] - - # Write raw data to file - if args.metrics_out is not None: - df.to_csv(args.metrics_out, sep="\t") - - # Plot - plot_data(df, args.pdf_out, args.linear) - - -def consolidate_sample_metrics(df, samples): - sample_metric_rows = get_sample_metric_rows(df, samples) - sample_df = df.loc[sample_metric_rows] - df = df.loc[~sample_metric_rows] - tags = get_sample_tag_groups(sample_df, samples) - tags_df = pd.DataFrame(tags, index=sample_df.index, columns=["group"]) - sample_df = sample_df.join(tags_df) - grouped_metrics = {} - for tag in set(tags): - compute_metrics(grouped_metrics, sample_df, tag) - grouped_df = pd.DataFrame(grouped_metrics).T.sort_index() - return df.append(grouped_df) - - -def compute_metrics(metrics_dict, metrics_df, group): - metrics_dict[group + ""] = metrics_df[metrics_df["group"] == group].mean(axis=0) - - -def get_sample_metric_rows(df, samples): - pat = "|".join(samples) - return df.index.str.contains(pat) - - -def get_sample_tag_groups(sample_df, samples): - pat = r"({})".format("|".join(samples)) - pat_double_underscore = r"__" - pat_underscore = r"(__|^_|_$)" - tags = [] - for row in sample_df.index: - row2 = re.sub(pat, "", row) - row3 = re.sub(pat_double_underscore, "_", row2) - row4 = re.sub(pat_underscore, "", row3) - tags.append(row4) - return tags - - -def get_metrics(path): - df = pd.read_csv(path, sep="\t", names=["name", "value"]) - return df.set_index("name") - - -def plot_data(df, out_path, linear): - if df.size == 0: - plot_empty_data() - else: - plot_nonempty_data(df, out_path, linear) - - -def plot_empty_data(): - fig = plt.figure() - ax = fig.add_axes([0, 0, 1, 1]) - left, width = 0.25, 0.5 - bottom, height = 0.25, 0.5 - right = left + width - top = bottom + height - ax.text( - 0.5 * (left + right), - 0.5 * (bottom + top), - "No data", - horizontalalignment="center", - verticalalignment="center", - fontsize=20, - color="red", - transform=ax.transAxes, - ) - - -def plot_nonempty_data(df, out_path, linear): - num_rows = df.index.size - num_plots = math.ceil(num_rows / float(MAX_ROWS_PER_PLOT)) - with PdfPages(out_path) as pdf: - for i in range(num_plots): - start = i * MAX_ROWS_PER_PLOT - end = min((i + 1) * MAX_ROWS_PER_PLOT + 1, num_rows) - plot_rows(df, start, end, pdf, linear) - - -def plot_rows(df, start, end, pdf, linear): - df.iloc[start:end].iloc[::-1].plot.barh(figsize=(WIDTH, HEIGHT_SCALE * (max(end - start, 15)))) - if not linear: - plt.xscale("log") - plt.legend(bbox_to_anchor=(0, 1.02, 1.0, 0.102), loc="lower left", ncol=2, borderaxespad=0.0) - plt.tight_layout() - pdf.savefig() - plt.close() - - -if __name__ == "__main__": - main() diff --git a/modules/local/svtest/svtest/svtest/cli/raw_counts.py b/modules/local/svtest/svtest/svtest/cli/raw_counts.py deleted file mode 100644 index 155dca8b..00000000 --- a/modules/local/svtest/svtest/svtest/cli/raw_counts.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python - -""" -Collect raw read count metrics. Writes stats to stdout. - -Metrics: - counts_qN : Nth percentile - counts_num_intervals : number of records - counts_intervals_size : sum of interval sizes - -""" - -import gzip -import argparse -import sys -import numpy as np -import svtest.utils.TestUtils as tu - -Q25_KEY = "rd_q25" -Q50_KEY = "rd_q50" -Q75_KEY = "rd_q75" -MEAN_KEY = "rd_mean" -NUM_ZERO = "rd_num_zero" -NUM_INTERVALS = "rd_num_intervals" -INTERVALS_SIZE = "rd_intervals_size" - -EXPECTED_COLUMNS = 4 -HEADER_CHAR = "@" - - -def main(argv): - parser = argparse.ArgumentParser( - description=__doc__, prog="svtest raw-counts", formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument("counts_file", type=str) - parser.add_argument("sample_id", type=str) - - # Print help if no arguments specified - if len(argv) == 0: - parser.print_help() - sys.exit(1) - args = parser.parse_args(argv) - - # Read file - with gzip.open(args.counts_file, mode="rb") as f: - metrics = get_metrics(f, args.sample_id) - - # Write metrics - write_metrics(metrics) - - -def write_metrics(metrics): - for key in metrics: - sys.stdout.write("%s\t%s\n" % (key, str(metrics[key]))) - - -def get_metrics(file, sample_id): - counts = [] - intervals_size = 0 - last_line_was_header = False - for bin in file: - line = bin.decode() - if line.startswith(HEADER_CHAR): - last_line_was_header = True - continue - if last_line_was_header: - last_line_was_header = False - continue # skip columns header line - tokens = line.strip().split("\t") - tu.test_iterable_size(tokens, EXPECTED_COLUMNS) - start = int(tokens[1]) - end = int(tokens[2]) - intervals_size += end - start + 1 - count = int(tokens[3]) - counts.append(int(count)) - counts_arr = np.asarray(counts) - quantiles = np.quantile(counts_arr, [0.25, 0.50, 0.75]) - mean = np.mean(counts_arr) - num_zero = counts_arr[counts_arr == 0].size - return { - Q25_KEY + "_" + sample_id: quantiles[0], - Q50_KEY + "_" + sample_id: quantiles[1], - Q75_KEY + "_" + sample_id: quantiles[2], - MEAN_KEY + "_" + sample_id: mean, - NUM_ZERO + "_" + sample_id: num_zero, - NUM_INTERVALS + "_" + sample_id: len(counts_arr), - INTERVALS_SIZE + "_" + sample_id: intervals_size, - } - - -if __name__ == "__main__": - main() diff --git a/modules/local/svtest/svtest/svtest/cli/rf_cutoffs.py b/modules/local/svtest/svtest/svtest/cli/rf_cutoffs.py deleted file mode 100644 index e8bcf881..00000000 --- a/modules/local/svtest/svtest/svtest/cli/rf_cutoffs.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python - -""" -Collect random forest cutoff metrics. Writes stats to stdout. - -Metric format: rf_cutoff____min_max - -""" - -import argparse -import sys -import pandas as pd - -KEY_PREFIX = "rf_cutoff_" - - -def main(argv): - parser = argparse.ArgumentParser( - description=__doc__, prog="svtest rf-cutoffs", formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument("cutoffs", type=str) - - # Print help if no arguments specified - if len(argv) == 0: - parser.print_help() - sys.exit(1) - args = parser.parse_args(argv) - - # Get metrics - df = pd.read_csv(args.cutoffs, sep="\t") - metrics = get_metrics(df) - - # Write metrics - write_metrics(metrics) - - -def write_metrics(metrics): - for key in metrics: - sys.stdout.write("%s\t%s\n" % (key, str(metrics[key]))) - - -def get_metrics(df): - metrics = {} - for i in range(df.shape[0]): - row = df.iloc[i] - name = KEY_PREFIX + "_".join([row.test, row.svtype, row.metric, row.algtype]) - if not pd.isna(row.min_svsize): - name += "_min" + str(int(row.min_svsize)) - if not pd.isna(row.max_svsize): - name += "_max" + str(int(row.max_svsize)) - metrics[name] = row.cutoff - return metrics - - -if __name__ == "__main__": - main() diff --git a/modules/local/svtest/svtest/svtest/cli/sample_list.py b/modules/local/svtest/svtest/svtest/cli/sample_list.py deleted file mode 100644 index f6074664..00000000 --- a/modules/local/svtest/svtest/svtest/cli/sample_list.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python - -""" -Collect sample list metrics. Writes stats to stdout. - -Metrics: - sample_list_count : Number of samples - -""" - -import argparse -import sys -import svtest.utils.IOUtils as iou - -KEY_PREFIX = "sample_list_" - - -def main(argv): - parser = argparse.ArgumentParser( - description=__doc__, prog="svtest sample-list", formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument("test_sample_list", type=str) - parser.add_argument( - "--valid-sample-list", type=str, default=None, help="Sample ids not found in this list will cause an error" - ) - parser.add_argument("--prefix", type=str, default=None, help="Prefix to add to metric names") - - # Print help if no arguments specified - if len(argv) == 0: - parser.print_help() - sys.exit(1) - args = parser.parse_args(argv) - - samples = iou.read_samples_list(args.test_sample_list, fail_on_empty=False) - if args.valid_sample_list is not None: - valid_samples = iou.read_samples_list(args.valid_sample_list) - else: - valid_samples = None - - # Get metrics - metrics = get_metrics(samples, valid_samples, args.prefix) - - # Write metrics - write_metrics(metrics) - - -def write_metrics(metrics): - for key in metrics: - sys.stdout.write("%s\t%s\n" % (key, str(metrics[key]))) - - -def get_metrics(samples, valid_samples, metric_prefix): - if valid_samples is not None: - unexpected_samples = set(samples) - set(valid_samples) - if len(unexpected_samples) > 0: - raise ValueError("Unexpected samples: %s" % unexpected_samples) - - if metric_prefix is None: - pfx = KEY_PREFIX - else: - pfx = metric_prefix + "_" + KEY_PREFIX - metrics = {pfx + "count": len(samples)} - return metrics - - -if __name__ == "__main__": - main() diff --git a/modules/local/svtest/svtest/svtest/cli/sr_file.py b/modules/local/svtest/svtest/svtest/cli/sr_file.py deleted file mode 100644 index 716b2435..00000000 --- a/modules/local/svtest/svtest/svtest/cli/sr_file.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python - -""" -Collect SR file metrics. Writes stats to stdout. - -Metrics: - sr_left_ : number of left records - sr_right_ : number of right records - -If the provided sample list has more than one id, will be "_merged". - -""" - -import gzip -import argparse -import sys -import svtest.utils.TestUtils as tu -import svtest.utils.IOUtils as iou - -LEFT_KEY = "sr_left" -RIGHT_KEY = "sr_right" - - -def main(argv): - parser = argparse.ArgumentParser( - description=__doc__, prog="svtest sr-file", formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument("sr_file", type=str) - parser.add_argument("sample_list", type=str) - - # Print help if no arguments specified - if len(argv) == 0: - parser.print_help() - sys.exit(1) - args = parser.parse_args(argv) - - # Read file - with gzip.open(args.sr_file, mode="rb") as fsr: - metrics = get_metrics(fsr, args.sample_list) - - # Write metrics - write_metrics(metrics) - - -def write_metrics(metrics): - for key in metrics: - sys.stdout.write("%s\t%s\n" % (key, str(metrics[key]))) - - -def get_metrics(sr_file, sample_list): - samples = iou.read_samples_list(sample_list) - samples_set = set(samples) - side_metrics = [0, 0] - for line in sr_file: - tokens = line.decode().strip().split("\t") - test_record(tokens, samples_set) - side = tokens[2] - if side == "left": - side_metrics[0] += 1 - elif side == "right": - side_metrics[1] += 1 - else: - raise ValueError("Unrecognized orientation: %s" % side) - - if len(samples) == 1: - metric_suffix = "_" + samples[0] - else: - metric_suffix = "_merged" - - return {LEFT_KEY + metric_suffix: side_metrics[0], RIGHT_KEY + metric_suffix: side_metrics[1]} - - -def test_record(columns, sample_ids): - tu.test_iterable_size(columns, 5) - tu.test_is_int(columns, 1) - valid_strands = set(["right", "left"]) - tu.test_column_in_iterable(columns, 2, valid_strands) - tu.test_is_int(columns, 3) - tu.test_column_in_iterable(columns, 4, sample_ids) - - -if __name__ == "__main__": - main() diff --git a/modules/local/svtest/svtest/svtest/cli/vcf.py b/modules/local/svtest/svtest/svtest/cli/vcf.py deleted file mode 100644 index 78160ddb..00000000 --- a/modules/local/svtest/svtest/svtest/cli/vcf.py +++ /dev/null @@ -1,464 +0,0 @@ -#!/usr/bin/env python - -""" -Collect vcf metrics. Writes metrics to stdout. - -Metrics: - _vcf__count : variants of a type in the test set - _vcf__tp : variants with at least one matching variant in the baseline set (if baseline-vcf provided) - _vcf__fp : variants with no matching variants in the baseline set (if baseline-vcf provided) - _vcf__fn : variants in baseline set that had no matching variants in the test set (if baseline-vcf provided) - _vcf__size_X_Y : variants with size >= X and < Y - _vcf__size_gte_X : variants with size >= X - _vcf__vargq_X_Y : variants with varGQ >= X and < Y (if varGQ present) - _vcf__vargq_gte_X : variants with varGQ >= X (if varGQ present) - _vcf__af_X_Y : variants with allele frequency >= X and < Y (if EV present) - _vcf__af_gte_X : variants with allele frequency >= X (if EV present) - _vcf__ac_1 : singleton variants (if EV present) - _vcf__evidence_ : variants supported by evidence type (if EVIDENCE present) - -""" - -import argparse -import sys -import svtest.utils.IntervalUtils as iu -import svtest.utils.TestUtils as tu -import svtest.utils.VCFUtils as vu -import svtest.utils.IOUtils as iou -from pysam import VariantFile -import pandas as pd - -VCF_METRIC_STR = "_vcf_" - -# Size bins -SIZES = [500, 5000, 100000] - -# varGQ bins -VARGQ_BINS = [2, 200, 400, 600, 800, 999] - -# AF bins -AF_BINS = [0.01, 0.1, 0.5] - -# Valid evidence types -EVIDENCE_TYPES = ["RD", "BAF", "PE", "SR"] - -# Accepted "passing" filters -PASSING_FILTERS = ["PASS", "BOTHSIDES_SUPPORT", "MULTIALLELIC", "HIGH_SR_BACKGROUND"] - -INVALID_CHR2_STR = "invalid_chr2" -INVALID_END_STR = "invalid_end" - -BED_FILE_HEADER_CHAR = "#" -BED_FILE_CHROM_COL = "chrom" -BED_FILE_START_COL = "start" -BED_FILE_END_COL = "end" -BED_FILE_SVTYPE_COL = "svtype" - - -def main(argv): - parser = argparse.ArgumentParser( - description=__doc__, prog="svtest vcf", formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument("test_vcf", type=str) - parser.add_argument("contig_list", type=str) - parser.add_argument("sample_list", type=str) - parser.add_argument("types", type=str, help="Comma-delimited list of variant types (case-sensitive)") - parser.add_argument("metric_prefix", type=str) - parser.add_argument("--baseline-vcf", type=str, help="Baseline vcf to provide evaluation metrics against") - parser.add_argument( - "--baseline-bed", - type=str, - help='Baseline bed file to provide evaluation metrics against. Must have header beginning with "' - + BED_FILE_HEADER_CHAR - + '" and the following columns: "' - + '", "'.join([BED_FILE_CHROM_COL, BED_FILE_START_COL, BED_FILE_END_COL, BED_FILE_SVTYPE_COL]) - + '"', - ) - parser.add_argument( - "--min-reciprocal-overlap", - type=float, - default=0.5, - help="Minimum reciprocal overlap for validation metrics [0.5]", - ) - parser.add_argument("--padding", type=int, default=50, help="Interval padding for validation metrics [50]") - parser.add_argument( - "--max-warnings", type=int, default=50, help="Maximum number of records to print warnings for [50]" - ) - parser.add_argument("--fp-file", type=str, default=None, help="Write false positives to file") - parser.add_argument("--fn-file", type=str, default=None, help="Write false negatives to file") - parser.add_argument("--fp-pass-file", type=str, default=None, help="Write PASS false positives to file") - parser.add_argument("--fn-pass-file", type=str, default=None, help="Write PASS false negatives to file") - - # Print help if no arguments specified - if len(argv) == 0: - parser.print_help() - sys.exit(1) - args = parser.parse_args(argv) - if (args.baseline_vcf is None and args.baseline_bed is None) and ( - args.fp_file is not None or args.fn_file is not None - ): - raise ValueError("FP and FN files cannot be generated if --baseline-vcf and --baseline-bed aren't specified") - if args.baseline_vcf is not None and args.baseline_bed is not None: - raise ValueError("Cannot specify both --baseline-vcf and --baseline-bed") - types_list = args.types.split(",") - - contigs = iou.read_contig_list(args.contig_list) - samples = iou.read_samples_list(args.sample_list) - metrics, fp_intervals, fn_intervals, fp_intervals_pass, fn_intervals_pass = get_metrics( - args.test_vcf, - args.baseline_vcf, - args.baseline_bed, - contigs, - types_list, - args.min_reciprocal_overlap, - args.padding, - samples, - args.metric_prefix, - args.max_warnings, - ) - - # Write metrics - write_metrics(metrics) - if args.fp_file is not None and fp_intervals is not None: - write_intervals(args.fp_file, fp_intervals) - if args.fn_file is not None and fn_intervals is not None: - write_intervals(args.fn_file, fn_intervals) - if args.fp_pass_file is not None and fp_intervals_pass is not None: - write_intervals(args.fp_pass_file, fp_intervals_pass) - if args.fn_pass_file is not None and fn_intervals_pass is not None: - write_intervals(args.fn_pass_file, fn_intervals_pass) - - -def write_metrics(metrics): - for key in metrics: - sys.stdout.write("%s\t%s\n" % (key, str(metrics[key]))) - - -def write_intervals(path, intervals): - with open(path, "w") as f: - for type in intervals: - for contig in intervals[type]: - for interval in intervals[type][contig]: - line = "\t".join([contig, str(interval[0]), str(interval[1]), type]) + "\n" - f.write(line) - - -def get_metrics( - ftest, fbase_vcf, fbase_bed, contigs, variant_types, min_ro, padding, samples, metric_prefix, max_warnings -): - - test_vcf = VariantFile(ftest) - - check_header(test_vcf, samples) - - genotyped = check_if_genotyped(test_vcf) - has_vargq = check_if_vargq(test_vcf) - collect_evidence = check_if_evidence(test_vcf) - test_records = list(test_vcf.fetch()) - - unfiltered_variant_type_counts = get_count_by_type(test_records, variant_types) - - pass_filter_set = set(PASSING_FILTERS) - pass_records = [r for r in test_records if ("PASS" in r.filter or len(set(r.filter) - pass_filter_set) == 0)] - - error_counts = count_errors(test_records, contigs, max_warnings) - - variant_type_counts = get_count_by_type(pass_records, variant_types) - size_counts = get_distributions_by_type(pass_records, variant_types, "SVLEN", SIZES, exclude_types=["BND"]) - - metrics = add_error_count_metrics({}, error_counts, metric_prefix) - - if fbase_vcf is not None: - base_vcf = VariantFile(fbase_vcf) - if genotyped != check_if_genotyped(base_vcf): - raise ValueError("One of the vcfs seems to be genotyped but the other does not") - if has_vargq != check_if_vargq(base_vcf): - raise ValueError("One of the vcfs has the varGQ field but the other does not") - if collect_evidence != check_if_evidence(base_vcf): - raise ValueError("One of the vcfs has the EVIDENCE field but the other does not") - base_records = list(base_vcf.fetch()) - test_tree = iu.create_trees_from_records(test_records, variant_types, contigs) - test_pass_tree = iu.create_trees_from_records(pass_records, variant_types, contigs) - base_tree = iu.create_trees_from_records(base_records, variant_types, contigs) - base_pass_records = [ - r for r in base_records if ("PASS" in r.filter or len(set(r.filter) - pass_filter_set) == 0) - ] - base_pass_tree = iu.create_trees_from_records(base_pass_records, variant_types, contigs) - elif fbase_bed is not None: - base_records = parse_bed_file(fbase_bed) - test_tree = iu.create_trees_from_records(test_records, variant_types, contigs) - test_pass_tree = iu.create_trees_from_records(pass_records, variant_types, contigs) - base_tree = iu.create_trees_from_bed_records(base_records, variant_types, contigs) - base_pass_tree = None - else: - test_tree = None - test_pass_tree = None - base_tree = None - base_pass_tree = None - - if base_tree is not None: - metrics, fp_intervals, fn_intervals = add_evaluation_metrics( - metrics, test_tree, base_tree, variant_types, min_ro, padding, metric_prefix - ) - else: - fp_intervals = None - fn_intervals = None - - if base_pass_tree is not None: - metrics, fp_intervals_pass, fn_intervals_pass = add_evaluation_metrics( - metrics, - test_pass_tree, - base_pass_tree, - variant_types, - min_ro, - padding, - metric_prefix, - metric_suffix="_pass", - ) - else: - fp_intervals_pass = None - fn_intervals_pass = None - - if genotyped: - allele_frequencies, num_singletons = get_allele_frequency_counts(pass_records, test_vcf.header, variant_types) - if has_vargq: - vargq_counts = get_distributions_by_type(pass_records, variant_types, "varGQ", VARGQ_BINS) - if collect_evidence: - evidence_counts = collect_evidence_fields(pass_records, variant_types) - - for type in variant_types: - metrics[metric_prefix + VCF_METRIC_STR + type + "_count"] = unfiltered_variant_type_counts[type] - metrics[metric_prefix + VCF_METRIC_STR + type + "_pass_count"] = variant_type_counts[type] - if type != "BND": - metrics = add_binned_metrics(size_counts, SIZES, type, metrics, metric_prefix, "pass_size") - if genotyped: - metrics = add_binned_metrics(allele_frequencies, AF_BINS, type, metrics, metric_prefix, "pass_af") - if type in num_singletons: - metrics[metric_prefix + VCF_METRIC_STR + type + "_pass_ac_1"] = num_singletons[type] - if has_vargq: - metrics = add_binned_metrics(vargq_counts, VARGQ_BINS, type, metrics, metric_prefix, "pass_vargq") - if collect_evidence: - metrics = add_metrics_from_dict(evidence_counts, type, metrics, metric_prefix, "pass_evidence") - - return metrics, fp_intervals, fn_intervals, fp_intervals_pass, fn_intervals_pass - - -def add_error_count_metrics(metrics, error_counts, metric_prefix): - for key in error_counts: - metrics[metric_prefix + VCF_METRIC_STR + key] = error_counts[key] - return metrics - - -def add_evaluation_metrics( - metrics, test_tree, base_tree, variant_types, min_ro, padding, metric_prefix, metric_suffix="" -): - tp_test = {} - fp_test = {} - fp_intervals_test = {} - tp_base = {} - fp_base = {} - fp_intervals_base = {} - for type in variant_types: - tp_test_, fp_test_, fp_intervals_test_ = iu.evaluate_tree( - test_tree[type], base_tree[type], min_ro, padding=padding - ) - tp_test[type] = tp_test_ - fp_test[type] = fp_test_ - fp_intervals_test[type] = fp_intervals_test_ - tp_base_, fp_base_, fp_intervals_base_ = iu.evaluate_tree( - base_tree[type], test_tree[type], min_ro, padding=padding - ) - tp_base[type] = tp_base_ - fp_base[type] = fp_base_ - fp_intervals_base[type] = fp_intervals_base_ - - tp_test_by_type = sum_counts_over_contigs(tp_test) - fp_test_by_type = sum_counts_over_contigs(fp_test) - fp_base_by_type = sum_counts_over_contigs(fp_base) - - for type in variant_types: - metrics[metric_prefix + VCF_METRIC_STR + type + "_tp" + metric_suffix] = tp_test_by_type[type] - metrics[metric_prefix + VCF_METRIC_STR + type + "_fp" + metric_suffix] = fp_test_by_type[type] - metrics[metric_prefix + VCF_METRIC_STR + type + "_fn" + metric_suffix] = fp_base_by_type[type] - return metrics, fp_intervals_test, fp_intervals_base - - -def get_allele_frequency_counts(records, header, variant_types): - num_samples = float(len(header.samples)) - allele_freq = {} - num_singletons = {} - types_set = set(variant_types) - # Don't calculate MCNV AF since non-ref alleles cannot be determined without chromosome ploidy - af_types = types_set - set(["CNV"]) - for type in af_types: - allele_freq[type] = [] - num_singletons[type] = 0 - for record in records: - type = vu.get_sv_type(record, types_set) - if type not in af_types: - continue - af = 0 - for sample in record.samples.values(): - for val in sample["GT"]: - if val is not None and val > 0: - af += 1 - if af == 1: - num_singletons[type] += 1 - allele_freq[type].append(af / num_samples) - allele_freq_counts = {} - num_bins = len(AF_BINS) - for type in af_types: - allele_freq_counts[type] = [0] * (num_bins + 1) - for val in allele_freq[type]: - idx = get_distribution_index(val, AF_BINS, num_bins) - allele_freq_counts[type][idx] += 1 - return allele_freq_counts, num_singletons - - -def count_errors(records, contigs, max_warnings): - contigs_set = set(contigs) - error_counts = {INVALID_CHR2_STR: 0, INVALID_END_STR: 0} - print_warnings = True - for record in records: - warnings_maxed = sum(error_counts.values()) > max_warnings - if warnings_maxed and print_warnings: - sys.stderr.write("Max of %d warnings have been given, the rest will be suppressed.\n" % max_warnings) - print_warnings = False - error_counts = check_record(error_counts, record, contigs_set, print_warnings) - return error_counts - - -def check_record(error_counts, record, contigs_set, warn): - if "CHR2" in record.info and record.info["CHR2"] not in contigs_set: - if warn: - sys.stderr.write("Invalid CHR2 value: %s\n" % record.info["CHR2"]) - error_counts[INVALID_CHR2_STR] += 1 - if not valid_end_field(record): - if warn: - sys.stderr.write("Position was not less than END in record %s\n" % record.id) - error_counts[INVALID_END_STR] += 1 - return error_counts - - -def valid_end_field(record): - return record.pos <= record.stop - - -def check_if_genotyped(vcf): - return check_header_format_field(vcf, "EV") - - -def check_if_vargq(vcf): - return check_header_info_field(vcf, "varGQ") - - -def check_if_evidence(vcf): - return check_header_info_field(vcf, "EVIDENCE") - - -def check_header_info_field(vcf, name): - if name in vcf.header.info: - return True - return False - - -def check_header_format_field(vcf, name): - if name in vcf.header.formats: - return True - return False - - -def check_header(vcf, expected_samples): - vcf_samples = vcf.header.samples - tu.test_sets_equal(vcf_samples, expected_samples, item_str="sample", name_a="VCF header", name_b="samples list") - - -def collect_evidence_fields(records, variant_types): - evidence_counts = {} - for variant_type in variant_types: - evidence_counts[variant_type] = {} - for evidence_type in EVIDENCE_TYPES: - evidence_counts[variant_type][evidence_type] = 0 - variant_types_set = set(variant_types) - evidence_types_set = set(EVIDENCE_TYPES) - for record in records: - variant_type = vu.get_sv_type(record, variant_types_set) - evidence_types = vu.get_evidence_types(record, evidence_types_set) - for evidence_type in evidence_types: - evidence_counts[variant_type][evidence_type] += 1 - return evidence_counts - - -def add_binned_metrics(metric_counts, bins, type, metrics, prefix, name): - if type not in metric_counts: - return metrics - for i in range(len(bins)): - if i == 0: - lb = 0 - else: - lb = bins[i - 1] - key = prefix + VCF_METRIC_STR + type + "_" + name + "_" + str(lb) + "_" + str(bins[i]) - metrics[key] = metric_counts[type][i] - key = prefix + VCF_METRIC_STR + type + "_" + name + "_gte_" + str(bins[-1]) - metrics[key] = metric_counts[type][-1] - return metrics - - -def add_metrics_from_dict(dict, type, metrics, prefix, name): - for key in dict[type]: - metric_name = prefix + VCF_METRIC_STR + type + "_" + name + "_" + key - metrics[metric_name] = dict[type][key] - return metrics - - -def get_count_by_type(records, variant_types): - counts = {} - types_set = set(variant_types) - for type in variant_types: - counts[type] = 0 - for record in records: - type = vu.get_sv_type(record, types_set) - counts[type] += 1 - return counts - - -def get_distributions_by_type(records, variant_types, field, bins, exclude_types=[]): - num_bins = len(bins) - counts = {} - types_set = set(variant_types) - for type in variant_types: - counts[type] = [0] * (num_bins + 1) - for record in records: - type = vu.get_sv_type(record, types_set) - if type not in exclude_types: - val = vu.get_info_field(record, field, singularize=True) - idx = get_distribution_index(val, bins, num_bins) - counts[type][idx] += 1 - return counts - - -def get_distribution_index(val, bins, num_bins): - for i in range(num_bins): - if val < bins[i]: - return i - return len(bins) - - -def sum_counts_over_contigs(x): - result = {} - for type in x: - result[type] = 0 - for contig in x[type]: - result[type] += x[type][contig] - return result - - -def parse_bed_file(path): - df = pd.read_csv(path, delimiter="\t") - return df[ - [BED_FILE_HEADER_CHAR + BED_FILE_CHROM_COL, BED_FILE_START_COL, BED_FILE_END_COL, BED_FILE_SVTYPE_COL] - ].values - - -if __name__ == "__main__": - main() diff --git a/modules/local/svtest/svtest/svtest/utils/IOUtils.py b/modules/local/svtest/svtest/svtest/utils/IOUtils.py deleted file mode 100644 index adb10fc9..00000000 --- a/modules/local/svtest/svtest/svtest/utils/IOUtils.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -""" -Useful utilities for IO. -""" - - -def read_samples_list(path, fail_on_empty=True): - with open(path, "r") as f: - samples = f.read().splitlines() - if fail_on_empty and len(samples) == 0: - raise ValueError("Samples list empty") - return samples - - -def read_contig_list(path, fail_on_empty=True): - with open(path, "r") as f: - contigs = f.read().splitlines() - if fail_on_empty and len(contigs) == 0: - raise ValueError("Contig list empty") - return contigs diff --git a/modules/local/svtest/svtest/svtest/utils/IntervalUtils.py b/modules/local/svtest/svtest/svtest/utils/IntervalUtils.py deleted file mode 100644 index b127b622..00000000 --- a/modules/local/svtest/svtest/svtest/utils/IntervalUtils.py +++ /dev/null @@ -1,134 +0,0 @@ -#!/usr/bin/env python - -""" -Useful utilities for intervals and interval trees. -""" - -from intervaltree import IntervalTree, Interval -import svtest.utils.VCFUtils as vu - - -# Creates dictionary of trees[sv_type][contig] from iterable records of VariantRecords -def create_trees_from_records(records, variant_types, contigs): - trees = {} - variant_types_set = set(variant_types) - for type in variant_types: - trees[type] = {} - for contig in contigs: - trees[type][contig] = IntervalTree() - for record in records: - type = vu.get_sv_type(record, variant_types_set) - contig = record.chrom - if type == "INS" or type == "BND": - length = 0 - else: - length = max(0, vu.get_record_length(record)) - trees[type][contig].addi(record.start, record.start + length + 1) - return trees - - -# Creates dictionary of trees[sv_type][contig] from iterable records of VariantRecords -def create_trees_from_bed_records(records, variant_types, contigs): - trees = {} - variant_types_set = set(variant_types) - for type in variant_types: - trees[type] = {} - for contig in contigs: - trees[type][contig] = IntervalTree() - for record in records: - type = record[3] - if type not in variant_types_set: - raise ValueError("Unexpected SVTYPE in bed file: %s" % type) - contig = record[0] - start = record[1] - if type == "INS" or type == "BND": - length = 0 - else: - length = record[2] - record[1] - trees[type][contig].addi(start, start + length) - return trees - - -# Creates dictionary of trees[contig] from gzipped bed file -def create_trees_from_bed(f, contigs, padding): - trees = {} - for contig in contigs: - trees[contig] = IntervalTree() - contigs_set = set(contigs) - for record in f: - line = record.decode() - if line.startswith("#"): - continue - record_tokens = line.strip().split("\t") - contig = record_tokens[0] - if contig not in contigs_set: - continue - start = int(record_tokens[1]) - end = int(record_tokens[2]) - trees[contig].addi(start - padding, end + padding, record_tokens) - return trees - - -# Evaluates test tree[contig] -def evaluate_tree(test_tree, truth_tree, min_ro, padding): - tp = {} - fp = {} - fpi = {} - for contig in test_tree: - if contig in truth_tree: - tp_contig, fp_contig, fpi_contig = evaluate_contig_tree( - test_tree[contig], truth_tree[contig], min_ro, padding - ) - tp[contig] = tp_contig - fp[contig] = fp_contig - fpi[contig] = fpi_contig - else: - tp[contig] = 0 - fp[contig] = len(test_tree[contig]) - fpi[contig] = list(test_tree[contig]) - return tp, fp, fpi - - -# Evaluates test IntervalTree - - -def evaluate_contig_tree(test_tree, truth_tree, min_ro, padding): - tp = 0 - fp_intervals = [] - for interval in test_tree: - padded_interval = Interval(interval[0] - padding, interval[1] + padding) - overlappers = truth_tree.overlap(padded_interval[0], padded_interval[1]) - has_overlapper = False - for overlapper in overlappers: - padded_overlapper = Interval(overlapper[0] - padding, overlapper[1] + padding) - if has_reciprocal_overlap(padded_interval, padded_overlapper, min_ro): - has_overlapper = True - tp += 1 - break - if not has_overlapper: - fp_intervals.append(interval) - fp = len(test_tree) - tp - return tp, fp, fp_intervals - - -def has_reciprocal_overlap(interval_a, interval_b, min_ro): - return min_ro <= reciprocal_overlap(interval_a, interval_b) - - -def reciprocal_overlap(interval_a, interval_b): - return float(overlap_size(interval_a, interval_b)) / max(interval_size(interval_a), interval_size(interval_b)) - - -def interval_size(interval): - return interval[1] - interval[0] - - -def overlap_size(interval_a, interval_b): - return max(0, min(interval_a[1], interval_b[1]) - max(interval_a[0], interval_b[0])) - - -# Sum tree sizes over contigs in trees[contig] - - -def tree_size(trees): - return sum([len(t) for t in trees.values()]) diff --git a/modules/local/svtest/svtest/svtest/utils/TestUtils.py b/modules/local/svtest/svtest/svtest/utils/TestUtils.py deleted file mode 100644 index 4ee3257c..00000000 --- a/modules/local/svtest/svtest/svtest/utils/TestUtils.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python - -""" -Useful utilities for testing. -""" - - -def test_sets_equal(iterable_a, iterable_b, item_str="item", name_a="set A", name_b="set B"): - set_a = set(iterable_a) - set_b = set(iterable_b) - a_not_in_b = set_a - set_b - if len(a_not_in_b) > 0: - raise ValueError("One or more %s(s) found in %s but not %s: %s" % (item_str, name_a, name_b, a_not_in_b)) - b_not_in_a = set_b - set_a - if len(b_not_in_a) > 0: - raise ValueError("One or more %s(s) found in %s but not %s: %s" % (item_str, name_b, name_a, b_not_in_a)) - - -def test_iterable_sizes_equal(iterable_a, iterable_b, name_a="iterable A", name_b="iterable B"): - if len(iterable_a) != len(iterable_b): - raise ValueError( - "%s (%d) was not the same size as %s (%d)" % (name_a, len(iterable_a), name_b, len(iterable_b)) - ) - - -def test_iterable_size(iterable, size): - if len(iterable) != size: - raise ValueError("Expected %d values but found %d in: %s" % (size, len(iterable), str(iterable))) - - -def test_column_equals(columns, idx, val): - if columns[idx] != val: - raise ValueError("Expected column %d to equal %s but found %s" % (idx + 1, val, columns[idx])) - - -def test_column_in_iterable(columns, idx, iter, msg=None): - if columns[idx] not in iter: - if msg is None: - raise ValueError("Expected column %d to be one of %s but found %s" % (idx + 1, str(iter), columns[idx])) - else: - raise ValueError(msg) - - -def test_is_not_empty(list, name): - if len(list) == 0: - raise ValueError("List %s was empty" % name) - - -def test_is_float(columns, idx): - if not is_float(columns[idx]): - raise ValueError("Column %d was not a float: %s" % (idx + 1, str(columns))) - - -def test_is_int(columns, idx): - if not is_int(columns[idx]): - raise ValueError("Column %d was not an int: %s" % (idx + 1, str(columns))) - - -def is_float(str): - try: - float(str) - return True - except ValueError: - return False - - -def is_int(str): - try: - int(str) - return True - except ValueError: - return False diff --git a/modules/local/svtest/svtest/svtest/utils/VCFUtils.py b/modules/local/svtest/svtest/svtest/utils/VCFUtils.py deleted file mode 100644 index 78c0e9bb..00000000 --- a/modules/local/svtest/svtest/svtest/utils/VCFUtils.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python - -""" -Useful utilities for working with pysam variant objects -""" - - -def get_info_field(record, name, singularize=False): - if name not in record.info: - if name == "SVLEN": - if record.info["SVTYPE"] in ["DEL", "DUP", "INV"]: - record.info["SVLEN"] = record.stop - record.pos - else: - record.info["SVLEN"] = -1 - else: - raise ValueError("%s info field not found: %s" % (name, record.info.keys())) - - # Checks if the value is a tuple (such as for SVLEN) - val = record.info[name] - if singularize and isinstance(val, tuple): - if len(val) == 1: - return val[0] - else: - raise ValueError(f"Encountered value tuple containing multiple entries: {val}") - return val - - -def get_record_length(record): - return get_info_field(record, "SVLEN", singularize=True) - - -def get_sv_type(record, expected_types): - if "SVTYPE" not in record.info: - raise ValueError("SVTYPE info field not found: %s" % record.info.keys()) - type = record.info["SVTYPE"] - if type not in expected_types: - raise ValueError("Unexpected SVTYPE: %s" % type) - return type - - -def get_evidence_types(record, expected_types): - if "EVIDENCE" not in record.info: - raise ValueError("EVIDENCE info field not found: %s" % record.info.keys()) - types = record.info["EVIDENCE"] - for type in types: - if type not in expected_types: - raise ValueError("Unexpected EVIDENCE: %s" % type) - return types diff --git a/modules/local/svtest/svtest/svtest/utils/__init__.py b/modules/local/svtest/svtest/svtest/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modules/local/svtest/vcf/main.nf b/modules/local/svtest/vcf/main.nf deleted file mode 100644 index 2cea6d51..00000000 --- a/modules/local/svtest/vcf/main.nf +++ /dev/null @@ -1,63 +0,0 @@ -process SVTEST_VCF { - tag "$meta.id" - label 'process_low' - - container "nicolasvnk/svtest:0.1" - - input: - tuple val(meta), path(vcf), path(tbi), path(baseline_vcf) - path fasta_fai - - output: - tuple val(meta), path("*.tsv") , emit: metrics - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - - def VERSION = "0.1" - - def arguments = args.args ?: '' - def types = args.types - def contigs = fasta_fai - - def baseline = baseline_vcf ? "--baseline-vcf ${baseline_vcf}" : "" - - """ - echo "${meta.id}" > samples.txt - - svtest vcf \\ - ${arguments} \\ - ${baseline} \\ - ${vcf} \\ - ${contigs} \\ - samples.txt \\ - ${types} \\ - ${prefix} \\ - > ${prefix}.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - svtest: ${VERSION} - python: \$(python3 --version | sed -e "s/Python //g") - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - def VERSION = "0.1" - - """ - touch ${prefix}.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - svtest: ${VERSION} - python: \$(python3 --version | sed -e "s/Python //g") - END_VERSIONS - """ -} diff --git a/modules/local/viola/Dockerfile b/modules/local/viola/Dockerfile new file mode 100644 index 00000000..28cf1367 --- /dev/null +++ b/modules/local/viola/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.9 + +LABEL version="1.0.2" maintainer="Nicolas Vannieuwkerke " + +RUN apt update && apt install tabix python3-pip -y + +RUN pip install setuptools==57.5.0 + +COPY ./requirements.txt . +RUN pip install -r ./requirements.txt + +CMD ["python"] diff --git a/modules/local/viola/main.nf b/modules/local/viola/main.nf new file mode 100644 index 00000000..df0a924e --- /dev/null +++ b/modules/local/viola/main.nf @@ -0,0 +1,73 @@ +process VIOLA { + tag "$meta.id" + label 'process_low' + + container "nicolasvnk/viola:1.0.2" + + input: + tuple val(meta), path(vcf) + + output: + tuple val(meta), path("*_standardized.vcf.gz") , emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = "1.0.2" + + if ("${vcf}" == "${prefix}.vcf.gz") { + error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + } + + def unzipped_vcf = vcf.name.replace(".gz","") + + """ + cp ${vcf} new_${vcf} + bgzip -d new_${vcf} + + variant=\$(cat new_${unzipped_vcf} | awk '/^#/ {next} {print 1;exit}' || echo 0) + + if [ \$variant -eq 1 ] + then + viola_standardize.py \\ + new_${unzipped_vcf} \\ + ${meta.caller} \\ + ${prefix}.vcf \\ + ${meta.id} + bgzip ${prefix}.vcf + echo "Ran the standardization succesfully" + else + echo "${vcf} was empty, so the viola_standardize.py process was skipped." + bgzip new_${unzipped_vcf} + cp new_${vcf} ${prefix}.vcf.gz + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + viola-sv: ${VERSION} + python: \$(python3 --version | sed -e "s/Python //g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = "1.0.2" + + if ("${vcf}" == "${prefix}.vcf.gz") { + error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + } + + + """ + touch ${prefix}.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + viola-sv: ${VERSION} + python: \$(python3 --version | sed -e "s/Python //g") + END_VERSIONS + """ +} diff --git a/modules/local/viola/requirements.txt b/modules/local/viola/requirements.txt new file mode 100644 index 00000000..e956ac22 --- /dev/null +++ b/modules/local/viola/requirements.txt @@ -0,0 +1,14 @@ +biopython==1.81 +intervaltree==3.1.0 +joblib==1.2.0 +numpy==1.24.2 +pandas==1.5.3 +python-dateutil==2.8.2 +pytz==2022.7.1 +PyVCF==0.6.8 +scikit-learn==1.2.2 +scipy==1.10.1 +six==1.16.0 +sortedcontainers==2.4.0 +threadpoolctl==3.1.0 +Viola-SV==1.0.2 diff --git a/modules/nf-core/bcftools/reheader/main.nf b/modules/nf-core/bcftools/reheader/main.nf index 441380c4..57634c07 100644 --- a/modules/nf-core/bcftools/reheader/main.nf +++ b/modules/nf-core/bcftools/reheader/main.nf @@ -12,8 +12,8 @@ process BCFTOOLS_REHEADER { path fai output: - tuple val(meta), path("*.vcf.gz"), emit: vcf - path "versions.yml" , emit: versions + tuple val(meta), path("*.{vcf,vcf.gz,bcf,bcf.gz}"), emit: vcf + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -23,6 +23,13 @@ process BCFTOOLS_REHEADER { def prefix = task.ext.prefix ?: "${meta.id}" def update_sequences = fai ? "-f $fai" : "" def new_header = header ? "-h $header" : "" + + def args2 = task.ext.args2 ?: '--output-type z' + def extension = args2.contains("--output-type b") || args2.contains("-Ob") ? "bcf.gz" : + args2.contains("--output-type u") || args2.contains("-Ou") ? "bcf" : + args2.contains("--output-type z") || args2.contains("-Oz") ? "vcf.gz" : + args2.contains("--output-type v") || args2.contains("-Ov") ? "vcf" : + "vcf" """ bcftools \\ reheader \\ @@ -30,8 +37,10 @@ process BCFTOOLS_REHEADER { $new_header \\ $args \\ --threads $task.cpus \\ - -o ${prefix}.vcf.gz \\ - $vcf + $vcf \\ + | bcftools view \\ + $args2 \\ + --output ${prefix}.${extension} cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -40,10 +49,16 @@ process BCFTOOLS_REHEADER { """ stub: + def args2 = task.ext.args2 ?: '--output-type z' def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args2.contains("--output-type b") || args2.contains("-Ob") ? "bcf.gz" : + args2.contains("--output-type u") || args2.contains("-Ou") ? "bcf" : + args2.contains("--output-type z") || args2.contains("-Oz") ? "vcf.gz" : + args2.contains("--output-type v") || args2.contains("-Ov") ? "vcf" : + "vcf" """ - touch ${prefix}.vcf.gz + touch ${prefix}.${extension} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/bcftools/reheader/meta.yml b/modules/nf-core/bcftools/reheader/meta.yml index 6718f42d..44d75fdf 100644 --- a/modules/nf-core/bcftools/reheader/meta.yml +++ b/modules/nf-core/bcftools/reheader/meta.yml @@ -44,8 +44,9 @@ output: pattern: "versions.yml" - vcf: type: file - description: VCF with updated header - pattern: "*.{vcf.gz}" + description: VCF with updated header, bgzipped per default + pattern: "*.{vcf,vcf.gz,bcf,bcf.gz}" authors: - "@bjohnnyd" + - "@jemten" diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index 3df21765..800a6099 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.13" + conda "bioconda::multiqc=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.14--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py index e55b8d43..da033408 100755 --- a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -4,11 +4,10 @@ """Provide functions to merge multiple versions.yml files.""" +import yaml import platform from textwrap import dedent -import yaml - def _make_versions_html(versions): """Generate a tabular HTML output of all versions for MultiQC.""" diff --git a/modules/nf-core/delly/call/delly-call.diff b/modules/nf-core/delly/call/delly-call.diff index 04eaf9f5..87313655 100644 --- a/modules/nf-core/delly/call/delly-call.diff +++ b/modules/nf-core/delly/call/delly-call.diff @@ -1,32 +1,14 @@ Changes in module 'nf-core/delly/call' --- modules/nf-core/delly/call/main.nf +++ modules/nf-core/delly/call/main.nf -@@ -14,7 +14,7 @@ - - output: - tuple val(meta), path("*.{bcf,vcf.gz}") , emit: bcf -- tuple val(meta), path("*.csi") , emit: csi, optional:true -+ tuple val(meta), path("*.{csi,tbi}") , emit: csi, optional:true - path "versions.yml" , emit: versions - - when: @@ -29,7 +29,7 @@ def exclude = exclude_bed ? "--exclude ${exclude_bed}" : "" def bcf_output = suffix == "bcf" ? "--outfile ${prefix}.bcf" : "" -- def vcf_output = suffix == "vcf" ? "| bgzip ${args2} --threads ${task.cpus} --stdout > ${prefix}.vcf.gz" : "" -+ def vcf_output = suffix == "vcf" ? "| sed 's/CONSENSUS/SVINSSEQ/g' | bgzip ${args2} --threads ${task.cpus} --stdout > ${prefix}.vcf.gz" : "" +- def vcf_output = suffix == "vcf" ? "| bgzip ${args2} --threads ${task.cpus} --stdout > ${prefix}.vcf.gz && tabix ${prefix}.vcf.gz" : "" ++ def vcf_output = suffix == "vcf" ? "| sed 's/CONSENSUS/SVINSSEQ/g' | bgzip ${args2} --threads ${task.cpus} --stdout > ${prefix}.vcf.gz && tabix ${prefix}.vcf.gz" : "" def genotype = vcf ? "--vcffile ${vcf}" : "" -@@ -43,6 +43,8 @@ - ${exclude} \\ - ${input} \\ - ${vcf_output} -+ -+ tabix ${prefix}.vcf.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": ************************************************************ diff --git a/modules/nf-core/delly/call/main.nf b/modules/nf-core/delly/call/main.nf index a87cbd36..f78fb1b4 100644 --- a/modules/nf-core/delly/call/main.nf +++ b/modules/nf-core/delly/call/main.nf @@ -14,7 +14,7 @@ process DELLY_CALL { output: tuple val(meta), path("*.{bcf,vcf.gz}") , emit: bcf - tuple val(meta), path("*.{csi,tbi}") , emit: csi, optional:true + tuple val(meta), path("*.{csi,tbi}") , emit: csi path "versions.yml" , emit: versions when: @@ -29,7 +29,7 @@ process DELLY_CALL { def exclude = exclude_bed ? "--exclude ${exclude_bed}" : "" def bcf_output = suffix == "bcf" ? "--outfile ${prefix}.bcf" : "" - def vcf_output = suffix == "vcf" ? "| sed 's/CONSENSUS/SVINSSEQ/g' | bgzip ${args2} --threads ${task.cpus} --stdout > ${prefix}.vcf.gz" : "" + def vcf_output = suffix == "vcf" ? "| sed 's/CONSENSUS/SVINSSEQ/g' | bgzip ${args2} --threads ${task.cpus} --stdout > ${prefix}.vcf.gz && tabix ${prefix}.vcf.gz" : "" def genotype = vcf ? "--vcffile ${vcf}" : "" @@ -44,8 +44,6 @@ process DELLY_CALL { ${input} \\ ${vcf_output} - tabix ${prefix}.vcf.gz - cat <<-END_VERSIONS > versions.yml "${task.process}": delly: \$( echo \$(delly --version 2>&1) | sed 's/^.*Delly version: v//; s/ using.*\$//') @@ -56,8 +54,8 @@ process DELLY_CALL { def prefix = task.ext.prefix ?: "${meta.id}" def suffix = task.ext.suffix ?: "bcf" - def bcf_output = suffix == "bcf" ? "touch ${prefix}.bcf" : "" - def vcf_output = suffix == "vcf" ? "touch ${prefix}.vcf.gz" : "" + def bcf_output = suffix == "bcf" ? "touch ${prefix}.bcf && touch ${prefix}.bcf.csi" : "" + def vcf_output = suffix == "vcf" ? "touch ${prefix}.vcf.gz && touch ${prefix}.vcf.gz.tbi" : "" """ ${bcf_output} diff --git a/modules/nf-core/delly/call/meta.yml b/modules/nf-core/delly/call/meta.yml index f8d06643..56a9e46b 100644 --- a/modules/nf-core/delly/call/meta.yml +++ b/modules/nf-core/delly/call/meta.yml @@ -11,8 +11,8 @@ tools: description: Structural variant discovery by integrated paired-end and split-read analysis homepage: https://github.com/dellytools/delly documentation: https://github.com/dellytools/delly/blob/master/README.md - tool_dev_url: None - doi: "DOI:10.1093/bioinformatics/bts378" + + doi: "10.1093/bioinformatics/bts378" licence: ["BSD-3-Clause"] input: @@ -65,9 +65,9 @@ output: description: Called variants in BCF/VCF format. Specify either "bcf" or "vcf" in ext.suffix to define the output type pattern: "*.{bcf,vcf.gz}" - csi: - type: file - description: A generated csi index that matches the bcf output (not generated for vcf files) - pattern: "*.{bcf.csi}" + type: file + description: A generated csi index that matches the bcf output (not generated for vcf files) + pattern: "*.{bcf.csi}" authors: - "@projectoriented" diff --git a/modules/nf-core/ensemblvep/vep/main.nf b/modules/nf-core/ensemblvep/vep/main.nf new file mode 100644 index 00000000..e9d25f3d --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/main.nf @@ -0,0 +1,72 @@ +process ENSEMBLVEP_VEP { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::ensembl-vep=108.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ensembl-vep:108.2--pl5321h4a94de4_0' : + 'quay.io/biocontainers/ensembl-vep:108.2--pl5321h4a94de4_0' }" + + input: + tuple val(meta), path(vcf) + val genome + val species + val cache_version + path cache + path fasta + path extra_files + + output: + tuple val(meta), path("*.vcf.gz") , optional:true, emit: vcf + tuple val(meta), path("*.tab.gz") , optional:true, emit: tab + tuple val(meta), path("*.json.gz") , optional:true, emit: json + path "*.summary.html" , emit: report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def file_extension = args.contains("--vcf") ? 'vcf' : args.contains("--json")? 'json' : args.contains("--tab")? 'tab' : 'vcf' + def compress_cmd = args.contains("--compress_output") ? '' : '--compress_output bgzip' + def prefix = task.ext.prefix ?: "${meta.id}" + def dir_cache = cache ? "\${PWD}/${cache}" : "/.vep" + def reference = fasta ? "--fasta $fasta" : "" + + """ + vep \\ + -i $vcf \\ + -o ${prefix}.${file_extension}.gz \\ + $args \\ + $compress_cmd \\ + $reference \\ + --assembly $genome \\ + --species $species \\ + --cache \\ + --cache_version $cache_version \\ + --dir_cache $dir_cache \\ + --fork $task.cpus \\ + --stats_file ${prefix}.summary.html \\ + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.ann.vcf.gz + touch ${prefix}.ann.tab.gz + touch ${prefix}.ann.json.gz + touch ${prefix}.summary.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/ensemblvep/vep/meta.yml b/modules/nf-core/ensemblvep/vep/meta.yml new file mode 100644 index 00000000..3a4f8d1d --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/meta.yml @@ -0,0 +1,74 @@ +name: ENSEMBLVEP_VEP +description: Ensembl Variant Effect Predictor (VEP). The output-file-format is controlled through `task.ext.args`. +keywords: + - annotation +tools: + - ensemblvep: + description: | + VEP determines the effect of your variants (SNPs, insertions, deletions, CNVs + or structural variants) on genes, transcripts, and protein sequence, as well as regulatory regions. + homepage: https://www.ensembl.org/info/docs/tools/vep/index.html + documentation: https://www.ensembl.org/info/docs/tools/vep/script/index.html + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: | + vcf to annotate + - genome: + type: value + description: | + which genome to annotate with + - species: + type: value + description: | + which species to annotate with + - cache_version: + type: value + description: | + which version of the cache to annotate with + - cache: + type: file + description: | + path to VEP cache (optional) + - fasta: + type: file + description: | + reference FASTA file (optional) + pattern: "*.{fasta,fa}" + - extra_files: + type: tuple + description: | + path to file(s) needed for plugins (optional) +output: + - vcf: + type: file + description: | + annotated vcf (optional) + pattern: "*.ann.vcf.gz" + - tab: + type: file + description: | + tab file with annotated variants (optional) + pattern: "*.ann.tab.gz" + - json: + type: file + description: | + json file with annotated variants (optional) + pattern: "*.ann.json.gz" + - report: + type: file + description: VEP report file + pattern: "*.html" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" + - "@matthdsm" diff --git a/modules/nf-core/gatk4/collectreadcounts/main.nf b/modules/nf-core/gatk4/collectreadcounts/main.nf deleted file mode 100644 index 37a90da0..00000000 --- a/modules/nf-core/gatk4/collectreadcounts/main.nf +++ /dev/null @@ -1,68 +0,0 @@ -process GATK4_COLLECTREADCOUNTS { - tag "$meta.id" - label 'process_medium' - - conda "bioconda::gatk4=4.3.0.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gatk4:4.3.0.0--py36hdfd78af_0': - 'quay.io/biocontainers/gatk4:4.3.0.0--py36hdfd78af_0' }" - - input: - tuple val(meta), path(input), path(input_index), path(intervals) - path(fasta) - path(fai) - path(dict) - - output: - tuple val(meta), path("*.hdf5"), optional: true, emit: hdf5 - tuple val(meta), path("*.tsv") , optional: true, emit: tsv - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - - def reference = fasta ? "--reference $fasta" : "" - def extension = args.contains("--format HDF5") ? "hdf5" : - args.contains("--format TSV") ? "tsv" : - "hdf5" - - def avail_mem = 3 - if (!task.memory) { - log.info '[GATK COLLECTREADCOUNTS] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' - } else { - avail_mem = task.memory.giga - } - """ - gatk --java-options "-Xmx${avail_mem}g" CollectReadCounts \\ - --input $input \\ - --intervals $intervals \\ - --output ${prefix}.$extension \\ - $reference \\ - --tmp-dir . \\ - $args - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def extension = args.contains("--format HDF5") ? "hdf5" : - args.contains("--format TSV") ? "tsv" : - "hdf5" - """ - touch ${prefix}.${extension} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/gatk4/collectreadcounts/meta.yml b/modules/nf-core/gatk4/collectreadcounts/meta.yml deleted file mode 100644 index 1dbddc59..00000000 --- a/modules/nf-core/gatk4/collectreadcounts/meta.yml +++ /dev/null @@ -1,72 +0,0 @@ -name: "gatk4_collectreadcounts" -description: Collects read counts at specified intervals. The count for each interval is calculated by counting the number of read starts that lie in the interval. -keywords: - - bam - - cram - - CollectReadCounts - - gatk - - gatk4 -tools: - - gatk4: - description: - Genome Analysis Toolkit (GATK4). Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools - with a primary focus on variant discovery and genotyping. Its powerful processing engine - and high-performance computing features make it capable of taking on projects of any size. - homepage: https://gatk.broadinstitute.org/hc/en-us - documentation: https://gatk.broadinstitute.org/hc/en-us/articles/360037593911-CombineGVCFs - tool_dev_url: https://github.com/broadinstitute/gatk - doi: 10.1158/1538-7445.AM2017-3590 - licence: ["Apache-2.0"] - -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - - bai: - type: file - description: BAM/CRAM/SAM index file - pattern: "*.{bai,crai,sai}" - - intervals: - type: file - description: A file containing the specified intervals - pattern: "*.{bed,intervals}" - - fasta: - type: file - description: Optional - Reference FASTA - pattern: "*.{fasta,fa}" - - fai: - type: file - description: Optional - Index of the reference FASTA file - pattern: "*.fai" - - dict: - type: file - description: Optional - Sequence dictionary of the reference FASTA file - pattern: "*.dict" - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - hdf5: - type: file - description: The read counts in hdf5 format - pattern: "*.hdf5" - - tsv: - type: file - description: The read counts in TSV format - pattern: "*.tsv" - -authors: - - "@nvnieuwk" diff --git a/modules/nf-core/gatk4/collectsvevidence/main.nf b/modules/nf-core/gatk4/collectsvevidence/main.nf deleted file mode 100644 index 235f019b..00000000 --- a/modules/nf-core/gatk4/collectsvevidence/main.nf +++ /dev/null @@ -1,56 +0,0 @@ -process GATK4_COLLECTSVEVIDENCE { - tag "$meta.id" - label 'process_low' - - conda "bioconda::gatk4=4.3.0.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gatk4:4.3.0.0--py36hdfd78af_0': - 'quay.io/biocontainers/gatk4:4.3.0.0--py36hdfd78af_0' }" - - input: - tuple val(meta), path(input), path(input_index), path(site_depth_vcf), path(site_depth_vcf_tbi) - path fasta - path fasta_fai - path dict - - output: - tuple val(meta), path("*.sr.txt.gz") , emit: split_read_evidence - tuple val(meta), path("*.sr.txt.gz.tbi"), emit: split_read_evidence_index - tuple val(meta), path("*.pe.txt.gz") , emit: paired_end_evidence - tuple val(meta), path("*.pe.txt.gz.tbi"), emit: paired_end_evidence_index - tuple val(meta), path("*.sd.txt.gz") , emit: site_depths, optional:true - tuple val(meta), path("*.sd.txt.gz.tbi"), emit: site_depths_index, optional:true - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - - def sd_vcf = site_depth_vcf ? "--sd-file ${prefix}.sd.txt.gz --site-depth-locs-vcf ${site_depth_vcf}" : "" - def reference = fasta ? "--reference ${fasta}" : "" - - def avail_mem = 3 - if (!task.memory) { - log.info '[GATK COLLECTSVEVIDENCE] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' - } else { - avail_mem = task.memory.giga - } - """ - gatk --java-options "-Xmx${avail_mem}g" CollectSVEvidence \\ - ${args} \\ - --input ${input} \\ - --sr-file ${prefix}.sr.txt.gz \\ - --pe-file ${prefix}.pe.txt.gz \\ - ${sd_vcf} \\ - ${reference} \\ - --tmp-dir . \\ - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/gatk4/collectsvevidence/meta.yml b/modules/nf-core/gatk4/collectsvevidence/meta.yml deleted file mode 100644 index 6e9f1f7a..00000000 --- a/modules/nf-core/gatk4/collectsvevidence/meta.yml +++ /dev/null @@ -1,88 +0,0 @@ -name: "gatk4_collectsvevidence" -description: Gathers paired-end and split read evidence files for use in the GATK-SV pipeline. Output files are a file containing the location of and orientation of read pairs marked as discordant, and a file containing the clipping location of all soft clipped reads and the orientation of the clipping. -keywords: - - gatk4 - - collectsvevidence - - structural variants - - metrics -tools: - - "gatk4": - description: "Genome Analysis Toolkit (GATK4)" - homepage: "https://gatk.broadinstitute.org/hc/en-us" - documentation: "None" - tool_dev_url: "https://github.com/broadinstitute/gatk" - doi: "10.1158/1538-7445.AM2017-3590" - licence: "['BSD-3-clause']" - -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - input: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - - input_index: - type: file - description: Index of the BAM/CRAM/SAM file - pattern: "*.{bai,crai,sai}" - - site_depth_vcf: - type: file - description: Optional - input VCF of SNPs marking loci for site depths, needed for the site depths output - pattern: "*.vcf.gz" - - site_depth_vcf_index: - type: file - description: Optional - index of the VCF file, needed for the site depths output - pattern: "*.tbi" - - fasta: - type: file - description: Optional - reference FASTA file needed when the input is a CRAM file - pattern: "*.{fasta,fa}" - - fasta_fai: - type: file - description: Optional - index of the reference FASTA file needed when the input is a CRAM file - pattern: "*.fai" - - dict: - type: file - description: Optional - sequence dictionary of the reference FASTA file needed when the input is a CRAM file - pattern: "*.dict" - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - split_read_evidence: - type: file - description: Output file for split read evidence - pattern: "*.sr.txt.gz" - - split_read_evidence_index: - type: file - description: Index of the output file for split read evidence - pattern: "*.sr.txt.gz.tbi" - - paired_end_evidence: - type: file - description: Output file for paired end evidence - pattern: "*.pe.txt.gz" - - paired_end_evidence_index: - type: file - description: Index of the output file for paired end evidence - pattern: "*.pe.txt.gz.tbi" - - site_depths: - type: file - description: Output file for site depths - pattern: "*.sd.txt.gz" - - site_depths_index: - type: file - description: Index of the output file for site depths - pattern: "*.sd.txt.gz.tbi" - -authors: - - "@nvnieuwk" diff --git a/modules/nf-core/gatk4/createsequencedictionary/main.nf b/modules/nf-core/gatk4/createsequencedictionary/main.nf deleted file mode 100644 index bc324ada..00000000 --- a/modules/nf-core/gatk4/createsequencedictionary/main.nf +++ /dev/null @@ -1,51 +0,0 @@ -process GATK4_CREATESEQUENCEDICTIONARY { - tag "$fasta" - label 'process_medium' - - conda "bioconda::gatk4=4.3.0.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gatk4:4.3.0.0--py36hdfd78af_0': - 'quay.io/biocontainers/gatk4:4.3.0.0--py36hdfd78af_0' }" - - input: - path fasta - - output: - path "*.dict" , emit: dict - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - - def avail_mem = 6 - if (!task.memory) { - log.info '[GATK CreateSequenceDictionary] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.' - } else { - avail_mem = task.memory.giga - } - """ - gatk --java-options "-Xmx${avail_mem}g" CreateSequenceDictionary \\ - --REFERENCE $fasta \\ - --URI $fasta \\ - --TMP_DIR . \\ - $args - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') - END_VERSIONS - """ - - stub: - """ - touch ${fasta.baseName}.dict - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/gatk4/createsequencedictionary/meta.yml b/modules/nf-core/gatk4/createsequencedictionary/meta.yml deleted file mode 100644 index bd247888..00000000 --- a/modules/nf-core/gatk4/createsequencedictionary/meta.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: gatk4_createsequencedictionary -description: Creates a sequence dictionary for a reference sequence -keywords: - - dictionary - - fasta -tools: - - gatk: - description: | - Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools - with a primary focus on variant discovery and genotyping. Its powerful processing engine - and high-performance computing features make it capable of taking on projects of any size. - homepage: https://gatk.broadinstitute.org/hc/en-us - documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s - doi: 10.1158/1538-7445.AM2017-3590 - licence: ["Apache-2.0"] - -input: - - fasta: - type: file - description: Input fasta file - pattern: "*.{fasta,fa}" -output: - - dict: - type: file - description: gatk dictionary file - pattern: "*.{dict}" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@maxulysse" diff --git a/modules/nf-core/gatk4/printsvevidence/main.nf b/modules/nf-core/gatk4/printsvevidence/main.nf deleted file mode 100644 index 942f4ee7..00000000 --- a/modules/nf-core/gatk4/printsvevidence/main.nf +++ /dev/null @@ -1,66 +0,0 @@ -process GATK4_PRINTSVEVIDENCE { - tag "${meta.id}" - label 'process_single' - - conda "bioconda::gatk4=4.3.0.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gatk4:4.3.0.0--py36hdfd78af_0': - 'quay.io/biocontainers/gatk4:4.3.0.0--py36hdfd78af_0' }" - - input: - tuple val(meta), path(evidence_files), path(evidence_indices) - path bed - path fasta - path fasta_fai - path dict - - output: - tuple val(meta), path("*.txt.gz") , emit: printed_evidence - tuple val(meta), path("*.txt.gz.tbi") , emit: printed_evidence_index - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def intervals = bed ? "--intervals ${bed}" : "" - def reference = fasta ? "--reference ${fasta}" : "" - def input_files = evidence_files.collect({"--evidence-file $it"}).join(' ') - - def file_name = evidence_files[0].getFileName() - - def file_type = file_name =~ ".sr.txt" ? "sr" : - file_name =~ ".pe.txt" ? "pe" : - file_name =~ ".baf.txt" ? "baf" : - file_name =~ ".rd.txt" ? "rd" : - false - - if(!file_type){ - error("The input file name should contain one of the following: '.sr.txt', '.pe.txt', '.baf.txt', '.rd.txt'") - } - - def avail_mem = 3 - if (!task.memory) { - log.info '[GATK PRINTSVEVIDENCE] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' - } else { - avail_mem = task.memory.giga - } - - """ - gatk --java-options "-Xmx${avail_mem}g" PrintSVEvidence \\ - ${input_files} \\ - --sequence-dictionary ${dict} \\ - ${intervals} \\ - ${reference} \\ - --output ${prefix}.${file_type}.txt.gz \\ - --tmp-dir . \\ - ${args} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/gatk4/printsvevidence/meta.yml b/modules/nf-core/gatk4/printsvevidence/meta.yml deleted file mode 100644 index 08463fa5..00000000 --- a/modules/nf-core/gatk4/printsvevidence/meta.yml +++ /dev/null @@ -1,68 +0,0 @@ -name: "gatk4_printsvevidence" -description: WARNING - this tool is still experimental and shouldn't be used in a production setting. Gathers paired-end and split read evidence files for use in the GATK-SV pipeline. Output files are a file containing the location of and orientation of read pairs marked as discordant, and a file containing the clipping location of all soft clipped reads and the orientation of the clipping. -keywords: - - structural variants - - gatk4 -tools: - - gatk4: - description: | - Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools - with a primary focus on variant discovery and genotyping. Its powerful processing engine - and high-performance computing features make it capable of taking on projects of any size. - homepage: https://gatk.broadinstitute.org/hc/en-us - documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s - doi: 10.1158/1538-7445.AM2017-3590 - licence: ["Apache-2.0"] - -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - evidence_files: - type: files - description: The evidence files created by CollectSVEvidence. They all need to be of the same type to print the SV evidence. - pattern: "*.{pe,sr,baf,rd}.txt(.gz)" - - evidence_indices: - type: files - description: The indices of the evidence files created by CollectSVEvidence - pattern: "*.{pe,sr,baf,rd}.txt(.gz).tbi" - - bed: - type: file - description: An optional BED file - pattern: "*.bed" - - fasta: - type: file - description: An optional reference FASTA file - pattern: "*.{fa,fasta}" - - fasta_fai: - type: file - description: An optional reference FASTA file index - pattern: "*.fai" - - dict: - type: file - description: The mandatory sequence dictionary file - pattern: "*.dict" - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - printed_evidence: - type: file - description: The output file containing the discordant read pairs or the soft clipped reads - pattern: "*.{pe,sr,baf,rd}.txt.gz" - - printed_evidence: - type: file - description: The index of the output file containing the discordant read pairs or the soft clipped reads - pattern: "*.{pe,sr,baf,rd}.txt.gz.tbi" - -authors: - - "@nvnieuwk" diff --git a/modules/nf-core/gatk4/sitedepthtobaf/main.nf b/modules/nf-core/gatk4/sitedepthtobaf/main.nf deleted file mode 100644 index 986836a6..00000000 --- a/modules/nf-core/gatk4/sitedepthtobaf/main.nf +++ /dev/null @@ -1,53 +0,0 @@ -process GATK4_SITEDEPTHTOBAF { - tag "$meta.id" - label 'process_single' - - conda "bioconda::gatk4=4.3.0.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gatk4:4.3.0.0--py36hdfd78af_0': - 'quay.io/biocontainers/gatk4:4.3.0.0--py36hdfd78af_0' }" - - input: - tuple val(meta), path(site_depths), path(site_depths_indices) - tuple path(vcf), path(tbi) - path fasta - path fasta_fai - path dict - - output: - tuple val(meta), path("*.baf.txt.gz") , emit: baf - tuple val(meta), path("*.baf.txt.gz.tbi") , emit: baf_tbi - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - - def site_depth_input = site_depths.collect({"--site-depth ${it}"}).join(" ") - def reference = fasta ? "--reference ${fasta}" : "" - - def avail_mem = 3 - if (!task.memory) { - log.info '[GATK SiteDepthtoBAF] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' - } else { - avail_mem = task.memory.giga - } - - """ - gatk --java-options "-Xmx${avail_mem}g" SiteDepthtoBAF \\ - --baf-evidence-output ${prefix}.baf.txt.gz \\ - --baf-sites-vcf ${vcf} \\ - ${site_depth_input} \\ - ${reference} \\ - --tmp-dir . \\ - ${args} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/gatk4/sitedepthtobaf/meta.yml b/modules/nf-core/gatk4/sitedepthtobaf/meta.yml deleted file mode 100644 index 4beb28a6..00000000 --- a/modules/nf-core/gatk4/sitedepthtobaf/meta.yml +++ /dev/null @@ -1,71 +0,0 @@ -name: "gatk4_sitedepthtobaf" -description: EXPERIMENTAL TOOL! Convert SiteDepth to BafEvidence -keywords: - - gatk4 - - site depth - - BAF -tools: - - gatk4: - description: Genome Analysis Toolkit (GATK4) - homepage: https://gatk.broadinstitute.org/hc/en-us - documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s - tool_dev_url: https://github.com/broadinstitute/gatk - doi: "10.1158/1538-7445.AM2017-3590" - licence: ["BSD-3-clause"] - -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - site_depths: - type: file - description: Files containing site depths - pattern: "*.sd.txt.gz" - - site_depths_indices: - type: file - description: The indices of the site depth files - pattern: "*.sd.txt.gz.tbi" - - vcf: - type: file - description: Input VCF of SNPs marking loci for site depths - pattern: "*.vcf.gz" - - tbi: - type: file - description: Index of the input VCF of SNPs marking loci for site depths - pattern: "*.vcf.gz.tbi" - - fasta: - type: file - description: The reference FASTA file - pattern: "*.{fasta,fa}" - - fasta_fai: - type: file - description: The index of the reference FASTA file - pattern: "*.fai" - - dict: - type: file - description: The sequence dictionary of the reference FASTA file - pattern: "*.dict" - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - baf: - type: file - description: The created BAF file - pattern: "*.baf.txt.gz" - - baf_tbi: - type: file - description: The index of the created BAF file - pattern: "*.baf.txt.gz.tbi" - -authors: - - "@nvnieuwk" diff --git a/modules/nf-core/gridss/gridss/meta.yml b/modules/nf-core/gridss/gridss/meta.yml index 55bbd32f..3ac3842e 100644 --- a/modules/nf-core/gridss/gridss/meta.yml +++ b/modules/nf-core/gridss/gridss/meta.yml @@ -9,10 +9,10 @@ keywords: tools: - "gridss": description: "GRIDSS: the Genomic Rearrangement IDentification Software Suite" - homepage: "None" + documentation: "https://github.com/PapenfussLab/gridss/wiki/GRIDSS-Documentation" tool_dev_url: "https://github.com/PapenfussLab/gridss" - doi: "https://doi.org/10.1186/s13059-021-02423-x" + doi: "10.1186/s13059-021-02423-x" licence: "['GPL v3']" input: diff --git a/modules/nf-core/jasminesv/meta.yml b/modules/nf-core/jasminesv/meta.yml index 86b320d4..ec232652 100644 --- a/modules/nf-core/jasminesv/meta.yml +++ b/modules/nf-core/jasminesv/meta.yml @@ -12,7 +12,7 @@ tools: homepage: "https://github.com/mkirsche/Jasmine/wiki/Jasmine-User-Manual" documentation: "https://github.com/mkirsche/Jasmine/wiki/Jasmine-User-Manual" tool_dev_url: "https://github.com/mkirsche/Jasmine" - doi: "" + licence: "['MIT']" input: diff --git a/modules/nf-core/paragraph/idxdepth/meta.yml b/modules/nf-core/paragraph/idxdepth/meta.yml index b3acd15a..26e42323 100644 --- a/modules/nf-core/paragraph/idxdepth/meta.yml +++ b/modules/nf-core/paragraph/idxdepth/meta.yml @@ -11,7 +11,7 @@ tools: homepage: "https://github.com/Illumina/paragraph" documentation: "https://github.com/Illumina/paragraph" tool_dev_url: "https://github.com/Illumina/paragraph" - doi: "https://doi.org/10.1101/635011" + doi: "10.1101/635011" licence: "['Apache License 2.0']" input: diff --git a/modules/nf-core/samtools/convert/meta.yml b/modules/nf-core/samtools/convert/meta.yml index 937b1403..866c228f 100644 --- a/modules/nf-core/samtools/convert/meta.yml +++ b/modules/nf-core/samtools/convert/meta.yml @@ -12,7 +12,7 @@ tools: short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. These files are generated as output by short read aligners like BWA. homepage: http://www.htslib.org/ - documentation: hhttp://www.htslib.org/doc/samtools.html + documentation: http://www.htslib.org/doc/samtools.html doi: 10.1093/bioinformatics/btp352 licence: ["MIT"] input: diff --git a/modules/nf-core/scramble/clusteranalysis/meta.yml b/modules/nf-core/scramble/clusteranalysis/meta.yml index 15adf7a6..0be659d0 100644 --- a/modules/nf-core/scramble/clusteranalysis/meta.yml +++ b/modules/nf-core/scramble/clusteranalysis/meta.yml @@ -9,7 +9,7 @@ tools: homepage: "https://github.com/GeneDx/scramble" documentation: "https://github.com/GeneDx/scramble" tool_dev_url: "https://github.com/GeneDx/scramble" - doi: "" + licence: "['CC']" input: diff --git a/modules/nf-core/scramble/clusteridentifier/meta.yml b/modules/nf-core/scramble/clusteridentifier/meta.yml index b1b733fb..f8c0236b 100644 --- a/modules/nf-core/scramble/clusteridentifier/meta.yml +++ b/modules/nf-core/scramble/clusteridentifier/meta.yml @@ -10,7 +10,7 @@ tools: homepage: "https://github.com/GeneDx/scramble" documentation: "https://github.com/GeneDx/scramble" tool_dev_url: "https://github.com/GeneDx/scramble" - doi: "" + licence: "['CC']" input: diff --git a/modules/nf-core/smoove/call/meta.yml b/modules/nf-core/smoove/call/meta.yml index 1f886eca..6b22efe9 100644 --- a/modules/nf-core/smoove/call/meta.yml +++ b/modules/nf-core/smoove/call/meta.yml @@ -11,7 +11,7 @@ tools: homepage: https://github.com/brentp/smoove documentation: https://brentp.github.io/post/smoove/ tool_dev_url: https://github.com/brentp/smoove - doi: "" + licence: "['Apache-2.0']" input: diff --git a/modules/nf-core/svtk/standardize/main.nf b/modules/nf-core/svtk/standardize/main.nf deleted file mode 100644 index 1fab17e7..00000000 --- a/modules/nf-core/svtk/standardize/main.nf +++ /dev/null @@ -1,60 +0,0 @@ -process SVTK_STANDARDIZE { - tag "$meta.id" - label 'process_low' - - conda "bioconda::svtk=0.0.20190615" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/svtk:0.0.20190615--py37h73a75cf_2': - 'quay.io/biocontainers/svtk:0.0.20190615--py37h73a75cf_2' }" - - input: - tuple val(meta), path(vcf) - path fasta_fai - - output: - tuple val(meta), path("*.std.vcf.gz"), emit: standardized_vcf - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - - def arguments = args.args ?: '' - def caller = args.caller ?: 'delly' - - def VERSION = '0.0.20190615' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. - - def contigs = fasta_fai ? "--contigs ${fasta_fai}" : "" - - """ - svtk standardize \\ - ${arguments} \\ - ${contigs} \\ - ${vcf} \\ - ${prefix}.std.vcf.gz \\ - ${caller} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - svtk: ${VERSION} - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - - def VERSION = '0.0.20190615' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. - - """ - touch ${prefix}.std.vcf.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - svtk: ${VERSION} - END_VERSIONS - """ -} diff --git a/modules/nf-core/svtk/standardize/meta.yml b/modules/nf-core/svtk/standardize/meta.yml deleted file mode 100644 index 13d551cf..00000000 --- a/modules/nf-core/svtk/standardize/meta.yml +++ /dev/null @@ -1,61 +0,0 @@ -name: "svtk_standardize" -description: Convert SV calls to a standardized format. -keywords: - - svtk - - structural variants - - SV - - vcf - - standardization -tools: - - "svtk": - description: "Utilities for consolidating, filtering, resolving, and annotating structural variants." - homepage: "https://github.com/broadinstitute/gatk-sv/tree/master/src/svtk" - documentation: "https://github.com/broadinstitute/gatk-sv/tree/master/src/svtk" - tool_dev_url: "https://github.com/broadinstitute/gatk-sv/tree/master/src/svtk" - doi: "" - licence: "['MIT']" - -input: - - args: - type: map - description: | - Groovy Map containing tool parameters. MUST follow the structure/keywords below and be provided via modules.config. Parameters must be set between quotes. - ``` - { - [ - "args": "", - "caller": "delly" // Should be either delly, lumpy, manta, wham or melt - ] - } - ``` - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - vcf: - type: file - description: A gzipped VCF file to be standardized - pattern: "*.vcf.gz" - - fasta_fai: - type: file - description: Optional fasta index file that specifies the contigs to be used in the VCF header (defaults to all contigs of GRCh37) - pattern: "*.fai" - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - standardized_vcf: - type: file - description: A gzipped version of the standardized VCF file - pattern: "*.std.vcf.gz" - -authors: - - "@nvnieuwk" diff --git a/modules/nf-core/svtk/standardize/svtk-standardize.diff b/modules/nf-core/svtk/standardize/svtk-standardize.diff deleted file mode 100644 index 46c1465d..00000000 --- a/modules/nf-core/svtk/standardize/svtk-standardize.diff +++ /dev/null @@ -1,25 +0,0 @@ -Changes in module 'nf-core/svtk/standardize' ---- modules/nf-core/svtk/standardize/main.nf -+++ modules/nf-core/svtk/standardize/main.nf -@@ -42,4 +42,19 @@ - svtk: ${VERSION} - END_VERSIONS - """ -+ -+ stub: -+ def args = task.ext.args ?: '' -+ def prefix = task.ext.prefix ?: "${meta.id}" -+ -+ def VERSION = '0.0.20190615' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. -+ -+ """ -+ touch ${prefix}.std.vcf.gz -+ -+ cat <<-END_VERSIONS > versions.yml -+ "${task.process}": -+ svtk: ${VERSION} -+ END_VERSIONS -+ """ - } - -************************************************************ diff --git a/modules/nf-core/whamg/meta.yml b/modules/nf-core/whamg/meta.yml index bc9042ac..25a0f169 100644 --- a/modules/nf-core/whamg/meta.yml +++ b/modules/nf-core/whamg/meta.yml @@ -12,7 +12,7 @@ tools: homepage: "https://github.com/zeeev/wham" documentation: "https://github.com/zeeev/wham" tool_dev_url: "https://github.com/zeeev/wham" - doi: "https://doi.org/10.1371/journal.pcbi.1004572" + doi: "10.1371/journal.pcbi.1004572" licence: "['MIT']" input: diff --git a/nextflow.config b/nextflow.config index 307c5f20..b8f1569e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,68 +10,70 @@ params { // Input options - input = null - - // Common options - run_module_metrics = true + input = null // Pipeline parameters - callers = "manta" - allele_loci_vcf = null + callers = "manta" + output_callers = false + callers_support = 1 + annotate = false // Delly parameters - delly_sv_types = "ALL" - delly_scatter_size = 60000000 - delly_map_qual = 1 - delly_min_clique_size = 2 + delly_sv_types = "ALL" + // delly_scatter_size = 60000000 + delly_map_qual = 1 + delly_min_clique_size = 2 - // Settings - output_callers = false + // VEP options + vep_cache_version = "108" + vep_version = "108.2" + species = "homo_sapiens" + vep_structuralvariantoverlap = false + vep_phenotypes = false // MultiQC options - multiqc_config = null - multiqc_title = null - multiqc_logo = null - max_multiqc_email_size = '25.MB' - multiqc_methods_description = null + multiqc_config = null + multiqc_title = null + multiqc_logo = null + max_multiqc_email_size = '25.MB' + multiqc_methods_description = null // Import data options - genome = "GRCh38" - genomes_base = "/references/" - genomes_ignore = false - cmgg_config_base = "/conf/" - igenomes_base = 's3://ngi-igenomes/igenomes' - igenomes_ignore = true + genome = "GRCh38" + genomes_base = "/references/" + genomes_ignore = false + cmgg_config_base = "/conf/" + igenomes_base = 's3://ngi-igenomes/igenomes' + igenomes_ignore = true // Boilerplate options - outdir = null - tracedir = "${params.outdir}/pipeline_info" - publish_dir_mode = 'copy' - email = null - email_on_fail = null - plaintext_email = false - monochrome_logs = false - hook_url = null - help = false - version = false - validate_params = true - show_hidden_params = false - schema_ignore_params = 'genomes,test_data' + outdir = null + tracedir = "${params.outdir}/pipeline_info" + publish_dir_mode = 'copy' + email = null + email_on_fail = null + plaintext_email = false + monochrome_logs = false + hook_url = null + help = false + version = false + validate_params = true + show_hidden_params = false + schema_ignore_params = 'genomes,test_data' // Config options - custom_config_version = 'master' - custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" - config_profile_description = null - config_profile_contact = null - config_profile_url = null - config_profile_name = null - + custom_config_version = 'master' + custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" + config_profile_description = null + config_profile_contact = null + config_profile_url = null + config_profile_name = null // Max resource options // Defaults only, expecting to be overwritten - max_memory = '128.GB' - max_cpus = 16 - max_time = '240.h' + max_memory = '128.GB' + max_cpus = 16 + max_time = '240.h' } diff --git a/nextflow_schema.json b/nextflow_schema.json index c08e0c35..3debf2de 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -60,22 +60,17 @@ "mimetype": "text/plain", "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", + "help_text": "This parameter is *mandatory* if `--genome` is not specified. ", "fa_icon": "far fa-file-code" }, - "fasta_fai": { + "fai": { "type": "string", - "default": "None", - "description": "Path to the index of the FASTA genome file" - }, - "dict": { - "type": "string", - "default": "None", - "description": "Path to the sequence dictionary of the FASTA genome file" + "default": "None" }, "bwa": { "type": "string", - "default": "None" + "default": "None", + "description": "Path to the BWA index folder" }, "igenomes_base": { "type": "string", @@ -294,47 +289,42 @@ } } }, - "new_group_1": { - "title": "New Group 1", + "pipeline_specific_options": { + "title": "Pipeline specific options", "type": "object", - "description": "", + "description": "Options specific to the execution of this pipeline", "default": "", "properties": { - "run_module_metrics": { - "type": "boolean", - "default": true, - "description": "Set to true if metrics have to be run on the VCFs" - }, "callers": { "type": "string", "default": "manta", - "description": "A comma-seperated list of callers to use" - }, - "allele_loci_vcf": { - "type": "string", - "default": "None", - "description": "Path to the Allele loci VCF" + "description": "A comma-seperated list of callers to use. Can be one or more these: smoove|delly|manta" }, "output_callers": { "type": "boolean", - "description": "Output the output from different callers. Warning: This produces a lot of additional output and should only be used for testing purposes" + "description": "Output the VCF files from different callers. Warning: This produces a lot of additional output and should only be used for testing purposes" + }, + "callers_support": { + "type": "integer", + "default": 1, + "description": "The minimum amount of callers that should detect a variant. All variants that have a lower amount of callers supporting it, will be removed. (Only used when more than one caller is given)" + }, + "annotate": { + "type": "boolean", + "description": "Runs the annotation with Ensembl VEP when true" } - } + }, + "required": ["callers"] }, "delly_parameters": { "title": "Delly parameters", "type": "object", - "description": "", + "description": "Options specific for the Delly execution", "default": "", "properties": { - "delly_scatter_size": { - "type": "integer", - "default": 1000000, - "description": "The approximate size of regions to split for parallelizing the Delly run" - }, "delly_sv_types": { "type": "string", - "default": "DEL,DUP", + "default": "ALL", "description": "Which SV types delly should search for in the variant calling" }, "delly_map_qual": { @@ -348,6 +338,75 @@ "description": "The minimum clique size to use for delly" } } + }, + "vep_options": { + "title": "VEP options", + "type": "object", + "description": "Options for the annotation with VEP", + "default": "", + "properties": { + "vep_cache_version": { + "type": "string", + "default": "108", + "description": "The version of the VEP cache to use.", + "help_text": "This version should be present in the folder supplied by `--vep_cache`. This version should be the same as `--vep_version` when no VEP cache was given with `--vep_cache`" + }, + "vep_cache": { + "type": "string", + "default": "None", + "description": "The path to the VEP cache folder" + }, + "vep_version": { + "type": "string", + "default": "108.2", + "description": "The version of VEP to use" + }, + "species": { + "type": "string", + "default": "homo_sapiens", + "description": "The species used for the analysis. Should be all lowercase and spaces should be underscorses." + }, + "vep_structuralvariantoverlap": { + "type": "boolean", + "description": "Use the StructuralVariantOverlap VEP plugin", + "help_text": "This requires either `--gnomad_sv` and `--gnomad_sv_tbi` and/or `--genomes1000_sv` and `--genomes1000_sv_tbi`. " + }, + "vep_phenotypes": { + "type": "boolean", + "description": "Use the Phenotypes VEP plugin", + "help_text": "This requires `--phenotypes` and `--phenotypes_tbi`. " + }, + "gnomad_sv": { + "type": "string", + "default": "None", + "description": "Path to the Gnomad VCF file" + }, + "gnomad_sv_tbi": { + "type": "string", + "default": "None", + "description": "Path to the Gnomad VCF index file" + }, + "genomes1000_sv": { + "type": "string", + "default": "None", + "description": "Path to the 1000 Genomes VCF file" + }, + "genomes1000_sv_tbi": { + "type": "string", + "default": "None", + "description": "Path to the 1000 Genomes VCF index file" + }, + "phenotypes": { + "type": "string", + "default": "None", + "description": "Path to the phenotypes GFF file" + }, + "phenotypes_tbi": { + "type": "string", + "default": "None", + "description": "Path to the phenotypes GFF index file" + } + } } }, "allOf": [ @@ -367,10 +426,13 @@ "$ref": "#/definitions/generic_options" }, { - "$ref": "#/definitions/new_group_1" + "$ref": "#/definitions/pipeline_specific_options" }, { "$ref": "#/definitions/delly_parameters" + }, + { + "$ref": "#/definitions/vep_options" } ] } diff --git a/subworkflows/local/bam_structural_variant_calling/main.nf b/subworkflows/local/bam_structural_variant_calling/main.nf index ec26f0d1..841a97af 100644 --- a/subworkflows/local/bam_structural_variant_calling/main.nf +++ b/subworkflows/local/bam_structural_variant_calling/main.nf @@ -9,133 +9,110 @@ include { BAM_VARIANT_CALLING_WHAMG } from '../bam_variant_c include { BAM_VARIANT_CALLING_SMOOVE } from '../bam_variant_calling_smoove/main' include { BAM_VARIANT_CALLING_SCRAMBLE } from '../bam_variant_calling_scramble/main' include { BAM_VARIANT_CALLING_GRIDSS } from '../bam_variant_calling_gridss/main' -include { VCF_METRICS_SVTK_SVTEST } from '../vcf_metrics_svtk_svtest/main' include { VCF_MERGE_JASMINE } from '../vcf_merge_jasmine/main' // Import modules -include { GATK4_COLLECTREADCOUNTS as COLLECTREADCOUNTS } from '../../../modules/nf-core/gatk4/collectreadcounts/main' -include { GATK4_COLLECTSVEVIDENCE as COLLECTSVEVIDENCE } from '../../../modules/nf-core/gatk4/collectsvevidence/main' -include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix/main' +include { VIOLA } from '../../../modules/local/viola/main' +include { REHEADER_CALLED_VCFS } from '../../../modules/local/bcftools/reheader_called_vcfs/main' + +include { TABIX_TABIX as TABIX_VCFS } from '../../../modules/nf-core/tabix/tabix/main' workflow BAM_STRUCTURAL_VARIANT_CALLING { take: - crams // channel: [mandatory] [ meta, cram, crai, bed ] => The aligned CRAMs per sample with the regions they should be called on - beds // channel: [optional] [ meta, bed, bed_gz, bed_gz_tbi ] => A channel containing the normal BED, the bgzipped BED and its index file - allele_loci_vcf // channel: [optional] [ vcf ] => A channel containing the VCF and its index for counting the alleles - fasta // channel: [mandatory] [ fasta ] => The fasta reference file - fasta_fai // channel: [mandatory] [ fasta_fai ] => The index of the fasta reference file - dict // channel: [mandatory] [ dict ] => The dictionary of the fasta reference file - bwa_index // channel: [optional] [ index ] => The BWA MEM index + ch_crams // channel: [mandatory] [ meta, cram, crai, bed ] => The aligned CRAMs per sample with the regions they should be called on + ch_beds // channel: [optional] [ meta, bed, bed_gz, bed_gz_tbi ] => A channel containing the normal BED, the bgzipped BED and its index file + ch_fasta // channel: [mandatory] [ fasta ] => The fasta reference file + ch_fai // channel: [mandatory] [ fai ] => The index of the fasta reference file + ch_bwa_index // channel: [optional] [ index ] => The BWA MEM index main: - callers = params.callers.tokenize(",") - - ch_versions = Channel.empty() - ch_reports = Channel.empty() - ch_metrics = Channel.empty() - called_vcfs = Channel.empty() - - // - // GATK Collect Read Counts - // - - crams - .combine( - beds.map({meta, bed, bed_gz, bed_gz_tbi -> [meta, bed]}) - , by:0) - .dump(tag: 'collectreadcounts_input', pretty: true) - .set { collectreadcounts_input } - - COLLECTREADCOUNTS( - collectreadcounts_input, - fasta, - fasta_fai, - dict - ) + val_callers = params.callers.tokenize(",") - // ch_versions = ch_versions.mix(COLLECTREADCOUNTS.out.versions) + ch_versions = Channel.empty() + ch_reports = Channel.empty() + ch_called_vcfs = Channel.empty() // // Calling variants using Manta // - if("manta" in callers){ + if("manta" in val_callers){ BAM_VARIANT_CALLING_MANTA( - crams, - beds, - fasta, - fasta_fai + ch_crams, + ch_beds, + ch_fasta, + ch_fai ) - called_vcfs = called_vcfs.mix(BAM_VARIANT_CALLING_MANTA.out.manta_vcfs) - ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_MANTA.out.versions) + ch_called_vcfs = ch_called_vcfs.mix(BAM_VARIANT_CALLING_MANTA.out.manta_vcfs) + ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_MANTA.out.versions) } // // Calling variants using Delly // - if("delly" in callers){ + if("delly" in val_callers){ BAM_VARIANT_CALLING_DELLY( - crams, - beds, - fasta, - fasta_fai + ch_crams, + ch_beds, + ch_fasta, + ch_fai ) - called_vcfs = called_vcfs.mix(BAM_VARIANT_CALLING_DELLY.out.delly_vcfs) - ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_DELLY.out.versions) + ch_called_vcfs = ch_called_vcfs.mix(BAM_VARIANT_CALLING_DELLY.out.delly_vcfs) + ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_DELLY.out.versions) } // - // Calling variants using Whamg + // Calling variants using Whamg (Currently disabled) // // TODO Whamg needs some reheadering (like done in https://github.com/broadinstitute/gatk-sv/blob/90e3e9a221bdfe7ab2cfedeffb704bc6f0e99aa9/wdl/Whamg.wdl#L209) // TODO Add insertions sequence in the info key - Whamg will not work for now - if("whamg" in callers){ + if("whamg" in val_callers){ BAM_VARIANT_CALLING_WHAMG( - crams, - beds, - fasta, - fasta_fai + ch_crams, + ch_beds, + ch_fasta, + ch_fai ) - called_vcfs = called_vcfs.mix(BAM_VARIANT_CALLING_WHAMG.out.whamg_vcfs) - ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_WHAMG.out.versions) + ch_called_vcfs = ch_called_vcfs.mix(BAM_VARIANT_CALLING_WHAMG.out.whamg_vcfs) + ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_WHAMG.out.versions) } // // Calling variants using Smoove // - if("smoove" in callers){ + if("smoove" in val_callers){ BAM_VARIANT_CALLING_SMOOVE( - crams, - beds, - fasta, - fasta_fai + ch_crams, + ch_beds, + ch_fasta, + ch_fai ) - called_vcfs = called_vcfs.mix(BAM_VARIANT_CALLING_SMOOVE.out.smoove_vcfs) - ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_SMOOVE.out.versions) + ch_called_vcfs = ch_called_vcfs.mix(BAM_VARIANT_CALLING_SMOOVE.out.smoove_vcfs) + ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_SMOOVE.out.versions) } // - // Calling variants using Gridss + // Calling variants using Gridss (Currently disabled) // - if("gridss" in callers){ + if("gridss" in val_callers){ BAM_VARIANT_CALLING_GRIDSS( - crams, - fasta, - fasta_fai, - bwa_index + ch_crams, + ch_fasta, + ch_fai, + ch_bwa_index ) - called_vcfs = called_vcfs.mix(BAM_VARIANT_CALLING_GRIDSS.out.gridss_vcfs) - ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_GRIDSS.out.versions) + ch_called_vcfs = ch_called_vcfs.mix(BAM_VARIANT_CALLING_GRIDSS.out.gridss_vcfs) + ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_GRIDSS.out.versions) } // @@ -144,100 +121,66 @@ workflow BAM_STRUCTURAL_VARIANT_CALLING { // Scramble is unfinished. It needs a lot of improvements if we were to add it - // if("scramble" in callers){ + // if("scramble" in val_callers){ // BAM_VARIANT_CALLING_SCRAMBLE( - // crams, - // beds, - // fasta + // ch_crams, + // ch_beds, + // ch_fasta // ) - // called_vcfs = called_vcfs.mix(BAM_VARIANT_CALLING_SCRAMBLE.out.scramble_vcfs) - // ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_SCRAMBLE.out.versions) + // ch_called_vcfs = ch_called_vcfs.mix(BAM_VARIANT_CALLING_SCRAMBLE.out.scramble_vcfs) + // ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_SCRAMBLE.out.versions) // } // - // GATK Collect Structural Variant Evidence + // Standardize and merge VCFs per sample for all callers // - if(allele_loci_vcf){ - TABIX_TABIX( - allele_loci_vcf.map{[[id:"allele_loci_vcf"], it]} - ) - - crams - .combine(TABIX_TABIX.out.tbi) - .map( - { meta, cram, crai, tbi -> - [ meta, cram, crai, allele_loci_vcf, tbi ] - } - ) - .set { collectsvevidence_input } - - } else { - crams - .map( - { meta, cram, crai -> - [ meta, cram, crai, [], [] ] - } - ) - .set { collectsvevidence_input } - } - - collectsvevidence_input.dump(tag: 'collectsvevidence_input', pretty: true) - - COLLECTSVEVIDENCE( - collectsvevidence_input, - fasta, - fasta_fai, - dict + VIOLA( + ch_called_vcfs.map{ it[0..1] } ) - ch_versions = ch_versions.mix(COLLECTSVEVIDENCE.out.versions) - - // - // Create the metrics for all produced files - // + ch_versions = ch_versions.mix(VIOLA.out.versions.first()) - if(params.run_module_metrics) { - VCF_METRICS_SVTK_SVTEST( - called_vcfs, - COLLECTSVEVIDENCE.out.split_read_evidence, - COLLECTSVEVIDENCE.out.paired_end_evidence, - COLLECTSVEVIDENCE.out.site_depths, - fasta_fai + if(val_callers.size() > 1){ + VCF_MERGE_JASMINE( + VIOLA.out.vcf, + ch_fasta, + ch_fai, ) + ch_versions = ch_versions.mix(VCF_MERGE_JASMINE.out.versions) - ch_metrics = ch_reports.mix(VCF_METRICS_SVTK_SVTEST.out.metrics) - ch_versions = ch_versions.mix(VCF_METRICS_SVTK_SVTEST.out.versions) - } + VCF_MERGE_JASMINE.out.merged_vcfs.set { ch_merged_vcfs } + } else { - if(callers.size > 1){ - VCF_MERGE_JASMINE( - called_vcfs.map { it[0..1] }, - fasta, - fasta_fai, + Channel.fromPath("${projectDir}/assets/header.txt") + .collect() + .set { ch_new_header } + + REHEADER_CALLED_VCFS( + VIOLA.out.vcf, + ch_new_header, + ch_fai + ) + ch_versions = ch_versions.mix(REHEADER_CALLED_VCFS.out.versions.first()) + + TABIX_VCFS( + REHEADER_CALLED_VCFS.out.vcf ) + ch_versions = ch_versions.mix(TABIX_VCFS.out.versions.first()) - VCF_MERGE_JASMINE.out.merged_vcfs.set { merged_vcfs } - } else { - called_vcfs + REHEADER_CALLED_VCFS.out.vcf + .join(TABIX_VCFS.out.tbi, failOnDuplicate:true, failOnMismatch:true) .map { meta, vcf, tbi -> - new_meta = meta.findAll { !(it.key == "caller") } + new_meta = meta - meta.subMap("caller") [ new_meta, vcf, tbi ] } - .set { merged_vcfs } + .set { ch_merged_vcfs } } emit: - vcfs = merged_vcfs - - coverage_counts = COLLECTREADCOUNTS.out.tsv - - split_reads = COLLECTSVEVIDENCE.out.split_read_evidence.combine(COLLECTSVEVIDENCE.out.split_read_evidence_index, by:0) - read_pairs = COLLECTSVEVIDENCE.out.paired_end_evidence.combine(COLLECTSVEVIDENCE.out.paired_end_evidence_index, by:0) - site_depths = COLLECTSVEVIDENCE.out.site_depths.combine(COLLECTSVEVIDENCE.out.site_depths_index, by:0) + vcfs = ch_merged_vcfs // channel: [ val(meta), path(vcf), path(tbi) ] versions = ch_versions - metrics = ch_metrics reports = ch_reports } diff --git a/subworkflows/local/bam_variant_calling_delly/main.nf b/subworkflows/local/bam_variant_calling_delly/main.nf index 70453ac9..8acd8b9c 100644 --- a/subworkflows/local/bam_variant_calling_delly/main.nf +++ b/subworkflows/local/bam_variant_calling_delly/main.nf @@ -12,141 +12,129 @@ include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix/main' workflow BAM_VARIANT_CALLING_DELLY { take: - crams // channel: [mandatory] [ meta, cram, crai ] => The aligned CRAMs per sample with the regions they should be called on - beds // channel: [optional] [ meta, bed, bed_gz, bed_gz_tbi ] => A channel containing the normal BED, the bgzipped BED and its index file - fasta // channel: [mandatory] [ fasta ] => The fasta reference file - fasta_fai // channel: [mandatory] [ fasta_fai ] => The index of the fasta reference file + ch_crams // channel: [mandatory] [ meta, cram, crai ] => The aligned CRAMs per sample with the regions they should be called on + ch_beds // channel: [optional] [ meta, bed, bed_gz, bed_gz_tbi ] => A channel containing the normal BED, the bgzipped BED and its index file + ch_fasta // channel: [mandatory] [ fasta ] => The fasta reference file + ch_fai // channel: [mandatory] [ fai ] => The index of the fasta reference file main: ch_versions = Channel.empty() - // - // Split the BED files if the scatter count is bigger than 1 - // + // Scattering is disabled for now. We have to investigate whether it is really necesarry. + // It can be an issue for calling translocations and bigger variants - beds - .map( - { meta, bed, bed_gz, bed_gz_tbi -> - [ meta, bed ] - } - ) - .set { single_beds } - - SCATTER_BEDS( - single_beds, - params.delly_scatter_size - ) + // // + // // Split the BED files if the scatter count is bigger than 1 + // // - ch_versions = ch_versions.mix(SCATTER_BEDS.out.versions) + // ch_beds + // .map( + // { meta, bed, bed_gz, bed_gz_tbi -> + // [ meta, bed ] + // } + // ) + // .set { ch_single_beds } - SCATTER_BEDS.out.scatter - .transpose() - .map{ meta, bed -> - new_meta = meta + [id:bed.baseName] - [ new_meta, bed ] - } - .set { split_beds } + // SCATTER_BEDS( + // ch_single_beds, + // params.delly_scatter_size + // ) - split_beds.dump(tag: 'split_beds', pretty: true) + // ch_versions = ch_versions.mix(SCATTER_BEDS.out.versions.first()) - // - // Reverse the BED file (It will only contain the regions that aren't of interest now) - // + // SCATTER_BEDS.out.scatter + // .transpose() + // .map{ meta, bed -> + // new_meta = meta + [id:bed.baseName] + // [ new_meta, bed ] + // } + // .set { ch_split_beds } - REVERSE_BED( - split_beds, - fasta_fai - ) + // // + // // Reverse the BED file (It will only contain the regions that aren't of interest now) + // // + + // REVERSE_BED( + // ch_split_beds, + // ch_fai + // ) - ch_versions = ch_versions.mix(REVERSE_BED.out.versions) + // ch_versions = ch_versions.mix(REVERSE_BED.out.versions.first()) // // Calling variants using Delly // - crams - .combine( - REVERSE_BED.out.bed - .map( - { meta, bed -> - new_meta = meta + [id:meta.sample] - [ new_meta, bed ] - } - ) - , by:0) - .map( - { meta, cram, crai, bed -> - new_meta = meta + [id:bed.baseName.replace("_reversed","")] - [ new_meta, cram, crai, [], [], bed ] - } - ) + ch_crams + .join(ch_beds.map{ meta, bed, bed_gz, bed_gz_tbi -> [ meta, bed ] }, failOnDuplicate:true, failOnMismatch:true) + .map { meta, cram, crai, bed -> + [ meta, cram, crai, [], [], bed ] + } .dump(tag: 'delly_input', pretty: true) - .set { delly_input } + .set { ch_delly_input } DELLY_CALL( - delly_input, - fasta, - fasta_fai + ch_delly_input, + ch_fasta, + ch_fai ) - ch_versions = ch_versions.mix(DELLY_CALL.out.versions) - - // - // Concatenate the BCF files if the scatter count is bigger than 1 and convert the BCF to VCF - // + ch_versions = ch_versions.mix(DELLY_CALL.out.versions.first()) + + // // + // // Concatenate the BCF files if the scatter count is bigger than 1 and convert the BCF to VCF + // // + + // DELLY_CALL.out.bcf + // .join(DELLY_CALL.out.csi, failOnMismatch:true, failOnDuplicate:true) + // .map { meta, vcf, csi -> + // new_meta = meta + [id:meta.sample] + // [ new_meta, vcf, csi ] + // } + // .combine(SCATTER_BEDS.out.scatter, by:0) + // .map( + // { meta, vcf, csi, beds -> + // count = beds instanceof ArrayList ? beds.size() : 1 + // [ groupKey(meta, count), vcf, csi ] + // } + // ) + // .groupTuple() + // .dump(tag: 'bcftools_input', pretty: true) + // .set { ch_bcftools_input } + + + // BCFTOOLS_CONCAT( + // ch_bcftools_input + // ) + + // ch_versions = ch_versions.mix(BCFTOOLS_CONCAT.out.versions.first()) + + // // + // // Index the VCF file + // // + + // BCFTOOLS_SORT( + // BCFTOOLS_CONCAT.out.vcf + // ) + // ch_versions = ch_versions.mix(BCFTOOLS_SORT.out.versions.first()) + + // TABIX_TABIX( + // BCFTOOLS_SORT.out.vcf + // ) + // ch_versions = ch_versions.mix(TABIX_TABIX.out.versions.first()) DELLY_CALL.out.bcf - .join(DELLY_CALL.out.csi) - .map { meta, vcf, csi -> - new_meta = meta + [id:meta.sample] - [ new_meta, vcf, csi ] + .join(DELLY_CALL.out.csi, failOnDuplicate:true, failOnMismatch:true) + .map { meta, vcf, tbi -> + new_meta = meta + [caller:"delly"] + [ new_meta, vcf, tbi ] } - .combine(SCATTER_BEDS.out.scatter, by:0) - .map( - { meta, vcf, csi, beds -> - count = beds instanceof ArrayList ? beds.size() : 1 - [ groupKey(meta, count), vcf, csi ] - } - ) - .groupTuple() - .dump(tag: 'bcftools_input', pretty: true) - .set { bcftools_input } - - - BCFTOOLS_CONCAT( - bcftools_input - ) - - ch_versions = ch_versions.mix(BCFTOOLS_CONCAT.out.versions) - - BCFTOOLS_CONCAT.out.vcf.set { sort_input } - - // - // Index the VCF file - // - - BCFTOOLS_SORT( - sort_input - ) - - TABIX_TABIX( - BCFTOOLS_SORT.out.vcf - ) - ch_versions = ch_versions.mix(TABIX_TABIX.out.versions) - - BCFTOOLS_SORT.out.vcf - .combine(TABIX_TABIX.out.tbi, by:0) - .map( - { meta, vcf, tbi -> - new_meta = meta + [caller:"delly"] - [ new_meta, vcf, tbi ] - } - ) .dump(tag: 'delly_vcfs', pretty: true) - .set { delly_vcfs } + .set { ch_delly_vcfs } emit: - delly_vcfs - versions = ch_versions + delly_vcfs = ch_delly_vcfs // channel: [ val(meta), path(vcf), path(tbi) ] + + versions = ch_versions } diff --git a/subworkflows/local/bam_variant_calling_gridss/main.nf b/subworkflows/local/bam_variant_calling_gridss/main.nf index e91efac7..84b1446d 100644 --- a/subworkflows/local/bam_variant_calling_gridss/main.nf +++ b/subworkflows/local/bam_variant_calling_gridss/main.nf @@ -2,15 +2,16 @@ // Run Gridss // -include { GRIDSS_GRIDSS } from '../../../modules/nf-core/gridss/gridss/main' +include { SIMPLE_EVENT_ANNOTATION } from '../../../modules/local/gridss/simple_event_annotation/main' -include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix/main' +include { GRIDSS_GRIDSS } from '../../../modules/nf-core/gridss/gridss/main' +include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix/main' workflow BAM_VARIANT_CALLING_GRIDSS { take: crams // channel: [mandatory] [ meta, cram, crai ] => The aligned CRAMs per sample with the regions they should be called on fasta // channel: [mandatory] [ fasta ] => The fasta reference file - fasta_fai // channel: [mandatory] [ fasta_fai ] => The index of the fasta reference file + fai // channel: [mandatory] [ fai ] => The index of the fasta reference file bwa_index // channel: [mandatory] [ index ] => The BWA MEM index main: @@ -20,20 +21,23 @@ workflow BAM_VARIANT_CALLING_GRIDSS { GRIDSS_GRIDSS( crams.map {meta, cram, crai -> [meta, cram, []]}, fasta.map {[[],it]}, - fasta_fai.map {[[],it]}, + fai.map {[[],it]}, bwa_index ) - ch_versions = ch_versions.mix(GRIDSS_GRIDSS.out.versions) - TABIX_TABIX( + SIMPLE_EVENT_ANNOTATION( GRIDSS_GRIDSS.out.vcf ) + ch_versions = ch_versions.mix(SIMPLE_EVENT_ANNOTATION.out.versions) + TABIX_TABIX( + SIMPLE_EVENT_ANNOTATION.out.vcf + ) ch_versions = ch_versions.mix(TABIX_TABIX.out.versions) - GRIDSS_GRIDSS.out.vcf - .join(TABIX_TABIX.out.tbi) + SIMPLE_EVENT_ANNOTATION.out.vcf + .join(TABIX_TABIX.out.tbi, failOnMismatch:true, failOnDuplicate:true) .map( { meta, vcf, tbi -> new_meta = meta + [caller:"gridss"] diff --git a/subworkflows/local/bam_variant_calling_manta/main.nf b/subworkflows/local/bam_variant_calling_manta/main.nf index 1078361d..f85885dd 100644 --- a/subworkflows/local/bam_variant_calling_manta/main.nf +++ b/subworkflows/local/bam_variant_calling_manta/main.nf @@ -7,40 +7,40 @@ include { BCFTOOLS_REHEADER } from '../../../modules/nf-core/bcftools/rehea workflow BAM_VARIANT_CALLING_MANTA { take: - crams // channel: [mandatory] [ meta, cram, crai ] => The aligned CRAMs per sample with the regions they should be called on - beds // channel: [optional] [ meta, bed, bed_gz, bed_gz_tbi ] => A channel containing the normal BED, the bgzipped BED and its index file - fasta // channel: [mandatory] [ fasta ] => The fasta reference file - fasta_fai // channel: [mandatory] [ fasta_fai ] => The index of the fasta reference file + ch_crams // channel: [mandatory] [ meta, cram, crai ] => The aligned CRAMs per sample with the regions they should be called on + ch_beds // channel: [optional] [ meta, bed, bed_gz, bed_gz_tbi ] => A channel containing the normal BED, the bgzipped BED and its index file + ch_fasta // channel: [mandatory] [ fasta ] => The fasta reference file + ch_fai // channel: [mandatory] [ fai ] => The index of the fasta reference file main: ch_versions = Channel.empty() - beds + ch_beds .map( { meta, bed, bed_gz, bed_gz_tbi -> [ meta, bed_gz, bed_gz_tbi ] } ) .dump(tag: 'gzipped_beds', pretty: true) - .set { gzipped_beds } + .set { ch_gzipped_beds } // // Calling variants using Manta // - crams - .combine(gzipped_beds, by: 0) + ch_crams + .join(ch_gzipped_beds, failOnDuplicate:true, failOnMismatch:true) .dump(tag: 'manta_input', pretty: true) - .set { manta_input } + .set { ch_manta_input } MANTA_GERMLINE( - manta_input, - fasta, - fasta_fai + ch_manta_input, + ch_fasta, + ch_fai ) - ch_versions = ch_versions.mix(MANTA_GERMLINE.out.versions) + ch_versions = ch_versions.mix(MANTA_GERMLINE.out.versions.first()) // // Reformat the inversions into single inverted sequence junctions @@ -48,13 +48,13 @@ workflow BAM_VARIANT_CALLING_MANTA { MANTA_CONVERTINVERSION( MANTA_GERMLINE.out.diploid_sv_vcf, - fasta + ch_fasta ) - ch_versions = ch_versions.mix(MANTA_CONVERTINVERSION.out.versions) + ch_versions = ch_versions.mix(MANTA_CONVERTINVERSION.out.versions.first()) MANTA_CONVERTINVERSION.out.vcf - .combine(MANTA_CONVERTINVERSION.out.tbi, by:0) + .join(MANTA_CONVERTINVERSION.out.tbi, failOnDuplicate:true, failOnMismatch:true) .map( { meta, vcf, tbi -> new_meta = meta.clone() @@ -63,9 +63,10 @@ workflow BAM_VARIANT_CALLING_MANTA { } ) .dump(tag: 'manta_vcfs', pretty: true) - .set { manta_vcfs } + .set { ch_manta_vcfs } emit: - manta_vcfs - versions = ch_versions + manta_vcfs = ch_manta_vcfs // channel: [ val(meta), path(vcf), path(tbi) ] + + versions = ch_versions } diff --git a/subworkflows/local/bam_variant_calling_smoove/main.nf b/subworkflows/local/bam_variant_calling_smoove/main.nf index 3c78da8d..41f597f5 100644 --- a/subworkflows/local/bam_variant_calling_smoove/main.nf +++ b/subworkflows/local/bam_variant_calling_smoove/main.nf @@ -9,10 +9,10 @@ include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix/main' workflow BAM_VARIANT_CALLING_SMOOVE { take: - crams // channel: [mandatory] [ meta, cram, crai ] => The aligned CRAMs per sample with the regions they should be called on - beds // channel: [optional] [ meta, bed, bed_gz, bed_gz_tbi ] => A channel containing the normal BED, the bgzipped BED and its index file - fasta // channel: [mandatory] [ fasta ] => The fasta reference file - fasta_fai // channel: [mandatory] [ fasta_fai ] => The index of the fasta reference file + ch_crams // channel: [mandatory] [ meta, cram, crai ] => The aligned CRAMs per sample with the regions they should be called on + ch_beds // channel: [optional] [ meta, bed, bed_gz, bed_gz_tbi ] => A channel containing the normal BED, the bgzipped BED and its index file + ch_fasta // channel: [mandatory] [ fasta ] => The fasta reference file + ch_fai // channel: [mandatory] [ fai ] => The index of the fasta reference file main: @@ -22,42 +22,47 @@ workflow BAM_VARIANT_CALLING_SMOOVE { // Reverse the BED file (It will only contain the regions that aren't of interest now) // - beds - .map( - { meta, bed, bed_gz, bed_gz_tbi -> - [ meta, bed ] - } - ) - .set { reverse_input } + ch_beds + .branch { meta, bed, bed_gz, bed_gz_tbi -> + bed: bed + return [ meta, bed ] + no_bed: !bed + return [ meta, [] ] + } + .set { ch_reverse_input } REVERSE_BED( - reverse_input, - fasta_fai + ch_reverse_input.bed, + ch_fai ) - ch_versions = ch_versions.mix(REVERSE_BED.out.versions) + ch_versions = ch_versions.mix(REVERSE_BED.out.versions.first()) + + REVERSE_BED.out.bed + .mix(ch_reverse_input.no_bed) + .set { ch_reversed_beds } // // Calling variants using Smoove // - crams - .join(REVERSE_BED.out.bed) + ch_crams + .join(ch_reversed_beds, failOnMismatch:true, failOnDuplicate:true) .dump(tag: 'smoove_input', pretty: true) - .set { smoove_input } + .set { ch_smoove_input } SMOOVE_CALL( - smoove_input, - fasta, - fasta_fai + ch_smoove_input, + ch_fasta, + ch_fai ) - ch_versions = ch_versions.mix(SMOOVE_CALL.out.versions) + ch_versions = ch_versions.mix(SMOOVE_CALL.out.versions.first()) TABIX_TABIX( SMOOVE_CALL.out.vcf ) - ch_versions = ch_versions.mix(TABIX_TABIX.out.versions) + ch_versions = ch_versions.mix(TABIX_TABIX.out.versions.first()) SMOOVE_CALL.out.vcf .combine(TABIX_TABIX.out.tbi, by:0) @@ -69,9 +74,10 @@ workflow BAM_VARIANT_CALLING_SMOOVE { } ) .dump(tag: 'smoove_vcfs', pretty: true) - .set { smoove_vcfs } + .set { ch_smoove_vcfs } emit: - smoove_vcfs - versions = ch_versions + smoove_vcfs = ch_smoove_vcfs // channel: [ val(meta), path(vcf), path(tbi) ] + + versions = ch_versions } diff --git a/subworkflows/local/bam_variant_calling_whamg/main.nf b/subworkflows/local/bam_variant_calling_whamg/main.nf index 1b760396..b28447f6 100644 --- a/subworkflows/local/bam_variant_calling_whamg/main.nf +++ b/subworkflows/local/bam_variant_calling_whamg/main.nf @@ -13,7 +13,7 @@ workflow BAM_VARIANT_CALLING_WHAMG { crams // channel: [mandatory] [ meta, cram, crai ] => The aligned CRAMs per sample with the regions they should be called on beds // channel: [optional] [ meta, bed, bed_gz, bed_gz_tbi ] => A channel containing the normal BED, the bgzipped BED and its index file fasta // channel: [mandatory] [ fasta ] => The fasta reference file - fasta_fai // channel: [mandatory] [ fasta_fai ] => The index of the fasta reference file + fai // channel: [mandatory] [ fai ] => The index of the fasta reference file main: @@ -26,7 +26,7 @@ workflow BAM_VARIANT_CALLING_WHAMG { SAMTOOLS_CONVERT( crams, fasta, - fasta_fai + fai ) SAMTOOLS_CONVERT.out.alignment_index.set { bams } @@ -39,13 +39,13 @@ workflow BAM_VARIANT_CALLING_WHAMG { WHAMG( bams, fasta, - fasta_fai + fai ) ch_versions = ch_versions.mix(WHAMG.out.versions) WHAMG.out.vcf - .join(WHAMG.out.tbi) + .join(WHAMG.out.tbi, failOnMismatch:true, failOnDuplicate:true) .set { whamg_vcfs } whamg_vcfs diff --git a/subworkflows/local/vcf_genotype_sv_paragraph/main.nf b/subworkflows/local/vcf_genotype_sv_paragraph/main.nf index 318ad956..31372a6c 100644 --- a/subworkflows/local/vcf_genotype_sv_paragraph/main.nf +++ b/subworkflows/local/vcf_genotype_sv_paragraph/main.nf @@ -4,31 +4,32 @@ import groovy.json.JsonSlurper // Merge VCFs from multiple callers // -include { PARAGRAPH_IDXDEPTH } from '../../../modules/nf-core/paragraph/idxdepth/main' -include { PARAGRAPH_MULTIGRMPY } from '../../../modules/nf-core/paragraph/multigrmpy/main' -include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix/main' -include { BCFTOOLS_MERGE } from '../../../modules/nf-core/bcftools/merge/main' +include { PARAGRAPH_IDXDEPTH } from '../../../modules/nf-core/paragraph/idxdepth/main' +include { PARAGRAPH_MULTIGRMPY } from '../../../modules/nf-core/paragraph/multigrmpy/main' +include { TABIX_TABIX as TABIX_INDIVIDUALS } from '../../../modules/nf-core/tabix/tabix/main' +include { TABIX_TABIX as TABIX_FAMILY } from '../../../modules/nf-core/tabix/tabix/main' +include { BCFTOOLS_MERGE } from '../../../modules/nf-core/bcftools/merge/main' workflow VCF_GENOTYPE_SV_PARAGRAPH { take: - vcfs // channel: [mandatory] [ meta, vcf, tbi ] VCFs containing the called structural variants - crams // channel: [mandatory] [ meta, cram, crai ] => The CRAM files used to create the VCF files - fasta // channel: [mandatory] [ fasta ] => The fasta reference file - fasta_fai // channel: [mandatory] [ fasta_fai ] => The index of the fasta reference file + ch_vcfs // channel: [mandatory] [ meta, vcf, tbi ] VCFs containing the called structural variants + ch_crams // channel: [mandatory] [ meta, cram, crai ] => The CRAM files used to create the VCF files + ch_fasta // channel: [mandatory] [ fasta ] => The fasta reference file + ch_fai // channel: [mandatory] [ fai ] => The index of the fasta reference file main: ch_versions = Channel.empty() PARAGRAPH_IDXDEPTH( - crams, - fasta.map { [[], it] }, - fasta_fai.map { [[], it] } + ch_crams, + ch_fasta.map { [[], it] }, + ch_fai.map { [[], it] } ) - ch_versions = ch_versions.mix(PARAGRAPH_IDXDEPTH.out.versions) + ch_versions = ch_versions.mix(PARAGRAPH_IDXDEPTH.out.versions.first()) PARAGRAPH_IDXDEPTH.out.depth - .tap { meta_channel } + .tap { ch_meta } .map { meta, json -> manifest = create_manifest(json.text, meta.id) [ meta, manifest ] @@ -40,30 +41,31 @@ workflow VCF_GENOTYPE_SV_PARAGRAPH { id = manifest.baseName [ id, manifest ] } - .join(meta_channel.map { meta, json -> [ meta.id, meta ]}) + .join(ch_meta.map { meta, json -> [ meta.id, meta ]}, failOnMismatch:true, failOnDuplicate:true) .map { id, manifest, meta -> [ meta, manifest ] } - .set { manifest } + .set { ch_manifest } - vcfs - .join(crams) - .join(manifest) - .set { grmpy_input } + ch_vcfs + .join(ch_crams, failOnMismatch:true, failOnDuplicate:true) + .join(ch_manifest, failOnMismatch:true, failOnDuplicate:true) + .set { ch_paragraph_input } PARAGRAPH_MULTIGRMPY( - grmpy_input, - fasta.map { [[], it] }, - fasta_fai.map { [[], it] } + ch_paragraph_input, + ch_fasta.map { [[], it] }, + ch_fai.map { [[], it] } ) - ch_versions = ch_versions.mix(PARAGRAPH_MULTIGRMPY.out.versions) + ch_versions = ch_versions.mix(PARAGRAPH_MULTIGRMPY.out.versions.first()) - TABIX_TABIX( + TABIX_INDIVIDUALS( PARAGRAPH_MULTIGRMPY.out.vcf ) + ch_versions = ch_versions.mix(TABIX_INDIVIDUALS.out.versions.first()) PARAGRAPH_MULTIGRMPY.out.vcf - .join(TABIX_TABIX.out.tbi) + .join(TABIX_INDIVIDUALS.out.tbi, failOnMismatch:true, failOnDuplicate:true) .map { meta, vcf, tbi -> new_meta = meta.findAll { !(it.key == "sample") } + [id:meta.family] [ groupKey(new_meta, meta.family_count), vcf, tbi ] @@ -72,32 +74,48 @@ workflow VCF_GENOTYPE_SV_PARAGRAPH { .branch { meta, vcfs, tbis -> merge: vcfs.size() > 1 dont_merge: vcfs.size() == 1 + return [ meta, vcfs ] } - .set { merge_input } + .set { ch_merge_input } BCFTOOLS_MERGE( - merge_input.merge, + ch_merge_input.merge, [], [], [] ) + ch_versions = ch_versions.mix(BCFTOOLS_MERGE.out.versions.first()) BCFTOOLS_MERGE.out.merged_variants - .mix(merge_input.dont_merge) - .set { genotyped_vcfs } + .mix(ch_merge_input.dont_merge) + .set { ch_genotyped_vcfs } + + if(!params.annotate) { + TABIX_FAMILY( + ch_genotyped_vcfs + ) + ch_versions = ch_versions.mix(TABIX_FAMILY.out.versions.first()) + } emit: - genotyped_vcfs - versions = ch_versions + genotyped_vcfs = ch_genotyped_vcfs // channel: [ val(meta), path(vcf) ] + + versions = ch_versions } def create_manifest(json, id) { - Map jsonMap = (Map) new JsonSlurper().parseText(json) - initDepth = jsonMap["autosome"]["depth"] - depth = workflow.profile.contains("test") && initDepth == 0.0 ? 0.1 : initDepth - path = jsonMap["bam_path"] - read_length = jsonMap["read_length"] - sex = "unknown" //TODO: add support for sex determination - header = "id\tpath\tdepth\tread length\tsex" - return "${header}\n${id}\t${path}\t${depth}\t${read_length}\t${sex}".toString() + // A function that converts the idxdepth JSON to manifest lines + if(!workflow.stubRun) { + Map jsonMap = (Map) new JsonSlurper().parseText(json) + initDepth = jsonMap["autosome"]["depth"] + depth = workflow.profile.contains("test") && initDepth == 0.0 ? 0.1 : initDepth + path = jsonMap["bam_path"] + read_length = jsonMap["read_length"] + sex = "unknown" //TODO: add support for sex determination + header = "id\tpath\tdepth\tread length\tsex" + return "${header}\n${id}\t${path}\t${depth}\t${read_length}\t${sex}".toString() + } + else { + return "stub" + } } diff --git a/subworkflows/local/vcf_merge_jasmine/main.nf b/subworkflows/local/vcf_merge_jasmine/main.nf index b5400751..733bb4c7 100644 --- a/subworkflows/local/vcf_merge_jasmine/main.nf +++ b/subworkflows/local/vcf_merge_jasmine/main.nf @@ -10,15 +10,15 @@ include { REHEADER_CALLED_VCFS } from '../../../modules workflow VCF_MERGE_JASMINE { take: - vcfs // channel: [mandatory] [ meta, vcf ] => The gzipped called VCFs - fasta // channel: [mandatory] [ fasta ] => The fasta reference file - fasta_fai // channel: [mandatory] [ fasta_fai ] => The index of the fasta reference file + ch_vcfs // channel: [mandatory] [ meta, vcf ] => The bgzipped called VCFs + ch_fasta // channel: [mandatory] [ fasta ] => The fasta reference file + ch_fai // channel: [mandatory] [ fai ] => The index of the fasta reference file main: ch_versions = Channel.empty() - vcfs + ch_vcfs .map { meta, vcf -> [ meta.findAll { !(it.key == "caller")}, vcf ] } @@ -27,43 +27,46 @@ workflow VCF_MERGE_JASMINE { [ meta, vcfs, [], [] ] } .dump(tag:'jasmine_input', pretty:true) - .set { jasmine_input } + .set { ch_jasmine_input } JASMINESV( - jasmine_input, - [], - [], + ch_jasmine_input, + ch_fasta, + ch_fai, [] ) - ch_versions = ch_versions.mix(JASMINESV.out.versions) + ch_versions = ch_versions.mix(JASMINESV.out.versions.first()) - new_header = Channel.fromPath("${projectDir}/assets/header.txt").collect() + Channel.fromPath("${projectDir}/assets/header.txt") + .collect() + .set { ch_new_header } REHEADER_CALLED_VCFS( JASMINESV.out.vcf, - new_header, - fasta_fai + ch_new_header, + ch_fai ) - ch_versions = ch_versions.mix(REHEADER_CALLED_VCFS.out.versions) + ch_versions = ch_versions.mix(REHEADER_CALLED_VCFS.out.versions.first()) BCFTOOLS_SORT( REHEADER_CALLED_VCFS.out.vcf ) - ch_versions = ch_versions.mix(BCFTOOLS_SORT.out.versions) + ch_versions = ch_versions.mix(BCFTOOLS_SORT.out.versions.first()) TABIX_TABIX( BCFTOOLS_SORT.out.vcf ) - ch_versions = ch_versions.mix(TABIX_TABIX.out.versions) + ch_versions = ch_versions.mix(TABIX_TABIX.out.versions.first()) BCFTOOLS_SORT.out.vcf - .join(TABIX_TABIX.out.tbi) - .set { merged_vcfs } + .join(TABIX_TABIX.out.tbi, failOnMismatch:true, failOnDuplicate:true) + .set { ch_merged_vcfs } emit: - merged_vcfs - versions = ch_versions + merged_vcfs = ch_merged_vcfs // channel: [ val(meta), path(vcf), path(tbi) ] + + versions = ch_versions } diff --git a/subworkflows/local/vcf_metrics_svtk_svtest/main.nf b/subworkflows/local/vcf_metrics_svtk_svtest/main.nf deleted file mode 100644 index e109fc7b..00000000 --- a/subworkflows/local/vcf_metrics_svtk_svtest/main.nf +++ /dev/null @@ -1,77 +0,0 @@ -// -// Gather Sample Evidence -// -include { SVTK_STANDARDIZE } from '../../../modules/nf-core/svtk/standardize/main' -include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix/main' -include { SVTEST_VCF } from '../../../modules/local/svtest/vcf/main' -include { SVTEST_SRFILE } from '../../../modules/local/svtest/sr-file/main' -include { SVTEST_PEFILE } from '../../../modules/local/svtest/pe-file/main' -include { SVTEST_RAWCOUNTS } from '../../../modules/local/svtest/raw-counts/main' - -workflow VCF_METRICS_SVTK_SVTEST { - take: - called_vcfs // channel: [mandatory] [ meta, vcf, tbi ] => The VCFs from all used variant callers - split_read_evidence // channel: [mandatory] [ meta, split_read_evidence ] => The split read evidence - paired_end_evidence // channel: [mandatory] [ meta, paired_end_evidence ] => The paired end evidence - site_depths // channel: [optional] [ meta, site_depths ] => The site depths - fasta_fai // channel: [mandatory] [ fasta_fai ] => The index of the fasta reference file - - main: - - ch_metrics = Channel.empty() - ch_versions = Channel.empty() - - SVTK_STANDARDIZE( - called_vcfs.filter {it[0].caller != "gridss"}.map({ meta, vcf, tbi -> [ meta, vcf ]}), - fasta_fai - ) - - ch_versions = ch_versions.mix(SVTK_STANDARDIZE.out.versions) - - TABIX_TABIX( - SVTK_STANDARDIZE.out.standardized_vcf - ) - - ch_versions = ch_versions.mix(TABIX_TABIX.out.versions) - - SVTEST_VCF( - SVTK_STANDARDIZE.out.standardized_vcf - .combine(TABIX_TABIX.out.tbi, by:0) - .map( - { meta, vcf, tbi -> - [ meta, vcf, tbi, [] ] - } - ), - fasta_fai - ) - - ch_metrics = ch_metrics.mix(SVTEST_VCF.out.metrics) - ch_versions = ch_versions.mix(SVTEST_VCF.out.versions) - - SVTEST_SRFILE( - split_read_evidence - ) - - ch_metrics = ch_metrics.mix(SVTEST_SRFILE.out.metrics) - ch_versions = ch_versions.mix(SVTEST_SRFILE.out.versions) - - SVTEST_PEFILE( - paired_end_evidence - ) - - ch_metrics = ch_metrics.mix(SVTEST_PEFILE.out.metrics) - ch_versions = ch_versions.mix(SVTEST_PEFILE.out.versions) - - SVTEST_RAWCOUNTS( - site_depths - ) - - ch_metrics = ch_metrics.mix(SVTEST_RAWCOUNTS.out.metrics) - ch_versions = ch_versions.mix(SVTEST_RAWCOUNTS.out.versions) - - ch_metrics.dump(tag: 'gathersampleevidence_metrics', pretty: true) - - emit: - metrics = ch_metrics - versions = ch_versions -} diff --git a/tests/all_annotate.test b/tests/all_annotate.test new file mode 100644 index 00000000..5702c938 --- /dev/null +++ b/tests/all_annotate.test @@ -0,0 +1,30 @@ +nextflow_pipeline { + + name "Tests using all callers" + script "main.nf" + + test("Success") { + when { + params { + // TODO re-add whamg and gridss when they are fixed + callers = "delly,manta,smoove" + annotate = true + } + } + + then { + assert workflow.success + assert file("${outputDir}/PosCon1/PosCon1.vcf.gz").exists() + assert file("${outputDir}/PosCon1/PosCon1.vcf.gz.tbi").exists() + assert file("${outputDir}/PosCon2/PosCon2.vcf.gz").exists() + assert file("${outputDir}/PosCon2/PosCon2.vcf.gz.tbi").exists() + assert file("${outputDir}/PosCon3/PosCon3.vcf.gz").exists() + assert file("${outputDir}/PosCon3/PosCon3.vcf.gz.tbi").exists() + assert file("${outputDir}/PosCon4/PosCon4.vcf.gz").exists() + assert file("${outputDir}/PosCon4/PosCon4.vcf.gz.tbi").exists() + assert file("${outputDir}/ready/family1/family1.vcf.gz").exists() + assert file("${outputDir}/ready/PosCon3/PosCon3.vcf.gz").exists() + assert file("${outputDir}/ready/PosCon4/PosCon4.vcf.gz").exists() + } + } +} diff --git a/tests/all.test b/tests/all_no_annotate.test similarity index 90% rename from tests/all.test rename to tests/all_no_annotate.test index a4d5677b..426f4554 100644 --- a/tests/all.test +++ b/tests/all_no_annotate.test @@ -6,8 +6,8 @@ nextflow_pipeline { test("Success") { when { params { - // TODO re-add whamg when it is fixed - callers = "delly,manta,smoove,gridss" + // TODO re-add whamg and gridss when they are fixed + callers = "delly,manta,smoove" } } diff --git a/tests/gridss.test b/tests/gridss.test index 53cddc00..5c88e341 100644 --- a/tests/gridss.test +++ b/tests/gridss.test @@ -11,18 +11,18 @@ nextflow_pipeline { } then { - assert workflow.success - assert file("${outputDir}/PosCon1/PosCon1.vcf.gz").exists() - assert file("${outputDir}/PosCon1/PosCon1.vcf.gz.tbi").exists() - assert file("${outputDir}/PosCon2/PosCon2.vcf.gz").exists() - assert file("${outputDir}/PosCon2/PosCon2.vcf.gz.tbi").exists() - assert file("${outputDir}/PosCon3/PosCon3.vcf.gz").exists() - assert file("${outputDir}/PosCon3/PosCon3.vcf.gz.tbi").exists() - assert file("${outputDir}/PosCon4/PosCon4.vcf.gz").exists() - assert file("${outputDir}/PosCon4/PosCon4.vcf.gz.tbi").exists() - assert file("${outputDir}/ready/family1/family1.vcf.gz").exists() - assert file("${outputDir}/ready/PosCon3/PosCon3.vcf.gz").exists() - assert file("${outputDir}/ready/PosCon4/PosCon4.vcf.gz").exists() + assert workflow.failed + // assert file("${outputDir}/PosCon1/PosCon1.vcf.gz").exists() + // assert file("${outputDir}/PosCon1/PosCon1.vcf.gz.tbi").exists() + // assert file("${outputDir}/PosCon2/PosCon2.vcf.gz").exists() + // assert file("${outputDir}/PosCon2/PosCon2.vcf.gz.tbi").exists() + // assert file("${outputDir}/PosCon3/PosCon3.vcf.gz").exists() + // assert file("${outputDir}/PosCon3/PosCon3.vcf.gz.tbi").exists() + // assert file("${outputDir}/PosCon4/PosCon4.vcf.gz").exists() + // assert file("${outputDir}/PosCon4/PosCon4.vcf.gz.tbi").exists() + // assert file("${outputDir}/ready/family1/family1.vcf.gz").exists() + // assert file("${outputDir}/ready/PosCon3/PosCon3.vcf.gz").exists() + // assert file("${outputDir}/ready/PosCon4/PosCon4.vcf.gz").exists() } } diff --git a/workflows/cmgg-structural.nf b/workflows/cmgg-structural.nf index c36b9e50..cafd8cb4 100644 --- a/workflows/cmgg-structural.nf +++ b/workflows/cmgg-structural.nf @@ -14,9 +14,14 @@ def checkPathParamList = [ params.input, params.multiqc_config, params.fasta, - params.fasta_fai, - params.dict, - params.allele_loci_vcf + params.fai, + params.vep_cache, + params.gnomad_sv, + params.gnomad_sv_tbi, + params.genomes1000_sv, + params.genomes1000_sv_tbi, + params.phenotypes, + params.phenotypes_tbi ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } @@ -26,26 +31,23 @@ if (params.input) { ch_input = file(params.input, checkIfExists: true) } else { // Check callers def availableCallers = [ "delly", - "whamg", + // "whamg", "manta", - "gridss", + // "gridss", "smoove" ] for (caller in params.callers.tokenize(",")) { - if(!(caller in availableCallers)) { exit 1, "The caller '${caller}' is not supported please specify a comma delimited list with on or more of the following callers: ${availableCallers}".toString() } + if(!(caller in availableCallers)) { error("The caller '${caller}' is not supported please specify a comma delimited list with on or more of the following callers: ${availableCallers}".toString()) } } if ("whamg" in params.callers.tokenize(",")) { - exit 1, "Whamg currently isn't functional. This will be fixed in a further build of the pipeline" + error("Whamg currently isn't functional. This will be fixed in a further build of the pipeline") } -// Parse parameters -fasta = Channel.fromPath(params.fasta).collect() -fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : null -dict = params.dict ? Channel.fromPath(params.dict).collect() : null -bwa_index = params.bwa ? Channel.fromPath(params.bwa).map {[[],it]}.collect() : null -allele_loci_vcf = params.allele_loci_vcf ? Channel.fromPath(params.allele_loci_vcf).collect() : [] +if ("gridss" in params.callers.tokenize(",")) { + error("Gridss currently isn't functional. This will be fixed in a further build of the pipeline") +} /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -80,10 +82,11 @@ include { VCF_GENOTYPE_SV_PARAGRAPH } from '../subworkflows/local/vcf_ge // MODULE: Installed directly from nf-core/modules // include { TABIX_BGZIPTABIX } from '../modules/nf-core/tabix/bgziptabix/main' +include { TABIX_TABIX as TABIX_ANNOTATED } from '../modules/nf-core/tabix/tabix/main' include { BEDTOOLS_SORT } from '../modules/nf-core/bedtools/sort/main' -include { GATK4_CREATESEQUENCEDICTIONARY } from '../modules/nf-core/gatk4/createsequencedictionary/main' include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' include { BWA_INDEX } from '../modules/nf-core/bwa/index/main' +include { ENSEMBLVEP_VEP } from '../modules/nf-core/ensemblvep/vep/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' @@ -102,34 +105,58 @@ workflow CMGGSTRUCTURAL { ch_reports = Channel.empty() // - // Create optional inputs + // Create input channels from parameters // - if(!fasta_fai){ - SAMTOOLS_FAIDX( - fasta.map {[[],it]} - ) + ch_fasta = Channel.fromPath(params.fasta).collect() + ch_fai = params.fai ? Channel.fromPath(params.fai).collect() : null + ch_bwa_index = params.bwa ? Channel.fromPath(params.bwa).map {[[],it]}.collect() : null + ch_vep_cache = params.vep_cache ? Channel.fromPath(params.vep_cache).collect() : [] - ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions) - fasta_fai = SAMTOOLS_FAIDX.out.fai.map { it[1] }.collect() + ch_vep_extra_files = [] + + if(params.vep_structuralvariantoverlap && ((params.gnomad_sv && params.gnomad_sv_tbi) || (params.genomes1000_sv && params.genomes1000_sv_tbi))) { + if(params.gnomad_sv && params.gnomad_sv_tbi) { + ch_vep_extra_files.add(file(params.gnomad_sv, checkIfExists:true)) + ch_vep_extra_files.add(file(params.gnomad_sv_tbi, checkIfExists:true)) + } + if(params.genomes1000_sv && params.genomes1000_sv_tbi) { + ch_vep_extra_files.add(file(params.genomes1000_sv, checkIfExists:true)) + ch_vep_extra_files.add(file(params.genomes1000_sv_tbi, checkIfExists:true)) + } + } + else if (params.vep_structuralvariantoverlap) { + error("Please specify '--gnomad_sv PATH/TO/GNOMADSV/FILE' and '--gnomad_sv_tbi PATH/TO/GNOMADS/INDEX/FILE' and/or '--genomes1000_sv PATH/TO/genomes1000/FILE' and '--genomes1000_sv_tbi PATH/TO/genomes1000/INDEX/FILE' to use the StructuralVariantOverlap VEP plugin.") + } + + if(params.vep_phenotypes && params.phenotypes && params.phenotypes_tbi) { + ch_vep_extra_files.add(file(params.phenotypes, checkIfExists:true)) + ch_vep_extra_files.add(file(params.phenotypes_tbi, checkIfExists:true)) + } + else if(params.vep_phenotypes) { + error("Please specify '--phenotypes PATH/TO/PHENOTYPES/FILE' and '--phenotypes_tbi PATH/TO/PHENOTYPES/INDEX/FILE' to use the Phenotypes VEP plugin.") } - if(!dict) { - GATK4_CREATESEQUENCEDICTIONARY( - fasta + // + // Create optional inputs + // + + if(!ch_fai){ + SAMTOOLS_FAIDX( + ch_fasta.map {[[],it]} ) - ch_versions = ch_versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions) - dict = GATK4_CREATESEQUENCEDICTIONARY.out.dict.collect() + ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions) + ch_fai = SAMTOOLS_FAIDX.out.fai.map { it[1] }.collect() } - if(!bwa_index && params.callers.contains("gridss")){ + if(!ch_bwa_index && params.callers.contains("gridss")){ BWA_INDEX( - fasta.map {[[id:'bwa'],it]} + ch_fasta.map {[[id:'bwa'],it]} ) ch_versions = ch_versions.mix(BWA_INDEX.out.versions) - bwa_index = BWA_INDEX.out.index.collect() + ch_bwa_index = BWA_INDEX.out.index.collect() } // @@ -160,14 +187,22 @@ workflow CMGGSTRUCTURAL { bed: [ new_meta, bed ] crams: [ new_meta, cram, crai ] }) - .set { inputs } + .set { ch_inputs } // // Prepare the BED files // + ch_inputs.bed + .branch { meta, bed -> + bed: bed + no_bed: !bed + return [ meta, [], [], [] ] + } + .set { ch_all_beds } + BEDTOOLS_SORT( - inputs.bed, + ch_all_beds.bed, [] ) @@ -178,20 +213,21 @@ workflow CMGGSTRUCTURAL { ) ch_versions = ch_versions.mix(TABIX_BGZIPTABIX.out.versions) - beds = BEDTOOLS_SORT.out.sorted.combine(TABIX_BGZIPTABIX.out.gz_tbi, by:0) + BEDTOOLS_SORT.out.sorted + .join(TABIX_BGZIPTABIX.out.gz_tbi, failOnDuplicate:true, failOnMismatch:true) + .mix(ch_all_beds.no_bed) + .set { ch_beds } // // Call the variants // BAM_STRUCTURAL_VARIANT_CALLING( - inputs.crams, - beds, - allele_loci_vcf, - fasta, - fasta_fai, - dict, - bwa_index + ch_inputs.crams, + ch_beds, + ch_fasta, + ch_fai, + ch_bwa_index ) ch_versions = ch_versions.mix(BAM_STRUCTURAL_VARIANT_CALLING.out.versions) @@ -203,10 +239,34 @@ workflow CMGGSTRUCTURAL { VCF_GENOTYPE_SV_PARAGRAPH( BAM_STRUCTURAL_VARIANT_CALLING.out.vcfs, - inputs.crams, - fasta, - fasta_fai + ch_inputs.crams, + ch_fasta, + ch_fai ) + ch_versions = ch_versions.mix(VCF_GENOTYPE_SV_PARAGRAPH.out.versions) + + // + // Annotate using Ensembl VEP + // + + if(params.annotate) { + ENSEMBLVEP_VEP( + VCF_GENOTYPE_SV_PARAGRAPH.out.genotyped_vcfs, + params.genome, + params.species, + params.vep_cache_version, + ch_vep_cache, + ch_fasta, + ch_vep_extra_files + ) + + ch_reports = ch_reports.mix(ENSEMBLVEP_VEP.out.report) + ch_versions = ch_versions.mix(ENSEMBLVEP_VEP.out.versions) + + TABIX_ANNOTATED( + ENSEMBLVEP_VEP.out.vcf + ) + } // // Dump the software versions @@ -228,6 +288,7 @@ workflow CMGGSTRUCTURAL { ch_methods_description = Channel.value(methods_description) ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(ch_reports) ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())