diff --git a/CHANGELOG.md b/CHANGELOG.md index 315efb773..0d2c7f047 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Added` ### `Fixed` -* [#145](https://github.com/nf-core/eager/issues/145) - Added Picard Memory Handling [fix](https://github.com/nf-core/eager/issues/144) - +* [#145](https://github.com/nf-core/eager/pull/145) - Added Picard Memory Handling [fix](https://github.com/nf-core/eager/issues/144) +* [#147](https://github.com/nf-core/eager/pull/147) - Fix Samtools Index for [large references](https://github.com/nf-core/eager/issues/146) ## [2.0.5] - 2019-01-28 diff --git a/README.md b/README.md index b5b8298b0..3485ac55c 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,18 @@ James Fellows Yates, Raphael Eisenhofer and Judith Neukamm. If you want to contribute, please open an issue and ask to be added to the project - happy to do so and everyone is welcome to contribute here! +## Contributors + +- [James A. Fellows-Yates](https://github.com/jfy133) +- [Stephen Clayton](https://github.com/sc13-bioinf) +- [Judith Neukamm](https://github.com/JudithNeukamm) +- [Raphael Eisenhofer](https://github.com/EisenRa) +- [Maxime Garcia](https://github.com/MaxUlysse) +- [Luc Venturini](https://github.com/lucventurini) +- [Hester van Schalkwyk](https://github.com/hesterjvs) + +If you've contributed and you're missing in here, please let me know and I'll add you in. + ## Tool References * **EAGER v1**, CircularMapper, DeDup* Peltzer, A., Jäger, G., Herbig, A., Seitz, A., Kniep, C., Krause, J., & Nieselt, K. (2016). EAGER: efficient ancient genome reconstruction. Genome Biology, 17(1), 1–14. [https://doi.org/10.1186/s13059-016-0918-z](https://doi.org/10.1186/s13059-016-0918-z) Download: [https://github.com/apeltzer/EAGER-GUI](https://github.com/apeltzer/EAGER-GUI) and [https://github.com/apeltzer/EAGER-CLI](https://github.com/apeltzer/EAGER-CLI) diff --git a/docs/usage.md b/docs/usage.md index dbb3889a1..60c65e6a0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -170,6 +170,10 @@ If you prefer, you can specify the full path to your reference genome when you r ``` > If you don't specify appropriate `--bwa_index`, `--fasta_index` parameters, the pipeline will create these indices for you automatically. Note, that saving these for later has to be turned on using `--saveReference`. You may also specify the path to a gzipped (`*.gz` file extension) FastA as reference genome - this will be uncompressed by the pipeline automatically for you. Note that other file extensions such as `.fna`, `.fa` are also supported but will be renamed to `.fasta` automatically by the pipeline. +### `--large_ref` + +This parameter is required to be set for large reference genomes. If your reference genome is larger than 3.5GB, the `samtools index` calls in the pipeline need to generate `CSI` indices instead of `BAI` indices to accompensate for the size of the reference genome. This parameter is not required for smaller references (including a human `hg19` or `grch37`/`grch38` reference), but `>4GB` genomes have been shown to need `CSI` indices. + ### `--genome` (using iGenomes) The pipeline config files come bundled with paths to the illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource. diff --git a/main.nf b/main.nf index 7513f0649..2c8a586d8 100644 --- a/main.nf +++ b/main.nf @@ -240,12 +240,6 @@ if("${params.fasta}".endsWith(".gz")){ .ifEmpty { exit 1, "No genome specified! Please specify one with --fasta"} .into {ch_fasta_for_bwa_indexing;ch_fasta_for_faidx_indexing;ch_fasta_for_dict_indexing; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_circularmapper_index} } - - - - - - //Index files provided? Then check whether they are correct and complete if (params.aligner != 'bwa' && !params.circularmapper && !params.bwamem){ @@ -346,6 +340,7 @@ summary['Pipeline Version'] = workflow.manifest.version summary['Run Name'] = custom_runName ?: workflow.runName summary['Reads'] = params.reads summary['Fasta Ref'] = params.fasta +summary['BAM Index Type'] = (params.large_ref == "") ? 'BAI' : 'CSI' if(params.bwa_index) summary['BWA Index'] = params.bwa_index summary['Data Type'] = params.singleEnd ? 'Single-End' : 'Paired-End' summary['Max Memory'] = params.max_memory @@ -649,16 +644,17 @@ process bwa { output: file "*.sorted.bam" into ch_mapped_reads_idxstats,ch_mapped_reads_filter,ch_mapped_reads_preseq, ch_mapped_reads_damageprofiler - file "*.bai" into ch_bam_index_for_damageprofiler + file "*.{bai,csi}" into ch_bam_index_for_damageprofiler script: prefix = reads[0].toString() - ~/(_R1)?(\.combined\.)?(prefixed)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?$/ fasta = "${index}/*.fasta" + size = "${params.large_ref}" ? '-c' : '' """ bwa aln -t ${task.cpus} $fasta $reads -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f "${reads.baseName}.sai" bwa samse -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta "${reads.baseName}".sai $reads | samtools sort -@ ${task.cpus} -O bam - > "${prefix}".sorted.bam - samtools index "${prefix}".sorted.bam + samtools index "${size}" "${prefix}".sorted.bam """ } @@ -703,19 +699,20 @@ process circularmapper{ output: file "*.sorted.bam" into ch_mapped_reads_idxstats_cm,ch_mapped_reads_filter_cm,ch_mapped_reads_preseq_cm, ch_mapped_reads_damageprofiler_cm - file "*.bai" + file "*.{bai,csi}" script: filter = "${params.circularfilter}" ? '' : '-f true -x false' prefix = reads[0].toString() - ~/(_R1)?(\.combined\.)?(prefixed)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?$/ fasta = "${index}/*_*.fasta" + size = "${params.large_ref}" ? '-c' : '' """ bwa aln -t ${task.cpus} $fasta $reads -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f "${reads.baseName}.sai" bwa samse -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta "${reads.baseName}".sai $reads > tmp.out realignsamfile -e ${params.circularextension} -i tmp.out -r $fasta $filter samtools sort -@ ${task.cpus} -O bam tmp_realigned.bam > "${prefix}".sorted.bam - samtools index "${prefix}".sorted.bam + samtools index "${size}" "${prefix}".sorted.bam """ } @@ -731,15 +728,16 @@ process bwamem { output: file "*.sorted.bam" into ch_bwamem_mapped_reads_idxstats,ch_bwamem_mapped_reads_filter,ch_bwamem_mapped_reads_preseq, ch_bwamem_mapped_reads_damageprofiler - file "*.bai" + file "*.{bai,csi}" script: prefix = reads[0].toString() - ~/(_R1)?(\.combined\.)?(prefixed)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?$/ fasta = "${index}/*.fasta" + size = "${params.large_ref}" ? '-c' : '' """ bwa mem -t ${task.cpus} $fasta $reads -R "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" | samtools sort -@ ${task.cpus} -O bam - > "${prefix}".sorted.bam - samtools index -@ ${task.cpus} "${prefix}".sorted.bam + samtools index "${size}" -@ ${task.cpus} "${prefix}".sorted.bam """ } @@ -786,38 +784,39 @@ process samtools_filter { file "*filtered.bam" into ch_bam_filtered_qualimap, ch_bam_filtered_dedup, ch_bam_filtered_markdup, ch_bam_filtered_pmdtools, ch_bam_filtered_angsd, ch_bam_filtered_gatk file "*.fastq.gz" optional true file "*.unmapped.bam" optional true - file "*.bai" + file "*.{bai,csi}" script: prefix="$bam" - ~/(\.bam)?/ + size = "${params.large_ref}" ? '-c' : '' if("${params.bam_discard_unmapped}" && "${params.bam_unmapped_type}" == "discard"){ """ samtools view -h -b $bam -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam - samtools index ${prefix}.filtered.bam + samtools index "${size}" ${prefix}.filtered.bam """ } else if("${params.bam_discard_unmapped}" && "${params.bam_unmapped_type}" == "bam"){ """ samtools view -h $bam | tee >(samtools view - -@ ${task.cpus} -f4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.unmapped.bam) >(samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam) - samtools index ${prefix}.filtered.bam + samtools index "${size}" ${prefix}.filtered.bam """ } else if("${params.bam_discard_unmapped}" && "${params.bam_unmapped_type}" == "fastq"){ """ samtools view -h $bam | tee >(samtools view - -@ ${task.cpus} -f4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.unmapped.bam) >(samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam) - samtools index ${prefix}.filtered.bam + samtools index "${size}" ${prefix}.filtered.bam samtools fastq -tn ${prefix}.unmapped.bam | pigz -p ${task.cpus} > ${prefix}.unmapped.fastq.gz rm ${prefix}.unmapped.bam """ } else if("${params.bam_discard_unmapped}" && "${params.bam_unmapped_type}" == "both"){ """ samtools view -h $bam | tee >(samtools view - -@ ${task.cpus} -f4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.unmapped.bam) >(samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam) - samtools index ${prefix}.filtered.bam + samtools index "${size}" ${prefix}.filtered.bam samtools fastq -tn ${prefix}.unmapped.bam | pigz -p ${task.cpus} > ${prefix}.unmapped.fastq.gz """ } else { //Only apply quality filtering, default """ samtools view -h -b $bam -@ ${task.cpus} -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam - samtools index ${prefix}.filtered.bam + samtools index "${size}" ${prefix}.filtered.bam """ } } @@ -841,25 +840,26 @@ process dedup{ file "*.hist" into ch_hist_for_preseq file "*.log" into ch_dedup_results_for_multiqc file "${prefix}.sorted.bam" into ch_dedup_bam - file "*.bai" + file "*.{bai,csi}" script: prefix="${bam.baseName}" treat_merged="${params.dedup_all_merged}" ? '-m' : '' - + size = "${params.large_ref}" ? '-c' : '' + if(params.singleEnd) { """ dedup -i $bam $treat_merged -o . -u mv *.log dedup.log samtools sort -@ ${task.cpus} "$prefix"_rmdup.bam -o "$prefix".sorted.bam - samtools index "$prefix".sorted.bam + samtools index "${size}" "$prefix".sorted.bam """ } else { """ dedup -i $bam $treat_merged -o . -u mv *.log dedup.log samtools sort -@ ${task.cpus} "$prefix"_rmdup.bam -o "$prefix".sorted.bam - samtools index "$prefix".sorted.bam + samtools index "${size}" "$prefix".sorted.bam """ } } @@ -1037,15 +1037,16 @@ process bam_trim { output: file "*.trimmed.bam" into ch_trimmed_bam_for_genotyping - file "*.bai" + file "*.{bai,csi}" script: prefix="${bam.baseName}" softclip = "${params.bamutils_softclip}" ? '-c' : '' + size = "${params.large_ref}" ? '-c' : '' """ bam trimBam $bam tmp.bam -L ${params.bamutils_clip_left} -R ${params.bamutils_clip_right} ${softclip} samtools sort -@ ${task.cpus} tmp.bam -o ${prefix}.trimmed.bam - samtools index ${prefix}.trimmed.bam + samtools index "${size}" ${prefix}.trimmed.bam """ } diff --git a/nextflow.config b/nextflow.config index f4ac236ad..150d9bc25 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,7 +23,8 @@ params { tracedir = "${params.outdir}/pipeline_info" readPaths = false bam = false - + large_ref = false + //More defaults complexity_filter = false complexity_filter_poly_g_min = 10