diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fca6661b8..a8c26c622 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -122,6 +122,9 @@ jobs: - name: MALTEXTRACT Basic with MALT plus MaltExtract run: | nextflow run ${GITHUB_WORKSPACE} "$TOWER" -name "$RUN_NAME-maltextract" -profile test,docker --paired_end --run_bam_filtering --bam_discard_unmapped --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt" --run_maltextract --maltextract_ncbifiles "/home/runner/work/eager/eager/databases/maltextract/" --maltextract_taxon_list 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/maltextract/MaltExtract_list.txt' + - name: METAGENOMIC Run the basic pipeline but with unmapped reads going into Kraken + run: | + nextflow run ${GITHUB_WORKSPACE} "$TOWER" -name "$RUN_NAME-kraken" -profile test_kraken,docker ${{ matrix.endedness }} --run_bam_filtering --bam_discard_unmapped --bam_unmapped_type 'fastq' - name: SEXDETERMINATION Run the basic pipeline with the bam input profile, but don't convert BAM, skip everything but sex determination run: | nextflow run ${GITHUB_WORKSPACE} "$TOWER" -name "$RUN_NAME-sexdeterrmine" -profile test_humanbam,docker --bam --skip_fastqc --skip_adapterremoval --skip_mapping --skip_deduplication --skip_qualimap --single_end --run_sexdeterrmine diff --git a/CHANGELOG.md b/CHANGELOG.md index 774191a63..aa3dd4d30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. * [#326](https://github.com/nf-core/eager/pull/326) - Add Biopython and [xopen](https://github.com/marcelm/xopen/) dependencies * [#336](https://github.com/nf-core/eager/issues/336) - Change default Y-axis maximum value of DamageProfiler to 30% to match popular (but slower) mapDamage, and allow user to set their own value. * [#352](https://github.com/nf-core/eager/pull/352) - Add social preview image +* [#355](https://github.com/nf-core/eager/pull/355) - Add Kraken2 metagenomics classifier ### `Fixed` diff --git a/README.md b/README.md index 6a8835b7c..54cb311bd 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,8 @@ Additional functionality contained by the pipeline currently includes: #### Metagenomic Screening * Taxonomic binner with alignment (`MALT`) -* aDNA characteristic screening of taxonomically binned data (`MaltExtract`) +* Taxonomic binner without alignment (`Kraken2`) +* aDNA characteristic screening of taxonomically binned data from MALT (`MaltExtract`) ## Quick Start @@ -157,3 +158,4 @@ If you've contributed and you're missing in here, please let me know and I'll ad * Vågene, Å.J. et al., 2018. Salmonella enterica genomes from victims of a major sixteenth-century epidemic in Mexico. Nature ecology & evolution, 2(3), pp.520–528. Available at: [http://dx.doi.org/10.1038/s41559-017-0446-6](http://dx.doi.org/10.1038/s41559-017-0446-6). * Herbig, A. et al., 2016. MALT: Fast alignment and analysis of metagenomic DNA sequence data applied to the Tyrolean Iceman. bioRxiv, p.050559. Available at: [http://biorxiv.org/content/early/2016/04/27/050559](http://biorxiv.org/content/early/2016/04/27/050559). * **MaltExtract** Huebler, R. et al., 2019. HOPS: Automated detection and authentication of pathogen DNA in archaeological remains. bioRxiv, p.534198. Available at: [https://www.biorxiv.org/content/10.1101/534198v1?rss=1](https://www.biorxiv.org/content/10.1101/534198v1?rss=1). Download: [https://github.com/rhuebler/MaltExtract](https://github.com/rhuebler/MaltExtract) +* **Kraken2** Wood, D et al., 2019. Improved metagenomic analysis with Kraken 2. Genome Biology volume 20, Article number: 257. Available at: [https://doi.org/10.1186/s13059-019-1891-0](https://doi.org/10.1186/s13059-019-1891-0). Download: [https://ccb.jhu.edu/software/kraken2/](https://ccb.jhu.edu/software/kraken2/) diff --git a/bin/kraken_parse.py b/bin/kraken_parse.py new file mode 100755 index 000000000..eb48a3d44 --- /dev/null +++ b/bin/kraken_parse.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python + + +import argparse +import csv + + +def _get_args(): + '''This function parses and return arguments passed in''' + parser = argparse.ArgumentParser( + prog='kraken_parse', + formatter_class=argparse.RawDescriptionHelpFormatter, + description='Parsing kraken') + parser.add_argument('krakenReport', help="path to kraken report file") + parser.add_argument( + '-c', + dest="count", + default=50, + help="Minimum number of hits on clade to report it. Default = 50") + parser.add_argument( + '-o', + dest="output", + default=None, + help="Output file. Default = .kraken_parsed.csv") + + args = parser.parse_args() + + infile = args.krakenReport + countlim = int(args.count) + outfile = args.output + + return(infile, countlim, outfile) + + +def _get_basename(file_name): + if ("/") in file_name: + basename = file_name.split("/")[-1].split(".")[0] + else: + basename = file_name.split(".")[0] + return(basename) + + +def parse_kraken(infile, countlim): + ''' + INPUT: + infile (str): path to kraken report file + countlim (int): lowest count threshold to report hit + OUTPUT: + resdict (dict): key=taxid, value=readCount + + ''' + with open(infile, 'r') as f: + resdict = {} + csvreader = csv.reader(f, delimiter='\t') + for line in csvreader: + reads = int(line[1]) + if reads >= countlim: + taxid = line[4] + resdict[taxid] = reads + return(resdict) + + +def write_output(resdict, infile, outfile): + with open(outfile, 'w') as f: + basename = _get_basename(infile) + f.write(f"TAXID,{basename}\n") + for akey in resdict.keys(): + f.write(f"{akey},{resdict[akey]}\n") + + +if __name__ == '__main__': + INFILE, COUNTLIM, outfile = _get_args() + + if not outfile: + outfile = _get_basename(INFILE)+".kraken_parsed.csv" + + tmp_dict = parse_kraken(infile=INFILE, countlim=COUNTLIM) + write_output(resdict=tmp_dict, infile=INFILE, outfile=outfile) diff --git a/bin/merge_kraken_res.py b/bin/merge_kraken_res.py new file mode 100755 index 000000000..e89b83d31 --- /dev/null +++ b/bin/merge_kraken_res.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python + +import argparse +import os +import pandas as pd +import numpy as np + + +def _get_args(): + '''This function parses and return arguments passed in''' + parser = argparse.ArgumentParser( + prog='merge_kraken_res', + formatter_class=argparse.RawDescriptionHelpFormatter, + description='Merging csv count files in one table') + parser.add_argument( + '-o', + dest="output", + default="kraken_count_table.csv", + help="Output file. Default = kraken_count_table.csv") + + args = parser.parse_args() + + outfile = args.output + + return(outfile) + + +def get_csv(): + tmp = [i for i in os.listdir() if ".csv" in i] + return(tmp) + + +def _get_basename(file_name): + if ("/") in file_name: + basename = file_name.split("/")[-1].split(".")[0] + else: + basename = file_name.split(".")[0] + return(basename) + + +def merge_csv(all_csv): + df = pd.read_csv(all_csv[0], index_col=0) + for i in range(1, len(all_csv)): + df_tmp = pd.read_csv(all_csv[i], index_col=0) + df = pd.merge(left=df, right=df_tmp, on='TAXID', how='outer') + df.fillna(0, inplace=True) + return(df) + + +def write_csv(pd_dataframe, outfile): + pd_dataframe.to_csv(outfile) + + +if __name__ == "__main__": + OUTFILE = _get_args() + all_csv = get_csv() + resdf = merge_csv(all_csv) + write_csv(resdf, OUTFILE) + print(resdf) diff --git a/conf/test_kraken.config b/conf/test_kraken.config new file mode 100644 index 000000000..88044b52b --- /dev/null +++ b/conf/test_kraken.config @@ -0,0 +1,28 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/eager -profile test, docker (or singularity, or conda) + */ + +params { + config_profile_name = 'Test profile kraken' + config_profile_description = 'Minimal test dataset to check pipeline function with kraken metagenomic profiler' + // Limit resources so that this can run on Travis + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + genome = false + //Input data + single_end = false + metagenomic_tool = 'kraken' + run_metagenomic_screening = true + readPaths = [['JK2782_TGGCCGATCAACGA_L008', ['https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2782_TGGCCGATCAACGA_L008_R1_001.fastq.gz.tengrand.fq.gz','https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2782_TGGCCGATCAACGA_L008_R2_001.fastq.gz.tengrand.fq.gz']], + ['JK2802_AGAATAACCTACCA_L008', ['https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2802_AGAATAACCTACCA_L008_R1_001.fastq.gz.tengrand.fq.gz','https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2802_AGAATAACCTACCA_L008_R2_001.fastq.gz.tengrand.fq.gz']], + ] + // Genome references + fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Mammoth/Mammoth_MT_Krause.fasta' + database = 'https://github.com/nf-core/test-datasets/raw/eager/databases/kraken/eager_test.tar.gz' +} \ No newline at end of file diff --git a/docs/output.md b/docs/output.md index ddb6c4c62..eebeda18b 100644 --- a/docs/output.md +++ b/docs/output.md @@ -485,6 +485,8 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir * `sex_determination/` this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the Sample Name, the Nr of Autosomal SNPs, Nr of SNPs on the X/Y chromosome, the Nr of reads mapping to the Autosomes, the Nr of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per bam. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer. * `nuclear_contamination/` this contains the output of the nuclear contamination processes. The directory contains one `*.X.contamination.out` file per individual, as well as `nuclear_contamination.txt` which is a summary table of the results for all individual. `nuclear_contamination.txt` contains a header, followed by one line per individual, comprised of the Method of Moments (MOM) and Maximum Likelihood (ML) contamination estimate (with their respective standard errors) for both Method1 and Method2. * `bedtools/` this contains two files as the output from bedtools coverage. One file contains the 'breadth' coverage (`*.breadth.gz`). This file will have the contents of your annotation file (e.g. BED/GFF), and the following subsequent columns: no. reads on feature, # bases at depth, length of feature, and % of feature. The second file (`*.depth.gz`), contains the contents of your annotation file (e.g. BED/GFF), and an additional column which is mean depth coverage (i.e. average number of reads covering each position). -* `metagenomic_classification/` This contains the output for a given metagenomic classifer (currently only for MALT). Malt will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonmic assignment etc. +* `metagenomic_classification/` This contains the output for a given metagenomic classifer. + * Malt will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. + * Kraken will contain the Kraken output and report files, as well as a merged Taxon count table. * `MaltExtract/` this will contain a `results` directory in which contains the output from MaltExtract - typically one folder for each filter type, an error and a log file. The characteristics of each node (e.g. damage, read lengths, edit distances - each in different txt formats) can be seen in each sub-folder of the filter folders. Output can be visualised either with the [HOPS postprocessing script](https://github.com/rhuebler/HOPS) or [MEx-IPA](https://github.com/jfy133/MEx-IPA) * `consensus_sequence` this contains three FASTA files from VCF2Genome, of a consensus sequence based on the reference FASTA with each sample's unique modifications. The main FASTA is a standard file with bases not passing the specified thresholds as Ns. The two other FASTAS (`_refmod.fasta.gz`) and (`_uncertainity.fasta.gz`) are IUPAC uncertainty codes (rather than Ns) and a special number-based uncertainity system used for other downstream tools, respectively. diff --git a/docs/usage.md b/docs/usage.md index 6d575143f..4de682f7d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -889,7 +889,7 @@ The name of the chromosome X in your bam. `'X'` for hs37d5, `'chrX'` for HG19. D ### Metagenomic Screening -An increasingly common line of analysis in high-throughput aDNA analysis today is simultaenously screening off target reads of the host for endogenous microbial signals - particularly of pathogens. Metagenomic screening is currently offered via MALT with aDNA specific verification via MaltExtract. +An increasingly common line of analysis in high-throughput aDNA analysis today is simultaenously screening off target reads of the host for endogenous microbial signals - particularly of pathogens. Metagenomic screening is currently offered via MALT with aDNA specific verification via MaltExtract, or Kraken2. Please note the following: @@ -899,115 +899,161 @@ Please note the following: > RUNNING MALT ON A SERVER WITH LESS THAN 128GB OF MEMORY SHOULD BE PERFORMED AT YOUR OWN RISK -#### -`-run_metagenomic_screening` +#### `--run_metagenomic_screening` Turn on the metagenomic screening module. #### `--metagenomic_tool` -Specify which taxonomic classifier to use. The only option avaliable is currently 'malt'. - -More can be seen in the [MALT documentation](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf) +Specify which taxonomic classifier to use. There are two options avaliable: +- `kraken` with [Kraken2](https://ccb.jhu.edu/software/kraken2) +- `malt` : more can be seen in the [MALT documentation](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf) + :warning: **Important** It is very important to run `nextflow clean -f` on your nextflow run directory once completed. RMA6 files are VERY large and are _copied_ from a `work/` directory into the results folder. You should clean the work directory with the command to ensure non-redundency and large HDD footprints! +#### `--metagenomic_min_support_reads` + +Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'. +For malt, this only applies when `--malt_min_support_mode` is set to 'reads'. Default: 1 . + #### `--database` -Specify the path to the _directory_ containing your taxonomic classifer's database. +Specify the path to the _directory_ containing your taxonomic classifer's database (malt or kraken). + +For Kraken2, it can be either the path to the _directory_ or the path to the `.tar.gz` compressed directory of the Kraken2 database. #### `--percent_identity` Specify the minimum percent identity (or similarity) a squence must have to the reference for it to be retained. Default is 85 +Only used when `--metagenomic_tool malt` is also supplied + #### `--malt_mode` Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA and DNA, protein and protein, or DNA reads against protein references respectively. -respectively. Ensure your database matches the mode. Check the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf) for more details. Default: 'BlastN' +respectively. Ensure your database matches the mode. Check the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf) for more details. Default: 'BlastN' + +Only when `--metagenomic_tool malt` is also supplied #### `--malt_alignment_mode` Specify what alignment algorithm to use. Options are 'Local' or 'SemiGlobal'. Local is a BLAST like alignment, but is much slower. Semi-global alignment aligns reads end-to-end. Default: 'SemiGlobal' +Only when `--metagenomic_tool malt` is also supplied + #### `--malt_top_percent` Specify the top percent value of the LCA algorthim. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): "For each read, only those matches are used for taxonomic placement whose bit disjointScore is within 10% of the best disjointScore for that read.". Default: 1. +Only when `--metagenomic_tool malt` is also supplied + #### `--malt_min_support_mode` Specify whether to use a percentage, or raw number of reads as the value used to decide the minimum support a taxon requires to be retained. +Only when `--metagenomic_tool malt` is also supplied + #### `--malt_min_support_percent` Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01. -#### `--malt_min_support_reads` - -Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'reads'. Default: 1 . +Only when `--metagenomic_tool malt` is also supplied #### `--malt_max_queries` Specify the maximum number of alignments a read can have. All further alignments are discarded. Default: 100 +Only when `--metagenomic_tool malt` is also supplied + #### `--malt_memory_mode` How to load the database into memory. Options are 'load', 'page' or 'map'. 'load' directly loads the entire database into memory prior seed look up, this is slow but compatible with all servers/file systems. 'page' and 'map' perform a sort of 'chunked' database loading, allow seed look up prior entire database loading. Note that Page and Map modes do not work properly not with many remote filesystems such as GPFS. Default is 'load'. +Only when `--metagenomic_tool malt` is also supplied + #### `--run_maltextract` Turn on MaltExtract for MALT aDNA characteristics authentication of metagenomic output from MALT. More can be seen in the [MaltExtract documentation](https://github.com/rhuebler/) +Only when `--metagenomic_tool malt` is also supplied + #### `maltextract_taxon_list` Path to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format. +Only when `--metagenomic_tool malt` is also supplied + #### `maltextract_ncbifiles` Path to directory containing containing the NCBI resource tree and taxonomy table files (ncbi.tre and ncbi.map; avaliable at the [HOPS repository](https://github.com/rhuebler/HOPS/Resources)). +Only when `--metagenomic_tool malt` is also supplied + #### `maltextract_filter` Specify which MaltExtract filter to use. This is used to specify what types of characteristics to scan for. The default will output statistics on all alignments, and then a second set with just reads with one C to T mismatch in the first 5 bases. Further details on other parameters can be seen in the [HOPS documentation](https://github.com/rhuebler/HOPS/#maltextract-parameters). Options: 'def_anc', 'ancient', 'default', 'crawl', 'scan', 'srna', 'assignment'. Default: 'def_anc'. +Only when `--metagenomic_tool malt` is also supplied + #### `maltextract_toppercent` Specify percent of top alignments for each read to be considered for each node. Default: 0.01. +Only when `--metagenomic_tool malt` is also supplied + #### `maltextract_destackingoff` Turn off destacking. If left on, a read that overlap with another read will be removed (leaving a depth coverage of 1). +Only when `--metagenomic_tool malt` is also supplied + #### `maltextract_downsamplingoff` Turn off downsampling. By default, downsampling is on and will randomly select 10,000 reads if the number of reads on a node exceeds this number. This is to speed up processing, under the assumption at 10,000 reads the species is a 'true positive'. +Only when `--metagenomic_tool malt` is also supplied + #### `maltextract_duplicateremovaloff` Turn off duplicate removal. By default, reads that are an exact copy (i.e. same start, stop coordinate and exact sequence match) will be removed as it is considered a PCR duplicate. +Only when `--metagenomic_tool malt` is also supplied + #### `maltextract_matches` Export alignments of hits for each node in BLAST format. By default turned off. +Only when `--metagenomic_tool malt` is also supplied + #### `maltextract_megansummary` Export 'minimal' summary files (i.e. without alignments) that can be loaded into [MEGAN6](https://doi.org/10.1371/journal.pcbi.1004957). By default turned off. +Only when `--metagenomic_tool malt` is also supplied + #### `maltextract_percentidentity` Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: 85.0. +Only when `--metagenomic_tool malt` is also supplied + #### `maltextract_topalignment` Use the best alignment of each read for every statistic, except for those concerning read distribution and coverage. Default: off. +Only when `--metagenomic_tool malt` is also supplied + #### `maltextract_singlestranded` Switch damage patterns to single-stranded mode. Default: off. +Only when `--metagenomic_tool malt` is also supplied + ## Clean up Once completed a run has completed, you will have _lots_ of (some very large) intermediate files in your output directory, within the directory named `work`. diff --git a/main.nf b/main.nf index 5a6237bf6..4b9e85f95 100644 --- a/main.nf +++ b/main.nf @@ -183,7 +183,7 @@ def helpMessage() { --malt_top_percent Specify the percent for LCA algorithm (see MEGAN6 CE manual). Default: 1 --malt_min_support_mode Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained. Options: 'percent', 'reads'. Default: 'percent' --malt_min_support_percent Specify the minimum percentage of reads a taxon of sample total is required to have to be retained. Default: 0.01 - --malt_min_support_reads Specify a minimum number of reads a taxon of sample total is required to have to be retained. Not compatible with . Default: 1 + --metagenomic_min_support_reads Specify a minimum number of reads a taxon of sample total is required to have to be retained. Not compatible with . Default: 1 --malt_max_queries Specify the maximium number of queries a read can have. Default: 100 --malt_memory_mode Specify the memory load method. Do not use 'map' with GTFS file system. Options: 'load', 'page', 'map'. Default: 'load' @@ -427,8 +427,8 @@ if (params.run_metagenomic_screening) { exit 1, "Metagenomic classification can only run on unmapped reads in FASTSQ format. Please supply --bam_unmapped_type 'fastq'. You gave '${params.bam_unmapped_type}'!" } - if (params.metagenomic_tool != 'malt' ) { - exit 1, "Metagenomic classification can currently only be run with 'malt'. Please check your classifer. You gave '${params.metagenomic_tool}'!" + if (params.metagenomic_tool != 'malt' && params.metagenomic_tool != 'kraken') { + exit 1, "Metagenomic classification can currently only be run with 'malt' or 'kraken' (kraken2). Please check your classifer. You gave '${params.metagenomic_tool}'!" } if (params.database == '' ) { @@ -443,8 +443,8 @@ if (params.run_metagenomic_screening) { exit 1, "Unknown MALT alignment mode specified. Options: 'Local', 'SemiGlobal'. You gave '${params.malt_alignment_mode}'!" } - if (params.malt_min_support_mode == 'percent' && params.malt_min_support_reads != 1) { - exit 1, "Incompatible MALT min support configuration. Percent can only be used with --malt_min_support_percent. You modified --malt_min_support_reads!" + if (params.malt_min_support_mode == 'percent' && params.metagenomic_min_support_reads != 1) { + exit 1, "Incompatible MALT min support configuration. Percent can only be used with --malt_min_support_percent. You modified --metagenomic_min_support_reads!" } if (params.malt_min_support_mode == 'reads' && params.malt_min_support_percent != 0.01) { @@ -1243,7 +1243,7 @@ process samtools_filter { output: file "*filtered.bam" into ch_output_from_filtering - file "*.unmapped.fastq.gz" optional true into ch_bam_filtering_for_malt + file "*.unmapped.fastq.gz" optional true into ch_bam_filtering_for_metagenomic file "*.unmapped.bam" optional true file "*.{bai,csi}" into ch_outputindex_from_filtering @@ -2008,18 +2008,32 @@ process print_nuclear_contamination{ } /* - * Step 17: Metagenomic screening of unmapped reads + * Step 17-A: Metagenomic screening of unmapped reads: MALT */ +if (params.metagenomic_tool == 'malt') { + ch_bam_filtering_for_metagenomic + .set {ch_bam_filtering_for_metagenomic_malt} + + ch_bam_filtering_for_metagenomic_kraken = Channel.empty() +} else if (params.metagenomic_tool == 'kraken') { + ch_bam_filtering_for_metagenomic + .set {ch_bam_filtering_for_metagenomic_kraken} + + ch_bam_filtering_for_metagenomic_malt = Channel.empty() +} + +// params.metagenomic_tool == 'malt' ? ch_bam_filtering_for_metagenomic.set {ch_bam_filtering_for_metagenomic_malt} : ch_bam_filtering_for_metagenomic.set {ch_bam_filtering_for_metagenomic_kraken} + process malt { label 'mc_huge' - publishDir "${params.outdir}/metagenomic_classification", mode:"copy" + publishDir "${params.outdir}/metagenomic_classification/malt", mode:"copy" when: - params.run_metagenomic_screening && params.run_bam_filtering && params.bam_discard_unmapped && params.bam_unmapped_type == 'fastq' + params.run_metagenomic_screening && params.run_bam_filtering && params.bam_discard_unmapped && params.bam_unmapped_type == 'fastq' && params.metagenomic_tool == 'malt' input: - file fastqs from ch_bam_filtering_for_malt.collect() + file fastqs from ch_bam_filtering_for_metagenomic_malt.collect() output: file "*.rma6" into ch_rma_for_maltExtract @@ -2055,7 +2069,7 @@ process malt { -m ${params.malt_mode} \ -at ${params.malt_alignment_mode} \ -top ${params.malt_top_percent} \ - -sup ${params.malt_min_support_reads} \ + -sup ${params.metagenomic_min_support_reads} \ -mq ${params.malt_max_queries} \ --memoryMode ${params.malt_memory_mode} \ -i ${fastqs.join(' ')} |&tee malt.log @@ -2076,7 +2090,7 @@ process maltextract { publishDir "${params.outdir}/MaltExtract/", mode:"copy" when: - params.run_maltextract + params.run_maltextract && params.metagenomic_tool == 'malt' input: file rma6 from ch_rma_for_maltExtract.collect() @@ -2115,6 +2129,94 @@ process maltextract { """ } +/* + * Step 17-B: Metagenomic screening of unmapped reads: Kraken2 +*/ + +if (params.run_metagenomic_screening && params.database.endsWith(".tar.gz") && params.metagenomic_tool == 'kraken'){ + comp_kraken = file(params.database) + + process decomp_kraken { + input: + file(ckdb) from comp_kraken + + output: + file(dbname) into ch_krakendb + + script: + dbname = params.database.tokenize("/")[-1].tokenize(".")[0] + """ + tar xvzf $ckdb + """ + } + +} else if (! params.database.endsWith(".tar.gz") && params.run_metagenomic_screening && params.metagenomic_tool == 'kraken') { + ch_krakendb = file(params.database) +} else { + ch_krakendb = Channel.empty() +} + + +process kraken { + tag "$prefix" + label 'mc_huge' + publishDir "${params.outdir}/metagenomic_classification/kraken", mode:"copy" + + when: + params.run_metagenomic_screening && params.run_bam_filtering && params.bam_discard_unmapped && params.bam_unmapped_type == 'fastq' && params.metagenomic_tool == 'kraken' + + input: + file fastq from ch_bam_filtering_for_metagenomic_kraken + file(krakendb) from ch_krakendb + + output: + file "*.kraken.out" into ch_kraken_out + set val(prefix), file("*.kreport") into ch_kraken_report + + + script: + prefix = fastq.toString().tokenize('.')[0] + out = prefix+".kraken.out" + kreport = prefix+".kreport" + + """ + kraken2 --db ${krakendb} --threads ${task.cpus} --output $out --report $kreport $fastq + """ +} + +process kraken_parse { + tag "$name" + errorStrategy 'ignore' + + input: + set val(name), file(kraken_r) from ch_kraken_report + + output: + set val(name), file('*.kraken_parsed.csv') into ch_kraken_parsed + + script: + out = name+".kraken_parsed.csv" + """ + kraken_parse.py -c ${params.metagenomic_min_support_reads} -o $out $kraken_r + """ +} + +process kraken_merge { + publishDir "${params.outdir}/metagenomic_classification/kraken", mode:"copy" + + input: + file(csv_count) from ch_kraken_parsed.collect() + + output: + file('kraken_count_table.csv') into kraken_merged + + script: + out = "kraken_count_table.csv" + """ + merge_kraken_res.py -o $out + """ +} + /* Genotyping tools: diff --git a/nextflow.config b/nextflow.config index 9ecaef840..11bd9159f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -165,6 +165,7 @@ params { // taxonomic classifer run_metagenomic_screening = false metagenomic_tool = 'malt' + metagenomic_min_support_reads = 1 database = '' percent_identity = 85 malt_mode = 'BlastN' @@ -172,7 +173,6 @@ params { malt_top_percent = 1 malt_min_support_mode = 'percent' malt_min_support_percent = 0.01 - malt_min_support_reads = 1 malt_max_queries = 100 malt_memory_mode = 'load' malt_weighted_lca = false @@ -251,6 +251,7 @@ profiles { test_fna { includeConfig 'conf/test_fna.config'} test_humanbam { includeConfig 'conf/test_humanbam.config' } test_pretrim { includeConfig 'conf/test_pretrim.config' } + test_kraken { includeConfig 'conf/test_kraken.config' } } // Load igenomes.config if required if(!params.igenomes_ignore){