diff --git a/.gitignore b/.gitignore index df1a8e9..1fe44f5 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ __pycache__/ db/* !bin/Darwin/barrnap-0.8/db/ !db/hmm_search/ +OUT/ diff --git a/README.md b/README.md index 70e248b..33871cd 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,7 @@ ln -s $DFAST_APP_ROOT/scripts/dfast_file_downloader.py /usr/local/bin/ ``` dfast_file_downloader.py -h ``` + ## Installation via conda DFAST is also available from [Bioconda](https://bioconda.github.io/recipes/dfast/README.html). Install with: ``` @@ -151,7 +152,7 @@ The following tools are run in parallel to predict biological features (e.g. CDS * CRISPR prediction (CRT) * Assembly gaps within sequences -Optionally, you can choose Prodigal and tRNAscan-SE to predict CDS and tRNA. (You need to install them manually.) +Optionally, you can choose Prodigal/GeneMarkS2, RNAmmer, tRNAscan-SE to predict CDS, rRNA, tRNA, respectively. See [FAQ](docs/FAQ.md). (You need to install them manually.) ### Functional annotation 1. OrthoSearch (Optional. Set `--references` option to enable this.) @@ -160,6 +161,8 @@ Optionally, you can choose Prodigal and tRNAscan-SE to predict CDS and tRNA. (Yo 4. HMMscan against the profile HMM database of TIGRFAM 5. CDDsearch against COG database from NCBI Conserved Domain Database +By default, GHOSTX is used to align protein sequences. Diamond/BLASTP can be used optionally. See [FAQ](docs/FAQ.md). (Diamond needs to be installed manually.) + ### Output * Sequence and annotation data in GFF3 and GenBank format * Sequence data in FASTA format @@ -171,7 +174,7 @@ Optionally, you can choose Prodigal and tRNAscan-SE to predict CDS and tRNA. (Yo ``` usage: dfast -g your_genome.fna [options] -DFAST: DDBJ Fast Annotation and Submission Tool version 1.2.3. +DFAST: DDBJ Fast Annotation and Submission Tool version 1.x.x. Basic options: -g PATH, --genome PATH @@ -213,10 +216,11 @@ Workflow options: db_path[,db_name[,pident,q_cov,s_cov,e_value]]) --references PATH Reference file(s) for OrthoSearch. Use semicolons for multiple files, e.g. 'genome1.faa;genome2.gbk' - --aligner STR Aligner to use [ghostx(=default)|blastp] + --aligner STR Aligner to use [ghostx(=default)|blastp|diamond] --use_prodigal Use Prodigal to predict CDS instead of MGA - --use_trnascan STR Use tRNAscan-SE to predict tRNA instead of Aragorn, - [bact|arch] + --use_genemarks2 STR Use GeneMarkS2 to predict CDS instead of MGA. [auto|bact|arch] + --use_trnascan STR Use tRNAscan-SE to predict tRNA instead of Aragorn. [bact|arch] + --use_rnammer STR Use RNAmmer to predict rRNA instead of Barrnap. [bact|arch] --gcode INT Genetic code [11(=default),4(=Mycoplasma)] --no_hmm Disable HMMscan --no_cdd Disable CDDsearch @@ -287,8 +291,8 @@ According to the user's report, DFAST fails on ArchLinux due to `libidn-11` requ Yasuhiro TANIZAWA, Takatomo FUJISAWA, Eli KAMINUMA, Yasukazu NAKAMURA, and Masanori ARITA * stand-alone version (DFAST-core) DFAST: a flexible prokaryotic genome annotation pipeline for faster genome publication. - *Bioinformatics*. 2017 Nov 2. doi: 10.1093/bioinformatics/btx713 (advance article). + *Bioinformatics*; 2018; 34(6): 1037–1039. Yasuhiro TANIZAWA, Takatomo FUJISAWA, Yasukazu NAKAMURA - https://academic.oup.com/bioinformatics/article-lookup/doi/10.1093/bioinformatics/btx713 + https://academic.oup.com/bioinformatics/article/34/6/1037/4587587 diff --git a/dfast b/dfast index 6e36422..29dd76d 100755 --- a/dfast +++ b/dfast @@ -12,7 +12,7 @@ from dfc.pipeline import Pipeline from dfc.utils.config_util import load_config, show_config, set_references, set_database, set_aligner, \ disable_cdd_search, disable_hmm_scan, enable_prodigal, enable_trnascan, disable_rrna_prediction, \ disable_trna_prediction, disable_cds_prediction, disable_crispr_prediction, set_genetic_code, set_gff, \ - set_threshold, set_values_from_metadata, enable_mga + set_threshold, set_values_from_metadata, enable_mga, enable_genemarks2, enable_rnammer from dfc.utils.path_util import set_binaries_path from dfc.utils.fix_origin import fix_origin @@ -64,11 +64,13 @@ group_workflow = parser.add_argument_group('Workflow options') group_workflow.add_argument("--threshold", help='Thresholds for default database search (format: "pident,q_cov,s_cov,e_value", default: "0,75,75,1e-6")', metavar="STR") group_workflow.add_argument("--database", help="Additional reference database to be searched against prior to the default database. (format: db_path[,db_name[,pident,q_cov,s_cov,e_value]])", metavar="PATH") group_workflow.add_argument("--references", help="Reference file(s) for OrthoSearch. Use semicolons for multiple files, e.g. 'genome1.faa;genome2.gbk'", metavar="PATH") -group_workflow.add_argument("--aligner", help="Aligner to use [ghostx(=default)|blastp]", choices=["ghostx", "blastp"], metavar="STR") +group_workflow.add_argument("--aligner", help="Aligner to use [ghostx(=default)|blastp|diamond]", choices=["ghostx", "blastp", "diamond"], metavar="STR") group_genecall = group_workflow.add_mutually_exclusive_group() group_genecall.add_argument("--use_prodigal", help="Use Prodigal to predict CDS instead of MGA", action="store_true") +group_genecall.add_argument("--use_genemarks2", help="Use GeneMarkS2 to predict CDS instead of MGA. [auto|bact|arch]", choices=["auto", "bact", "arch", "bacteria", "archaea"], metavar="STR") # group_genecall.add_argument("--use_mga", help="Use MetaGeneAnnotator to predict CDS instead of Prodigal", action="store_true") -group_workflow.add_argument("--use_trnascan", help="Use tRNAscan-SE to predict tRNA instead of Aragorn, [bact|arch]", choices=["bact", "arch"], metavar="STR") +group_workflow.add_argument("--use_trnascan", help="Use tRNAscan-SE to predict tRNA instead of Aragorn. [bact|arch]", choices=["bac", "bact", "arc", "arch"], metavar="STR") +group_workflow.add_argument("--use_rnammer", help="Use RNAmmer to predict rRNA instead of Barrnap. [bact|arch]", choices=["bact", "arch"], metavar="STR") group_workflow.add_argument("--gcode", help="Genetic code [11(=default),4(=Mycoplasma)]", metavar="INT", type=int, default=11) group_workflow.add_argument("--no_hmm", help="Disable HMMscan", action="store_true") group_workflow.add_argument("--no_cdd", help="Disable CDDsearch", action="store_true") @@ -116,7 +118,7 @@ logger.setLevel(DEBUG) if args.help: parser.print_help() - exit() + exit(1) app_root = os.path.abspath(os.path.dirname(os.path.realpath(__file__))) @@ -210,14 +212,15 @@ if args.no_crispr: # --no_crispr disable_crispr_prediction(config) if args.gcode != 11: + if not (args.use_prodigal or args.use_genemarks2): + logger.error("'--gcode 4' cannot be specified when using MGA. Please set '--use_prodigal' or '--use_genemarks2'") + exit(1) set_genetic_code(config, args.gcode) - # disabled below. 2019.10.30 - # if gcode is set to the value other than 11, prodigal and aragorn will be used. - # args.use_prodigal = False # ignore --use_prodigal option - # args.use_trnascan = False # ignore --use_trnascan option if args.use_prodigal: # --use_prodigal enable_prodigal(config) +if args.use_genemarks2: # --use_genemarks2 + enable_genemarks2(config, args.use_genemarks2) # if args.use_mga: # enable_mga(config) @@ -225,6 +228,9 @@ if args.use_prodigal: # --use_prodigal if args.use_trnascan: # --use_trnascan enable_trnascan(config, args.use_trnascan) +if args.use_rnammer: # --use_trnascan + enable_rnammer(config, args.use_rnammer) + if args.no_hmm: # --no_hmm disable_hmm_scan(config) diff --git a/dfc/__init__.py b/dfc/__init__.py index 481cc65..20f5202 100644 --- a/dfc/__init__.py +++ b/dfc/__init__.py @@ -1 +1 @@ -dfast_version = "1.2.5" +dfast_version = "1.2.6" diff --git a/dfc/default_config.py b/dfc/default_config.py index da4d014..002ee09 100644 --- a/dfc/default_config.py +++ b/dfc/default_config.py @@ -155,7 +155,7 @@ class Config: "target": "rRNA", "enabled": False, "options": { - "model": "bac", # arc/bac/euk + "model": "bac", # arc/bac "cmd_options": "" }, }, @@ -176,13 +176,26 @@ class Config: { # Prodigal for CDS prediction "tool_name": "Prodigal", - "tool_type": "CDS", + "target": "CDS", + "enabled": False, + "options": { + "transl_table": 11, + "cmd_options": "", + }, + }, + { + # GeneMarkS2 for CDS prediction + # By default GeneMarkS2 is disabled. To enable this, also set MGA disabled or enable merge_cds in FEATURE_ADJUSTMENT. + "tool_name": "GeneMarkS2", + "target": "CDS", "enabled": False, "options": { "transl_table": 11, + "genome_type": "bacteria", + "format": "gff", "cmd_options": "", }, - }, + }, ] FUNCTIONAL_ANNOTATION = [ diff --git a/dfc/structuralAnnotation.py b/dfc/structuralAnnotation.py index 6c6057f..48d3202 100644 --- a/dfc/structuralAnnotation.py +++ b/dfc/structuralAnnotation.py @@ -14,6 +14,7 @@ from .tools.CRT import CRT # from .tools.glimmer import Glimmer from .tools.prodigal import Prodigal +from .tools.genemarkS2 import GeneMarkS2 from .tools.tRNAscan import tRNAscan from .tools.rnammer import RNAmmer from .tools.gff_importer import GFFimporter @@ -26,6 +27,7 @@ "CRT": CRT, # "Glimmer": Glimmer, "Prodigal": Prodigal, + "GeneMarkS2": GeneMarkS2, "tRNAscan": tRNAscan, "RNAmmer": RNAmmer, "GFF_import": GFFimporter, diff --git a/dfc/tools/genemarkS2.py b/dfc/tools/genemarkS2.py new file mode 100644 index 0000000..ede6334 --- /dev/null +++ b/dfc/tools/genemarkS2.py @@ -0,0 +1,112 @@ +#! /usr/bin/env python +# coding: UTF8 + +# Written by Aaron Pfennig + +from .base_tools import StructuralAnnotationTool +from Bio import SeqIO +from ..models.bio_feature import ExtendedFeature +import os.path + +class GeneMarkS2(StructuralAnnotationTool): + """ + GeneMarkS2 + + Tool type: CDS prediction + URL: + REF: + + """ + version = None + TYPE = "CDS" + NAME = "GeneMarkS2" + VERSION_CHECK_CMD = ["gms2.pl | tail -n 1"] + VERSION_PATTERN = r"Version: (.+)_lic" + SHELL = True + + def __init__(self, options=None, workDir="OUT"): + super(GeneMarkS2, self).__init__(options, workDir) + self.transl_table = options.get("transl_table", 11) + self.out_format = options.get("format", "gff") + self.genome_type = options.get("genome_type", "bacteria") + self.cmd_options = options.get("cmd_options", "") + + def getCommand(self): + + # /home/apfennig3/Team1-GenePrediction/bin/gms2.pl --seq $genome --genome-type bacteria --output $tmp_dir/output.gtf --format gff + # GeneMarkS2 generates a log file in the current directory. + # To avoid leaving a log file, GeneMarkS2 is run aftr changing the directory. + gms2_work_dir = os.path.dirname(self.outputFile) + rel_input_file = os.path.join("..", "input", os.path.basename(self.genomeFasta)) + output_file = os.path.basename(self.outputFile) + cmd = ["cd", gms2_work_dir, ";", "gms2.pl", self.cmd_options, "--genome-type", + self.genome_type, "--gcode", str(self.transl_table), "--format", self.out_format, "--seq", rel_input_file, "--output", output_file] + # cmd = ["gms2.pl", self.cmd_options, "--genome-type", self.genome_type, "--gcode", str(self.transl_table), "--format", self.out_format, "--seq", self.genomeFasta, "--output", self.outputFile] + return cmd + + + def getFeatures(self): + """GeneMarkS2 generates standard GFF format. + + + """ + + def _parseResult(): + with open(self.outputFile) as f: + for line in f: + if line.startswith("#"): + continue + # Genemark has an empty line between header and actual predictions + if len(line) == 1: + continue + sequence, toolName, featureType, left, right, score, strand, _, qualifiers = line.strip("\n").split("\t") + qualifiers = dict([x.split(' ') for x in qualifiers.strip(";").split("; ")]) + yield sequence, toolName, featureType, left, right, strand, qualifiers + + def _getLengthDict(fileName): + R = list(SeqIO.parse(open(fileName), "fasta")) + return {r.id: len(r) for r in R} + + def _get_feature(left, right, strand, partial_flag, seq_length, i): + left_flag, right_flag = partial_flag[0], partial_flag[1] + left, right, codon_start = int(left), int(right), 1 + + if left_flag == "1" and left <= 3: + if strand == "+": + codon_start = left + left = 1 + if right_flag == "1" and seq_length - right <= 2: + if strand == "-": + codon_start = seq_length - right + 1 + right = seq_length + location = self.getLocation(left, right, strand, partial_flag) + + annotations = {"partial_flag": partial_flag} + if partial_flag != "00": + annotations["partial"] = True + feature = ExtendedFeature(location=location, type="CDS", id="{0}_{1}".format(self.__class__.__name__, i), + seq_id=sequence, annotations=annotations) + feature.qualifiers = { + "product": ["hypothetical protein"], + "inference": ["COORDINATES:ab initio prediction:{0}:{1}".format(self.NAME, self.version)], + "transl_table": [str(self.transl_table)], + "codon_start": [codon_start] + } + return feature + + dict_length = _getLengthDict(self.genomeFasta) + + D = {} + i = 0 + for sequence, toolName, featureType, left, right, strand, qualifiers in _parseResult(): + partial_flag = qualifiers.get("partial", "00") + seq_length = dict_length[sequence] + + i += 1 + feature = _get_feature(left, right, strand, partial_flag, seq_length, i) + + if qualifiers.get("rbs_motif", "None") != "None": + feature.annotations["rbs"] = qualifiers["rbs_motif"] + D.setdefault(sequence, []).append(feature) + return D + diff --git a/dfc/utils/config_util.py b/dfc/utils/config_util.py index c146347..7d7ee9d 100644 --- a/dfc/utils/config_util.py +++ b/dfc/utils/config_util.py @@ -178,21 +178,43 @@ def enable_trnascan(config, model): if setting.get("tool_name", "") == "Aragorn": setting["enabled"] = False - -def enable_prodigal(config): +def enable_rnammer(config, model): + # model should be arc/bac + if model == "bact": + model = "bac" + elif model == "arch": + model = "arc" for setting in config.STRUCTURAL_ANNOTATION: - if setting.get("tool_name", "") == "Prodigal": + if setting.get("tool_name", "") == "RNAmmer": setting["enabled"] = True - if setting.get("tool_name", "") == "MGA": + setting["options"]["model"] = model + if setting.get("tool_name", "") == "Barrnap": setting["enabled"] = False -def enable_mga(config): + +def _select_CDS_prediction_tool(config, tool_name): for setting in config.STRUCTURAL_ANNOTATION: - if setting.get("tool_name", "") == "MGA": - setting["enabled"] = True - if setting.get("tool_name", "") == "Prodigal": - setting["enabled"] = False + if setting.get("target", "") == "CDS": + if setting.get("tool_name", "") == tool_name: + setting["enabled"] = True + else: + setting["enabled"] = False + +def enable_prodigal(config): + _select_CDS_prediction_tool(config, "Prodigal") +def enable_mga(config): + _select_CDS_prediction_tool(config, "MGA") + +def enable_genemarks2(config, genome_type): + # genome type must be bacteria or archaea + if genome_type == "bact": + genome_type = "bacteria" + elif genome_type == "arch": + genome_type = "archaea" + _select_CDS_prediction_tool(config, "GeneMarkS2") + gms2_config = [conf for conf in config.STRUCTURAL_ANNOTATION if conf["tool_name"] == "GeneMarkS2"][0] + gms2_config["options"]["genome_type"] = genome_type def set_gff(config, gff_file_name): targets = [] @@ -213,26 +235,14 @@ def set_genetic_code(config, value): if value == 11: pass else: - logger.warning("Genetic code is set to {}. Prodigal will be used for CDS prediction.".format(value)) + # genetic code 4 can be specified when using Prodigal/GeneMarkS2/Aragorn + logger.warning("Genetic code is set to {}.".format(value)) for setting in config.STRUCTURAL_ANNOTATION: - if setting.get("tool_name", "") == "Prodigal": - setting["enabled"] = True - setting["options"]["transl_table"] = value - if setting.get("tool_name", "") == "MGA": - setting["enabled"] = False - if setting.get("tool_name", "") == "Aragorn": - # setting["enabled"] = True - setting["options"]["transl_table"] = value - if setting.get("tool_name", "") == "tRNAscan": - pass - # setting["enabled"] = False + if "transl_table" in setting["options"]: + setting["options"]["transl_table"] = value for setting in config.FUNCTIONAL_ANNOTATION: if setting.get("component_name", "") == "PseudoGeneDetection": setting["options"]["transl_table"] = value - # if value == 4: - # setting["options"]["genetic_code_file"] = setting["options"].get("genetic_code_file", "").replace("transl_table_11.txt", "transl_table_4.txt") - # if value == 25: - # setting["options"]["genetic_code_file"] = setting["options"].get("genetic_code_file", "").replace("transl_table_11.txt", "transl_table_25.txt") diff --git a/dfc/utils/reffile_util.py b/dfc/utils/reffile_util.py index bb81335..c963f9d 100644 --- a/dfc/utils/reffile_util.py +++ b/dfc/utils/reffile_util.py @@ -44,7 +44,7 @@ def check_db_file(db_name, aligner): file_ext = ".dmnd" file_name = db_name + file_ext if not os.path.exists(file_name): - logger.warning("Diamond index files do not exist.") + logger.warning("Diamond index file does not exist.") prepare_database_dmnd(ref_file) else: logger.error( @@ -208,8 +208,8 @@ def run_hmmpress(file_name): def prepare_database_dmnd(file_name): base_name, _ext = (os.path.splitext(file_name)) output_file = base_name + ".faa" - logger.info("Converting DFAST reference '{0}' to FASTA '{1}'".format( - file_name, output_file)) + # logger.info("Converting DFAST reference '{0}' to FASTA '{1}'".format( + # file_name, output_file)) fasta_file = dfast2fasta(file_name, output_file) diamond = Diamond() logger.info( diff --git a/docs/FAQ.md b/docs/FAQ.md index c85a8f4..24fedda 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -5,20 +5,54 @@ 2. Set the option `--use_prodigal`. When enabled, Prodigal will be used instead of the default prediction tool (MGA). -Note that Prodigal is not bundled in the DFAST distribution. Please install them by yourself. +Note that Prodigal is not bundled in the DFAST distribution. Please install it by yourself. -## 2. How to use tRNAscan-SE to predict tRNAs +## 2. How to use GeneMarkS2 to predict CDSs +1. Make sure that 'GeneMarkS2' is in your `PATH`, e.g. create a symbolic link for gms2.pl in '/usr/local/bin'. +2. Set the option `--use_genemarks2 GENOME-TYPE`. + GENOME-TYPE should be 'auto', 'bact(eria)', or 'arch(aea)'. + When enabled, GeneMarkS2 will be used instead of the default prediction tool (MGA). + +GeneMarkS2 is invoked with the command like below: +`gms2.pl --genome-type bacteria(or auto, archaea) --gcode 11 --format gff --seq input/genome.fna --output output_file.txt` +Codon table for Mycoplasma can be used by specifying DFAST's commnand line option `--gcode 4`. + + +Note that GeneMarkS2 is not bundled in the DFAST distribution. Please install it by yourself. + + +## 3. How to use RNAmmer to predict rRNAs +1. Make sure that 'RNAmmer' is in your `PATH`, e.g. create a symbolic link for the 'rnammer' executable in '/usr/local/bin'. +2. Set the option `--use_rnammer [bact|arch]`. + `--use_rnammer bact` for bacterial genome, `--use_rnammer arch` for archaeal genome. + When enabled, RNAmmer will be used instead of the default prediction tool (Barnnap). + +RNAmmer is invoked with the following command, +`rnammer -S bac(or arc) -m tsu,lsu,ssu -gff output_file.txt input/genome.fna`, +meaning that all kind of rRNA genes will be predicted using the parameter for bacteria(or archaea). +Note that RNAmmer is not bundled in the DFAST distribution. Please install it by yourself. + +## 4. How to use tRNAscan-SE to predict tRNAs 1. Make sure that 'tRNAscan-SE' is in your `PATH`. 2. Set the option `--use_trnascan [bact|arch]`. `--use_trnascan bact` for bacterial genome, `--use_trnascan arch` for archaeal genome. When enabled, tRNAscan-SE will be used instead of the default prediction tool (Aragorn). Both tRNAscan-SE 1.3 and 2.0 can be used. -Note that tRNAscan-SE are not bundled in the DFAST distribution. Please install them by yourself. +Note that tRNAscan-SE is not bundled in the DFAST distribution. Please install it by yourself. + + +## 5. How to use Diamond to align protein sequences +1. Make sure that the binary for Diamond is in your `PATH`. +2. Set the option `--aligner diamond`. + +If an index file for Diamond (.dmnd) does not exist, DFAST attempts to build it. You can build it manually by `scripts/reference_util.py formatdb-dmnd`. +Note that Diamond is not bundled in the DFAST distribution. Please install it by yourself. + -## 3. What are the meanings of 'note' qualifiers in CDS features? +## 6. What are the meanings of 'note' qualifiers in CDS features? The note qualifier in GenBank or GFF format shows the result of an alignment to the reference sequence. ``` note="Q890K8 chromosomal replication initiator protein DnaA (Lactobacillus plantarum WCFS1)