Merge pull request #26 from nigyta/genemarkS2

Added options to enable GeneMarkS2, Diamond, RNAmmer
nigyta · Feb 10, 2020 · 100d57a · 100d57a
2 parents 14a891f + 243b63a
commit 100d57a
Show file tree

Hide file tree

Showing 10 changed files with 233 additions and 51 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@ __pycache__/
 db/*
 !bin/Darwin/barrnap-0.8/db/
 !db/hmm_search/
+OUT/
diff --git a/README.md b/README.md
@@ -90,6 +90,7 @@ ln -s $DFAST_APP_ROOT/scripts/dfast_file_downloader.py /usr/local/bin/
     ```
     dfast_file_downloader.py -h
     ```
+
 ## <a id="condainstallation"></a>Installation via conda
 DFAST is also available from [Bioconda](https://bioconda.github.io/recipes/dfast/README.html). Install with:
 ```
@@ -151,7 +152,7 @@ The following tools are run in parallel to predict biological features (e.g. CDS
 * CRISPR prediction (CRT)
 * Assembly gaps within sequences
 
-Optionally, you can choose Prodigal and tRNAscan-SE to predict CDS and tRNA. (You need to install them manually.)
+Optionally, you can choose Prodigal/GeneMarkS2, RNAmmer, tRNAscan-SE to predict CDS, rRNA, tRNA, respectively. See [FAQ](docs/FAQ.md). (You need to install them manually.)
 
 ### Functional annotation
 1. OrthoSearch (Optional. Set `--references` option to enable this.)
@@ -160,6 +161,8 @@ Optionally, you can choose Prodigal and tRNAscan-SE to predict CDS and tRNA. (Yo
 4. HMMscan against the profile HMM database of TIGRFAM
 5. CDDsearch against COG database from NCBI Conserved Domain Database
 
+By default, GHOSTX is used to align protein sequences. Diamond/BLASTP can be used optionally. See [FAQ](docs/FAQ.md). (Diamond needs to be installed manually.) 
+
 ### Output
 * Sequence and annotation data in GFF3 and GenBank format
 * Sequence data in FASTA format
@@ -171,7 +174,7 @@ Optionally, you can choose Prodigal and tRNAscan-SE to predict CDS and tRNA. (Yo
 ```  
 usage: dfast -g your_genome.fna [options]
 
-DFAST: DDBJ Fast Annotation and Submission Tool version 1.2.3.
+DFAST: DDBJ Fast Annotation and Submission Tool version 1.x.x.
 
 Basic options:
   -g PATH, --genome PATH
@@ -213,10 +216,11 @@ Workflow options:
                         db_path[,db_name[,pident,q_cov,s_cov,e_value]])
   --references PATH     Reference file(s) for OrthoSearch. Use semicolons for
                         multiple files, e.g. 'genome1.faa;genome2.gbk'
-  --aligner STR         Aligner to use [ghostx(=default)|blastp]
+  --aligner STR         Aligner to use [ghostx(=default)|blastp|diamond]
   --use_prodigal        Use Prodigal to predict CDS instead of MGA
-  --use_trnascan STR    Use tRNAscan-SE to predict tRNA instead of Aragorn,
-                        [bact|arch]
+  --use_genemarks2 STR  Use GeneMarkS2 to predict CDS instead of MGA. [auto|bact|arch]
+  --use_trnascan STR    Use tRNAscan-SE to predict tRNA instead of Aragorn. [bact|arch]
+  --use_rnammer STR     Use RNAmmer to predict rRNA instead of Barrnap. [bact|arch]
   --gcode INT           Genetic code [11(=default),4(=Mycoplasma)]
   --no_hmm              Disable HMMscan
   --no_cdd              Disable CDDsearch
@@ -287,8 +291,8 @@ According to the user's report, DFAST fails on ArchLinux due to `libidn-11` requ
     Yasuhiro TANIZAWA, Takatomo FUJISAWA, Eli KAMINUMA, Yasukazu NAKAMURA, and Masanori ARITA  
 * stand-alone version (DFAST-core)  
     DFAST: a flexible prokaryotic genome annotation pipeline for faster genome publication.  
-    *Bioinformatics*. 2017 Nov 2. doi: 10.1093/bioinformatics/btx713 (advance article).  
+    *Bioinformatics*; 2018; 34(6): 1037–1039.  
     Yasuhiro TANIZAWA, Takatomo FUJISAWA, Yasukazu NAKAMURA  
-    https://academic.oup.com/bioinformatics/article-lookup/doi/10.1093/bioinformatics/btx713  
+    https://academic.oup.com/bioinformatics/article/34/6/1037/4587587
 
 
diff --git a/dfast b/dfast
@@ -12,7 +12,7 @@ from dfc.pipeline import Pipeline
 from dfc.utils.config_util import load_config, show_config, set_references, set_database, set_aligner, \
     disable_cdd_search, disable_hmm_scan, enable_prodigal, enable_trnascan, disable_rrna_prediction, \
     disable_trna_prediction, disable_cds_prediction, disable_crispr_prediction, set_genetic_code, set_gff, \
-    set_threshold, set_values_from_metadata, enable_mga
+    set_threshold, set_values_from_metadata, enable_mga, enable_genemarks2, enable_rnammer
 from dfc.utils.path_util import set_binaries_path
 from dfc.utils.fix_origin import fix_origin
 
@@ -64,11 +64,13 @@ group_workflow = parser.add_argument_group('Workflow options')
 group_workflow.add_argument("--threshold", help='Thresholds for default database search (format: "pident,q_cov,s_cov,e_value", default: "0,75,75,1e-6")', metavar="STR")
 group_workflow.add_argument("--database", help="Additional reference database to be searched against prior to the default database. (format: db_path[,db_name[,pident,q_cov,s_cov,e_value]])", metavar="PATH")
 group_workflow.add_argument("--references", help="Reference file(s) for OrthoSearch. Use semicolons for multiple files, e.g. 'genome1.faa;genome2.gbk'", metavar="PATH")
-group_workflow.add_argument("--aligner", help="Aligner to use [ghostx(=default)|blastp]", choices=["ghostx", "blastp"], metavar="STR")
+group_workflow.add_argument("--aligner", help="Aligner to use [ghostx(=default)|blastp|diamond]", choices=["ghostx", "blastp", "diamond"], metavar="STR")
 group_genecall = group_workflow.add_mutually_exclusive_group()
 group_genecall.add_argument("--use_prodigal", help="Use Prodigal to predict CDS instead of MGA", action="store_true")
+group_genecall.add_argument("--use_genemarks2", help="Use GeneMarkS2 to predict CDS instead of MGA. [auto|bact|arch]", choices=["auto", "bact", "arch", "bacteria", "archaea"], metavar="STR")
 # group_genecall.add_argument("--use_mga", help="Use MetaGeneAnnotator to predict CDS instead of Prodigal", action="store_true")
-group_workflow.add_argument("--use_trnascan", help="Use tRNAscan-SE to predict tRNA instead of Aragorn, [bact|arch]", choices=["bact", "arch"], metavar="STR")
+group_workflow.add_argument("--use_trnascan", help="Use tRNAscan-SE to predict tRNA instead of Aragorn. [bact|arch]", choices=["bac", "bact", "arc", "arch"], metavar="STR")
+group_workflow.add_argument("--use_rnammer", help="Use RNAmmer to predict rRNA instead of Barrnap. [bact|arch]", choices=["bact", "arch"], metavar="STR")
 group_workflow.add_argument("--gcode", help="Genetic code [11(=default),4(=Mycoplasma)]", metavar="INT", type=int, default=11)
 group_workflow.add_argument("--no_hmm", help="Disable HMMscan", action="store_true")
 group_workflow.add_argument("--no_cdd", help="Disable CDDsearch", action="store_true")
@@ -116,7 +118,7 @@ logger.setLevel(DEBUG)
 
 if args.help:
     parser.print_help()
-    exit()
+    exit(1)
 
 app_root = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
 
@@ -210,21 +212,25 @@ if args.no_crispr:  # --no_crispr
     disable_crispr_prediction(config)
 
 if args.gcode != 11:
+    if not (args.use_prodigal or args.use_genemarks2):
+        logger.error("'--gcode 4' cannot be specified when using MGA. Please set '--use_prodigal' or '--use_genemarks2'")
+        exit(1)
     set_genetic_code(config, args.gcode)
-    # disabled below. 2019.10.30
-    # if gcode is set to the value other than 11, prodigal and aragorn will be used.
-    # args.use_prodigal = False  # ignore --use_prodigal option
-    # args.use_trnascan = False  # ignore --use_trnascan option
 
 if args.use_prodigal:  # --use_prodigal
     enable_prodigal(config)
+if args.use_genemarks2: # --use_genemarks2
+    enable_genemarks2(config, args.use_genemarks2)
 
 # if args.use_mga:
 #     enable_mga(config)
 
 if args.use_trnascan:  # --use_trnascan
     enable_trnascan(config, args.use_trnascan)
 
+if args.use_rnammer:  # --use_trnascan
+    enable_rnammer(config, args.use_rnammer)
+
 if args.no_hmm:  # --no_hmm
     disable_hmm_scan(config)
 

diff --git a/dfc/__init__.py b/dfc/__init__.py
@@ -1 +1 @@
-dfast_version = "1.2.5"
+dfast_version = "1.2.6"
diff --git a/dfc/default_config.py b/dfc/default_config.py
@@ -155,7 +155,7 @@ class Config:
             "target": "rRNA",
             "enabled": False,
             "options": {
-                "model": "bac",  # arc/bac/euk
+                "model": "bac",  # arc/bac
                 "cmd_options": ""
             },
         },
@@ -176,13 +176,26 @@ class Config:
         {
             # Prodigal for CDS prediction
             "tool_name": "Prodigal",
-            "tool_type": "CDS",
+            "target": "CDS",
+            "enabled": False,
+            "options": {
+                "transl_table": 11,
+                "cmd_options": "",
+            },
+        },
+        {
+            # GeneMarkS2 for CDS prediction
+            # By default GeneMarkS2 is disabled. To enable this, also set MGA disabled or enable merge_cds in FEATURE_ADJUSTMENT.
+            "tool_name": "GeneMarkS2",
+            "target": "CDS",
             "enabled": False,
             "options": {
                 "transl_table": 11,
+                "genome_type": "bacteria",
+                "format": "gff",
                 "cmd_options": "",
             },
-         },
+        },
     ]
 
     FUNCTIONAL_ANNOTATION = [

diff --git a/dfc/structuralAnnotation.py b/dfc/structuralAnnotation.py
@@ -14,6 +14,7 @@
 from .tools.CRT import CRT
 # from .tools.glimmer import Glimmer
 from .tools.prodigal import Prodigal
+from .tools.genemarkS2 import GeneMarkS2
 from .tools.tRNAscan import tRNAscan
 from .tools.rnammer import RNAmmer
 from .tools.gff_importer import GFFimporter
@@ -26,6 +27,7 @@
     "CRT": CRT,
     # "Glimmer": Glimmer,
     "Prodigal": Prodigal,
+    "GeneMarkS2": GeneMarkS2,
     "tRNAscan": tRNAscan,
     "RNAmmer": RNAmmer,
     "GFF_import": GFFimporter,

diff --git a/dfc/tools/genemarkS2.py b/dfc/tools/genemarkS2.py
@@ -0,0 +1,112 @@
+#! /usr/bin/env python
+# coding: UTF8
+
+# Written by Aaron Pfennig
+
+from .base_tools import StructuralAnnotationTool
+from Bio import SeqIO
+from ..models.bio_feature import ExtendedFeature
+import os.path
+
+class GeneMarkS2(StructuralAnnotationTool):
+    """
+    GeneMarkS2
+
+    Tool type: CDS prediction
+    URL: 
+    REF:
+
+    """
+    version = None
+    TYPE = "CDS"
+    NAME = "GeneMarkS2"
+    VERSION_CHECK_CMD = ["gms2.pl | tail -n 1"]
+    VERSION_PATTERN = r"Version: (.+)_lic"
+    SHELL = True
+
+    def __init__(self, options=None, workDir="OUT"):
+        super(GeneMarkS2, self).__init__(options, workDir)
+        self.transl_table = options.get("transl_table", 11)
+        self.out_format = options.get("format", "gff")
+        self.genome_type = options.get("genome_type", "bacteria")
+        self.cmd_options = options.get("cmd_options", "")
+
+    def getCommand(self):
+
+        # /home/apfennig3/Team1-GenePrediction/bin/gms2.pl --seq $genome --genome-type bacteria --output $tmp_dir/output.gtf --format gff
+        # GeneMarkS2 generates a log file in the current directory. 
+        # To avoid leaving a log file, GeneMarkS2 is run aftr changing the directory. 
+        gms2_work_dir = os.path.dirname(self.outputFile)
+        rel_input_file = os.path.join("..", "input", os.path.basename(self.genomeFasta))
+        output_file = os.path.basename(self.outputFile)
+        cmd = ["cd", gms2_work_dir, ";", "gms2.pl", self.cmd_options, "--genome-type", 
+                self.genome_type, "--gcode", str(self.transl_table), "--format", self.out_format, "--seq", rel_input_file, "--output", output_file]
+        # cmd = ["gms2.pl", self.cmd_options, "--genome-type", self.genome_type, "--gcode", str(self.transl_table), "--format", self.out_format, "--seq", self.genomeFasta, "--output", self.outputFile]
+        return cmd
+
+
+    def getFeatures(self):
+        """GeneMarkS2 generates standard GFF format.
+           
+
+        """
+
+        def _parseResult():
+            with open(self.outputFile) as f:
+                for line in f:
+                    if line.startswith("#"):
+                        continue
+                    # Genemark has an empty line between header and actual predictions
+                    if len(line) == 1:
+                        continue
+                    sequence, toolName, featureType, left, right, score, strand, _, qualifiers = line.strip("\n").split("\t")
+                    qualifiers = dict([x.split(' ') for x in qualifiers.strip(";").split("; ")])
+                    yield sequence, toolName, featureType, left, right, strand, qualifiers
+
+        def _getLengthDict(fileName):
+            R = list(SeqIO.parse(open(fileName), "fasta"))
+            return {r.id: len(r) for r in R}
+
+        def _get_feature(left, right, strand, partial_flag, seq_length, i):
+            left_flag, right_flag = partial_flag[0], partial_flag[1]
+            left, right, codon_start = int(left), int(right), 1
+
+            if left_flag == "1" and left <= 3:
+                if strand == "+":
+                    codon_start = left
+                left = 1
+            if right_flag == "1" and seq_length - right <= 2:
+                if strand == "-":
+                    codon_start = seq_length - right + 1
+                right = seq_length
+            location = self.getLocation(left, right, strand, partial_flag)
+
+            annotations = {"partial_flag": partial_flag}
+            if partial_flag != "00":
+                annotations["partial"] = True
+            feature = ExtendedFeature(location=location, type="CDS", id="{0}_{1}".format(self.__class__.__name__, i),
+                                      seq_id=sequence, annotations=annotations)
+            feature.qualifiers = {
+                "product": ["hypothetical protein"],
+                "inference": ["COORDINATES:ab initio prediction:{0}:{1}".format(self.NAME, self.version)],
+                "transl_table": [str(self.transl_table)],
+                "codon_start": [codon_start]
+            }
+            return feature
+
+        dict_length = _getLengthDict(self.genomeFasta)
+
+        D = {}
+        i = 0
+        for sequence, toolName, featureType, left, right, strand, qualifiers in _parseResult():
+            partial_flag = qualifiers.get("partial", "00")
+            seq_length = dict_length[sequence]
+
+            i += 1
+            feature = _get_feature(left, right, strand, partial_flag, seq_length, i)
+
+            if qualifiers.get("rbs_motif", "None") != "None":
+                feature.annotations["rbs"] = qualifiers["rbs_motif"]
+            D.setdefault(sequence, []).append(feature)
+        return D
+
diff --git a/dfc/utils/config_util.py b/dfc/utils/config_util.py
@@ -178,21 +178,43 @@ def enable_trnascan(config, model):
         if setting.get("tool_name", "") == "Aragorn":
             setting["enabled"] = False
 
-
-def enable_prodigal(config):
+def enable_rnammer(config, model):
+    # model should be arc/bac
+    if model == "bact":
+        model = "bac"
+    elif model == "arch":
+        model = "arc"
     for setting in config.STRUCTURAL_ANNOTATION:
-        if setting.get("tool_name", "") == "Prodigal":
+        if setting.get("tool_name", "") == "RNAmmer":
             setting["enabled"] = True
-        if setting.get("tool_name", "") == "MGA":
+            setting["options"]["model"] = model
+        if setting.get("tool_name", "") == "Barrnap":
             setting["enabled"] = False
 
-def enable_mga(config):
+
+def _select_CDS_prediction_tool(config, tool_name):
     for setting in config.STRUCTURAL_ANNOTATION:
-        if setting.get("tool_name", "") == "MGA":
-            setting["enabled"] = True
-        if setting.get("tool_name", "") == "Prodigal":
-            setting["enabled"] = False
+        if setting.get("target", "") == "CDS":
+            if setting.get("tool_name", "") == tool_name:
+                setting["enabled"] = True
+            else:
+                setting["enabled"] = False
+
+def enable_prodigal(config):
+    _select_CDS_prediction_tool(config, "Prodigal")
 
+def enable_mga(config):
+    _select_CDS_prediction_tool(config, "MGA")
+
+def enable_genemarks2(config, genome_type):
+    # genome type must be bacteria or archaea
+    if genome_type == "bact":
+        genome_type = "bacteria"
+    elif genome_type == "arch":
+        genome_type = "archaea"
+    _select_CDS_prediction_tool(config, "GeneMarkS2")
+    gms2_config = [conf for conf in config.STRUCTURAL_ANNOTATION if conf["tool_name"] == "GeneMarkS2"][0]
+    gms2_config["options"]["genome_type"] = genome_type
 
 def set_gff(config, gff_file_name):
     targets = []
@@ -213,26 +235,14 @@ def set_genetic_code(config, value):
     if value == 11:
         pass
     else:
-        logger.warning("Genetic code is set to {}. Prodigal will be used for CDS prediction.".format(value))
+        # genetic code 4 can be specified when using Prodigal/GeneMarkS2/Aragorn
+        logger.warning("Genetic code is set to {}.".format(value))
         for setting in config.STRUCTURAL_ANNOTATION:
-            if setting.get("tool_name", "") == "Prodigal":
-                setting["enabled"] = True
-                setting["options"]["transl_table"] = value 
-            if setting.get("tool_name", "") == "MGA":
-                setting["enabled"] = False
-            if setting.get("tool_name", "") == "Aragorn":
-                # setting["enabled"] = True
-                setting["options"]["transl_table"] = value 
-            if setting.get("tool_name", "") == "tRNAscan":
-                pass
-                # setting["enabled"] = False
+            if "transl_table" in setting["options"]:
+                setting["options"]["transl_table"] = value
         for setting in config.FUNCTIONAL_ANNOTATION:
             if setting.get("component_name", "") == "PseudoGeneDetection":
                 setting["options"]["transl_table"] = value
-        #         if value == 4:
-        #             setting["options"]["genetic_code_file"] = setting["options"].get("genetic_code_file", "").replace("transl_table_11.txt", "transl_table_4.txt")
-        #         if value == 25:
-        #             setting["options"]["genetic_code_file"] = setting["options"].get("genetic_code_file", "").replace("transl_table_11.txt", "transl_table_25.txt")
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,4 @@ __pycache__/ @@
     db/*
     !bin/Darwin/barrnap-0.8/db/
     !db/hmm_search/
+    OUT/