Development (#184)

* This should address #173 (both configuration file and docs) and #158 * Fix #181 and small bug fix for parsing Mikado annotations. * Progress for #142 - this should fix the wrong ORF calculation for cases when the CDS was open at the 5' end. * Fixed previous commit (always for #142) * #142: corrected and tested the issue with one-off exons, for padding. * This should fix and test #142 for good. * Removed spurious warning/error messages * #142: solved a bug which caused truncated transcripts at the 5' end not to be padded. * #142: solved a problem which caused a false abort for transcripts on the - strand with changed stop codon. * #142: fixing previous commit * Pushing the fix for #182 onto the development branch * Fix #183 * Fix #183 and previous commit * #183: now Mikado configure will set a seed when generating the configuration file. The seed will be explicitly mentioned in the log. * #177: made ORF loading slightly faster with pysam. Also made XML serialisation much faster using SQL sessions and multiprocessing.Pool instead of queues. * Solved annoying bug that caused Mikado to crash with TAIR GFF3s.
EI-CoreBioinformatics · Jun 18, 2019 · 28898f1 · 28898f1
1 parent 275ff65
commit 28898f1
Show file tree

Hide file tree

Showing 29 changed files with 577 additions and 475 deletions.
diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json
@@ -4,15 +4,17 @@
   "type": "object",
   "properties": {
     "SimpleComment": {"type": "object", "properties": {},
-      "SimpleComment": ["Configuration file for Mikado. Sections:",
+      "SimpleComment": ["Configuration file for Mikado. Please note that absent values, e.g. if a field is deleted, will be imputed by the default values for the program.",
+      "Sections:",
       "- log_settings: settings related to the verbosity of logs.",
       "- db_settings: Database settings, for Mikado serialise and pick.",
       "- reference: Settings related to the genome reference.",
       "- prepare: settings related to the Mikado prepare stage",
       "- serialise: settings related to the Mikado serialise stage",
       "- pick: settings related to the Mikado pick stage",
     "- multiprocessing_method: which method (fork, spawn, forkserver) Mikado should use for multiprocessing."],
-    "Comment": ["Configuration file for Mikado. Sections:",
+    "Comment": ["Configuration file for Mikado.  Please note that absent values, e.g. if a field is deleted, will be imputed by the default values for the program.",
+      "Sections:",
       "- log_settings: settings related to the verbosity of logs.",
       "- db_settings: Database settings, for Mikado serialise and pick.",
       "- reference: Settings related to the genome reference.",
@@ -21,6 +23,10 @@
       "- pick: settings related to the Mikado pick stage",
     "- multiprocessing_method: which method (fork, spawn, forkserver) Mikado should use for multiprocessing."]
     },
+    "seed": {
+      "type": ["integer", "null"],
+      "default": null
+    },
     "multiprocessing_method": {
       "type": "string",
       "default": "",
@@ -420,7 +426,7 @@
             },
             "ts_distance": {
               "type": "integer",
-              "default": 300,
+              "default": 1000,
               "minimum": 0
             },
             "pad": {
@@ -430,7 +436,7 @@
             "ts_max_splices": {
               "type": "integer",
               "minimum": 0,
-              "default": 1
+              "default": 2
             }
           }
         },

diff --git a/Mikado/configuration/configurator.py b/Mikado/configuration/configurator.py
@@ -23,6 +23,8 @@
 from ..exceptions import InvalidJson, UnrecognizedRescaler
 from ..utilities import merge_dictionaries
 from ..utilities.log_utils import create_default_logger
+import sys
+import random
 
 
 __author__ = "Luca Venturini"
@@ -603,6 +605,13 @@ def check_json(json_conf, simple=False, external_dict=None, logger=None):
         logger.debug("Scoring parameters: {}".format("\n".join(["\n"] + [
             "{}: {}".format(_, json_conf["scoring"][_]) for _ in json_conf["scoring"].keys()])))
 
+    seed = json_conf.get("seed", None)
+    if seed is None:
+        seed = random.randint(0, sys.maxsize)
+        logger.info("Random seed: {}", seed)
+        json_conf["seed"] = seed
+    random.seed(seed)
+
     return json_conf
 
 
@@ -644,4 +653,10 @@ def to_json(string, simple=False, logger=None):
     except Exception as exc:
         raise OSError((exc, string))
 
+    seed = json_dict.get("seed", None)
+    if seed is None:
+        seed = random.randint(0, sys.maxsize)
+        logger.info("Random seed: {}", seed)
+    random.seed(seed)
+
     return json_dict
diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py
@@ -193,10 +193,11 @@ def __getstate__(self):
 
         if hasattr(self, "json_conf"):
             # This removes unpicklable compiled attributes, eg in "requirements" or "as_requirements"
+            if "json_conf" not in state:
+                state["json_conf"] = self.json_conf.copy()
             for key in self.json_conf:
                 if (isinstance(self.json_conf[key], dict) and
                         self.json_conf[key].get("compiled", None) is not None):
-                    assert "json_conf" in state
                     assert key in state["json_conf"]
                     del state["json_conf"][key]["compiled"]
 

diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py
@@ -709,7 +709,10 @@ def pad_transcripts(self) -> set:
         """
 
         try:
-            self.fai = pysam.FastaFile(self.json_conf["reference"]["genome"])
+            if isinstance(self.json_conf["reference"]["genome"], pysam.FastaFile):
+                self.fai = self.json_conf["reference"]["genome"]
+            else:
+                self.fai = pysam.FastaFile(self.json_conf["reference"]["genome"])
         except KeyError:
             raise KeyError(self.json_conf.keys())
 
@@ -730,13 +733,25 @@ def pad_transcripts(self) -> set:
                 templates.add(__to_modify[tid][1].id)
 
             self.logger.debug("Expanding %s to have start %s (from %s) and end %s (from %s)",
-                              tid, __to_modify[tid][0],
-                              self[tid].start, __to_modify[tid][1], self[tid].end)
-            new_transcript = expand_transcript(self[tid].deepcopy(),
-                                               __to_modify[tid][0],
-                                               __to_modify[tid][1],
-                                               self.fai,
-                                               self.logger)
+                              tid, __to_modify[tid][0] if not __to_modify[tid][0] else __to_modify[tid][0].start,
+                              self[tid].start,
+                              __to_modify[tid][1] if not __to_modify[tid][1] else __to_modify[tid][1].end,
+                              self[tid].end)
+            try:
+                new_transcript = expand_transcript(self[tid].deepcopy(),
+                                                   __to_modify[tid][0],
+                                                   __to_modify[tid][1],
+                                                   self.fai,
+                                                   self.logger)
+            except KeyboardInterrupt:
+                raise
+            except Exception as exc:
+                self.logger.exception(exc)
+                raise
+            if (new_transcript.start == self.transcripts[tid].end) and (new_transcript.end == self.transcripts[tid].end):
+                self.logger.debug("No expansion took place for %s!", tid)
+            else:
+                self.logger.debug("Expansion took place for %s!", tid)
             self.transcripts[tid] = new_transcript
 
         self.exons = set()
@@ -755,6 +770,7 @@ def define_graph(self, objects: dict, inters=None, three_prime=False):
             inters = self._share_extreme
 
         for obj, other_obj in combinations(objects.keys(), 2):
+            self.logger.debug("Comparing %s to %s (%s')", obj, other_obj, "5" if not three_prime else "3")
             if obj == other_obj:
                 continue
             else:
@@ -847,9 +863,11 @@ def _share_five_prime(self, first: Transcript, second: Transcript):
         first, second = sorted([first, second], key=operator.attrgetter("start"))
         # Now let us check whether the second falls within an intron
         matched = first.segmenttree.find(second.exons[0][0], second.exons[0][1])
-        if matched[0].value == "intron":
+        self.logger.debug("{second.id} last exon {second.exons[0]} intersects in {first.id}: {matched}".format(
+            **locals()))
+        if matched[0].value == "intron" or second.exons[0][0] < matched[0].start:
             decision = False
-            reason = "{second} first exon ends within an intron of {first}".format(**locals())
+            reason = "{second.id} first exon ends within an intron of {first.id}".format(**locals())
         else:
             upstream = [_ for _ in first.find_upstream(second.exons[0][0], second.exons[0][1])
                         if _.value == "exon" and _ not in matched]
@@ -868,8 +886,8 @@ def _share_five_prime(self, first: Transcript, second: Transcript):
             decision = (ts_distance <= self.ts_distance) and (ts_splices <= self.ts_max_splices)
             if decision:
                 decision = (second, first)
-            reason = "{first.id} {doesit} overlap {second.id} (distance {ts_distance} max {self.ts_distance}, splices {ts_splices} max {self.ts_max_splices})".format(
-                doesit="does" if decision else "does not", **locals())
+            reason = "{first.id} {doesit} overlap {second.id} (distance {ts_distance} max {self.ts_distance}, splices \
+{ts_splices} max {self.ts_max_splices})".format(doesit="does" if decision else "does not", **locals())
         self.logger.debug(reason)
         return decision
 
@@ -886,7 +904,7 @@ def _share_three_prime(self, first: Transcript, second: Transcript):
         first, second = sorted([first, second], key=operator.attrgetter("end"), reverse=False)
         # Now let us check whether the second falls within an intron
         matched = second.segmenttree.find(first.exons[-1][0], first.exons[-1][1])
-        if matched[-1].value == "intron":
+        if matched[-1].value == "intron" or first.exons[-1][1] > matched[-1].end:
             decision = False
             reason = "{second.id} last exon ends within an intron of {first.id}".format(**locals())
         else:
@@ -1083,17 +1101,16 @@ def expand_transcript(transcript: Transcript,
         start_transcript, end_transcript = end_transcript, start_transcript
 
     # Make a backup copy of the transcript
+    logger.debug("Starting expansion of %s", transcript.id)
     backup = transcript.deepcopy()
 
     # First get the ORFs
-    transcript.logger = logger
     # Remove the CDS and unfinalize
+    logger.debug("Starting expansion of %s", transcript.id)
     strand = transcript.strand
     transcript.strip_cds()
     transcript.unfinalize()
-
     assert strand == transcript.strand
-
     downstream = 0
     down_exons = []
 
@@ -1114,6 +1131,7 @@ def expand_transcript(transcript: Transcript,
 
     new_exons = up_exons + down_exons
     if not new_exons:
+        logger.debug("%s does not need to be expanded, exiting", transcript.id)
         return backup
 
     transcript.add_exons(new_exons)
@@ -1123,43 +1141,52 @@ def expand_transcript(transcript: Transcript,
     if transcript.strand == "-":
         downstream, upstream = upstream, downstream
 
-    if up_exons or down_exons:
-        seq = check_expanded(transcript, backup, start_transcript, end_transcript,
-                             fai, upstream, downstream, logger)
-        transcript = enlarge_orfs(transcript, backup, seq, upstream, downstream, logger)
-        transcript.finalize()
+    if (up_exons or down_exons):
+        if backup.is_coding:
+            seq = check_expanded(transcript, backup, start_transcript, end_transcript,
+                                 fai, upstream, downstream, logger)
+            transcript = enlarge_orfs(transcript, backup, seq, upstream, downstream, logger)
+            transcript.finalize()
     else:
         return backup
 
     # Now finalize again
-    if upstream > 0 or downstream > 0:
+    logger.debug("%s: start (before %s, now %s, %s), end (before %s, now %s, %s)",
+                 transcript.id,
+                 backup.start, transcript.start, transcript.start < backup.start,
+                 backup.end, transcript.end, transcript.end > backup.end)
+    if transcript.start < backup.start or transcript.end > backup.end:
         transcript.attributes["padded"] = True
 
     # Now check that we have a valid expansion
     if backup.is_coding and not transcript.is_coding:
         # Something has gone wrong. Just return the original transcript.
         assert new_exons
-        logger.info("Padding %s would lead to an invalid CDS. Aborting.",
+        logger.info("Padding %s would lead to an invalid CDS (up exons: %s). Aborting.",
                     transcript.id, up_exons)
         return backup
-    elif (backup.is_coding and ((backup.strand == "-" and backup.combined_cds_end < transcript.combined_cds_end) or
-          (backup.combined_cds_end > transcript.combined_cds_end))):
-        message = "Padding %s would lead to an in-frame stop codon (%s to %s, vs original %s to %s. Aborting." % (
-            transcript.id, transcript.combined_cds_start, transcript.combined_cds_end,
-            backup.combined_cds_start, backup.combined_cds_end
-        )
-        logger.info(message)
-        return backup
-    else:
-        message = "{transcript.id} has now start {transcript.start}, end {transcript.end}"
-        if (backup.is_coding and ((backup.combined_cds_end != transcript.combined_cds_end) or
-                  (backup.combined_cds_start != transcript.combined_cds_start))):
+    elif backup.is_coding:
+        abort = False
+        if backup.strand == "-" and backup.combined_cds_end < transcript.combined_cds_end:
+            abort = True
+        elif backup.strand != "-" and backup.combined_cds_end > transcript.combined_cds_end:
+            abort = True
+        if abort is True:
+            msg = "Padding {} (strand: {}) would lead to an in-frame stop codon ({} to {}, vs original {} to {}.\
+Aborting.".format(transcript.id, backup.strand, transcript.combined_cds_start, transcript.combined_cds_end,
+                  backup.combined_cds_start, backup.combined_cds_end)
+            logger.info(msg)
+            return backup
+
+    message = "{transcript.id} has now start {transcript.start}, end {transcript.end}"
+    if (backup.is_coding and ((backup.combined_cds_end != transcript.combined_cds_end) or
+        (backup.combined_cds_start != transcript.combined_cds_start))):
             transcript.attributes["cds_padded"] = True
             message += "; CDS moved to {transcript.combined_cds_start}, end {transcript.combined_cds_end}"
-        else:
-            transcript.attributes["cds_padded"] = False
-            message += "."
-        logger.info(message.format(**locals()))
+    elif backup.is_coding:
+        transcript.attributes["cds_padded"] = False
+    message += "."
+    logger.info(message.format(**locals()))
 
     return transcript
 
@@ -1401,14 +1428,11 @@ def enlarge_orfs(transcript: Transcript,
             internal_orfs = []
     else:
         internal_orfs = []
-        internal_orfs = []
 
     if not internal_orfs:
         return transcript
 
-    logger.debug("Enlarging the ORFs for TID %s", transcript.id)
     new_orfs = []
-
     for orf in internal_orfs:
         logger.debug("Old ORF: %s", str(orf))
         try:

diff --git a/Mikado/loci/reference_gene.py b/Mikado/loci/reference_gene.py
@@ -5,6 +5,7 @@
 Minimal checks.
 """
 
+import re
 import copy
 import logging
 import operator
@@ -157,6 +158,9 @@ def add_exon(self, row):
 
         for parent in (_ for _ in row.parent if _ not in self.transcripts):
             found = False
+            if parent.endswith("-Protein") and re.sub("-Protein", "", parent) in self.transcripts:
+                continue
+
             for tid in self.transcripts:
                 if parent in self.transcripts[tid].derived_children:
                     found = True