Fixes for EI-CoreBioinformatics#137 and EI-CoreBioinformatics#138

lucventurini · Oct 18, 2018 · 6d35778 · 6d35778
1 parent 185aa09
commit 6d35778
Show file tree

Hide file tree

Showing 8 changed files with 562 additions and 59 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,6 +21,7 @@ Bugfixes and improvements:
 - [#132](https://github.com/lucventurini/mikado/issues/132),[#133](https://github.com/lucventurini/mikado/issues/133): Mikado will now evaluate the CDS of transcripts during Mikado prepare.
 - [#134](https://github.com/lucventurini/mikado/issues/134): when checking for potential Alternative Splicing Events (ASEs), now Mikado will check whether the CDS phases are in frame with each other. Moreover
  **Mikado will now calculate the CDS overlap percentage based on the primary transcript CDS length**, not the minimum CDS length between primary and candidate. Please note that the change **regarding the frame** also affects the monosublocus stage. Mikado still considers only the primary ORFs for the overlap.
+ - Solved a bug which led Mikado to recalculate the phases for each model during picking, potentially creating mistakes for models truncated at the 5' end.
 
 # Version 1.2.4
 

diff --git a/Mikado/preparation/checking.py b/Mikado/preparation/checking.py
@@ -1,13 +1,15 @@
 import functools
 import multiprocessing
+import multiprocessing.queues
 import os
-
 import pyfaidx
-
 from Mikado.transcripts.transcriptchecker import TranscriptChecker
 from .. import exceptions
 from ..loci import Transcript
 from ..utilities.log_utils import create_null_logger, create_queue_logger
+import logging
+import queue
+import time
 
 __author__ = 'Luca Venturini'
 
@@ -51,11 +53,14 @@ def create_transcript(lines,
     """
 
     if logger is None:
-        logger = create_null_logger("checker")
+        logger = create_null_logger()
 
-    logger.debug("Starting with %s", lines["tid"])
+    if "tid" not in lines:
+        logger.error("Lines datastore lacks the transcript ID. Exiting.")
+        return None
 
     try:
+        logger.debug("Starting with %s", lines["tid"])
         transcript_line = Transcript()
         transcript_line.chrom = lines["chrom"]
         if "source" in lines:
@@ -70,8 +75,15 @@ def create_transcript(lines,
         transcript_line.parent = lines["parent"]
 
         for feature in lines["features"]:
-            coords = [(_[0], _[1]) for _ in lines["features"][feature]]
-            phases = [_[2] for _ in lines["features"][feature]]
+            coords, phases = [], []
+            for feat in lines["features"][feature]:
+                assert isinstance(feat, (list, tuple)) and 2 <= len(feat) <= 3, feat
+                coords.append((feat[0], feat[1]))
+                if len(feat) == 3 and feat[2] in (0, 1, 2, None):
+                    phases.append(feat[2])
+                else:
+                    phases.append(None)
+            assert len(phases) == len(coords)
             transcript_line.add_exons(coords, features=feature, phases=phases)
 
         transcript_object = TranscriptChecker(transcript_line,
@@ -100,8 +112,6 @@ def create_transcript(lines,
         logger.exception(exc)
         transcript_object = None
 
-    logger.debug("Finished with %s", lines["tid"])
-
     return transcript_object
 
 
@@ -124,25 +134,33 @@ def __init__(self,
                  ):
 
         super().__init__()
-        self.__identifier = identifier
+        self.__identifier = ""
+        self.__set_identifier(identifier)
         # self.strand_specific = strand_specific
-        self.canonical = canonical_splices
+        self.__canonical = []
+        self.__set_canonical(canonical_splices)
+        self.__log_level = "DEBUG"
         self.log_level = log_level
-        self.logger = None
-        self.logging_queue = logging_queue
+        self.logger = None  # This gets populated by the create_queue_logger function below
+        self.__logging_queue = None
+        self.__set_logging_queue(logging_queue)
         self.name = "Checker-{0}".format(self.identifier)
-        create_queue_logger(self)
+        try:
+            create_queue_logger(self)
+        except AttributeError as exc:
+            raise AttributeError(exc)
+        self.__lenient = False
         self.lenient = lenient
         self.__fasta = fasta
-        self.submission_queue = submission_queue
+        self.__submission_queue = None
+        self.__set_submission_queue(submission_queue)
         self.fasta = pyfaidx.Fasta(self.__fasta)
         self.fasta_out = os.path.join(tmpdir, "{0}-{1}".format(
             fasta_out, self.identifier
         ))
         self.gtf_out = os.path.join(tmpdir, "{0}-{1}".format(
             gtf_out, self.identifier
         ))
-        self.logger.debug(self.canonical)
 
     def run(self):
 
@@ -154,7 +172,12 @@ def run(self):
 
         fasta_out = open(self.fasta_out, "w")
         gtf_out = open(self.gtf_out, "w")
+        self.logger.debug("Starting %s", self.name)
+        self.logger.debug("Created output FASTA {self.fasta_out} and GTF {self.gtf_out}".format(**locals()))
+        time.sleep(0.1)
+        self.logger.debug(self.canonical)
 
+        __printed = 0
         while True:
             lines, start, end, counter = self.submission_queue.get()
             if lines == "EXIT":
@@ -176,13 +199,23 @@ def run(self):
                 continue
             else:
                 self.logger.debug("Printing %s", lines["tid"])
+                __printed += 1
                 print("\n".join(["{0}/{1}".format(counter, line) for line in
                                  transcript.format("gtf").split("\n")]), file=gtf_out)
                 print("\n".join(["{0}/{1}".format(counter, line) for line in
                                  transcript.fasta.split("\n")]), file=fasta_out)
 
+        time.sleep(0.1)
+        fasta_out.flush()
         fasta_out.close()
+        gtf_out.flush()
         gtf_out.close()
+        if __printed > 0:
+            self.logger.debug("Size of FASTA out and GTF out: %s, %s",
+                              os.stat(fasta_out.name).st_size, os.stat(gtf_out.name).st_size)
+            assert os.stat(gtf_out.name).st_size > 0
+            assert os.stat(fasta_out.name).st_size > 0
+        time.sleep(0.1)
         return
 
     def __getstate__(self):
@@ -200,3 +233,64 @@ def __setstate__(self, state):
     @property
     def identifier(self):
         return self.__identifier
+
+    def __set_identifier(self, identifier):
+
+        if identifier is None:
+            raise ValueError("The identifier must be defined!")
+        self.__identifier = str(identifier)
+
+    @property
+    def log_level(self):
+        return self.__log_level
+
+    @log_level.setter
+    def log_level(self, log_level):
+        _ = logging._checkLevel(log_level)
+        self.__log_level = log_level
+
+    @property
+    def lenient(self):
+        return self.__lenient
+
+    @lenient.setter
+    def lenient(self, lenient):
+        if lenient not in (False, True):
+            raise ValueError("Invalid lenient value: {}".format(lenient))
+        self.__lenient = lenient
+
+    @property
+    def submission_queue(self):
+        return self.__submission_queue
+
+    def __set_submission_queue(self, submission):
+        if not isinstance(submission, (multiprocessing.queues.Queue, queue.Queue)):
+            raise ValueError("Invalid queue object: {}".format(type(submission)))
+        self.__submission_queue = submission
+
+    @property
+    def logging_queue(self):
+        return self.__logging_queue
+
+    def __set_logging_queue(self, logging_queue):
+        if not isinstance(logging_queue, (multiprocessing.queues.Queue, queue.Queue)):
+            raise ValueError("Invalid queue object: {}".format(type(logging_queue)))
+        self.__logging_queue = logging_queue
+
+    @property
+    def canonical(self):
+        return self.__canonical
+
+    def __set_canonical(self, canonical):
+        if not isinstance(canonical, (tuple, list)):
+            raise TypeError("Canonical splices should be lists or tuples")
+
+        if len(canonical) == 0:
+            raise ValueError("The list of canonical splices cannot be empty!")
+
+        for el in canonical:
+            if (len(el) != 2 or (not (isinstance(el[0], str) and len(el[0]) == 2) or
+                                not (isinstance(el[1], str) and len(el[1]) == 2  ))):
+                raise ValueError("Invalid splicing pattern!")
+
+        self.__canonical = canonical
diff --git a/Mikado/preparation/prepare.py b/Mikado/preparation/prepare.py
@@ -419,7 +419,6 @@ def prepare(args, logger):
         args.json_conf["prepare"]["files"]["output_dir"],
         args.json_conf["prepare"]["files"]["out"]), 'w')
 
-
     logger.info("Loading reference file")
     args.json_conf["reference"]["genome"] = pyfaidx.Fasta(args.json_conf["reference"]["genome"])