Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
lucventurini committed Oct 18, 2018
1 parent 185aa09 commit 6d35778
Show file tree
Hide file tree
Showing 8 changed files with 562 additions and 59 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Bugfixes and improvements:
- [#132](https://github.com/lucventurini/mikado/issues/132),[#133](https://github.com/lucventurini/mikado/issues/133): Mikado will now evaluate the CDS of transcripts during Mikado prepare.
- [#134](https://github.com/lucventurini/mikado/issues/134): when checking for potential Alternative Splicing Events (ASEs), now Mikado will check whether the CDS phases are in frame with each other. Moreover
**Mikado will now calculate the CDS overlap percentage based on the primary transcript CDS length**, not the minimum CDS length between primary and candidate. Please note that the change **regarding the frame** also affects the monosublocus stage. Mikado still considers only the primary ORFs for the overlap.
- Solved a bug which led Mikado to recalculate the phases for each model during picking, potentially creating mistakes for models truncated at the 5' end.

# Version 1.2.4

Expand Down
124 changes: 109 additions & 15 deletions Mikado/preparation/checking.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import functools
import multiprocessing
import multiprocessing.queues
import os

import pyfaidx

from Mikado.transcripts.transcriptchecker import TranscriptChecker
from .. import exceptions
from ..loci import Transcript
from ..utilities.log_utils import create_null_logger, create_queue_logger
import logging
import queue
import time

__author__ = 'Luca Venturini'

Expand Down Expand Up @@ -51,11 +53,14 @@ def create_transcript(lines,
"""

if logger is None:
logger = create_null_logger("checker")
logger = create_null_logger()

logger.debug("Starting with %s", lines["tid"])
if "tid" not in lines:
logger.error("Lines datastore lacks the transcript ID. Exiting.")
return None

try:
logger.debug("Starting with %s", lines["tid"])
transcript_line = Transcript()
transcript_line.chrom = lines["chrom"]
if "source" in lines:
Expand All @@ -70,8 +75,15 @@ def create_transcript(lines,
transcript_line.parent = lines["parent"]

for feature in lines["features"]:
coords = [(_[0], _[1]) for _ in lines["features"][feature]]
phases = [_[2] for _ in lines["features"][feature]]
coords, phases = [], []
for feat in lines["features"][feature]:
assert isinstance(feat, (list, tuple)) and 2 <= len(feat) <= 3, feat
coords.append((feat[0], feat[1]))
if len(feat) == 3 and feat[2] in (0, 1, 2, None):
phases.append(feat[2])
else:
phases.append(None)
assert len(phases) == len(coords)
transcript_line.add_exons(coords, features=feature, phases=phases)

transcript_object = TranscriptChecker(transcript_line,
Expand Down Expand Up @@ -100,8 +112,6 @@ def create_transcript(lines,
logger.exception(exc)
transcript_object = None

logger.debug("Finished with %s", lines["tid"])

return transcript_object


Expand All @@ -124,25 +134,33 @@ def __init__(self,
):

super().__init__()
self.__identifier = identifier
self.__identifier = ""
self.__set_identifier(identifier)
# self.strand_specific = strand_specific
self.canonical = canonical_splices
self.__canonical = []
self.__set_canonical(canonical_splices)
self.__log_level = "DEBUG"
self.log_level = log_level
self.logger = None
self.logging_queue = logging_queue
self.logger = None # This gets populated by the create_queue_logger function below
self.__logging_queue = None
self.__set_logging_queue(logging_queue)
self.name = "Checker-{0}".format(self.identifier)
create_queue_logger(self)
try:
create_queue_logger(self)
except AttributeError as exc:
raise AttributeError(exc)
self.__lenient = False
self.lenient = lenient
self.__fasta = fasta
self.submission_queue = submission_queue
self.__submission_queue = None
self.__set_submission_queue(submission_queue)
self.fasta = pyfaidx.Fasta(self.__fasta)
self.fasta_out = os.path.join(tmpdir, "{0}-{1}".format(
fasta_out, self.identifier
))
self.gtf_out = os.path.join(tmpdir, "{0}-{1}".format(
gtf_out, self.identifier
))
self.logger.debug(self.canonical)

def run(self):

Expand All @@ -154,7 +172,12 @@ def run(self):

fasta_out = open(self.fasta_out, "w")
gtf_out = open(self.gtf_out, "w")
self.logger.debug("Starting %s", self.name)
self.logger.debug("Created output FASTA {self.fasta_out} and GTF {self.gtf_out}".format(**locals()))
time.sleep(0.1)
self.logger.debug(self.canonical)

__printed = 0
while True:
lines, start, end, counter = self.submission_queue.get()
if lines == "EXIT":
Expand All @@ -176,13 +199,23 @@ def run(self):
continue
else:
self.logger.debug("Printing %s", lines["tid"])
__printed += 1
print("\n".join(["{0}/{1}".format(counter, line) for line in
transcript.format("gtf").split("\n")]), file=gtf_out)
print("\n".join(["{0}/{1}".format(counter, line) for line in
transcript.fasta.split("\n")]), file=fasta_out)

time.sleep(0.1)
fasta_out.flush()
fasta_out.close()
gtf_out.flush()
gtf_out.close()
if __printed > 0:
self.logger.debug("Size of FASTA out and GTF out: %s, %s",
os.stat(fasta_out.name).st_size, os.stat(gtf_out.name).st_size)
assert os.stat(gtf_out.name).st_size > 0
assert os.stat(fasta_out.name).st_size > 0
time.sleep(0.1)
return

def __getstate__(self):
Expand All @@ -200,3 +233,64 @@ def __setstate__(self, state):
@property
def identifier(self):
return self.__identifier

def __set_identifier(self, identifier):

if identifier is None:
raise ValueError("The identifier must be defined!")
self.__identifier = str(identifier)

@property
def log_level(self):
return self.__log_level

@log_level.setter
def log_level(self, log_level):
_ = logging._checkLevel(log_level)
self.__log_level = log_level

@property
def lenient(self):
return self.__lenient

@lenient.setter
def lenient(self, lenient):
if lenient not in (False, True):
raise ValueError("Invalid lenient value: {}".format(lenient))
self.__lenient = lenient

@property
def submission_queue(self):
return self.__submission_queue

def __set_submission_queue(self, submission):
if not isinstance(submission, (multiprocessing.queues.Queue, queue.Queue)):
raise ValueError("Invalid queue object: {}".format(type(submission)))
self.__submission_queue = submission

@property
def logging_queue(self):
return self.__logging_queue

def __set_logging_queue(self, logging_queue):
if not isinstance(logging_queue, (multiprocessing.queues.Queue, queue.Queue)):
raise ValueError("Invalid queue object: {}".format(type(logging_queue)))
self.__logging_queue = logging_queue

@property
def canonical(self):
return self.__canonical

def __set_canonical(self, canonical):
if not isinstance(canonical, (tuple, list)):
raise TypeError("Canonical splices should be lists or tuples")

if len(canonical) == 0:
raise ValueError("The list of canonical splices cannot be empty!")

for el in canonical:
if (len(el) != 2 or (not (isinstance(el[0], str) and len(el[0]) == 2) or
not (isinstance(el[1], str) and len(el[1]) == 2 ))):
raise ValueError("Invalid splicing pattern!")

self.__canonical = canonical
1 change: 0 additions & 1 deletion Mikado/preparation/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,6 @@ def prepare(args, logger):
args.json_conf["prepare"]["files"]["output_dir"],
args.json_conf["prepare"]["files"]["out"]), 'w')


logger.info("Loading reference file")
args.json_conf["reference"]["genome"] = pyfaidx.Fasta(args.json_conf["reference"]["genome"])

Expand Down
Loading

0 comments on commit 6d35778

Please sign in to comment.