Fix for #153

EI-CoreBioinformatics · Mar 8, 2019 · cfe3119 · cfe3119
1 parent efea2c6
commit cfe3119
Show file tree

Hide file tree

Showing 2 changed files with 79 additions and 87 deletions.
diff --git a/Mikado/transcripts/transcript.py b/Mikado/transcripts/transcript.py
@@ -588,11 +588,13 @@ def __setstate__(self, state):
         self.logger = None
 
     def __getattribute__(self, item):
-
-        if "external" in item and item != "external" and "." in item:
-            return getattr(self.external_scores, item.split(".")[1])
-        else:
-            return super().__getattribute__(item)
+        try:
+            return object.__getattribute__(self, item)
+        except AttributeError as exc:
+            if "external" in item and item != "external" and "." in item:
+                return getattr(self.external_scores, item.split(".")[1])
+            else:
+                raise AttributeError(exc)
 
     # ######## Class instance methods ####################
 
@@ -603,30 +605,31 @@ def add_exon(self, gffline, feature=None, phase=None):
         :type feature: flag to indicate what kind of feature we are adding
         """
 
-        if isinstance(gffline, (tuple, list)):
+        _gtype = type(gffline)
+
+        if _gtype in (tuple, list):
             assert len(gffline) == 2
             try:
                 start, end = sorted(gffline)
             except TypeError:
-                raise TypeError((gffline, type(gffline)))
+                raise TypeError((gffline, _gtype))
             if feature is None:
                 feature = "exon"
-        elif isinstance(gffline, intervaltree.Interval):
+        elif _gtype is intervaltree.Interval:
             start, end = gffline[0], gffline[1]
             if feature is None:
                 feature = "exon"
-        elif isinstance(gffline, Interval):
+        elif _gtype is Interval:
             start, end = gffline.start, gffline.end
             if feature is None:
                 feature = "exon"
-        elif not isinstance(gffline, (GtfLine, GffLine)):
-            raise InvalidTranscript("Unkwown feature type! %s",
-                                    type(gffline))
+        elif _gtype not in (GtfLine, GffLine):
+            raise InvalidTranscript("Unkwown feature type! %s", _gtype)
         else:
             start, end = sorted([gffline.start, gffline.end])
             if feature is None:
                 feature = gffline.feature
-            if isinstance(gffline, GffLine) and "cdna_match" in gffline.feature.lower():
+            if _gtype is GffLine and "cdna_match" in gffline.feature.lower():
                 gffline.parent = gffline.id
 
             if self.id not in gffline.parent:
@@ -669,11 +672,7 @@ def add_exon(self, gffline, feature=None, phase=None):
 
         segment = tuple([int(start), int(end)])
         if segment in store:
-            # raise InvalidTranscript(
-            #     "Attempt to add {} to {}, but it is already present!".format(
-            #         segment, self.id))
             return
-        # assert isinstance(segment[0], int) and isinstance(segment[1], int)
         if self.__expandable is True:
             self.start = min([self.start, start])
             self.end = max([self.end, end])
@@ -1382,10 +1381,6 @@ def get_available_metrics(cls) -> list:
         metrics = [member[0] for member in inspect.getmembers(cls) if
                    "__" not in member[0] and isinstance(cls.__dict__[member[0]], Metric)]
 
-        # metrics = list(x[0] for x in filter(
-        #     lambda y: "__" not in y[0] and isinstance(cls.__dict__[y[0]], Metric),
-        #     inspect.getmembers(cls)))
-        # assert "tid" in metrics and "parent" in metrics and "score" in metrics
         _metrics = sorted([metric for metric in metrics])
         final_metrics = ["tid", "alias", "parent", "original_source", "score"] + _metrics
         return final_metrics
@@ -1552,9 +1547,12 @@ def parent(self, parent):
         :type parent: list
         :type parent: str
         """
-        if isinstance(parent, (list, type(None))):
+
+        _ptype = type(parent)
+
+        if _ptype in (list, type(None)):
             self.__parent = parent
-        elif isinstance(parent, str):
+        elif _ptype is str:
             if "," in parent:
                 self.__parent = parent.split(",")
             else:
@@ -1896,6 +1894,13 @@ def exons(self, *args):
 
         self.__exons = list(sorted(args[0]))
 
+    def _set_exons(self, exons):
+        """Private method that bypasses the checks within the direct setter, for speed purposes in finalising."""
+        if self.finalized is True:
+            raise NotImplementedError("I cannot reset the exons in a finalised transcript.")
+
+        self.__exons = exons
+
     @property
     def combined_cds_introns(self):
         """This property returns the introns which are located between CDS
@@ -1976,14 +1981,6 @@ def combined_cds(self, combined):
         if ((not isinstance(combined, list)) or
                 any(self.__wrong_combined_entry(comb) for comb in combined)):
             raise TypeError("Invalid value for combined CDS: {0}".format(combined))
-        # if len(combined) > 0:
-        #     if isinstance(combined[0], tuple):
-        #         try:
-        #             combined = [intervaltree.Interval(_[0], _[1]) for _ in combined]
-        #         except IndexError:
-        #             raise IndexError(combined)
-        #     else:
-        #         assert isinstance(combined[0], intervaltree.Interval)
 
         if len(combined) > 0:
             ar = np.array(list(zip(*combined)))
@@ -2282,14 +2279,11 @@ def combined_utr_fraction(self):
     @Metric
     def cdna_length(self):
         """This property returns the length of the transcript."""
-        # try:
-        #     self.__cdna_length = sum([e[1] - e[0] + 1 for e in self.exons])
-        # except TypeError:
-        #     raise TypeError(self.exons)
-        # if self.__cdna_length is None:
-        ar = np.array(list(zip(*self.exons)))
-        self.__cdna_length = int(np.subtract(ar[1], ar[0] -1).sum())
-
+        if self.__cdna_length is None and self.finalized is True:
+            raise AssertionError
+        if self.finalized is False or self.__cdna_length is None:
+            ar = np.array(list(zip(*self.exons)))
+            self.__cdna_length = int(np.subtract(ar[1], ar[0] - 1).sum())
         return self.__cdna_length
 
     cdna_length.category = "cDNA"

diff --git a/Mikado/transcripts/transcript_methods/finalizing.py b/Mikado/transcripts/transcript_methods/finalizing.py
@@ -20,7 +20,9 @@ def __basic_final_checks(transcript):
     :return:
     """
 
-    if len(transcript.exons) == 0:
+    _exons = transcript.exons
+
+    if not _exons:
         if transcript._possibly_without_exons is True:
             transcript.logger.debug("Inferring that %s is a single-exon transcript")
             new_exon = (transcript.start, transcript.end)
@@ -33,41 +35,41 @@ def __basic_final_checks(transcript):
             raise exc
         else:
             # Let us try to derive exons from CDS ...
-            transcript.exons = sorted([tuple([int(exon[0]), int(exon[1])]) for exon in transcript.combined_cds])
+            _exons = sorted([tuple([int(exon[0]), int(exon[1])]) for exon in transcript.combined_cds])
             if len(transcript.combined_utr) == 0:
                 # Enlarge the terminal exons to include the starts
                 if transcript.start is not None:
-                    transcript.exons[0] = (transcript.start, transcript.exons[0][1])
+                    _exons[0] = (transcript.start, _exons[0][1])
                 if transcript.end is not None:
-                    transcript.exons[-1] = (transcript.exons[-1][0], transcript.end)
+                    _exons[-1] = (_exons[-1][0], transcript.end)
             else:
                 __utr = sorted([tuple([int(exon[0]), int(exon[1])]) for exon in transcript.combined_utr])
                 try:
-                    __before = [_ for _ in __utr if _[1] < transcript.exons[0][0]]
+                    __before = [_ for _ in __utr if _[1] < _exons[0][0]]
                 except IndexError:
-                    raise IndexError((__utr, transcript.exons))
-                if __before[-1][1] == transcript.exons[0][0] - 1:
-                    transcript.exons[0] = (__before[-1][0], transcript.exons[0][1])
+                    raise IndexError((__utr, _exons))
+                if __before[-1][1] == _exons[0][0] - 1:
+                    _exons[0] = (__before[-1][0], _exons[0][1])
                     __before.pop()
-                __after = [_ for _ in __utr if _[0] > transcript.exons[-1][1]]
-                if __after[0][0] == transcript.exons[-1][1] + 1:
-                    transcript.exons[-1] = (transcript.exons[-1][0], __after[0][1])
+                __after = [_ for _ in __utr if _[0] > _exons[-1][1]]
+                if __after[0][0] == _exons[-1][1] + 1:
+                    _exons[-1] = (_exons[-1][0], __after[0][1])
                     __after = __after[1:]
-                transcript.exons = __before + transcript.exons + __after
+                _exons = __before + _exons + __after
 
     transcript.logger.debug("Converting to tuples")
-    transcript.exons = [tuple([int(exon[0]), int(exon[1])]) for exon in transcript.exons]
+    _exons = [tuple([int(exon[0]), int(exon[1])]) for exon in _exons]
 
     new_exons = []
-    invalid = False
+    # invalid = False
 
     # Set the start and end automatically if none has been explicitly provided
     if transcript.start is None:
-        transcript.start = min(_[0] for _ in transcript.exons)
+        transcript.start = min(_[0] for _ in _exons)
     if transcript.end is None:
-        transcript.end = max(_[1] for _ in transcript.exons)
+        transcript.end = max(_[1] for _ in _exons)
 
-    for exon in transcript.exons:
+    for exon in _exons:
         if not isinstance(exon, tuple):
             if (isinstance(exon, Interval) or
                     (isinstance(exon, list) and len(exon) == 2 and
@@ -83,7 +85,7 @@ def __basic_final_checks(transcript):
             raise exc
         new_exons.append(exon)
 
-    transcript.exons = sorted(new_exons)
+    transcript._set_exons(sorted(new_exons))
 
     if len(transcript.exons) > 1 and transcript.strand is None:
         if transcript._accept_undefined_multi is False:
@@ -533,70 +535,65 @@ def __check_phase_correctness(transcript):
     :return: Mikado.loci.transcript.Transcript
     """
 
-    if min(len(transcript.segments), len(transcript.internal_orfs)) == 0:
-        transcript.logger.debug("Redefining segments for %s", transcript.id)
+    segments, internal_orfs = transcript.segments, transcript.internal_orfs
+
+    if min(len(segments), len(internal_orfs)) == 0:
+        # transcript.logger.debug("Redefining segments for %s", transcript.id)
         # Define exons
-        transcript.segments = [("exon", tuple([e[0], e[1]]))
-                               for e in transcript.exons]
+        segments = [("exon", tuple([e[0], e[1]])) for e in transcript.exons]
         # Define CDS
-        if len(transcript.internal_orfs) > 0:
-            for orf in transcript.internal_orfs:
+        if len(internal_orfs) > 0:
+            for orf in internal_orfs:
                 for segment in orf:
                     if segment[0] == "exon":
                         continue
                     elif segment[0] == "UTR":
-                        transcript.segments.append(("UTR", (segment[1][0], segment[1][1])))
+                        segments.append(("UTR", (segment[1][0], segment[1][1])))
                     elif segment[0] == "CDS":
-                        transcript.segments.append(("CDS", (segment[1][0], segment[1][1])))
+                        segments.append(("CDS", (segment[1][0], segment[1][1])))
         else:
-            transcript.segments.extend([("CDS", tuple([c[0], c[1]]))
-                                    for c in transcript.combined_cds])
+            segments.extend([("CDS", tuple([c[0], c[1]])) for c in transcript.combined_cds])
         # Define UTR segments
-        transcript.segments.extend([("UTR", tuple([u[0], u[1]]))
-                                    for u in transcript.combined_utr])
+        segments.extend([("UTR", tuple([u[0], u[1]])) for u in transcript.combined_utr])
         # Mix and sort
-        transcript.segments = sorted(transcript.segments, key=operator.itemgetter(1, 0))
+        segments = sorted(segments, key=operator.itemgetter(1, 0))
         # Add to the store as a single entity
-        if not transcript.internal_orfs and any(_[0] == "CDS" for _ in transcript.segments):
-            transcript.internal_orfs = [transcript.segments]
+        if not internal_orfs and any(_[0] == "CDS" for _ in segments):
+            internal_orfs = [segments]
         else:
             transcript.selected_internal_orf_index = None
-    elif len(transcript.internal_orfs) == 0:
+    elif len(internal_orfs) == 0:
         exception = AssertionError("No internal ORF for {}".format(transcript.id))
         transcript.logger.exception(exception)
         raise exception
     else:
-        transcript.logger.debug("Segments and ORFs defined for %s", transcript.id)
+        pass
 
-    transcript.logger.debug("{} has {} internal ORF{}".format(
-        transcript.id, len(transcript.internal_orfs),
-        "s" if len(transcript.internal_orfs) > 1 else ""))
+    transcript.segments, transcript.internal_orfs = segments, internal_orfs
 
     __orfs_to_remove = []
-    for orf_index in range(len(transcript.internal_orfs)):
-        transcript.logger.debug("ORF #%d for %s: %s",
-                                orf_index, transcript.id, transcript.internal_orfs[orf_index])
+    for orf_index in range(len(internal_orfs)):
+        # transcript.logger.debug("ORF #%d for %s: %s",
+        #                         orf_index, transcript.id, transcript.internal_orfs[orf_index])
         try:
-            transcript = __check_internal_orf(transcript,
-                                              orf_index)
+            transcript = __check_internal_orf(transcript, orf_index)
         except (InvalidTranscript, InvalidCDS) as exc:
             transcript.logger.warning("ORF %s of %s is invalid, removing. Reason: %s",
                                       orf_index, transcript.id, exc)
             __orfs_to_remove.append(orf_index)
 
-            # transcript.logger.warning("Stripping the CDS from %s, error: %s",
-            #                           transcript.id, exc)
-    __num_orfs = transcript.number_internal_orfs
+    __num_orfs = len(internal_orfs)
     if (__num_orfs > 0) and (len(__orfs_to_remove) == __num_orfs):
         transcript.logger.warning("Every ORF of %s is invalid, stripping the CDS", transcript.id)
         transcript.strip_cds(strand_specific=True)
     elif len(__orfs_to_remove):
         transcript.logger.warning("Stripping %s of %s ORFs out of %s",
                                   transcript.id, len(__orfs_to_remove), __num_orfs)
         for orf_index in reversed(sorted(__orfs_to_remove)):
-            transcript.internal_orfs.pop(orf_index)
+            internal_orfs.pop(orf_index)
+        transcript.internal_orfs = internal_orfs
     else:
-        transcript.logger.debug("All internal ORFs of %s pass the muster.", transcript.id)
+        pass
 
     if len(transcript.internal_orfs) > 0:
         transcript.selected_internal_orf_index = 0
@@ -748,6 +745,7 @@ def finalize(transcript):
                 transcript.attributes.pop(prop)
 
     # transcript = __calc_cds_introns(transcript)
+    _ = transcript.cdna_length
 
     transcript.finalized = True
     transcript.logger.debug("Finished finalising %s", transcript.id)