This bug fix and associated test should finally close #129. Tomorrow …

…more testing.
EI-CoreBioinformatics · Oct 3, 2018 · b76b734 · b76b734
1 parent 7a4a075
commit b76b734
Show file tree

Hide file tree

Showing 3 changed files with 113 additions and 24 deletions.
diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py
@@ -579,8 +579,8 @@ def pad_transcripts(self):
         except KeyError:
             raise KeyError(self.json_conf.keys())
 
-        five_graph = self.define_graph(self.transcripts, self.__share_extreme, three_prime=False)
-        three_graph = self.define_graph(self.transcripts, self.__share_extreme, three_prime=True)
+        five_graph = self.define_graph(self.transcripts, self._share_extreme, three_prime=False)
+        three_graph = self.define_graph(self.transcripts, self._share_extreme, three_prime=True)
 
         self.logger.debug("5' graph: %s", five_graph.edges)
         self.logger.debug("3' graph: %s", three_graph.edges)
@@ -618,6 +618,8 @@ def _find_communities_boundaries(self, five_clique, three_clique):
         five_found = set()
 
         __to_modify = dict()
+        if self.strand == "-":
+            five_clique, three_clique = three_clique, five_clique
 
         self.logger.debug("5' communities to uniform: %s", five_clique)
 
@@ -670,7 +672,7 @@ def _find_communities_boundaries(self, five_clique, three_clique):
 
         return __to_modify
 
-    def __share_extreme(self, first, second, three_prime=False):
+    def _share_extreme(self, first, second, three_prime=False):
 
         """
         This function will determine whether two transcripts "overlap" at the 3' or 5' end.
@@ -682,20 +684,22 @@ def __share_extreme(self, first, second, three_prime=False):
         :return:
         """
 
-        if three_prime is False:
+        if (three_prime is False and first.strand in ("+", ".")) or (three_prime is True and first.strand == "-"):
             # 5' case
             first, second = sorted([first, second], key=operator.attrgetter("start"))  # so we know which comes first
             dist = second.start + 1 - first.start
+            assert dist > 0
             splices = len([_ for _ in first.splices if _ <= second.start])
             decision = (dist <= self.ts_distance) and (splices <= self.ts_max_splices)
-
-        else:
-
+        elif (three_prime is False and first.strand == "-") or (three_prime is True and first.strand in ("+", ".")):
             # 3' case
             first, second = sorted([first, second], key=operator.attrgetter("end"))
             dist = second.end + 1 - first.end
+            assert dist > 0
             splices = len([_ for _ in second.splices if _ >= first.end])
             decision = (dist <= self.ts_distance) and (splices <= self.ts_max_splices)
+        else:
+            raise ValueError("Undetermined case")
 
         self.logger.debug("%s and %s do %s overlap (distance %s - max %s, splices %s - max %s)",
                           first.id, second.id, "" if decision else "not",
@@ -860,6 +864,9 @@ def expand_transcript(transcript, new_start, new_end, fai, logger):
         transcript.exons = sorted(transcript.exons)
 
     transcript.finalize()
+    if transcript.strand == "-":
+        downstream, upstream = upstream, downstream
+
     # Now for the difficult part
     if internal_orfs and (new_start or new_end):
         logger.debug("Enlarging the ORFs for TID %s (%s)",
@@ -880,6 +887,8 @@ def expand_transcript(transcript, new_start, new_end, fai, logger):
         for orf in internal_orfs:
             logger.debug("Old ORF: %s", str(orf))
             try:
+                logger.debug("Sequence for %s: %s[..]%s (upstream %s, downstream %s)",
+                             transcript.id, seq[:10], seq[-10:], upstream, downstream)
                 orf.expand(seq, upstream, downstream, expand_orf=True, logger=logger)
             except AssertionError as err:
                 logger.error(err)

diff --git a/Mikado/parsers/bed12.py b/Mikado/parsers/bed12.py
@@ -681,12 +681,12 @@ def expand(self, sequence, upstream, downstream, expand_orf=False, logger=create
         if self.strand == "-":
             raise NotImplementedError("I can only expand ORFs on the sense strand")
 
+        old_sequence = sequence[upstream:len(self) + upstream]
+        assert len(old_sequence) + upstream + downstream == len(sequence)
         self.fasta_length = len(sequence)
 
         # I presume that the sequence is already in the right orientation
 
-        old_sequence = sequence[upstream:len(self) + upstream]
-
         self.start_codon = str(old_sequence[self.thick_start + self.phase -1 :self.thick_start + self.phase + 2]).upper()
         # last_codon_start = self.thick_end + ((self.thick_end - self.thick_start + 1 + self.phase) % 3 - 3)
         self.stop_codon = str(old_sequence[self.thick_end - 3:self.thick_end]).upper()

diff --git a/Mikado/tests/locus_tester.py b/Mikado/tests/locus_tester.py
@@ -1900,23 +1900,103 @@ def test_locus_unpickling(self):
 
 class PaddingTester(unittest.TestCase):
 
+    @staticmethod
+    def load_from_bed(manager, resource):
+        transcripts = dict()
+        with pkg_resources.resource_stream(manager, resource) as bed:
+            for line in bed:
+                line = line.decode()
+                line = BED12(line, coding=True)
+                line.coding = True
+                transcript = Transcript(line)
+                assert transcript.start > 0
+                assert transcript.end > 0
+                assert transcript.is_coding, transcript.format("bed12")
+                transcript.finalize()
+                transcript.verified_introns = transcript.introns
+                transcript.parent = "{}.gene".format(transcript.id)
+                transcripts[transcript.id] = transcript
+        return transcripts
+
+    def test_negative_padding(self):
+        genome = pkg_resources.resource_filename("Mikado.tests", "neg_pad.fa")
+        transcripts = self.load_from_bed("Mikado.tests", "neg_pad.bed12")
+        locus = Mikado.loci.Locus(transcripts['Human_coding_ENSP00000371111.2.m1'])
+        locus.json_conf["reference"]["genome"] = genome
+        for t in transcripts:
+            if t == locus.primary_transcript_id:
+                continue
+            locus.add_transcript_to_locus(transcripts[t])
+
+        self.assertEqual(transcripts['Human_coding_ENSP00000371111.2.m1'].combined_cds_end, 1646)
+        self.assertEqual(transcripts['Human_coding_ENSP00000371111.2.m1'].combined_cds_start, 33976)
+        self.assertEqual(transcripts['Human_coding_ENSP00000371111.2.m1'].combined_cds_end,
+                         transcripts['Human_coding_ENSP00000371111.2.m1'].start)
+        self.assertEqual(transcripts['Human_coding_ENSP00000371111.2.m1'].combined_cds_start,
+                         transcripts['Human_coding_ENSP00000371111.2.m1'].end)
+
+        cds_coordinates = dict()
+        for transcript in locus:
+            cds_coordinates[transcript] = (locus[transcript].combined_cds_start, locus[transcript].combined_cds_end)
+
+        corr = {1: "Human_coding_ENSP00000371111.2.m1", # 1645	33976
+                2: "Mikado_gold_mikado.0G230.1", # 1	34063
+                3: "ACOCA10068_run2_woRNA_ACOCA10068_r3_0032600.1" # 1032	34095
+                }
+
+        for pad_distance, max_splice in zip((130, 700, 1500, 2000), (1, )):
+            with self.subTest(pad_distance=pad_distance, max_splice=max_splice):
+                logger = create_default_logger("logger", level="WARNING")
+                locus.logger = logger
+                locus.json_conf["pick"]["alternative_splicing"]["ts_distance"] = pad_distance
+                locus.json_conf["pick"]["alternative_splicing"]["ts_max_splices"] = max_splice
+                locus.pad_transcripts()
+                for transcript in locus:
+                    self.assertGreater(locus[transcript].combined_cds_length, 0, transcript)
+                    self.assertEqual(locus[transcript].combined_cds_start, cds_coordinates[transcript][0])
+                    self.assertEqual(locus[transcript].combined_cds_end, cds_coordinates[transcript][1])
+                if pad_distance > 120:  # Ends must be uniform
+                    self.assertEqual(locus[corr[1]].end, locus[corr[3]].end,
+                                     ([locus[corr[_]].end for _ in range(1, 4)],
+                                     locus._share_extreme(transcripts[corr[1]],
+                                                          transcripts[corr[2]],
+                                                          three_prime=False))
+                                     )
+                    self.assertEqual(locus[corr[1]].end, locus[corr[2]].end,
+                                     ([locus[corr[_]].end for _ in range(1, 4)],
+                                     locus._share_extreme(transcripts[corr[1]],
+                                                          transcripts[corr[2]],
+                                                          three_prime=False))
+                                     )
+
+                elif pad_distance < 20:
+                    self.assertNotEqual(locus[corr[1]].end, locus[corr[3]].end)
+                    self.assertNotEqual(locus[corr[1]].end, locus[corr[2]].end)
+                    self.assertNotEqual(locus[corr[2]].end, locus[corr[3]].end)
+
+                if pad_distance >= (abs(transcripts[corr[1]].start - transcripts[corr[2]].start)):
+                    self.assertEqual(locus[corr[1]].start,
+                                     locus[corr[2]].start)
+                    self.assertEqual(locus[corr[1]].start,
+                                     locus[corr[3]].start)
+                else:
+
+                    self.assertNotEqual(locus[corr[1]].start, locus[corr[2]].start,
+                                        (abs(transcripts[corr[1]].start - transcripts[corr[2]].start),
+                                         pad_distance,
+                                         locus._share_extreme(transcripts[corr[1]], transcripts[corr[2]],
+                                                              three_prime=True)
+                                        ))
+
+                if pad_distance >= (abs(transcripts[corr[1]].start - transcripts[corr[3]].start)):
+                    self.assertEqual(locus[corr[3]].start,
+                                     locus[corr[1]].start)
+                else:
+                    self.assertNotEqual(locus[corr[3]].start, locus[corr[1]].start)
+
     def test_padding(self):
-        bed = pkg_resources.resource_stream("Mikado.tests", "padding_test.bed12")
         genome = pkg_resources.resource_filename("Mikado.tests", "padding_test.fa")
-        transcripts = dict()
-        for line in bed:
-            line = line.decode()
-            line = BED12(line, coding=True)
-            line.coding = True
-            transcript = Transcript(line)
-            assert transcript.start > 0
-            assert transcript.end > 0
-            assert transcript.is_coding, transcript.format("bed12")
-            assert transcript.strand == "+"
-            transcript.finalize()
-            transcript.verified_introns = transcript.introns
-            transcript.parent = "{}.gene".format(transcript.id)
-            transcripts[transcript.id] = transcript
+        transcripts = self.load_from_bed("Mikado.tests", "padding_test.bed12")
 
         locus = Mikado.loci.Locus(transcripts['mikado.44G2.1'])
         locus.json_conf["reference"]["genome"] = genome