From b76b734ec83beaa5de3f3490b4485470edff9e1b Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 3 Oct 2018 02:17:31 +0100 Subject: [PATCH] This bug fix and associated test should finally close #129. Tomorrow more testing. --- Mikado/loci/locus.py | 23 +++++--- Mikado/parsers/bed12.py | 4 +- Mikado/tests/locus_tester.py | 110 ++++++++++++++++++++++++++++++----- 3 files changed, 113 insertions(+), 24 deletions(-) diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py index da9a6ef31..a12f3215f 100644 --- a/Mikado/loci/locus.py +++ b/Mikado/loci/locus.py @@ -579,8 +579,8 @@ def pad_transcripts(self): except KeyError: raise KeyError(self.json_conf.keys()) - five_graph = self.define_graph(self.transcripts, self.__share_extreme, three_prime=False) - three_graph = self.define_graph(self.transcripts, self.__share_extreme, three_prime=True) + five_graph = self.define_graph(self.transcripts, self._share_extreme, three_prime=False) + three_graph = self.define_graph(self.transcripts, self._share_extreme, three_prime=True) self.logger.debug("5' graph: %s", five_graph.edges) self.logger.debug("3' graph: %s", three_graph.edges) @@ -618,6 +618,8 @@ def _find_communities_boundaries(self, five_clique, three_clique): five_found = set() __to_modify = dict() + if self.strand == "-": + five_clique, three_clique = three_clique, five_clique self.logger.debug("5' communities to uniform: %s", five_clique) @@ -670,7 +672,7 @@ def _find_communities_boundaries(self, five_clique, three_clique): return __to_modify - def __share_extreme(self, first, second, three_prime=False): + def _share_extreme(self, first, second, three_prime=False): """ This function will determine whether two transcripts "overlap" at the 3' or 5' end. @@ -682,20 +684,22 @@ def __share_extreme(self, first, second, three_prime=False): :return: """ - if three_prime is False: + if (three_prime is False and first.strand in ("+", ".")) or (three_prime is True and first.strand == "-"): # 5' case first, second = sorted([first, second], key=operator.attrgetter("start")) # so we know which comes first dist = second.start + 1 - first.start + assert dist > 0 splices = len([_ for _ in first.splices if _ <= second.start]) decision = (dist <= self.ts_distance) and (splices <= self.ts_max_splices) - - else: - + elif (three_prime is False and first.strand == "-") or (three_prime is True and first.strand in ("+", ".")): # 3' case first, second = sorted([first, second], key=operator.attrgetter("end")) dist = second.end + 1 - first.end + assert dist > 0 splices = len([_ for _ in second.splices if _ >= first.end]) decision = (dist <= self.ts_distance) and (splices <= self.ts_max_splices) + else: + raise ValueError("Undetermined case") self.logger.debug("%s and %s do %s overlap (distance %s - max %s, splices %s - max %s)", first.id, second.id, "" if decision else "not", @@ -860,6 +864,9 @@ def expand_transcript(transcript, new_start, new_end, fai, logger): transcript.exons = sorted(transcript.exons) transcript.finalize() + if transcript.strand == "-": + downstream, upstream = upstream, downstream + # Now for the difficult part if internal_orfs and (new_start or new_end): logger.debug("Enlarging the ORFs for TID %s (%s)", @@ -880,6 +887,8 @@ def expand_transcript(transcript, new_start, new_end, fai, logger): for orf in internal_orfs: logger.debug("Old ORF: %s", str(orf)) try: + logger.debug("Sequence for %s: %s[..]%s (upstream %s, downstream %s)", + transcript.id, seq[:10], seq[-10:], upstream, downstream) orf.expand(seq, upstream, downstream, expand_orf=True, logger=logger) except AssertionError as err: logger.error(err) diff --git a/Mikado/parsers/bed12.py b/Mikado/parsers/bed12.py index 0b1ca25be..23deab07b 100644 --- a/Mikado/parsers/bed12.py +++ b/Mikado/parsers/bed12.py @@ -681,12 +681,12 @@ def expand(self, sequence, upstream, downstream, expand_orf=False, logger=create if self.strand == "-": raise NotImplementedError("I can only expand ORFs on the sense strand") + old_sequence = sequence[upstream:len(self) + upstream] + assert len(old_sequence) + upstream + downstream == len(sequence) self.fasta_length = len(sequence) # I presume that the sequence is already in the right orientation - old_sequence = sequence[upstream:len(self) + upstream] - self.start_codon = str(old_sequence[self.thick_start + self.phase -1 :self.thick_start + self.phase + 2]).upper() # last_codon_start = self.thick_end + ((self.thick_end - self.thick_start + 1 + self.phase) % 3 - 3) self.stop_codon = str(old_sequence[self.thick_end - 3:self.thick_end]).upper() diff --git a/Mikado/tests/locus_tester.py b/Mikado/tests/locus_tester.py index 475e0f19b..a29d2be4d 100644 --- a/Mikado/tests/locus_tester.py +++ b/Mikado/tests/locus_tester.py @@ -1900,23 +1900,103 @@ def test_locus_unpickling(self): class PaddingTester(unittest.TestCase): + @staticmethod + def load_from_bed(manager, resource): + transcripts = dict() + with pkg_resources.resource_stream(manager, resource) as bed: + for line in bed: + line = line.decode() + line = BED12(line, coding=True) + line.coding = True + transcript = Transcript(line) + assert transcript.start > 0 + assert transcript.end > 0 + assert transcript.is_coding, transcript.format("bed12") + transcript.finalize() + transcript.verified_introns = transcript.introns + transcript.parent = "{}.gene".format(transcript.id) + transcripts[transcript.id] = transcript + return transcripts + + def test_negative_padding(self): + genome = pkg_resources.resource_filename("Mikado.tests", "neg_pad.fa") + transcripts = self.load_from_bed("Mikado.tests", "neg_pad.bed12") + locus = Mikado.loci.Locus(transcripts['Human_coding_ENSP00000371111.2.m1']) + locus.json_conf["reference"]["genome"] = genome + for t in transcripts: + if t == locus.primary_transcript_id: + continue + locus.add_transcript_to_locus(transcripts[t]) + + self.assertEqual(transcripts['Human_coding_ENSP00000371111.2.m1'].combined_cds_end, 1646) + self.assertEqual(transcripts['Human_coding_ENSP00000371111.2.m1'].combined_cds_start, 33976) + self.assertEqual(transcripts['Human_coding_ENSP00000371111.2.m1'].combined_cds_end, + transcripts['Human_coding_ENSP00000371111.2.m1'].start) + self.assertEqual(transcripts['Human_coding_ENSP00000371111.2.m1'].combined_cds_start, + transcripts['Human_coding_ENSP00000371111.2.m1'].end) + + cds_coordinates = dict() + for transcript in locus: + cds_coordinates[transcript] = (locus[transcript].combined_cds_start, locus[transcript].combined_cds_end) + + corr = {1: "Human_coding_ENSP00000371111.2.m1", # 1645 33976 + 2: "Mikado_gold_mikado.0G230.1", # 1 34063 + 3: "ACOCA10068_run2_woRNA_ACOCA10068_r3_0032600.1" # 1032 34095 + } + + for pad_distance, max_splice in zip((130, 700, 1500, 2000), (1, )): + with self.subTest(pad_distance=pad_distance, max_splice=max_splice): + logger = create_default_logger("logger", level="WARNING") + locus.logger = logger + locus.json_conf["pick"]["alternative_splicing"]["ts_distance"] = pad_distance + locus.json_conf["pick"]["alternative_splicing"]["ts_max_splices"] = max_splice + locus.pad_transcripts() + for transcript in locus: + self.assertGreater(locus[transcript].combined_cds_length, 0, transcript) + self.assertEqual(locus[transcript].combined_cds_start, cds_coordinates[transcript][0]) + self.assertEqual(locus[transcript].combined_cds_end, cds_coordinates[transcript][1]) + if pad_distance > 120: # Ends must be uniform + self.assertEqual(locus[corr[1]].end, locus[corr[3]].end, + ([locus[corr[_]].end for _ in range(1, 4)], + locus._share_extreme(transcripts[corr[1]], + transcripts[corr[2]], + three_prime=False)) + ) + self.assertEqual(locus[corr[1]].end, locus[corr[2]].end, + ([locus[corr[_]].end for _ in range(1, 4)], + locus._share_extreme(transcripts[corr[1]], + transcripts[corr[2]], + three_prime=False)) + ) + + elif pad_distance < 20: + self.assertNotEqual(locus[corr[1]].end, locus[corr[3]].end) + self.assertNotEqual(locus[corr[1]].end, locus[corr[2]].end) + self.assertNotEqual(locus[corr[2]].end, locus[corr[3]].end) + + if pad_distance >= (abs(transcripts[corr[1]].start - transcripts[corr[2]].start)): + self.assertEqual(locus[corr[1]].start, + locus[corr[2]].start) + self.assertEqual(locus[corr[1]].start, + locus[corr[3]].start) + else: + + self.assertNotEqual(locus[corr[1]].start, locus[corr[2]].start, + (abs(transcripts[corr[1]].start - transcripts[corr[2]].start), + pad_distance, + locus._share_extreme(transcripts[corr[1]], transcripts[corr[2]], + three_prime=True) + )) + + if pad_distance >= (abs(transcripts[corr[1]].start - transcripts[corr[3]].start)): + self.assertEqual(locus[corr[3]].start, + locus[corr[1]].start) + else: + self.assertNotEqual(locus[corr[3]].start, locus[corr[1]].start) + def test_padding(self): - bed = pkg_resources.resource_stream("Mikado.tests", "padding_test.bed12") genome = pkg_resources.resource_filename("Mikado.tests", "padding_test.fa") - transcripts = dict() - for line in bed: - line = line.decode() - line = BED12(line, coding=True) - line.coding = True - transcript = Transcript(line) - assert transcript.start > 0 - assert transcript.end > 0 - assert transcript.is_coding, transcript.format("bed12") - assert transcript.strand == "+" - transcript.finalize() - transcript.verified_introns = transcript.introns - transcript.parent = "{}.gene".format(transcript.id) - transcripts[transcript.id] = transcript + transcripts = self.load_from_bed("Mikado.tests", "padding_test.bed12") locus = Mikado.loci.Locus(transcripts['mikado.44G2.1']) locus.json_conf["reference"]["genome"] = genome