Skip to content

Commit

Permalink
This bug fix and associated test should finally close #129. Tomorrow …
Browse files Browse the repository at this point in the history
…more testing.
  • Loading branch information
lucventurini committed Oct 3, 2018
1 parent 7a4a075 commit b76b734
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 24 deletions.
23 changes: 16 additions & 7 deletions Mikado/loci/locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,8 +579,8 @@ def pad_transcripts(self):
except KeyError:
raise KeyError(self.json_conf.keys())

five_graph = self.define_graph(self.transcripts, self.__share_extreme, three_prime=False)
three_graph = self.define_graph(self.transcripts, self.__share_extreme, three_prime=True)
five_graph = self.define_graph(self.transcripts, self._share_extreme, three_prime=False)
three_graph = self.define_graph(self.transcripts, self._share_extreme, three_prime=True)

self.logger.debug("5' graph: %s", five_graph.edges)
self.logger.debug("3' graph: %s", three_graph.edges)
Expand Down Expand Up @@ -618,6 +618,8 @@ def _find_communities_boundaries(self, five_clique, three_clique):
five_found = set()

__to_modify = dict()
if self.strand == "-":
five_clique, three_clique = three_clique, five_clique

self.logger.debug("5' communities to uniform: %s", five_clique)

Expand Down Expand Up @@ -670,7 +672,7 @@ def _find_communities_boundaries(self, five_clique, three_clique):

return __to_modify

def __share_extreme(self, first, second, three_prime=False):
def _share_extreme(self, first, second, three_prime=False):

"""
This function will determine whether two transcripts "overlap" at the 3' or 5' end.
Expand All @@ -682,20 +684,22 @@ def __share_extreme(self, first, second, three_prime=False):
:return:
"""

if three_prime is False:
if (three_prime is False and first.strand in ("+", ".")) or (three_prime is True and first.strand == "-"):
# 5' case
first, second = sorted([first, second], key=operator.attrgetter("start")) # so we know which comes first
dist = second.start + 1 - first.start
assert dist > 0
splices = len([_ for _ in first.splices if _ <= second.start])
decision = (dist <= self.ts_distance) and (splices <= self.ts_max_splices)

else:

elif (three_prime is False and first.strand == "-") or (three_prime is True and first.strand in ("+", ".")):
# 3' case
first, second = sorted([first, second], key=operator.attrgetter("end"))
dist = second.end + 1 - first.end
assert dist > 0
splices = len([_ for _ in second.splices if _ >= first.end])
decision = (dist <= self.ts_distance) and (splices <= self.ts_max_splices)
else:
raise ValueError("Undetermined case")

self.logger.debug("%s and %s do %s overlap (distance %s - max %s, splices %s - max %s)",
first.id, second.id, "" if decision else "not",
Expand Down Expand Up @@ -860,6 +864,9 @@ def expand_transcript(transcript, new_start, new_end, fai, logger):
transcript.exons = sorted(transcript.exons)

transcript.finalize()
if transcript.strand == "-":
downstream, upstream = upstream, downstream

# Now for the difficult part
if internal_orfs and (new_start or new_end):
logger.debug("Enlarging the ORFs for TID %s (%s)",
Expand All @@ -880,6 +887,8 @@ def expand_transcript(transcript, new_start, new_end, fai, logger):
for orf in internal_orfs:
logger.debug("Old ORF: %s", str(orf))
try:
logger.debug("Sequence for %s: %s[..]%s (upstream %s, downstream %s)",
transcript.id, seq[:10], seq[-10:], upstream, downstream)
orf.expand(seq, upstream, downstream, expand_orf=True, logger=logger)
except AssertionError as err:
logger.error(err)
Expand Down
4 changes: 2 additions & 2 deletions Mikado/parsers/bed12.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,12 +681,12 @@ def expand(self, sequence, upstream, downstream, expand_orf=False, logger=create
if self.strand == "-":
raise NotImplementedError("I can only expand ORFs on the sense strand")

old_sequence = sequence[upstream:len(self) + upstream]
assert len(old_sequence) + upstream + downstream == len(sequence)
self.fasta_length = len(sequence)

# I presume that the sequence is already in the right orientation

old_sequence = sequence[upstream:len(self) + upstream]

self.start_codon = str(old_sequence[self.thick_start + self.phase -1 :self.thick_start + self.phase + 2]).upper()
# last_codon_start = self.thick_end + ((self.thick_end - self.thick_start + 1 + self.phase) % 3 - 3)
self.stop_codon = str(old_sequence[self.thick_end - 3:self.thick_end]).upper()
Expand Down
110 changes: 95 additions & 15 deletions Mikado/tests/locus_tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -1900,23 +1900,103 @@ def test_locus_unpickling(self):

class PaddingTester(unittest.TestCase):

@staticmethod
def load_from_bed(manager, resource):
transcripts = dict()
with pkg_resources.resource_stream(manager, resource) as bed:
for line in bed:
line = line.decode()
line = BED12(line, coding=True)
line.coding = True
transcript = Transcript(line)
assert transcript.start > 0
assert transcript.end > 0
assert transcript.is_coding, transcript.format("bed12")
transcript.finalize()
transcript.verified_introns = transcript.introns
transcript.parent = "{}.gene".format(transcript.id)
transcripts[transcript.id] = transcript
return transcripts

def test_negative_padding(self):
genome = pkg_resources.resource_filename("Mikado.tests", "neg_pad.fa")
transcripts = self.load_from_bed("Mikado.tests", "neg_pad.bed12")
locus = Mikado.loci.Locus(transcripts['Human_coding_ENSP00000371111.2.m1'])
locus.json_conf["reference"]["genome"] = genome
for t in transcripts:
if t == locus.primary_transcript_id:
continue
locus.add_transcript_to_locus(transcripts[t])

self.assertEqual(transcripts['Human_coding_ENSP00000371111.2.m1'].combined_cds_end, 1646)
self.assertEqual(transcripts['Human_coding_ENSP00000371111.2.m1'].combined_cds_start, 33976)
self.assertEqual(transcripts['Human_coding_ENSP00000371111.2.m1'].combined_cds_end,
transcripts['Human_coding_ENSP00000371111.2.m1'].start)
self.assertEqual(transcripts['Human_coding_ENSP00000371111.2.m1'].combined_cds_start,
transcripts['Human_coding_ENSP00000371111.2.m1'].end)

cds_coordinates = dict()
for transcript in locus:
cds_coordinates[transcript] = (locus[transcript].combined_cds_start, locus[transcript].combined_cds_end)

corr = {1: "Human_coding_ENSP00000371111.2.m1", # 1645 33976
2: "Mikado_gold_mikado.0G230.1", # 1 34063
3: "ACOCA10068_run2_woRNA_ACOCA10068_r3_0032600.1" # 1032 34095
}

for pad_distance, max_splice in zip((130, 700, 1500, 2000), (1, )):
with self.subTest(pad_distance=pad_distance, max_splice=max_splice):
logger = create_default_logger("logger", level="WARNING")
locus.logger = logger
locus.json_conf["pick"]["alternative_splicing"]["ts_distance"] = pad_distance
locus.json_conf["pick"]["alternative_splicing"]["ts_max_splices"] = max_splice
locus.pad_transcripts()
for transcript in locus:
self.assertGreater(locus[transcript].combined_cds_length, 0, transcript)
self.assertEqual(locus[transcript].combined_cds_start, cds_coordinates[transcript][0])
self.assertEqual(locus[transcript].combined_cds_end, cds_coordinates[transcript][1])
if pad_distance > 120: # Ends must be uniform
self.assertEqual(locus[corr[1]].end, locus[corr[3]].end,
([locus[corr[_]].end for _ in range(1, 4)],
locus._share_extreme(transcripts[corr[1]],
transcripts[corr[2]],
three_prime=False))
)
self.assertEqual(locus[corr[1]].end, locus[corr[2]].end,
([locus[corr[_]].end for _ in range(1, 4)],
locus._share_extreme(transcripts[corr[1]],
transcripts[corr[2]],
three_prime=False))
)

elif pad_distance < 20:
self.assertNotEqual(locus[corr[1]].end, locus[corr[3]].end)
self.assertNotEqual(locus[corr[1]].end, locus[corr[2]].end)
self.assertNotEqual(locus[corr[2]].end, locus[corr[3]].end)

if pad_distance >= (abs(transcripts[corr[1]].start - transcripts[corr[2]].start)):
self.assertEqual(locus[corr[1]].start,
locus[corr[2]].start)
self.assertEqual(locus[corr[1]].start,
locus[corr[3]].start)
else:

self.assertNotEqual(locus[corr[1]].start, locus[corr[2]].start,
(abs(transcripts[corr[1]].start - transcripts[corr[2]].start),
pad_distance,
locus._share_extreme(transcripts[corr[1]], transcripts[corr[2]],
three_prime=True)
))

if pad_distance >= (abs(transcripts[corr[1]].start - transcripts[corr[3]].start)):
self.assertEqual(locus[corr[3]].start,
locus[corr[1]].start)
else:
self.assertNotEqual(locus[corr[3]].start, locus[corr[1]].start)

def test_padding(self):
bed = pkg_resources.resource_stream("Mikado.tests", "padding_test.bed12")
genome = pkg_resources.resource_filename("Mikado.tests", "padding_test.fa")
transcripts = dict()
for line in bed:
line = line.decode()
line = BED12(line, coding=True)
line.coding = True
transcript = Transcript(line)
assert transcript.start > 0
assert transcript.end > 0
assert transcript.is_coding, transcript.format("bed12")
assert transcript.strand == "+"
transcript.finalize()
transcript.verified_introns = transcript.introns
transcript.parent = "{}.gene".format(transcript.id)
transcripts[transcript.id] = transcript
transcripts = self.load_from_bed("Mikado.tests", "padding_test.bed12")

locus = Mikado.loci.Locus(transcripts['mikado.44G2.1'])
locus.json_conf["reference"]["genome"] = genome
Expand Down

0 comments on commit b76b734

Please sign in to comment.