Skip to content

Commit

Permalink
Fixed the padding (EI-CoreBioinformatics#142) with some new tests as …
Browse files Browse the repository at this point in the history
  • Loading branch information
lucventurini committed Nov 9, 2018
1 parent 3897746 commit 111e1cc
Show file tree
Hide file tree
Showing 3 changed files with 232 additions and 29 deletions.
78 changes: 50 additions & 28 deletions Mikado/loci/locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -941,8 +941,8 @@ def ts_max_splices(self):


def expand_transcript(transcript: Transcript,
start_transcript: Transcript,
end_transcript: Transcript,
start_transcript: [Transcript, bool],
end_transcript: [Transcript, bool],
fai, logger):

# If there is nothing to do, just get out
Expand Down Expand Up @@ -975,6 +975,8 @@ def expand_transcript(transcript: Transcript,
downstream = 0
up_exons = []
down_exons = []
new_first_exon = None # We have to use it for monoexonic transcripts

if start_transcript:
transcript.start = start_transcript.start
upstream_exons = sorted([ _ for _ in
Expand All @@ -989,24 +991,26 @@ def expand_transcript(transcript: Transcript,
transcript.id, start_transcript.id)

if intersecting_upstream[0].value == "exon":
new_exon = (min(intersecting_upstream[0][0], backup.start),
transcript.exons[0][1])
if new_exon != transcript.exons[0]:
upstream += backup.start - new_exon[0]
up_exons.append(new_exon)
new_first_exon = (min(intersecting_upstream[0][0], backup.start),
transcript.exons[0][1])
if new_first_exon != transcript.exons[0]:
upstream += backup.start - new_first_exon[0]
up_exons.append(new_first_exon)
else:
new_first_exon = None
if intersecting_upstream[0] in upstream_exons:
upstream_exons.remove(intersecting_upstream[0])
upstream += sum(_[1] - _[0] + 1 for _ in upstream_exons)
up_exons.extend(upstream_exons)
up_exons.extend([(_[0], _[1]) for _ in upstream_exons])
elif intersecting_upstream[0].value == "intron":
# Now we have to expand until the first exon in the upstream_exons
upstream_exon = upstream_exons[-1]
new_exon = (upstream_exon[0], transcript.exons[0][1])
new_first_exon = (upstream_exon[0], transcript.exons[0][1])
upstream_exons.remove(upstream_exon)
upstream += backup.start - new_exon[0]
upstream += backup.start - new_first_exon[0]
upstream += sum(_[1] - _[0] + 1 for _ in upstream_exons)
up_exons.extend(upstream_exons)
up_exons.append(new_exon)
up_exons.extend([(_[0], _[1]) for _ in upstream_exons])
up_exons.append(new_first_exon)

if end_transcript:

Expand All @@ -1022,10 +1026,19 @@ def expand_transcript(transcript: Transcript,
transcript.id, start_transcript.id)

if intersecting_downstream[-1].value == "exon":
new_exon = (transcript.exons[-1][0], max(intersecting_downstream[-1][1], transcript.exons[-1][1]))
if new_exon != transcript.exons[-1]:
downstream += new_exon[1] - backup.end
down_exons.append(new_exon)
if transcript.monoexonic and new_first_exon is not None:
new_exon = (new_first_exon[0], max(intersecting_downstream[-1][1], new_first_exon[1]))

if new_exon != new_first_exon:
up_exons.remove(new_first_exon)
downstream += new_exon[1] - backup.end
down_exons.append(new_exon)
else:
new_exon = (transcript.exons[-1][0], max(intersecting_downstream[-1][1], transcript.exons[-1][1]))
if new_exon != transcript.exons[-1]:
downstream += new_exon[1] - backup.end
down_exons.append(new_exon)

if intersecting_downstream[-1] in downstream_exons:
downstream_exons.remove(intersecting_downstream[-1])
downstream += sum(_[1] - _[0] + 1 for _ in downstream_exons)
Expand All @@ -1035,7 +1048,11 @@ def expand_transcript(transcript: Transcript,
downstream_exon = downstream_exons[0]
assert downstream_exon[1] > backup.end
assert downstream_exon[0] > backup.end
new_exon = (transcript.exons[-1][0], downstream_exon[1])
if transcript.monoexonic and new_first_exon is not None:
new_exon = (new_first_exon[0], downstream_exon[1])
up_exons.remove(new_first_exon)
else:
new_exon = (transcript.exons[-1][0], downstream_exon[1])
downstream_exons.remove(downstream_exon)
downstream += new_exon[1] - backup.end
downstream += sum(_[1] - _[0] + 1 for _ in downstream_exons)
Expand All @@ -1044,8 +1061,7 @@ def expand_transcript(transcript: Transcript,

first_exon, last_exon = transcript.exons[0], transcript.exons[-1]

assert upstream >= 0
assert downstream >= 0
assert upstream >= 0 and downstream >= 0

if upstream > 0:
# Remove the first exon
Expand Down Expand Up @@ -1082,15 +1098,21 @@ def expand_transcript(transcript: Transcript,
raise InvalidTranscript(error)
seq = TranscriptChecker(transcript, genome_seq, is_reference=True).cdna
assert len(seq) == transcript.cdna_length, (len(seq), transcript.cdna_length, transcript.exons)
assert len(seq) == backup.cdna_length + upstream + downstream, (
len(seq), backup.cdna_length + upstream + downstream,
backup.cdna_length, upstream, downstream,
(transcript.start, transcript.end), (backup.start, backup.end),
(None if not start_transcript else start_transcript.start,
None if not end_transcript else end_transcript.end),
set.difference(set(transcript.exons), set(backup.exons)),
set.difference(set(backup.exons), set(transcript.exons))
)
if not len(seq) == backup.cdna_length + upstream + downstream:

error = [len(seq), backup.cdna_length + upstream + downstream,
backup.cdna_length, upstream, downstream,
(transcript.start, transcript.end), (backup.id, backup.start, backup.end),
(None if not start_transcript else (start_transcript.id, start_transcript.end)),
(None if not end_transcript else (end_transcript.id, end_transcript.end)),
(backup.exons,
None if not start_transcript else start_transcript.exons,
None if not end_transcript else end_transcript.exons),
set.difference(set(transcript.exons), set(backup.exons)),
set.difference(set(backup.exons), set(transcript.exons))
]
error = "\n".join([str(_) for _ in error])
raise AssertionError(error)

for orf in internal_orfs:
logger.debug("Old ORF: %s", str(orf))
Expand Down
178 changes: 178 additions & 0 deletions Mikado/tests/locus_tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from Mikado.parsers import GFF # ,GTF, bed12
from Mikado.parsers.GTF import GtfLine
from Mikado.loci import Transcript, Superlocus, Abstractlocus, Locus, Monosublocus, MonosublocusHolder, Sublocus
from Mikado.loci.locus import expand_transcript
from Mikado.utilities.log_utils import create_null_logger, create_default_logger
from Mikado.utilities import overlap
import itertools
Expand All @@ -21,6 +22,9 @@
import pickle
import inspect
from Mikado.parsers.bed12 import BED12
import tempfile
import gzip
import pyfaidx
# from Mikado.scales.contrast import compare as c_compare


Expand Down Expand Up @@ -2120,6 +2124,18 @@ def test_locus_unpickling(self):

class PaddingTester(unittest.TestCase):

@classmethod
def setUpClass(cls):
cls.__genomefile__ = None

cls.__genomefile__ = tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".fa", prefix="prepare")

with pkg_resources.resource_stream("Mikado.tests", "chr5.fas.gz") as _:
cls.__genomefile__.write(gzip.decompress(_.read()))
cls.__genomefile__.flush()
cls.fai = pyfaidx.Fasta(cls.__genomefile__.name)


@staticmethod
def load_from_bed(manager, resource):
transcripts = dict()
Expand Down Expand Up @@ -2312,6 +2328,168 @@ def test_padding_noncoding(self):
self.assertNotEqual(locus["mikado.44G2.4"].end, locus["mikado.44G2.5"].end)
self.assertFalse(locus["mikado.44G2.1"].attributes.get("padded", False))

def test_pad_monoexonic(self):

transcript = Transcript()
transcript.chrom, transcript.strand, transcript.id = "Chr5", "+", "mono.1"
transcript.add_exons([(2001, 3000)])
transcript.finalize()
backup = transcript.deepcopy()

template_one = Transcript()
template_one.chrom, template_one.strand, template_one.id = "Chr5", "+", "multi.1"
template_one.add_exons([(1931, 2500), (2701, 3500)])
template_one.finalize()
logger = create_null_logger("test_pad_monoexonic")

for case in range(3):
with self.subTest(case=case):
transcript = backup.deepcopy()
start = template_one if case % 2 == 0 else False
end = template_one if case > 0 else False

expanded_one = expand_transcript(transcript,
start, end, self.fai, logger=logger)
if start:
self.assertEqual(expanded_one.start, template_one.start)
else:
self.assertEqual(expanded_one.start, transcript.start)
if end:
self.assertEqual(expanded_one.end, template_one.end)
else:
self.assertEqual(expanded_one.end, transcript.end)

# Now monoexonic template
template_two = Transcript()
template_two.chrom, template_two.strand, template_two.id = "Chr5", "+", "multi.1"
template_two.add_exons([(1931, 3500)])
template_two.finalize()

for case in range(3):
with self.subTest(case=case):
transcript = backup.deepcopy()
start = template_two if case % 2 == 0 else False
end = template_two if case > 0 else False

expanded_one = expand_transcript(transcript,
start, end, self.fai, logger=logger)
if start:
self.assertEqual(expanded_one.start, template_two.start)
else:
self.assertEqual(expanded_one.start, transcript.start)
if end:
self.assertEqual(expanded_one.end, template_two.end)
else:
self.assertEqual(expanded_one.end, transcript.end)

# Now monoexonic template
template_three = Transcript()
template_three.chrom, template_three.strand, template_three.id = "Chr5", "+", "multi.1"
template_three.add_exons([(1501, 1700), (1931, 3500), (4001, 5000)])
template_three.finalize()

for case in range(3):
with self.subTest(case=case):
transcript = backup.deepcopy()
start = template_three if case % 2 == 0 else False
end = template_three if case > 0 else False

expanded_one = expand_transcript(transcript,
start, end, self.fai, logger=logger)
if start:
self.assertEqual(expanded_one.start, start.start)
self.assertIn((1501, 1700), expanded_one.exons)
else:
self.assertEqual(expanded_one.start, transcript.start)
self.assertNotIn((1501, 1700), expanded_one.exons)
if end:
self.assertEqual(expanded_one.end, end.end)
self.assertIn((4001, 5000), expanded_one.exons)
else:
self.assertEqual(expanded_one.end, transcript.end)
self.assertNotIn((4001, 5000), expanded_one.exons)

def test_pad_multiexonic(self):

transcript = Transcript()
transcript.chrom, transcript.strand, transcript.id = "Chr5", "+", "mono.1"
transcript.add_exons([(2001, 2400), (2800, 3000)])
transcript.finalize()
backup = transcript.deepcopy()

template_one = Transcript()
template_one.chrom, template_one.strand, template_one.id = "Chr5", "+", "multi.1"
template_one.add_exons([(1931, 2500), (2701, 3500)])
template_one.finalize()
logger = create_null_logger("test_pad_monoexonic")

for case in range(3):
with self.subTest(case=case):
transcript = backup.deepcopy()
start = template_one if case % 2 == 0 else False
end = template_one if case > 0 else False

expanded_one = expand_transcript(transcript,
start, end, self.fai, logger=logger)
if start:
self.assertEqual(expanded_one.start, template_one.start)
else:
self.assertEqual(expanded_one.start, backup.start)
if end:
self.assertEqual(expanded_one.end, template_one.end)
else:
self.assertEqual(expanded_one.end, backup.end)

# Now monoexonic template
template_two = Transcript()
template_two.chrom, template_two.strand, template_two.id = "Chr5", "+", "multi.1"
template_two.add_exons([(1931, 3500)])
template_two.finalize()

for case in range(3):
with self.subTest(case=case):
transcript = backup.deepcopy()
start = template_two if case % 2 == 0 else False
end = template_two if case > 0 else False

expanded_one = expand_transcript(transcript,
start, end, self.fai, logger=logger)
if start:
self.assertEqual(expanded_one.start, template_two.start)
else:
self.assertEqual(expanded_one.start, backup.start)
if end:
self.assertEqual(expanded_one.end, template_two.end)
else:
self.assertEqual(expanded_one.end, transcript.end)

# Now monoexonic template
template_three = Transcript()
template_three.chrom, template_three.strand, template_three.id = "Chr5", "+", "multi.1"
template_three.add_exons([(1501, 1700), (1931, 3500), (4001, 5000)])
template_three.finalize()

for case in range(3):
with self.subTest(case=case):
transcript = backup.deepcopy()
start = template_three if case % 2 == 0 else False
end = template_three if case > 0 else False

expanded_one = expand_transcript(transcript,
start, end, self.fai, logger=logger)
if start:
self.assertEqual(expanded_one.start, start.start)
self.assertIn((1501, 1700), expanded_one.exons)
else:
self.assertEqual(expanded_one.start, backup.start)
self.assertNotIn((1501, 1700), expanded_one.exons)
if end:
self.assertEqual(expanded_one.end, end.end)
self.assertIn((4001, 5000), expanded_one.exons)
else:
self.assertEqual(expanded_one.end, backup.end)
self.assertNotIn((4001, 5000), expanded_one.exons)


if __name__ == '__main__':
unittest.main(verbosity=2)
5 changes: 4 additions & 1 deletion Mikado/transcripts/transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,10 @@ def add_exon(self, gffline, feature=None, phase=None):

if isinstance(gffline, (tuple, list)):
assert len(gffline) == 2
start, end = sorted(gffline)
try:
start, end = sorted(gffline)
except TypeError:
raise TypeError((gffline, type(gffline)))
if feature is None:
feature = "exon"
elif isinstance(gffline, intervaltree.Interval):
Expand Down

0 comments on commit 111e1cc

Please sign in to comment.