Skip to content

Commit

Permalink
For EI-CoreBioinformatics#280: now BED12 objects use numpy arrays rat…
Browse files Browse the repository at this point in the history
…her than simple lists. Also, now using properties to avoid calculating the invalidity of a BED12 object over and over again. This should lead to some more speed improvements.
  • Loading branch information
lucventurini committed Mar 3, 2020
1 parent b79bb90 commit 8ca4b86
Show file tree
Hide file tree
Showing 8 changed files with 179 additions and 46 deletions.
179 changes: 157 additions & 22 deletions Mikado/parsers/bed12.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import numpy
import os
from fastnumbers import fast_int, fast_float
from fastnumbers import fast_int, fast_float, isint
from Bio import Seq
import Bio.SeqRecord
from . import Parser
Expand Down Expand Up @@ -294,14 +294,15 @@ def __init__(self, *args: Union[str, list, tuple, GffLine],
# If >=1, i.e. at least one internal stop codon, the ORF is invalid
self._internal_stop_codons = 0
self.chrom = None
self.start = self.end = self.thick_start = self.thick_end = 0
self.__start = self.__end = self.__thick_start = self.__thick_end = 0
self.name = ""
self.score = 0
self.strand = None
self.rgb = ''
self.block_sizes = [0]
self.block_starts = [0]
self.block_count = 1
self.__block_sizes = np.zeros(1, dtype=np.integer)
self.__block_starts = np.zeros(1, dtype=np.integer)
self.__block_count = 1
self.__invalid = None
self.invalid_reason = None
self.fasta_length = None
self.__in_index = True
Expand Down Expand Up @@ -552,15 +553,18 @@ def __check_validity(self, transcriptomic, fasta_index, sequence):
:return:
"""

if transcriptomic is True:
self.has_start_codon = False
self.has_stop_codon = False
del self.invalid

if transcriptomic is True and self.coding is True:
if not (fasta_index is not None or sequence is not None):
self.logger.debug("No check on the validity of %s", self.chrom)
self.logger.debug("No further check on the validity of %s as no sequence has been provided.",
self.chrom)
return

if transcriptomic is True:
self.has_start_codon = False
self.has_stop_codon = False

if transcriptomic is True and self.coding is True and (fasta_index is not None or sequence is not None):
self.logger.debug("Starting to check the validity of %s", self.chrom)
self.validity_checked = True
Expand All @@ -584,7 +588,7 @@ def __check_validity(self, transcriptomic, fasta_index, sequence):

assert isinstance(sequence, str)
# Just double check that the sequence length is the same as what the BED would suggest
if self.invalid is True:
if self.__is_invalid() is True:
self.logger.debug("%s is invalid (%s)", self.chrom, self.invalid_reason)
self.coding = False
return
Expand Down Expand Up @@ -639,7 +643,8 @@ def __check_validity(self, transcriptomic, fasta_index, sequence):
gap='N')

self._internal_stop_codons = str(translated_seq).count("*")
if self.invalid is True:
del self.invalid
if self.__is_invalid() is True:
return

def _adjust_start(self, sequence, orf_sequence):
Expand Down Expand Up @@ -718,6 +723,7 @@ def _adjust_start(self, sequence, orf_sequence):
self.chrom, self.end, self.thick_end, self.thick_start)
self.phase = 0

del self.invalid
if self.invalid:
self.logger.debug("%s is not coding after checking. Reason: %s", self.chrom, self.invalid_reason)
self.coding = False
Expand Down Expand Up @@ -953,6 +959,18 @@ def invalid(self):
:rtype bool
"""

if self.__invalid is None:
self.__invalid = self.__is_invalid()

return self.__invalid

@invalid.deleter
def invalid(self):
self.__invalid = None


def __is_invalid(self):

if self._internal_stop_codons >= 1:
self.invalid_reason = "{} internal stop codons found".format(self._internal_stop_codons)
return True
Expand Down Expand Up @@ -1029,6 +1047,70 @@ def transcriptomic(self, value):
elif not value:
self.phase = None

@property
def start(self):
return self.__start

@start.setter
def start(self, value):
if not isint(value) and not isinstance(value, np.integer):
raise ValueError("Thick end must be an integer!")
self.__start = fast_int(value)
del self.invalid

@start.deleter
def start(self):
self.__start = 0
del self.invalid

@property
def end(self):
return self.__end

@end.setter
def end(self, value):
if not isint(value) and not isinstance(value, np.integer):
raise ValueError("Thick end must be an integer, not {}! Value: {}".format(type(value), value))
self.__end = fast_int(value)
del self.invalid

@end.deleter
def end(self):
self.__end = 0
del self.invalid

@property
def thick_start(self):
return self.__thick_start

@thick_start.setter
def thick_start(self, value):
if not isint(value) and not isinstance(value, np.integer):
raise ValueError("Thick end must be an integer!")
self.__thick_start = fast_int(value)
del self.invalid

@thick_start.deleter
def thick_start(self):
self.__thick_start = 0
del self.invalid

@property
def thick_end(self):
return self.__thick_end

@thick_end.setter
def thick_end(self, value):
if not isint(value) and not isinstance(value, np.integer):
raise ValueError("Thick end must be an integer!")
self.__thick_end = fast_int(value)
del self.invalid

@thick_end.deleter
def thick_end(self):
self.__thick_end = 0
del self.invalid

@property
def phase(self):
"""This property is used for transcriptomic BED objects
Expand All @@ -1050,8 +1132,61 @@ def phase(self, val):
self.name, val))
elif self.transcriptomic is True and val not in (0, 1, 2):
raise ValueError("A transcriptomic BED cannot have null frame.")
del self.invalid
self.__phase = val

@phase.deleter
def phase(self):
self.__phase = None
del self.invalid

@property
def block_count(self):
return self.__block_count

@block_count.setter
def block_count(self, value):
if not isint(value) and not isinstance(value, np.integer):
raise ValueError("Thick end must be an integer!")
self.__block_count = fast_int(value)
del self.invalid

@property
def block_sizes(self):
return self.__block_sizes

@block_sizes.setter
def block_sizes(self, sizes):
sizes = np.array(sizes)
if not issubclass(sizes.dtype.type, np.integer):
raise TypeError("Block sizes should be integers!")
self.__block_sizes = sizes
del self.invalid

@block_sizes.deleter
def block_sizes(self):
self.__block_sizes = np.zeros(1, dtype=np.integer)
del self.invalid

@property
def block_starts(self):
return self.__block_starts

@block_starts.setter
def block_starts(self, starts):
starts = np.array(starts)
if not issubclass(starts.dtype.type, np.integer):
raise TypeError("Block sizes should be integers! Dtype: {}; array: {}".format(
starts.dtype, starts
))
self.__block_starts = starts
del self.invalid

@block_starts.deleter
def block_starts(self):
self.__block_starts = np.zeros(1, dtype=np.integer)
del self.invalid

@property
def _max_regression(self):
"""
Expand Down Expand Up @@ -1173,12 +1308,11 @@ def blocks(self):
"""This will return the coordinates of the blocks, with a 1-offset (as in GFF3)"""

# First thing: calculate where each start point will be
_blocks = []
starts = [_ + self.start - 1 for _ in self.block_starts]
for pos in range(self.block_count):
_blocks.append((starts[pos] + 1, starts[pos] + self.block_sizes[pos]))
starts = self.block_starts + self.start - 1
_bstarts = starts + 1
_bends = starts + self.block_sizes

return _blocks
return list(zip(_bstarts, _bends))

def to_transcriptomic(self, sequence=None, fasta_index=None, start_adjustment=False,
lenient=False, alias=None, coding=True):
Expand Down Expand Up @@ -1218,12 +1352,13 @@ def to_transcriptomic(self, sequence=None, fasta_index=None, start_adjustment=Fa
if self.strand == "+":
bsizes = self.block_sizes[:]
else:
bsizes = list(reversed(self.block_sizes[:]))
tStart, tEnd = sum(self.block_sizes) - tEnd, sum(self.block_sizes) - tStart
bsizes = np.flip(self.block_sizes)
tStart, tEnd = self.block_sizes.sum() - tEnd, self.block_sizes.sum() - tStart

bstarts = [0]
for bs in bsizes[:-1]:
bstarts.append(bs + bstarts[-1])
bstarts = np.concatenate([np.zeros(1, dtype=np.integer), bsizes[:-1].cumsum()])
# bstarts = [0]
# for bs in bsizes[:-1]:
# bstarts.append(bs + bstarts[-1])
assert len(bstarts) == len(bsizes) == self.block_count, (bstarts, bsizes, self.block_count)

if self.coding:
Expand All @@ -1238,7 +1373,7 @@ def to_transcriptomic(self, sequence=None, fasta_index=None, start_adjustment=Fa

new = list((self.name.split(";")[0],
0,
sum(self.block_sizes),
self.block_sizes.sum(),
new_name,
self.score,
"+"))
Expand Down
6 changes: 3 additions & 3 deletions Mikado/tests/test_bed12.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def test_diexonic_pos_transfer(self):
seq += "ATG" * 79
seq += "TAA" * 1
seq += "A" * 80
self.assertEqual(len(seq), sum(bed.block_sizes))
self.assertEqual(len(seq), bed.block_sizes.sum())
tbed = bed.to_transcriptomic(sequence=seq)
self.assertEqual(tbed.start, 1)
self.assertEqual(tbed.end, 390)
Expand All @@ -107,7 +107,7 @@ def test_diexonic_neg_transfer(self):
seq += "ATG" * 79
seq += "TAA" * 1
seq += "A" * 70
self.assertEqual(len(seq), sum(bed.block_sizes))
self.assertEqual(len(seq), bed.block_sizes.sum())
tbed = bed.to_transcriptomic(sequence=seq)
self.assertEqual(tbed.start, 1)
self.assertEqual(tbed.end, 390)
Expand Down Expand Up @@ -163,7 +163,7 @@ def test_wheat_1(self):
self.assertEqual(bed.thick_start, 207087616)
self.assertEqual(bed.thick_end, 207088433)

self.assertEqual(len(string_seq), sum(bed.block_sizes))
self.assertEqual(len(string_seq), bed.block_sizes.sum())
tbed = bed.to_transcriptomic(sequence=string_seq)
self.assertEqual(tbed.thick_end - tbed.thick_start + 1, 672)
self.assertEqual(tbed.thick_start, string_seq.index("ATGGCGCTGATCGATTGGA") + 1)
Expand Down
2 changes: 1 addition & 1 deletion Mikado/tests/test_scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def setUp(self):
self.assertEqual(b3.thick_start, 201)
self.assertFalse(b3.invalid, b3.invalid_reason)
self.assertEqual(b3.block_count, 3)
self.assertEqual(b3.block_sizes, [300, 300, 300])
self.assertTrue((b3.block_sizes == [300, 300, 300]).all())
self.t3 = Transcript(b3)
self.t3.finalize()
self.assertTrue(self.t3.is_coding)
Expand Down
14 changes: 4 additions & 10 deletions Mikado/tests/test_transcript_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,12 +300,10 @@ def test_casePositive(self):
self.assertEqual(b12.thick_start, tr.combined_cds_start)
self.assertEqual(b12.thick_end, tr.combined_cds_end)
self.assertEqual(len(b12.block_sizes), tr.exon_num)
self.assertEqual(b12.block_sizes,
[200, 200, 400, 500],
self.assertTrue((b12.block_sizes == [200, 200, 400, 500]).all(),
b12.block_sizes)
self.assertEqual(b12.strand, "+")
self.assertEqual(b12.block_starts,
[0, 300, 700, 2400],
self.assertTrue((b12.block_starts == [0, 300, 700, 2400]).all(),
b12.block_starts)
self.assertEqual(str(b12),
"\t".join([str(_) for _ in
Expand Down Expand Up @@ -340,13 +338,9 @@ def test_caseNegative(self):
self.assertEqual(b12.thick_start, tr.combined_cds_end)
self.assertEqual(b12.thick_end, tr.combined_cds_start)
self.assertEqual(len(b12.block_sizes), tr.exon_num)
self.assertEqual(b12.block_sizes,
[200, 200, 400, 500],
b12.block_sizes)
self.assertTrue((b12.block_sizes == [200, 200, 400, 500]).all(), b12.block_sizes)
self.assertEqual(b12.strand, "-")
self.assertEqual(b12.block_starts,
[0, 300, 700, 2400],
b12.block_starts)
self.assertTrue((b12.block_starts == [0, 300, 700, 2400]).all(), b12.block_starts)

self.assertEqual(tr.format("bed12"), str(b12))
self.assertEqual(str(b12),
Expand Down
8 changes: 4 additions & 4 deletions Mikado/tests/test_transcript_negative.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def setUp(self):
self.orf.thick_end = self.tr.cdna_length - self.tr.selected_end_distance_from_tes
self.orf.block_count = 1
self.orf.blockSize = self.tr.cdna_length
self.orf.block_starts = 0
self.orf.block_starts = [0]
self.orf.has_start_codon = True
self.orf.has_stop_codon = True
self.orf.transcriptomic = True
Expand Down Expand Up @@ -399,7 +399,7 @@ def testDoubleOrf(self):
first_orf.thick_end = 501
first_orf.block_count = 1
first_orf.blockSize = self.tr.cdna_length
first_orf.block_starts = 0
first_orf.block_starts = [0]
first_orf.has_start_codon = True
first_orf.has_stop_codon = True
first_orf.transcriptomic = True
Expand All @@ -417,7 +417,7 @@ def testDoubleOrf(self):
second_orf.thick_end = 401
second_orf.block_count = 1
second_orf.blockSize = self.tr.cdna_length
second_orf.block_starts = 0
second_orf.block_starts = [0]
second_orf.has_start_codon = True
second_orf.has_stop_codon = True
second_orf.transcriptomic = True
Expand All @@ -437,7 +437,7 @@ def testDoubleOrf(self):
third_orf.thick_end = 1602
third_orf.block_count = 1
third_orf.blockSize = self.tr.cdna_length
third_orf.block_starts = 0
third_orf.block_starts = [0]
third_orf.has_start_codon = True
third_orf.has_stop_codon = True
third_orf.transcriptomic = True
Expand Down
2 changes: 1 addition & 1 deletion Mikado/tests/test_transcript_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def setUp(self):
self.orf.thick_end = 8666 - 5928 + 1
self.orf.block_count = 1
self.orf.blockSize = self.tr.cdna_length
self.orf.block_starts = 0
self.orf.block_starts = [0]
self.orf.has_start_codon = True
self.orf.has_stop_codon = True
self.orf.transcriptomic = True
Expand Down
Loading

0 comments on commit 8ca4b86

Please sign in to comment.