From 8ca4b8675bd82f8750eac2c129ec6ce8c2541c9c Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Tue, 3 Mar 2020 12:16:06 +0000 Subject: [PATCH] For #280: now BED12 objects use numpy arrays rather than simple lists. Also, now using properties to avoid calculating the invalidity of a BED12 object over and over again. This should lead to some more speed improvements. --- Mikado/parsers/bed12.py | 179 +++++++++++++++--- Mikado/tests/test_bed12.py | 6 +- Mikado/tests/test_scores.py | 2 +- Mikado/tests/test_transcript_methods.py | 14 +- Mikado/tests/test_transcript_negative.py | 8 +- Mikado/tests/test_transcript_single.py | 2 +- .../transcript_methods/printing.py | 12 +- .../transcript_methods/splitting.py | 2 +- 8 files changed, 179 insertions(+), 46 deletions(-) diff --git a/Mikado/parsers/bed12.py b/Mikado/parsers/bed12.py index aad0ea3d2..09d62c30b 100644 --- a/Mikado/parsers/bed12.py +++ b/Mikado/parsers/bed12.py @@ -8,7 +8,7 @@ import numpy import os -from fastnumbers import fast_int, fast_float +from fastnumbers import fast_int, fast_float, isint from Bio import Seq import Bio.SeqRecord from . import Parser @@ -294,14 +294,15 @@ def __init__(self, *args: Union[str, list, tuple, GffLine], # If >=1, i.e. at least one internal stop codon, the ORF is invalid self._internal_stop_codons = 0 self.chrom = None - self.start = self.end = self.thick_start = self.thick_end = 0 + self.__start = self.__end = self.__thick_start = self.__thick_end = 0 self.name = "" self.score = 0 self.strand = None self.rgb = '' - self.block_sizes = [0] - self.block_starts = [0] - self.block_count = 1 + self.__block_sizes = np.zeros(1, dtype=np.integer) + self.__block_starts = np.zeros(1, dtype=np.integer) + self.__block_count = 1 + self.__invalid = None self.invalid_reason = None self.fasta_length = None self.__in_index = True @@ -552,15 +553,18 @@ def __check_validity(self, transcriptomic, fasta_index, sequence): :return: """ - if transcriptomic is True: - self.has_start_codon = False - self.has_stop_codon = False + del self.invalid if transcriptomic is True and self.coding is True: if not (fasta_index is not None or sequence is not None): - self.logger.debug("No check on the validity of %s", self.chrom) + self.logger.debug("No further check on the validity of %s as no sequence has been provided.", + self.chrom) return + if transcriptomic is True: + self.has_start_codon = False + self.has_stop_codon = False + if transcriptomic is True and self.coding is True and (fasta_index is not None or sequence is not None): self.logger.debug("Starting to check the validity of %s", self.chrom) self.validity_checked = True @@ -584,7 +588,7 @@ def __check_validity(self, transcriptomic, fasta_index, sequence): assert isinstance(sequence, str) # Just double check that the sequence length is the same as what the BED would suggest - if self.invalid is True: + if self.__is_invalid() is True: self.logger.debug("%s is invalid (%s)", self.chrom, self.invalid_reason) self.coding = False return @@ -639,7 +643,8 @@ def __check_validity(self, transcriptomic, fasta_index, sequence): gap='N') self._internal_stop_codons = str(translated_seq).count("*") - if self.invalid is True: + del self.invalid + if self.__is_invalid() is True: return def _adjust_start(self, sequence, orf_sequence): @@ -718,6 +723,7 @@ def _adjust_start(self, sequence, orf_sequence): self.chrom, self.end, self.thick_end, self.thick_start) self.phase = 0 + del self.invalid if self.invalid: self.logger.debug("%s is not coding after checking. Reason: %s", self.chrom, self.invalid_reason) self.coding = False @@ -953,6 +959,18 @@ def invalid(self): :rtype bool """ + if self.__invalid is None: + self.__invalid = self.__is_invalid() + + return self.__invalid + + @invalid.deleter + def invalid(self): + self.__invalid = None + + + def __is_invalid(self): + if self._internal_stop_codons >= 1: self.invalid_reason = "{} internal stop codons found".format(self._internal_stop_codons) return True @@ -1029,6 +1047,70 @@ def transcriptomic(self, value): elif not value: self.phase = None + @property + def start(self): + return self.__start + + @start.setter + def start(self, value): + if not isint(value) and not isinstance(value, np.integer): + raise ValueError("Thick end must be an integer!") + self.__start = fast_int(value) + del self.invalid + + @start.deleter + def start(self): + self.__start = 0 + del self.invalid + + @property + def end(self): + return self.__end + + @end.setter + def end(self, value): + if not isint(value) and not isinstance(value, np.integer): + raise ValueError("Thick end must be an integer, not {}! Value: {}".format(type(value), value)) + self.__end = fast_int(value) + del self.invalid + + @end.deleter + def end(self): + self.__end = 0 + del self.invalid + + @property + def thick_start(self): + return self.__thick_start + + @thick_start.setter + def thick_start(self, value): + if not isint(value) and not isinstance(value, np.integer): + raise ValueError("Thick end must be an integer!") + self.__thick_start = fast_int(value) + del self.invalid + + @thick_start.deleter + def thick_start(self): + self.__thick_start = 0 + del self.invalid + + @property + def thick_end(self): + return self.__thick_end + + @thick_end.setter + def thick_end(self, value): + if not isint(value) and not isinstance(value, np.integer): + raise ValueError("Thick end must be an integer!") + self.__thick_end = fast_int(value) + del self.invalid + + @thick_end.deleter + def thick_end(self): + self.__thick_end = 0 + del self.invalid + @property def phase(self): """This property is used for transcriptomic BED objects @@ -1050,8 +1132,61 @@ def phase(self, val): self.name, val)) elif self.transcriptomic is True and val not in (0, 1, 2): raise ValueError("A transcriptomic BED cannot have null frame.") + del self.invalid self.__phase = val + @phase.deleter + def phase(self): + self.__phase = None + del self.invalid + + @property + def block_count(self): + return self.__block_count + + @block_count.setter + def block_count(self, value): + if not isint(value) and not isinstance(value, np.integer): + raise ValueError("Thick end must be an integer!") + self.__block_count = fast_int(value) + del self.invalid + + @property + def block_sizes(self): + return self.__block_sizes + + @block_sizes.setter + def block_sizes(self, sizes): + sizes = np.array(sizes) + if not issubclass(sizes.dtype.type, np.integer): + raise TypeError("Block sizes should be integers!") + self.__block_sizes = sizes + del self.invalid + + @block_sizes.deleter + def block_sizes(self): + self.__block_sizes = np.zeros(1, dtype=np.integer) + del self.invalid + + @property + def block_starts(self): + return self.__block_starts + + @block_starts.setter + def block_starts(self, starts): + starts = np.array(starts) + if not issubclass(starts.dtype.type, np.integer): + raise TypeError("Block sizes should be integers! Dtype: {}; array: {}".format( + starts.dtype, starts + )) + self.__block_starts = starts + del self.invalid + + @block_starts.deleter + def block_starts(self): + self.__block_starts = np.zeros(1, dtype=np.integer) + del self.invalid + @property def _max_regression(self): """ @@ -1173,12 +1308,11 @@ def blocks(self): """This will return the coordinates of the blocks, with a 1-offset (as in GFF3)""" # First thing: calculate where each start point will be - _blocks = [] - starts = [_ + self.start - 1 for _ in self.block_starts] - for pos in range(self.block_count): - _blocks.append((starts[pos] + 1, starts[pos] + self.block_sizes[pos])) + starts = self.block_starts + self.start - 1 + _bstarts = starts + 1 + _bends = starts + self.block_sizes - return _blocks + return list(zip(_bstarts, _bends)) def to_transcriptomic(self, sequence=None, fasta_index=None, start_adjustment=False, lenient=False, alias=None, coding=True): @@ -1218,12 +1352,13 @@ def to_transcriptomic(self, sequence=None, fasta_index=None, start_adjustment=Fa if self.strand == "+": bsizes = self.block_sizes[:] else: - bsizes = list(reversed(self.block_sizes[:])) - tStart, tEnd = sum(self.block_sizes) - tEnd, sum(self.block_sizes) - tStart + bsizes = np.flip(self.block_sizes) + tStart, tEnd = self.block_sizes.sum() - tEnd, self.block_sizes.sum() - tStart - bstarts = [0] - for bs in bsizes[:-1]: - bstarts.append(bs + bstarts[-1]) + bstarts = np.concatenate([np.zeros(1, dtype=np.integer), bsizes[:-1].cumsum()]) + # bstarts = [0] + # for bs in bsizes[:-1]: + # bstarts.append(bs + bstarts[-1]) assert len(bstarts) == len(bsizes) == self.block_count, (bstarts, bsizes, self.block_count) if self.coding: @@ -1238,7 +1373,7 @@ def to_transcriptomic(self, sequence=None, fasta_index=None, start_adjustment=Fa new = list((self.name.split(";")[0], 0, - sum(self.block_sizes), + self.block_sizes.sum(), new_name, self.score, "+")) diff --git a/Mikado/tests/test_bed12.py b/Mikado/tests/test_bed12.py index 30b99fe91..fd61e0c2c 100644 --- a/Mikado/tests/test_bed12.py +++ b/Mikado/tests/test_bed12.py @@ -80,7 +80,7 @@ def test_diexonic_pos_transfer(self): seq += "ATG" * 79 seq += "TAA" * 1 seq += "A" * 80 - self.assertEqual(len(seq), sum(bed.block_sizes)) + self.assertEqual(len(seq), bed.block_sizes.sum()) tbed = bed.to_transcriptomic(sequence=seq) self.assertEqual(tbed.start, 1) self.assertEqual(tbed.end, 390) @@ -107,7 +107,7 @@ def test_diexonic_neg_transfer(self): seq += "ATG" * 79 seq += "TAA" * 1 seq += "A" * 70 - self.assertEqual(len(seq), sum(bed.block_sizes)) + self.assertEqual(len(seq), bed.block_sizes.sum()) tbed = bed.to_transcriptomic(sequence=seq) self.assertEqual(tbed.start, 1) self.assertEqual(tbed.end, 390) @@ -163,7 +163,7 @@ def test_wheat_1(self): self.assertEqual(bed.thick_start, 207087616) self.assertEqual(bed.thick_end, 207088433) - self.assertEqual(len(string_seq), sum(bed.block_sizes)) + self.assertEqual(len(string_seq), bed.block_sizes.sum()) tbed = bed.to_transcriptomic(sequence=string_seq) self.assertEqual(tbed.thick_end - tbed.thick_start + 1, 672) self.assertEqual(tbed.thick_start, string_seq.index("ATGGCGCTGATCGATTGGA") + 1) diff --git a/Mikado/tests/test_scores.py b/Mikado/tests/test_scores.py index 2940ef742..53adc9081 100644 --- a/Mikado/tests/test_scores.py +++ b/Mikado/tests/test_scores.py @@ -38,7 +38,7 @@ def setUp(self): self.assertEqual(b3.thick_start, 201) self.assertFalse(b3.invalid, b3.invalid_reason) self.assertEqual(b3.block_count, 3) - self.assertEqual(b3.block_sizes, [300, 300, 300]) + self.assertTrue((b3.block_sizes == [300, 300, 300]).all()) self.t3 = Transcript(b3) self.t3.finalize() self.assertTrue(self.t3.is_coding) diff --git a/Mikado/tests/test_transcript_methods.py b/Mikado/tests/test_transcript_methods.py index a329ed6c2..742cfbd97 100644 --- a/Mikado/tests/test_transcript_methods.py +++ b/Mikado/tests/test_transcript_methods.py @@ -300,12 +300,10 @@ def test_casePositive(self): self.assertEqual(b12.thick_start, tr.combined_cds_start) self.assertEqual(b12.thick_end, tr.combined_cds_end) self.assertEqual(len(b12.block_sizes), tr.exon_num) - self.assertEqual(b12.block_sizes, - [200, 200, 400, 500], + self.assertTrue((b12.block_sizes == [200, 200, 400, 500]).all(), b12.block_sizes) self.assertEqual(b12.strand, "+") - self.assertEqual(b12.block_starts, - [0, 300, 700, 2400], + self.assertTrue((b12.block_starts == [0, 300, 700, 2400]).all(), b12.block_starts) self.assertEqual(str(b12), "\t".join([str(_) for _ in @@ -340,13 +338,9 @@ def test_caseNegative(self): self.assertEqual(b12.thick_start, tr.combined_cds_end) self.assertEqual(b12.thick_end, tr.combined_cds_start) self.assertEqual(len(b12.block_sizes), tr.exon_num) - self.assertEqual(b12.block_sizes, - [200, 200, 400, 500], - b12.block_sizes) + self.assertTrue((b12.block_sizes == [200, 200, 400, 500]).all(), b12.block_sizes) self.assertEqual(b12.strand, "-") - self.assertEqual(b12.block_starts, - [0, 300, 700, 2400], - b12.block_starts) + self.assertTrue((b12.block_starts == [0, 300, 700, 2400]).all(), b12.block_starts) self.assertEqual(tr.format("bed12"), str(b12)) self.assertEqual(str(b12), diff --git a/Mikado/tests/test_transcript_negative.py b/Mikado/tests/test_transcript_negative.py index aaaa7887a..9e81b3000 100644 --- a/Mikado/tests/test_transcript_negative.py +++ b/Mikado/tests/test_transcript_negative.py @@ -72,7 +72,7 @@ def setUp(self): self.orf.thick_end = self.tr.cdna_length - self.tr.selected_end_distance_from_tes self.orf.block_count = 1 self.orf.blockSize = self.tr.cdna_length - self.orf.block_starts = 0 + self.orf.block_starts = [0] self.orf.has_start_codon = True self.orf.has_stop_codon = True self.orf.transcriptomic = True @@ -399,7 +399,7 @@ def testDoubleOrf(self): first_orf.thick_end = 501 first_orf.block_count = 1 first_orf.blockSize = self.tr.cdna_length - first_orf.block_starts = 0 + first_orf.block_starts = [0] first_orf.has_start_codon = True first_orf.has_stop_codon = True first_orf.transcriptomic = True @@ -417,7 +417,7 @@ def testDoubleOrf(self): second_orf.thick_end = 401 second_orf.block_count = 1 second_orf.blockSize = self.tr.cdna_length - second_orf.block_starts = 0 + second_orf.block_starts = [0] second_orf.has_start_codon = True second_orf.has_stop_codon = True second_orf.transcriptomic = True @@ -437,7 +437,7 @@ def testDoubleOrf(self): third_orf.thick_end = 1602 third_orf.block_count = 1 third_orf.blockSize = self.tr.cdna_length - third_orf.block_starts = 0 + third_orf.block_starts = [0] third_orf.has_start_codon = True third_orf.has_stop_codon = True third_orf.transcriptomic = True diff --git a/Mikado/tests/test_transcript_single.py b/Mikado/tests/test_transcript_single.py index aec25bee0..8d7149d63 100644 --- a/Mikado/tests/test_transcript_single.py +++ b/Mikado/tests/test_transcript_single.py @@ -56,7 +56,7 @@ def setUp(self): self.orf.thick_end = 8666 - 5928 + 1 self.orf.block_count = 1 self.orf.blockSize = self.tr.cdna_length - self.orf.block_starts = 0 + self.orf.block_starts = [0] self.orf.has_start_codon = True self.orf.has_stop_codon = True self.orf.transcriptomic = True diff --git a/Mikado/transcripts/transcript_methods/printing.py b/Mikado/transcripts/transcript_methods/printing.py index 5f67cb8cc..c031bd241 100644 --- a/Mikado/transcripts/transcript_methods/printing.py +++ b/Mikado/transcripts/transcript_methods/printing.py @@ -9,6 +9,8 @@ from Mikado.parsers.GTF import GtfLine from Mikado.parsers.GFF import GffLine from Mikado.parsers.bed12 import BED12 +import numpy as np + __author__ = 'Luca Venturini' @@ -372,10 +374,12 @@ def as_bed12(transcript, transcriptomic=False): bed12.thick_start = bed12.thick_end = bed12.start bed12.block_count = transcript.exon_num bed12.block_sizes = [exon[1] - exon[0] + 1 for exon in transcript.exons] - bed12.block_starts = [0] - for pos, intron in enumerate(sorted(transcript.introns)): - bed12.block_starts.append( - bed12.block_starts[pos] + bed12.block_sizes[pos] + intron[1] - intron[0] + 1) + _introns = np.concatenate([np.array([intron[1] - intron[0] + 1 for intron in sorted(transcript.introns)], + dtype=np.integer), + np.zeros(1, dtype=np.integer)]) + bed12.block_starts = np.concatenate([np.zeros(1, dtype=np.integer), + (bed12.block_sizes + _introns).cumsum()[:-1]], axis=0) + assert bed12.block_starts[0] == 0, bed12.block_starts if transcriptomic: bed12 = bed12.to_transcriptomic(alias=transcript.alias, start_adjustment=False, coding=transcript.is_coding) diff --git a/Mikado/transcripts/transcript_methods/splitting.py b/Mikado/transcripts/transcript_methods/splitting.py index 0cdc2c734..b3833d8bd 100644 --- a/Mikado/transcripts/transcript_methods/splitting.py +++ b/Mikado/transcripts/transcript_methods/splitting.py @@ -742,7 +742,7 @@ def __relocate_orfs(transcript, bed12_objects, tstart, tend): new.thick_start = min(new.thick_start, tend) - tstart + 1 new.thick_end = min(obj.thick_end, tend) - tstart + 1 new.block_sizes = [new.end] - new.block_starts = [new.block_starts] + new.block_starts = new.block_starts[:] assert new.thick_start > 0, new.thick_start assert new.thick_end > 0, new.thick_end