diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c67153b1..526b077a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,8 @@ Bugfixes and improvements: - For the external scores, Mikado can now accept any type of numerical or boolean value. Mikado will understand at serialisation time whether a particular score can be used raw (ie its values are strictly comprised between 0 and 1) or whether it has to be forcibly scaled. - This allows Mikado to use e.g. transcript expression as a valid metric. - Now coding and non-coding transcripts will be in different loci. +- Corrected a bug in the calculation of overlapping intervals during BLAST serialisation. +- Now BLAST HSPs will have stored as well whether there is an in-frame stop codon. - Mikado prepare now can accept models that lack any exon features but still have valid CDS/UTR features - this is necessary for some protein prediction tools. - Fixed [#139](https://github.com/lucventurini/mikado/issues/139): Mikado was reverse complementing non-uppercase letters incorrectly. - [#135](https://github.com/lucventurini/mikado/issues/135): Mikado so far operated under the assumption that the 8th field in GTF files was the **frame**, not the **phase** like in GFF3s. This is actually incompatible with EnsEMBL and many widespread tools such as [GenomeTools](http://genometools.org/) or [GffRead](https://github.com/gpertea/gffread). Starting from this version, Mikado uniforms with these other tools. diff --git a/Mikado/parsers/blast_utils.py b/Mikado/parsers/blast_utils.py index 732bb53ab..81157f665 100644 --- a/Mikado/parsers/blast_utils.py +++ b/Mikado/parsers/blast_utils.py @@ -293,9 +293,6 @@ def __calculate_merges(intervals: np.array): :return: """ - if intervals.shape[1] != 2: - raise ValueError("Invalid array shape: {}".format(intervals.shape)) - if intervals.shape[0] == 1: return intervals @@ -343,7 +340,17 @@ def merge(intervals: [(int, int)], query_length=None, offset=1): # Assume tuple of the form (start,end) # Create array and sort - intervals = np.array([sorted(_) for _ in intervals], dtype=np.int) + offset = int(offset) + if offset not in [0, 1]: + raise ValueError("Invalid offset - only 0 and 1 allowed: {}".format(offset)) + + try: + intervals = np.array([sorted(_) for _ in intervals], dtype=np.int) + if intervals.shape[1] != 2: + raise ValueError("Invalid shape for intervals: {}".format(intervals.shape)) + except (TypeError, ValueError): + raise TypeError("Invalid array for intervals: {}".format(intervals)) + intervals = intervals[np.lexsort((intervals[:,1], intervals[:,0]))] intervals = __calculate_merges(intervals) total_length_covered = int(abs(intervals[:,1] - intervals[:,0] + offset).sum()) diff --git a/Mikado/tests/test_blast_related.py b/Mikado/tests/test_blast_related.py index b66e73737..1aa1ba116 100644 --- a/Mikado/tests/test_blast_related.py +++ b/Mikado/tests/test_blast_related.py @@ -7,7 +7,7 @@ import os import gzip import subprocess -import shutil +from Mikado.serializers.blast_serializer import utils as seri_blast_utils import time @@ -124,5 +124,62 @@ def test_asn(self): os.remove(valid_asn) os.chdir(master) + +class TestMerging(unittest.TestCase): + + """Small class to test basic cases of the merging algorithm.""" + + def test_merging_1(self): + + l = [(-10, -5), (-6, 8), (5, 10), (20, 40)] + tot_length = 51 + corr_merged = [(-10, 10), (20, 40)] + merged, tot_length = seri_blast_utils.merge(l, query_length=tot_length, offset=1) + self.assertEqual(merged, corr_merged) + self.assertEqual(tot_length, 10 - -10 +1 + 40 - 20 + 1) + + def test_merging_2(self): + + l = [(100, 200)] + for offset in [0, 1, 2]: + with self.subTest(offset=offset): + tot_length = l[0][1] - l[0][0] + offset + if offset == 2: + with self.assertRaises(ValueError): + _ = seri_blast_utils.merge(l, offset=offset) + else: + merged, length = seri_blast_utils.merge(l, offset=offset) + self.assertEqual(length, tot_length) + self.assertEqual(merged, l) + + def test_various_merging(self): + + invalid = [ + [('a', 0)], + [('a', 'b')], + [(10, 20), ('a', 'b')], + [(10, 20, 30), (40, 50, 60)] + ] + + for inv in invalid: + with self.subTest(inv=inv): + with self.assertRaises(TypeError, msg=inv): + seri_blast_utils.merge(inv) + + valid = { + 0: [[('10', '20')], [(10, 20)]], + 1: [[(10, 30)], [(10, 30)]], + 2: [[(-10.0, 5.5)], [(-10, 5)]], + 3: [[(-4, -10)], [(-10, -4)]], + 4: [[(-5, -10), (-2.2, -7.3)], [(-10, -2)]] + } + + for val in valid: + inp, out = valid[val] + with self.subTest(val=val, msg=valid[val]): + _ = seri_blast_utils.merge(inp) + self.assertEqual(out, _[0]) + + if __name__ == '__main__': unittest.main() \ No newline at end of file