Skip to content

Commit

Permalink
Updated the changelog, progress on EI-CoreBioinformatics#137
Browse files Browse the repository at this point in the history
  • Loading branch information
lucventurini committed Nov 1, 2018
1 parent 62d3adf commit 696114e
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 5 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ Bugfixes and improvements:
- For the external scores, Mikado can now accept any type of numerical or boolean value. Mikado will understand at serialisation time whether a particular score can be used raw (ie its values are strictly comprised between 0 and 1) or whether it has to be forcibly scaled.
- This allows Mikado to use e.g. transcript expression as a valid metric.
- Now coding and non-coding transcripts will be in different loci.
- Corrected a bug in the calculation of overlapping intervals during BLAST serialisation.
- Now BLAST HSPs will have stored as well whether there is an in-frame stop codon.
- Mikado prepare now can accept models that lack any exon features but still have valid CDS/UTR features - this is necessary for some protein prediction tools.
- Fixed [#139](https://github.com/lucventurini/mikado/issues/139): Mikado was reverse complementing non-uppercase letters incorrectly.
- [#135](https://github.com/lucventurini/mikado/issues/135): Mikado so far operated under the assumption that the 8th field in GTF files was the **frame**, not the **phase** like in GFF3s. This is actually incompatible with EnsEMBL and many widespread tools such as [GenomeTools](http://genometools.org/) or [GffRead](https://github.com/gpertea/gffread). Starting from this version, Mikado uniforms with these other tools.
Expand Down
15 changes: 11 additions & 4 deletions Mikado/parsers/blast_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,9 +293,6 @@ def __calculate_merges(intervals: np.array):
:return:
"""

if intervals.shape[1] != 2:
raise ValueError("Invalid array shape: {}".format(intervals.shape))

if intervals.shape[0] == 1:
return intervals

Expand Down Expand Up @@ -343,7 +340,17 @@ def merge(intervals: [(int, int)], query_length=None, offset=1):

# Assume tuple of the form (start,end)
# Create array and sort
intervals = np.array([sorted(_) for _ in intervals], dtype=np.int)
offset = int(offset)
if offset not in [0, 1]:
raise ValueError("Invalid offset - only 0 and 1 allowed: {}".format(offset))

try:
intervals = np.array([sorted(_) for _ in intervals], dtype=np.int)
if intervals.shape[1] != 2:
raise ValueError("Invalid shape for intervals: {}".format(intervals.shape))
except (TypeError, ValueError):
raise TypeError("Invalid array for intervals: {}".format(intervals))

intervals = intervals[np.lexsort((intervals[:,1], intervals[:,0]))]
intervals = __calculate_merges(intervals)
total_length_covered = int(abs(intervals[:,1] - intervals[:,0] + offset).sum())
Expand Down
59 changes: 58 additions & 1 deletion Mikado/tests/test_blast_related.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import os
import gzip
import subprocess
import shutil
from Mikado.serializers.blast_serializer import utils as seri_blast_utils
import time


Expand Down Expand Up @@ -124,5 +124,62 @@ def test_asn(self):
os.remove(valid_asn)
os.chdir(master)


class TestMerging(unittest.TestCase):

"""Small class to test basic cases of the merging algorithm."""

def test_merging_1(self):

l = [(-10, -5), (-6, 8), (5, 10), (20, 40)]
tot_length = 51
corr_merged = [(-10, 10), (20, 40)]
merged, tot_length = seri_blast_utils.merge(l, query_length=tot_length, offset=1)
self.assertEqual(merged, corr_merged)
self.assertEqual(tot_length, 10 - -10 +1 + 40 - 20 + 1)

def test_merging_2(self):

l = [(100, 200)]
for offset in [0, 1, 2]:
with self.subTest(offset=offset):
tot_length = l[0][1] - l[0][0] + offset
if offset == 2:
with self.assertRaises(ValueError):
_ = seri_blast_utils.merge(l, offset=offset)
else:
merged, length = seri_blast_utils.merge(l, offset=offset)
self.assertEqual(length, tot_length)
self.assertEqual(merged, l)

def test_various_merging(self):

invalid = [
[('a', 0)],
[('a', 'b')],
[(10, 20), ('a', 'b')],
[(10, 20, 30), (40, 50, 60)]
]

for inv in invalid:
with self.subTest(inv=inv):
with self.assertRaises(TypeError, msg=inv):
seri_blast_utils.merge(inv)

valid = {
0: [[('10', '20')], [(10, 20)]],
1: [[(10, 30)], [(10, 30)]],
2: [[(-10.0, 5.5)], [(-10, 5)]],
3: [[(-4, -10)], [(-10, -4)]],
4: [[(-5, -10), (-2.2, -7.3)], [(-10, -2)]]
}

for val in valid:
inp, out = valid[val]
with self.subTest(val=val, msg=valid[val]):
_ = seri_blast_utils.merge(inp)
self.assertEqual(out, _[0])


if __name__ == '__main__':
unittest.main()

0 comments on commit 696114e

Please sign in to comment.