diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a458ee08..8b4169751 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,8 @@ Other changes: in BED12 format as sources of valid junctions. - Slightly increased the unit-test coverage for the locus classes, e.g. properly covering the `as_dict` and `load_dict` methods. Minor bugfixes related to the introduction of these unit-tests. -- `Mikado.parsers.to_gff` has been renamed to `Mikado.parsers.parser_factory`. +- `Mikado.parsers.to_gff` has been renamed to `Mikado.parsers.parser_factory`. +- Mikado will error informatively if the scoring configuration file is malformed. # Version 2.1.1 diff --git a/Mikado/configuration/configuration.py b/Mikado/configuration/configuration.py index ecb43a33f..b4c01062e 100644 --- a/Mikado/configuration/configuration.py +++ b/Mikado/configuration/configuration.py @@ -97,7 +97,7 @@ def load_scoring(self, logger=None): logger = create_null_logger("check_scoring") if self.pick.scoring_file is None: if self._loaded_scoring != self.pick.scoring_file: - logger.debug("Resetting the scoring to its previous value") + logger.warning(f"Resetting the scoring to its previous value ({self._loaded_scoring})") self.pick.scoring_file = self._loaded_scoring elif self._loaded_scoring != self.pick.scoring_file: logger.debug("Overwriting the scoring self using '%s' as scoring file", self.pick.scoring_file) @@ -137,13 +137,11 @@ def load_scoring(self, logger=None): checked = ScoringFile.Schema().load(scoring) checked.check(minimal_orf_length=self.pick.orf_loading.minimal_orf_length) self._loaded_scoring = self.pick.scoring_file - # self = check_scoring(self) - # self = check_all_requirements(self) except (InvalidConfiguration, ValidationError) as exc: - logger.debug("Invalid option: %s", option) - logger.warning(exc) - continue - # self.scoring = dataclasses.asdict(checked.scoring) + msg = f"The configuration file {option} is invalid:\n" + msg += str(exc) + logger.critical(msg) + raise InvalidConfiguration(msg) self.scoring = checked found = True self.pick.scoring_file = option diff --git a/Mikado/loci/superlocus.py b/Mikado/loci/superlocus.py index fbeb54054..d9d489fc0 100644 --- a/Mikado/loci/superlocus.py +++ b/Mikado/loci/superlocus.py @@ -527,9 +527,7 @@ async def _load_introns(self): """ if len(self.introns) == 0: - if self.monoexonic is False: - raise ValueError("%s is multiexonic but has no introns defined!", - self.id) + assert self.monoexonic is True, f"{self.id} is multiexonic but has no introns defined!" self.logger.debug("No introns for %s", self.id) return @@ -537,10 +535,10 @@ async def _load_introns(self): if not self.configuration.db_settings.db: return # No data to load - ver_introns = dict(((junc.junction_start, junc.junction_end), junc.strand) - for junc in junction_baked(self.session).params( - chrom=self.chrom, junctionStart=self.start, junctionEnd=self.end - )) + ver_introns = collections.defaultdict(set) + for junc in junction_baked(self.session).params(chrom=self.chrom, + junctionStart=self.start, junctionEnd=self.end): + ver_introns[(junc.junction_start, junc.junction_end)].add(junc.strand) self.logger.debug("Found %d verifiable introns for %s", len(ver_introns), self.id) @@ -549,11 +547,15 @@ async def _load_introns(self): self.logger.debug("Checking %s%s:%d-%d", self.chrom, self.strand, intron[0], intron[1]) if (intron[0], intron[1]) in ver_introns: - self.logger.debug("Verified intron %s:%d-%d", - self.chrom, intron[0], intron[1]) - self.locus_verified_introns.add((intron[0], - intron[1], - ver_introns[(intron[0], intron[1])])) + if self.stranded is False: + for strand in ver_introns[(intron[0], intron[1])]: + self.locus_verified_introns.add((intron[0], + intron[1], + strand)) + elif self.strand in ver_introns[(intron[0], intron[1])]: + self.locus_verified_introns.add((intron[0], + intron[1], + self.strand)) async def get_sources(self): if self.configuration.pick.output_format.report_all_external_metrics is True: @@ -566,7 +568,7 @@ async def get_sources(self): if param.startswith("external")}) sources.update({param for param in self.configuration.scoring.cds_requirements.parameters.keys() if param.startswith("external")}) - sources.update({param for param in self.configuration.scoring.cds_requirements.parameters.keys() + sources.update({param for param in self.configuration.scoring.as_requirements.parameters.keys() if param.startswith("external")}) sources.update({param for param in self.configuration.scoring.scoring.keys() if param.startswith("external")}) diff --git a/Mikado/serializers/junction.py b/Mikado/serializers/junction.py index 836a232ce..8b208e8bc 100644 --- a/Mikado/serializers/junction.py +++ b/Mikado/serializers/junction.py @@ -99,10 +99,19 @@ def __init__(self, junction_start, junction_end, name, strand, score, chrom_id): self.score = score def __str__(self): - return "{chrom}\t{start}\t{end}".format( + return "{chrom}\t{start}\t{end}\t{strand}".format( chrom=self.chrom, start=self.start, - end=self.end + end=self.end, + strand=self.strand + ) + + def __repr__(self): + return "{chrom}\t{start}\t{end}\t{strand}".format( + chrom=self.chrom, + start=self.start, + end=self.end, + strand=self.strand ) @hybrid_method diff --git a/Mikado/tests/test_external_async.py b/Mikado/tests/test_external_async.py index ba3f36a36..f7479a8e7 100644 --- a/Mikado/tests/test_external_async.py +++ b/Mikado/tests/test_external_async.py @@ -1,15 +1,17 @@ import asyncio -from Mikado._transcripts.scoring_configuration import MinMaxScore +import itertools +from Mikado._transcripts.scoring_configuration import MinMaxScore, SizeFilter from Mikado.configuration.configurator import load_and_validate_config from Mikado.loci import Superlocus from Mikado.serializers.external import External, ExternalSource from Mikado.serializers.blast_serializer.query import Query +from Mikado.serializers.orf import Orf +from Mikado.serializers.junction import Chrom, Junction from Mikado.transcripts import Transcript -from Mikado.utilities.dbutils import DBBASE +from Mikado.utilities.dbutils import DBBASE as db import unittest from sqlalchemy.orm import sessionmaker from sqlalchemy import create_engine -import tempfile class AsyncExternalTest(unittest.TestCase): @@ -41,7 +43,6 @@ def test_get_external(self): 'use_raw': True, 'percentage': True}) transcript.attributes["tpm"] = 10 - db = DBBASE int_source = ExternalSource('int', 'int', 0) float_source = ExternalSource('float', 'float', 0) bool_source = ExternalSource('bool', 'bool', 0) @@ -61,10 +62,8 @@ def test_get_external(self): query = Query(transcript.id, transcript.cdna_length) query2 = Query(transcript2.id, transcript2.cdna_length) - dbname = tempfile.NamedTemporaryFile(suffix=".db", delete=False) engine = create_engine("sqlite:///:memory:") db.metadata.create_all(engine) - print(engine.url, dbname.name) SessionMaker = sessionmaker(bind=engine) session = SessionMaker() session.add_all([int_source, float_source, bool_source, raw_int_source, raw_float_source, raw_bool_source]) @@ -90,3 +89,110 @@ def test_get_external(self): 'raw_bool': (True, True) } }) + + sup.configuration.pick.output_format.report_all_external_metrics = False + external = asyncio.run(sup.get_external(qobj, [1])) + self.assertEqual(len(external), 0) + # These are meaningless it's just to verify we are loading *only* these metrics. + # We should *NOT* have 'float' as it is not present in any section. + sup.configuration.scoring.scoring["external.int"] = MinMaxScore(rescaling="max", filter=None) + sup.configuration.scoring.requirements.parameters["external.raw_float"] = SizeFilter(operator="gt", + value=100) + sup.configuration.scoring.cds_requirements.parameters["external.raw_int"] = SizeFilter(operator="lt", + value=1) + sup.configuration.scoring.as_requirements.parameters["external.raw_bool"] = SizeFilter(operator="lt", + value=1) + sup.configuration.scoring.not_fragmentary.parameters["external.bool"] = SizeFilter(operator="ne", + value=False) + external = asyncio.run(sup.get_external(qobj, [1])) + self.assertEqual(external, { + 'ENST00000560636': { + 'int': (10, False), + 'raw_float': (8.0, True), + 'bool': (False, False), + 'raw_int': (8, True), + 'raw_bool': (True, True) + } + }) + + +class AsyncJunctionTest(unittest.TestCase): + + def test_retrieval(self): + engine = create_engine("sqlite:///:memory:") + db.metadata.create_all(engine) + SessionMaker = sessionmaker(bind=engine) + session = SessionMaker() + + transcript = Transcript(accept_undefined_multi=True) + transcript.chrom = "15" + transcript.source = "protein_coding" + transcript.start = 47631264 + transcript.end = 48051999 + + exons = [(47631264, 47631416), + (47704590, 47704669), + (47762671, 47762742), + (47893062, 47893093), + (47895572, 47895655), + (48051942, 48051999)] + + transcript.strand = "+" + transcript.add_exons(exons) + transcript.id = "ENST00000560636" + transcript.parent = "ENSG00000137872" + transcript2 = transcript.copy() + transcript2.id = "ENST00000560637" + + chrom_one = Chrom("1", 10**8) + chrom_fifteen = Chrom("15", 5 * 10 ** 8) + session.add_all([chrom_one, chrom_fifteen]) + session.commit() + # junction_start, junction_end, name, strand, score, chrom_id) + # This junction is on a different chrom + junction_chrom_one = Junction(47704669 + 1, 47762671 - 1, "chrom_one", "+", 10, chrom_one.chrom_id) + # This junction is too far away + outside_chrom_15 = Junction(47704669 - 10 ** 6 + 1, 47762671 - 10 ** 6 - 1, "chrom_15_outside", "+", 10, + chrom_fifteen.chrom_id) + # This junction is in the right place but wrong strand + wrong_strand_chrom_15 = Junction(47704669 + 1, 47762671 - 1, "chrom_15_wrong_strand", "-", 10, + chrom_fifteen.chrom_id) + # This one is correct + chrom_15_junction = Junction(47704669 + 1, 47762671 - 1, "chrom_15", "+", 10, chrom_fifteen.chrom_id) + session.add_all([junction_chrom_one, outside_chrom_15, wrong_strand_chrom_15, chrom_15_junction]) + session.commit() + + self.assertEqual(junction_chrom_one.chrom, "1") + for junc in [outside_chrom_15, wrong_strand_chrom_15, chrom_15_junction]: + self.assertEqual(junc.chrom, "15") + + for strand, stranded in itertools.product(("+", "-", None), (True, False)): + transcript.unfinalize() + transcript.strand = strand + transcript.finalize() + sup = Superlocus(transcript, stranded=stranded) + self.assertTrue((chrom_15_junction.junction_start, chrom_15_junction.end) in + sup.introns, (chrom_15_junction, sup.introns)) + sup.session = session + asyncio.run(sup._load_introns()) + if stranded is True and strand is not None: + self.assertEqual(sup.locus_verified_introns, {(chrom_15_junction.junction_start, + chrom_15_junction.junction_end, + strand)}, + (stranded, strand)) + elif stranded is False: + self.assertEqual(sup.locus_verified_introns, {(chrom_15_junction.junction_start, + chrom_15_junction.junction_end, + chrom_15_junction.strand), + (wrong_strand_chrom_15.junction_start, + wrong_strand_chrom_15.junction_end, + wrong_strand_chrom_15.strand)}, + (stranded, strand)) + elif stranded is True and strand is None: + self.assertEqual(sup.locus_verified_introns, set()) + + +class AsyncOrfLoading(unittest.TestCase): + + def test_load_orfs(self): + """"""