diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b4169751..61b262f78 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,10 +10,17 @@ Other changes: provided as a stream is *disabled* though. - Fix [#382](https://github.com/EI-CoreBioinformatics/mikado/issues/382): now Mikado can accept generic BED12 files as input junctions, not just Portcullis junctions. This allows e.g. a user to provide a ***set of gene models*** - in BED12 format as sources of valid junctions. -- Slightly increased the unit-test coverage for the locus classes, e.g. properly covering the `as_dict` and `load_dict` - methods. Minor bugfixes related to the introduction of these unit-tests. + in BED12 format as sources of valid junctions. +- Fix [#387](https://github.com/EI-CoreBioinformatics/mikado/issues/387): now Mikado will always use a static seed, + rather than generating a new one per call unless specifically instructed to do so. The old behaviour can still be + replicated by either setting the `seed` parameter to `null` (ie `None`) in the configuration file, or by + specifying `--random-seed` during the command invocation. +- General increase in code unit-test coverage; in particular: + - Slightly increased the unit-test coverage for the locus classes, e.g. properly covering the `as_dict` and `load_dict` + methods. Minor bugfixes related to the introduction of these unit-tests. - `Mikado.parsers.to_gff` has been renamed to `Mikado.parsers.parser_factory`. +- The code related to the transcript padding has been moved to the submodule `Mikado.transcripts.pad`, rather than + being part of the `Mikado.loci.locus` submodule. - Mikado will error informatively if the scoring configuration file is malformed. # Version 2.1.1 diff --git a/Mikado/_transcripts/transcript_base.py b/Mikado/_transcripts/transcript_base.py index fee851719..e9c9dfbe5 100644 --- a/Mikado/_transcripts/transcript_base.py +++ b/Mikado/_transcripts/transcript_base.py @@ -827,8 +827,8 @@ def get_internal_orf_beds(self) -> List[BED12]: new_row.thick_start = utr + 1 new_row.thick_end = new_row.thick_start + cds_len - 1 new_row.name = "{}_orf{}".format(self.tid, index) - new_row.block_starts = [row.thick_start] - new_row.block_sizes = [cds_len] + new_row.block_starts = [0] + new_row.block_sizes = [self.cdna_length] new_row.phase = phase # self.logger.debug(new_row) new_row = BED12(new_row, @@ -849,6 +849,10 @@ def get_internal_orf_beds(self) -> List[BED12]: yield new_row + @property + def orfs(self) -> List[BED12]: + return list(self.get_internal_orf_beds()) + @Metric def is_reference(self): """Checks whether the transcript has been marked as reference by Mikado prepare""" diff --git a/Mikado/configuration/configuration.py b/Mikado/configuration/configuration.py index 1f028f25a..74c5bb5d4 100644 --- a/Mikado/configuration/configuration.py +++ b/Mikado/configuration/configuration.py @@ -1,6 +1,7 @@ import copy import dataclasses from dataclasses import field +import random from marshmallow import validate, ValidationError from marshmallow_dataclass import dataclass, Optional from .picking_config import PickConfiguration @@ -41,8 +42,10 @@ class MikadoConfiguration: "required": True }) seed: int = field(default=0, metadata={ - "metadata": {"description": "Random number generator seed, to ensure reproducibility across runs"}, - "validate": validate.Range(min=0, max=2 ** 32 - 1) + "metadata": {"description": "Random number generator seed, to ensure reproducibility across runs. Set to None" + "('null' in YAML/JSON/TOML files) to let Mikado select a random seed every time."}, + "validate": validate.Range(min=0, max=2 ** 32 - 1), + "allow_none": True, "required": True }) multiprocessing_method: Optional[str] = field(default="spawn", metadata={ "metadata": {"description": "Which method (fork, spawn, forkserver) Mikado should use for multiprocessing"}, @@ -75,11 +78,18 @@ def __post_init__(self): def copy(self): return copy.copy(self) - def check(self): + def check(self, logger=create_null_logger()): + if self.seed is None: + self.seed = random.randint(0, 2 ** 32 - 1) + logger.info(f"Random seed: {self.seed}") if self.scoring is None or not hasattr(self.scoring.requirements, "parameters"): - self.load_scoring() + self.load_scoring(logger=logger) self.scoring.check(minimal_orf_length=self.pick.orf_loading.minimal_orf_length) - self.Schema().validate(dataclasses.asdict(self)) + errors = self.Schema().validate(dataclasses.asdict(self)) + if len(errors) > 0: + exc = InvalidConfiguration(f"The configuration is invalid, please double check. Errors:\n{errors}") + logger.critical(exc) + raise exc def load_scoring(self, logger=None): """ diff --git a/Mikado/configuration/configurator.py b/Mikado/configuration/configurator.py index f1b61c637..e41114a2a 100644 --- a/Mikado/configuration/configurator.py +++ b/Mikado/configuration/configurator.py @@ -118,24 +118,16 @@ def check_and_load_scoring(configuration: Union[DaijinConfiguration, MikadoConfi try: configuration.load_scoring(logger=logger) - configuration.check() + configuration.check(logger=logger) configuration = check_db(configuration) if not configuration.multiprocessing_method: configuration.multiprocessing_method = get_start_method() - - except Exception as exc: + except InvalidConfiguration as exc: logger.exception(exc) raise - seed = configuration.seed - - if seed != 0: - # numpy.random.seed(seed % (2 ** 32 - 1)) - random.seed(seed % (2 ** 32 - 1)) - else: - # numpy.random.seed(None) - random.seed(None) - + assert configuration.seed is not None + random.seed(configuration.seed % (2 ** 32 - 1)) return configuration @@ -212,10 +204,6 @@ def load_and_validate_config(raw_configuration: Union[None, MikadoConfiguration, logger.exception("Loading the configuration file failed with error:\n%s\n\n\n", exc) raise InvalidConfiguration("The configuration file passed is invalid. Please double check.") - if config.seed == 0 or config.seed is None: - config.seed = random.randint(1, 2 ** 32 - 1) - logger.info("Random seed: {}", config.seed) - random.seed(config.seed % (2 ** 32 - 1)) return config diff --git a/Mikado/configuration/daijin_configurator.py b/Mikado/configuration/daijin_configurator.py index 444dbdd41..b07bf21b0 100644 --- a/Mikado/configuration/daijin_configurator.py +++ b/Mikado/configuration/daijin_configurator.py @@ -6,7 +6,7 @@ import toml import yaml from pkg_resources import resource_stream -from .configurator import create_cluster_config +from .configurator import create_cluster_config, load_and_validate_config from . import print_config from .daijin_configuration import DaijinConfiguration from .._transcripts.scoring_configuration import ScoringFile @@ -254,6 +254,8 @@ def create_daijin_config(args: Namespace, config=None, level="ERROR", piped=Fals final_config = config.copy() + final_config = load_and_validate_config(final_config) + if args.exe: with open(args.exe, "wt") as out: for key, val in dataclasses.asdict(final_config.load).items(): diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py index a57963f75..cdd5c6362 100644 --- a/Mikado/loci/abstractlocus.py +++ b/Mikado/loci/abstractlocus.py @@ -258,6 +258,7 @@ def as_dict(self) -> dict: state["transcripts"] = dict((tid, state["transcripts"][tid].as_dict()) for tid in state["transcripts"]) assert "metrics_calculated" in state state["json_conf"] = dataclasses.asdict(state["json_conf"]) + assert state["json_conf"]["seed"] is not None return state def load_dict(self, state: dict, load_transcripts=True, load_configuration=True): diff --git a/Mikado/loci/excluded.py b/Mikado/loci/excluded.py index 3420d4aed..7fc7b9865 100644 --- a/Mikado/loci/excluded.py +++ b/Mikado/loci/excluded.py @@ -34,7 +34,6 @@ def __init__(self, monosublocus_instance=None, configuration=None, logger=None): Abstractlocus.__init__(self, configuration=configuration) self.splitted = False self.metrics_calculated = False - # self.configuration = configuration self.logger = logger if isinstance(monosublocus_instance, Transcript): Abstractlocus.__init__(self, transcript_instance=monosublocus_instance) diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py index 44c59d4af..cd83f996d 100644 --- a/Mikado/loci/locus.py +++ b/Mikado/loci/locus.py @@ -10,13 +10,11 @@ import operator from collections import defaultdict import pysam +from ..transcripts.expansion import expand_transcript from ..transcripts.transcript import Transcript -# from ..configuration.picking_config import valid_as_ccodes, redundant_as_ccodes -from ..transcripts.transcriptchecker import TranscriptChecker from .abstractlocus import Abstractlocus from ..parsers.GFF import GffLine from ..scales.assignment.assigner import Assigner -from ..exceptions import InvalidTranscript import networkx as nx import random @@ -537,7 +535,7 @@ def add_transcript_to_locus(self, transcript: Transcript, check_in_locus=True, # Add a check similar to what we do for the minimum requirements and the fragments if to_be_added and self.configuration.scoring.as_requirements: - to_be_added = self.__check_as_requirements(transcript, is_reference=reference_pass) + to_be_added = self._check_as_requirements(transcript, is_reference=reference_pass) if to_be_added is True: is_alternative, ccode, _ = self.is_alternative_splicing(transcript) @@ -565,7 +563,7 @@ def add_transcript_to_locus(self, transcript: Transcript, check_in_locus=True, self.locus_verified_introns.update(transcript.verified_introns) - def __check_as_requirements(self, transcript: Transcript, is_reference=False) -> bool: + def _check_as_requirements(self, transcript: Transcript, is_reference=False) -> bool: """Private method to evaluate a transcript for inclusion in the locus. This method uses the "as_requirements" section of the configuration file to perform the evaluation. @@ -573,7 +571,6 @@ def __check_as_requirements(self, transcript: Transcript, is_reference=False) -> will always evaluate to True (ie the transcript is valid). """ - to_be_added = True if is_reference is True and self.configuration.pick.run_options.check_references is False: return True @@ -982,7 +979,8 @@ def pad_transcripts(self, backup=None) -> set: __to_modify[tid][1], self.fai, self.logger) - if (new_transcript.start == self.transcripts[tid].end) and (new_transcript.end == self.transcripts[tid].end): + if (new_transcript.start == self.transcripts[tid].end) and \ + (new_transcript.end == self.transcripts[tid].end): self.logger.debug("No expansion took place for %s!", tid) else: self.logger.debug("Expansion took place for %s!", tid) @@ -1059,8 +1057,7 @@ def define_graph(self, objects: dict, inters=None, three_prime=False) -> nx.DiGr graph = nx.DiGraph() graph.add_nodes_from(objects.keys()) - if inters is None: - inters = self._share_extreme + inters = self._share_extreme if inters is None else inters if len(objects) >= 2: if (three_prime is True and self.strand != "-") or (three_prime is False and self.strand == "-"): @@ -1424,388 +1421,3 @@ def _remove_from_redundant_splicing_codes(self, *ccodes): sub = [_ for _ in sub if _ not in ccodes] self.logger.debug("New redundant ccodes: %s", sub) self.redundant_ccodes = sub - - -def expand_transcript(transcript: Transcript, - backup: Transcript, - start_transcript: [Transcript, bool], - end_transcript: [Transcript, bool], - fai: pysam.libcfaidx.FastaFile, - logger): - - """This method will enlarge the coordinates and exon structure of a transcript, given: - :param transcript: the transcript to modify. - :type transcript: Transcript - :param start_transcript: the template transcript for the 5' end. - :param end_transcript: the template transcript for the 3' end. - :param fai: the indexed genomic sequence. - :param logger: the logger to be used in the function. - """ - - # If there is nothing to do, just get out - transcript.finalize() - if start_transcript not in (False, None): - start_transcript.finalize() - if end_transcript not in (False, None): - end_transcript.finalize() - - if start_transcript in (False, None) and end_transcript in (False, None): - logger.debug("%s does not need to be expanded, exiting", transcript.id) - return transcript - - if transcript.strand == "-": - start_transcript, end_transcript = end_transcript, start_transcript - - # Make a backup copy of the transcript - # First get the ORFs - # Remove the CDS and unfinalize - logger.debug("Starting expansion of %s", transcript.id) - strand = transcript.strand - transcript.strip_cds() - transcript.unfinalize() - assert strand == transcript.strand - - upstream, up_exons, new_first_exon, up_remove = _enlarge_start(transcript, backup, start_transcript) - downstream, up_exons, down_exons, down_remove = _enlarge_end(transcript, - backup, end_transcript, up_exons, new_first_exon) - - first_exon, last_exon = transcript.exons[0], transcript.exons[-1] - - assert upstream >= 0 and downstream >= 0 - - if up_remove is True: - # Remove the first exon - transcript.remove_exon(first_exon) - if down_remove is True: - if not (up_remove is True and first_exon == last_exon): - transcript.remove_exon(last_exon) - - new_exons = up_exons + down_exons - if not new_exons: - logger.debug("%s does not need to be expanded, exiting", transcript.id) - return backup - - transcript.add_exons(new_exons) - transcript.start, transcript.end = None, None - transcript.finalize() - - if transcript.strand == "-": - downstream, upstream = upstream, downstream - - if (up_exons or down_exons): - if backup.is_coding: - seq = check_expanded(transcript, backup, start_transcript, end_transcript, - fai, upstream, downstream, logger) - transcript = enlarge_orfs(transcript, backup, seq, upstream, downstream, logger) - transcript.finalize() - else: - return backup - - # Now finalize again - logger.debug("%s: start (before %s, now %s, %s), end (before %s, now %s, %s)", - transcript.id, - backup.start, transcript.start, transcript.start < backup.start, - backup.end, transcript.end, transcript.end > backup.end) - if transcript.start < backup.start or transcript.end > backup.end: - transcript.attributes["padded"] = True - - # Now check that we have a valid expansion - if backup.is_coding and not transcript.is_coding: - # Something has gone wrong. Just return the original transcript. - assert new_exons - logger.info("Padding %s would lead to an invalid CDS (up exons: %s). Aborting.", - transcript.id, up_exons) - return backup - elif backup.is_coding: - abort = False - if backup.strand == "-" and backup.combined_cds_end < transcript.combined_cds_end: - abort = True - elif backup.strand != "-" and backup.combined_cds_end > transcript.combined_cds_end: - abort = True - if abort is True: - msg = "Padding {} (strand: {}) would lead to an in-frame stop codon ({} to {}, \ -vs original {} to {}. Aborting.".format( - transcript.id, backup.strand, transcript.combined_cds_start, transcript.combined_cds_end, - backup.combined_cds_start, backup.combined_cds_end) - logger.info(msg) - return backup - - return transcript - - -def _enlarge_start(transcript: Transcript, - backup: Transcript, - start_transcript: Transcript) -> (int, list, [None, tuple], bool): - - """This method will enlarge the transcript at the 5' end, using another transcript as the template. - :param transcript: the original transcript to modify. - :param backup: a copy of the transcript. As we are modifying the original one, we do need a hard copy. - :param start_transcript: the template transcript. - - The function returns the following: - :returns: the upstream modification, the list of upstream exons to add, the new first exon (if any), - a boolean flag indicating whether the first exon of the transcript should be removed. - """ - - upstream = 0 - up_exons = [] - new_first_exon = None - to_remove = False - if start_transcript: - transcript.start = start_transcript.start - upstream_exons = sorted([_ for _ in - start_transcript.find_upstream(transcript.exons[0][0], transcript.exons[0][1]) - if _.value == "exon"]) - intersecting_upstream = sorted(start_transcript.search( - transcript.exons[0][0], transcript.exons[0][1])) - - if not intersecting_upstream: - raise KeyError("No exon or intron found to be intersecting with %s vs %s, this is a mistake", - transcript.id, start_transcript.id) - - if intersecting_upstream[0].value == "exon": - new_first_exon = (min(intersecting_upstream[0][0], backup.start), - transcript.exons[0][1]) - if new_first_exon != transcript.exons[0]: - upstream += backup.start - new_first_exon[0] - up_exons.append(new_first_exon) - to_remove = True - else: - new_first_exon = None - if intersecting_upstream[0] in upstream_exons: - upstream_exons.remove(intersecting_upstream[0]) - upstream += sum(_[1] - _[0] + 1 for _ in upstream_exons) - up_exons.extend([(_[0], _[1]) for _ in upstream_exons]) - elif intersecting_upstream[0].value == "intron": - # Check whether the first exon of the model *ends* within an *intron* of the template - # If that is the case, we have to keep the first exon in place and - # just expand it until the end - # Now we have to expand until the first exon in the upstream_exons - if intersecting_upstream[0][1] == transcript.exons[0][0] - 1: - assert upstream_exons - to_remove = False - elif upstream_exons: - to_remove = True - upstream_exon = upstream_exons[-1] - new_first_exon = (upstream_exon[0], transcript.exons[0][1]) - upstream_exons.remove(upstream_exon) - upstream += backup.start - new_first_exon[0] - up_exons.append(new_first_exon) - else: - # Something fishy going on here. Let us double check everything. - if start_transcript.exons[0][0] == transcript.start: - raise ValueError( - "Something has gone wrong. The template transcript should have returned upstream exons." - ) - elif start_transcript.exons[0][0] < transcript.start: - raise ValueError( - "Something has gone wrong. We should have found the correct exons." - ) - else: - pass - - upstream += sum(_[1] - _[0] + 1 for _ in upstream_exons) - up_exons.extend([(_[0], _[1]) for _ in upstream_exons]) - - return upstream, up_exons, new_first_exon, to_remove - - -def _enlarge_end(transcript: Transcript, - backup: Transcript, - end_transcript: Transcript, - up_exons: list, - new_first_exon: [None, tuple]) -> [int, list, list, bool]: - - """ - This method will enlarge the transcript at the 5' end, using another transcript as the template. - :param transcript: the original transcript to modify. - :param backup: a copy of the transcript. As we are modifying the original one, we do need a hard copy. - :param end_transcript: the template transcript. - :param up_exons: the list of exons added at the 5' end. - :param new_first_exon: the new coordinates of what used to be the first exon of the transcript. - This is necessary because if the transcript is monoexonic, we might need to re-modify it. - - The function returns the following: - :returns: the downstream modification, the (potentially modified) list of upstream exons to add, - the list of downstream exons to add, a boolean flag indicating whether the last exon of the transcript - should be removed. - """ - - downstream = 0 - down_exons = [] - to_remove = False - - if end_transcript: - transcript.end = end_transcript.end - downstream_exons = sorted([_ for _ in - end_transcript.find_downstream(transcript.exons[-1][0], transcript.exons[-1][1]) - if _.value == "exon"]) - intersecting_downstream = sorted(end_transcript.search( - transcript.exons[-1][0], transcript.exons[-1][1])) - if not intersecting_downstream: - raise KeyError("No exon or intron found to be intersecting with %s vs %s, this is a mistake", - transcript.id, end_transcript.id) - # We are taking the right-most intersecting element. - if intersecting_downstream[-1].value == "exon": - if transcript.monoexonic and new_first_exon is not None: - new_exon = (new_first_exon[0], max(intersecting_downstream[-1][1], new_first_exon[1])) - if new_exon != new_first_exon: - up_exons.remove(new_first_exon) - downstream += new_exon[1] - backup.end - down_exons.append(new_exon) - to_remove = True - else: - new_exon = (transcript.exons[-1][0], - max(intersecting_downstream[-1][1], transcript.exons[-1][1])) - if new_exon != transcript.exons[-1]: - downstream += new_exon[1] - backup.end - down_exons.append(new_exon) - to_remove = True - - if intersecting_downstream[-1] in downstream_exons: - downstream_exons.remove(intersecting_downstream[-1]) - downstream += sum(_[1] - _[0] + 1 for _ in downstream_exons) - down_exons.extend([(_[0], _[1]) for _ in downstream_exons]) - elif intersecting_downstream[-1].value == "intron": - # Now we have to expand until the first exon in the upstream_exons - if intersecting_downstream[-1][0] == transcript.exons[-1][1] + 1: - assert downstream_exons - to_remove = False - elif downstream_exons: - downstream_exon = downstream_exons[0] - assert downstream_exon[1] > backup.end - assert downstream_exon[0] > backup.end - if transcript.monoexonic and new_first_exon is not None: - new_exon = (new_first_exon[0], downstream_exon[1]) - up_exons.remove(new_first_exon) - to_remove = True - else: - new_exon = (transcript.exons[-1][0], downstream_exon[1]) - to_remove = True - downstream_exons.remove(downstream_exon) - downstream += new_exon[1] - backup.end - down_exons.append(new_exon) - else: - # Something fishy going on here. Let us double check everything. - if end_transcript.exons[-1][1] == transcript.end: - raise ValueError( - "Something has gone wrong. The template transcript should have returned upstream exons." - ) - elif end_transcript.exons[-1][1] > transcript.end: - raise ValueError( - "Something has gone wrong. We should have found the correct exons." - ) - downstream += sum(_[1] - _[0] + 1 for _ in downstream_exons) - down_exons.extend([(_[0], _[1]) for _ in downstream_exons]) - - return downstream, up_exons, down_exons, to_remove - - -def check_expanded(transcript, backup, start_transcript, end_transcript, fai, upstream, downstream, logger) -> str: - - """ - This function checks that the expanded transcript is valid, and it also calculates and returns its cDNA sequence. - :param transcript: the modified transcript. - :param backup: The original transcript, before expansion. - :param start_transcript: the transcript used as template at the 5' end. - :param end_transcript: the transcript used as template at the 3' end. - :param fai: The pysam.libcfaidx.FastaFile object indexing the genome. - :param upstream: the amount of transcriptomic base-pairs added to the transcript at its 5' end. - :param downstream: the amount of transcriptomic base-pairs added to the transcript at its 3' end. - :param logger: the logger to use. - :returns: the cDNA of the modified transcript, as a standard Python string. - """ - - assert transcript.exons != backup.exons - assert transcript.end <= fai.get_reference_length(transcript.chrom), ( - transcript.end, fai.get_reference_length(transcript.chrom)) - genome_seq = fai.fetch(transcript.chrom, transcript.start - 1, transcript.end) - - if not (transcript.exons[-1][1] - transcript.start + 1 == len(genome_seq)): - error = "{} should have a sequence of length {} ({} start, {} end), but one of length {} has been given" - error = error.format(transcript.id, transcript.exons[-1][1] - transcript.start + 1, - transcript.start, transcript.end, len(genome_seq)) - logger.error(error) - raise InvalidTranscript(error) - seq = TranscriptChecker(transcript, genome_seq, is_reference=True).cdna - assert len(seq) == transcript.cdna_length, (len(seq), transcript.cdna_length, transcript.exons) - if not len(seq) == backup.cdna_length + upstream + downstream: - error = [len(seq), backup.cdna_length + upstream + downstream, - backup.cdna_length, upstream, downstream, - (transcript.start, transcript.end), (backup.id, backup.start, backup.end), - (None if not start_transcript else (start_transcript.id, (start_transcript.start, - start_transcript.end))), - (None if not end_transcript else (end_transcript.id, (end_transcript.start, - end_transcript.end))), - (backup.id, backup.exons), - None if not start_transcript else (start_transcript.id, start_transcript.exons), - None if not end_transcript else (end_transcript.id, end_transcript.exons), - (transcript.id + "_expanded", transcript.exons), - set.difference(set(transcript.exons), set(backup.exons)), - set.difference(set(backup.exons), set(transcript.exons)) - ] - error = "\n".join([str(_) for _ in error]) - raise AssertionError(error) - return seq - - -def enlarge_orfs(transcript: Transcript, - backup: Transcript, - seq: str, - upstream: int, - downstream: int, - logger) -> Transcript: - - """ - This method will take an expanded transcript and recalculate its ORF(s). As a consequence of the expansion, - truncated transcripts might become whole. - :param transcript: the expanded transcript. - :param backup: the original transcript. Used to extract the original ORF(s). - :param seq: the new cDNA sequence of the expanded transcript. - :param upstream: the amount of expansion that happened at the 5'. - :param downstream: the amount of expansion that happened at the 3'. - :param logger: the logger. - :returns: the modified transcript with the ORF(s) recalculated. - """ - - if backup.combined_cds_length > 0: - try: - internal_orfs = list(backup.get_internal_orf_beds()) - except (ValueError, TypeError, AssertionError): - logger.error("Something went wrong with the CDS extraction for %s. Stripping it.", - backup.id) - internal_orfs = [] - else: - internal_orfs = [] - - if not internal_orfs: - return transcript - - new_orfs = [] - for orf in internal_orfs: - logger.debug("Old ORF: %s", str(orf)) - try: - logger.debug("Sequence for %s: %s[..]%s (upstream %s, downstream %s)", - transcript.id, seq[:10], seq[-10:], upstream, downstream) - orf.expand(seq, upstream, downstream, expand_orf=True, logger=logger) - except AssertionError as err: - logger.error(err) - logger.error("%s, %s, %s, %s", - upstream, - downstream, - transcript.exons, - transcript.cdna_length) - raise AssertionError(err) - logger.debug("New ORF: %s", str(orf)) - if orf.coding is False: - raise ValueError(orf) - elif orf.invalid: - raise InvalidTranscript(orf.invalid_reason) - - new_orfs.append(orf) - - transcript.load_orfs(new_orfs) - transcript.finalize() - if backup.is_coding and not transcript.is_coding: - raise InvalidTranscript(new_orfs) - return transcript diff --git a/Mikado/loci/superlocus.py b/Mikado/loci/superlocus.py index d9d489fc0..f1c5c2f65 100644 --- a/Mikado/loci/superlocus.py +++ b/Mikado/loci/superlocus.py @@ -33,7 +33,7 @@ from collections import OrderedDict as SortedDict from .locus import Locus from .excluded import Excluded -from typing import Union +from typing import Union, List, Dict from ..utilities import Interval, IntervalTree from itertools import combinations import random @@ -160,7 +160,6 @@ def __init__(self, self.engine = self.sessionmaker = self.session = None # Excluded object self.excluded = Excluded(configuration=self.configuration) - self.__retained_sources = set() self.__data_loaded = False self.__lost = dict() if transcript_instance is not None: @@ -236,10 +235,8 @@ def __create_sublocus_lines(self, superlocus_line: GffLine, new_id: str, print_c self.define_subloci() found = dict() for sublocus_instance in self.subloci: - try: - sublocus_instance.source = source - except AttributeError: - raise AttributeError(sublocus_instance) + assert hasattr(sublocus_instance, "source"), sublocus_instance + sublocus_instance.source = source sublocus_instance.parent = new_id if sublocus_instance.id in found: found[sublocus_instance.id] += 1 @@ -260,8 +257,7 @@ def format(self, print_cds=True, level=None): :param level: level which we wish to print for. Can be "loci", "subloci", "monosubloci" :return: formatted GFF strings """ - return self.__str__(print_cds=print_cds, - level=level) + return self.__str__(print_cds=print_cds, level=level) def __str__(self, level=None, print_cds=True): @@ -284,6 +280,8 @@ def __str__(self, level=None, print_cds=True): if abs(self.start) == float("inf") or abs(self.start) == maxsize: return '' + assert level in (None, "loci", "subloci", "monosubloci"), f"Unrecognized level: {level}" + superlocus_line = GffLine('') superlocus_line.chrom = self.chrom superlocus_line.feature = self.__name__ @@ -296,16 +294,10 @@ def __str__(self, level=None, print_cds=True): superlocus_line.id, superlocus_line.name = new_id, self.name if self.approximation_level > 0: superlocus_line.attributes["approximation_level"] = self.approximation_level - if len(self.__retained_sources) > 0: - superlocus_line.attributes["retained_sources"] = ",".join( - sorted(list(self.__retained_sources)) - ) lines = [] - if level not in (None, "loci", "subloci", "monosubloci"): - raise ValueError("Unrecognized level: {0}".format(level)) - elif level == "loci" or (level is None and self.loci_defined is True): + if level == "loci" or (level is None and self.loci_defined is True): lines = self.__create_locus_lines( superlocus_line, new_id, @@ -315,9 +307,6 @@ def __str__(self, level=None, print_cds=True): lines = self.__create_monolocus_holder_lines(superlocus_line, new_id, print_cds=print_cds) - # lines = self.__create_monolocus_lines(superlocus_line, - # new_id, - # print_cds=print_cds) elif level == "subloci" or (level is None and self.monosubloci_defined is False): lines = self.__create_sublocus_lines(superlocus_line, new_id, @@ -584,17 +573,15 @@ async def get_external(self, query_ids, qids): External.query_id.in_(qids))) for ext in self.session.execute(baked): source_id, query_id, score = ext.source_id, ext.query_id, ext.score - if source_id not in sources or query_id not in qids: - continue + assert source_id in sources and query_id in qids rtype = sources[source_id].rtype + assert rtype in ("int", "float", "bool"), f"Invalid rtype: {rtype}" if rtype == "int": score = int(score) elif rtype == "float": score = float(score) elif rtype == "bool": score = bool(int(score)) - else: - raise ValueError("Invalid rtype: {}".format(sources[ext.source_id].rtype)) external[query_ids[ext.query_id].query_name][ sources[ext.source_id].source] = (score, sources[ext.source_id].valid_raw) return external @@ -637,7 +624,7 @@ async def get_hits(self, query_ids, qids): ) return hits - async def get_orfs(self, qids): + async def get_orfs(self, qids) -> Dict[str, List]: orfs = collections.defaultdict(list) for orf in orfs_baked(self.session).params(queries=qids): orfs[orf.query].append(orf.as_bed12()) diff --git a/Mikado/parsers/__init__.py b/Mikado/parsers/__init__.py index a47166b6b..6c8f17847 100644 --- a/Mikado/parsers/__init__.py +++ b/Mikado/parsers/__init__.py @@ -74,6 +74,8 @@ def parser_factory(string, input_format=None): continue if found: break + else: + raised[test.__annot_type__] = "No valid line found." except InvalidParsingFormat as exc: raised[test.__annot_type__] = exc continue @@ -83,6 +85,7 @@ def parser_factory(string, input_format=None): elif found: return test(string) else: + raise InvalidParsingFormat( "Invalid file specified: {} should have been of format {}, but it could not be verified. Error:\n{}".format( fname if fname != "-" else "stream", input_format, raised[input_format] diff --git a/Mikado/parsers/bed12.py b/Mikado/parsers/bed12.py index 67cd80794..f7adf86b7 100644 --- a/Mikado/parsers/bed12.py +++ b/Mikado/parsers/bed12.py @@ -34,6 +34,7 @@ import numpy as np import random import pprint as pp +from math import modf from Bio.Data import IUPACData @@ -46,7 +47,15 @@ IUPACData.extended_protein_values) assert standard.start_codons == ["ATG"] assert CodonTable.ambiguous_dna_by_id[1].start_codons != ["ATG"] -CodonTable.ambiguous_dna_by_id[0] = standard + +ambiguous_dna_by_id = dict() +ambiguous_dna_by_name = dict() +for key, table in CodonTable.ambiguous_dna_by_name.items(): + ambiguous_dna_by_name[key] = table + +for key, table in CodonTable.ambiguous_dna_by_id.items(): + ambiguous_dna_by_id[key] = table +ambiguous_dna_by_id[0] = standard @functools.lru_cache(typed=True, maxsize=2**10) @@ -421,26 +430,41 @@ def table(self): @table.setter def table(self, table): - # We are going to receive a string, so we need first to convert to integer - try: + if isinstance(table, bool): # Boolean can be considered as int so this requires special handling + raise ValueError(f"Invalid table specified: {table} (type {type(table)})") + elif table is not None and not isinstance(table, (int, float, bytes, str)): + raise ValueError(f"Invalid table specified: {table} (type {type(table)})") + elif isinstance(table, (str, bytes)): + table = table.decode() if isinstance(table, bytes) else table + if table.isdigit() is True: + table = int(table) + elif re.search(r"^[0-9]*\.[0-9]$", table): + table = float(table) + if modf(table) != 0: + raise ValueError(f"Invalid table specified: {table}") + table = int(table) + elif isinstance(table, float): + if modf(table) != 0: + raise ValueError(f"Invalid table specified: {table}") table = int(table) - except (ValueError, TypeError): - pass + if table is None: self.__table = standard self.__table_index = 0 elif isinstance(table, int): - self.__table = CodonTable.ambiguous_dna_by_id[table] - self.__table_index = 0 + if table not in ambiguous_dna_by_id.keys(): + raise ValueError(f"Invalid table code specified: {table}. Available codes: " + f"{', '.join([str(_) for _ in ambiguous_dna_by_id.keys()])}") + self.__table = ambiguous_dna_by_id[table] + assert self.__table.start_codons == ["ATG"] if table == 0 else True, f"Invalid codons for table 0: " \ + f"{self.__table.start_codons}" + self.__table_index = table elif isinstance(table, str): - self.__table = CodonTable.ambiguous_dna_by_name[table] - self.__table_index = self.__table._codon_table.id - elif isinstance(table, bytes): - self.__table = CodonTable.ambiguous_dna_by_name[table.decode()] - self.__table_index = self.__table._codon_table.id - else: - raise ValueError("Invalid table: {} (type: {})".format( - table, type(table))) + if table not in ambiguous_dna_by_name.keys(): + raise ValueError(f"Invalid table name specified: {table}. Available table: " + f"{', '.join([str(_) for _ in ambiguous_dna_by_name.keys()])}") + self.__table = ambiguous_dna_by_name[table] + self.__table_index = ambiguous_dna_by_name[table].id return @parent.setter @@ -1399,19 +1423,11 @@ def to_transcriptomic(self, sequence=None, fasta_index=None, start_adjustment=Fa seen += block[1] - block[0] + 1 # Check thick start and end are defined - error = "" - if tStart is None: - error += """The thick start of {self.id} ({self.chrom}:{self.start}-{self.end}) is invalid as it is outside of the defined exons. -Thick start: {self.thick_start} -Exons: {self.blocks}\n""".format(self=self) - - if tStart is None or tEnd is None: - error += """The thick end of {self.id} ({self.chrom}:{self.start}-{self.end}) is invalid as it is outside of the defined exons. -Thick end: {self.thick_end} -Exons: {self.blocks}\n""".format(self=self) - if error: - raise ValueError(error) + assert tStart is not None and tEnd is not None, f"The thick start, thick end of {self.id} are invalid " \ + f"as they are outside of the defined exons.\nThick start: " \ + f"{self.thick_start}\nThick end: {self.thick_end}\n" \ + f"Exons: {self.blocks}" if self.strand == "+": bsizes = self.block_sizes[:] @@ -1462,9 +1478,7 @@ def to_transcriptomic(self, sequence=None, fasta_index=None, start_adjustment=Fa transcriptomic=True, lenient=lenient, start_adjustment=start_adjustment) - if not isinstance(new, type(self)): - raise TypeError("The new object is of type {tnew} instead of {tself}!".format(tnew=type(new), - tself=type(self))) + assert isinstance(new, type(self)), f"The new object is of type {type(new)} instead of {type(self)}!" return new @property @@ -1570,7 +1584,7 @@ def __next__(self, seq=None): else: return self.gff_next() except (ValueError, KeyError, TypeError, UnicodeError, AttributeError, AssertionError, InvalidParsingFormat) as exc: - raise InvalidParsingFormat("This is not a valid BED12 file! Exception: {}".format(exc)) + raise InvalidParsingFormat(f"This is not a valid BED12 file! Exception: {exc}") def __getstate__(self): state = super().__getstate__() diff --git a/Mikado/preparation/prepare.py b/Mikado/preparation/prepare.py index d87fc2329..a9b4ebeed 100644 --- a/Mikado/preparation/prepare.py +++ b/Mikado/preparation/prepare.py @@ -360,6 +360,11 @@ def perform_check(keys, shelve_names, mikado_config: MikadoConfiguration, logger row_columns = ["chrom", "start", "end", "strand", "tid", "write_start", "write_length", "shelf"] +def _get_strand_specific_assemblies_boolean_vector(mikado_config): + return [(member in mikado_config.prepare.files.strand_specific_assemblies) + for member in mikado_config.prepare.files.gff] + + def _load_exon_lines_single_thread(mikado_config, shelve_names, logger, min_length, strip_cds, max_intron): logger.info("Starting to load lines from %d files (single-threaded)", @@ -373,7 +378,7 @@ def _load_exon_lines_single_thread(mikado_config, shelve_names, logger, min_leng to_do = list(zip( shelve_names, mikado_config.prepare.files.labels, - mikado_config.prepare.files.strand_specific_assemblies, + _get_strand_specific_assemblies_boolean_vector(mikado_config), mikado_config.prepare.files.reference, mikado_config.prepare.files.exclude_redundant, mikado_config.prepare.files.strip_cds, @@ -384,7 +389,7 @@ def _load_exon_lines_single_thread(mikado_config, shelve_names, logger, min_leng ( shelve_names, mikado_config.prepare.files.labels, - mikado_config.prepare.files.strand_specific_assemblies, + _get_strand_specific_assemblies_boolean_vector(mikado_config), mikado_config.prepare.files.reference, mikado_config.prepare.files.exclude_redundant, mikado_config.prepare.files.strip_cds, @@ -459,7 +464,7 @@ def _load_exon_lines_multi(mikado_config, shelve_names, logger, min_length, stri exclude_redundant, file_strip_cds, gff_name) in enumerate(zip( shelve_names, mikado_config.prepare.files.labels, - mikado_config.prepare.files.strand_specific_assemblies, + _get_strand_specific_assemblies_boolean_vector(mikado_config), mikado_config.prepare.files.reference, mikado_config.prepare.files.exclude_redundant, mikado_config.prepare.files.strip_cds, @@ -581,12 +586,7 @@ def prepare(mikado_config: MikadoConfiguration, logger): ) if mikado_config.prepare.strand_specific is True: - mikado_config.prepare.files.strand_specific_assemblies = [True] * len( - mikado_config.prepare.files.gff) - else: - mikado_config.prepare.files.strand_specific_assemblies = [ - (member in mikado_config.prepare.files.strand_specific_assemblies) - for member in mikado_config.prepare.files.gff] + mikado_config.prepare.files.strand_specific_assemblies = mikado_config.prepare.files.gff[:] ref_len = len(mikado_config.prepare.files.reference) file_len = len(mikado_config.prepare.files.gff) diff --git a/Mikado/subprograms/configure.py b/Mikado/subprograms/configure.py index ef3bba0c8..748f0ea54 100644 --- a/Mikado/subprograms/configure.py +++ b/Mikado/subprograms/configure.py @@ -101,7 +101,10 @@ def create_config(args): args.gff = [] config = parse_prepare_options(args, config) - config.seed = args.seed if args.seed is not None else config.seed + if args.random_seed is True: + config.seed = None + else: + config.seed = args.seed config.serialise.files.junctions = args.junctions if args.junctions is not None else \ config.serialise.files.junctions @@ -128,7 +131,6 @@ def create_config(args): config.pick.output_format.report_all_external_metrics = True if args.report_all_external_metrics else \ config.pick.output_format.report_all_external_metrics - if args.scoring is not None: if args.copy_scoring is not False: with open(args.copy_scoring, "wt") as out: @@ -181,6 +183,8 @@ def create_config(args): config.serialise.files.output_dir = args.out_dir config.pick.files.output_dir = args.out_dir + config.check() + # Check that the configuration file is correct with tempfile.NamedTemporaryFile("wt", suffix=".json", delete=True) as tempcheck: print_config(config, tempcheck, full=args.full, output_format="json") @@ -217,8 +221,10 @@ def configure_parser(): parser = argparse.ArgumentParser(description="Configuration utility for Mikado") parser.add_argument("--full", action="store_true", default=False) - parser.add_argument("--seed", type=int, default=0, - help="Random seed number.") + seed_group = parser.add_mutually_exclusive_group() + seed_group.add_argument("--seed", type=int, default=0, help="Random seed number. Default: 0.") + seed_group.add_argument("--random-seed", action="store_true", default=False, + help="Generate a new random seed number (instead of the default of 0)") preparer = parser.add_argument_group("Options related to the prepare stage.") preparer.add_argument("--minimum-cdna-length", default=None, type=int, dest="minimum_cdna_length", help="Minimum cDNA length for transcripts.") diff --git a/Mikado/subprograms/pick.py b/Mikado/subprograms/pick.py index 5fb434c1b..4b21a165a 100644 --- a/Mikado/subprograms/pick.py +++ b/Mikado/subprograms/pick.py @@ -133,7 +133,13 @@ def _set_conf_values_from_args(conf: Union[DaijinConfiguration, MikadoConfigurat conf.multiprocessing_method = args.start_method if args.start_method else conf.multiprocessing_method conf.threads = args.procs if args.procs is not None else conf.threads - conf.seed = args.seed if args.seed is not None else conf.seed + if args.random_seed is True: + conf.seed = None + elif args.seed is not None: + conf.seed = args.seed + else: + pass + conf.pick.scoring_file = args.scoring_file if args.scoring_file is not None else conf.pick.scoring_file conf.prepare.max_intron_length = args.max_intron_length if args.max_intron_length is not None else \ @@ -402,8 +408,10 @@ def pick_parser(): either of the ORFs lacks a BLAST hit (but not both). - permissive: like lenient, but also split when both ORFs lack BLAST hits - split: split multi-orf transcripts regardless of what BLAST data is available.""") - parser.add_argument("--seed", type=int, default=None, - help="Random seed number.") + seed_group = parser.add_mutually_exclusive_group() + seed_group.add_argument("--seed", type=int, default=None, help="Random seed number. Default: 0.") + seed_group.add_argument("--random-seed", action="store_true", default=False, + help="Generate a new random seed number (instead of the default of 0)") parser.add_argument("gff", nargs="?", default=None) parser.set_defaults(func=pick) return parser diff --git a/Mikado/subprograms/prepare.py b/Mikado/subprograms/prepare.py index 24a68491c..1c0b0a04f 100644 --- a/Mikado/subprograms/prepare.py +++ b/Mikado/subprograms/prepare.py @@ -120,8 +120,14 @@ def parse_prepare_options(args, mikado_config) -> Union[DaijinConfiguration, Mik mikado_config.serialise.codon_table = str(args.codon_table) if ( getattr(args, "codon_table", None) not in (None, False, True)) else mikado_config.serialise.codon_table - mikado_config.seed = args.seed if args.seed is not None else mikado_config.seed + if args.random_seed is True: + mikado_config.seed = None + elif args.seed is not None: + mikado_config.seed = args.seed + else: + pass + mikado_config.check() assert isinstance(mikado_config.reference.genome, str) return mikado_config @@ -284,8 +290,10 @@ def positive(string): cds_stripping.add_argument("--strip-faulty-cds", default=None, action="store_true", help="Flag. If set, transcripts with an incorrect CDS will be retained but \ with their CDS stripped. Default behaviour: the whole transcript will be considered invalid and discarded.") - parser.add_argument("--seed", type=int, default=None, - help="Random seed number.") + seed_group = parser.add_mutually_exclusive_group() + seed_group.add_argument("--seed", type=int, default=None, help="Random seed number. Default: 0.") + seed_group.add_argument("--random-seed", action="store_true", default=False, + help="Generate a new random seed number (instead of the default of 0)") parser.add_argument("gff", help="Input GFF/GTF file(s).", nargs="*") parser.set_defaults(func=prepare_launcher) return parser diff --git a/Mikado/subprograms/serialise.py b/Mikado/subprograms/serialise.py index b8b7db15e..2d3c30947 100644 --- a/Mikado/subprograms/serialise.py +++ b/Mikado/subprograms/serialise.py @@ -289,7 +289,14 @@ def setup(args): logger.setLevel("INFO") logger.info("Command line: %s", " ".join(sys.argv)) - mikado_configuration.seed = args.seed if args.seed is not None else mikado_configuration.seed + if args.random_seed is True: + mikado_configuration.seed = None + elif args.seed is not None: + mikado_configuration.seed = args.seed + else: + pass + + mikado_configuration.check() random.seed(mikado_configuration.seed) logger.info("Random seed: %s", mikado_configuration.seed) logger.setLevel(mikado_configuration.log_settings.log_level) @@ -449,7 +456,9 @@ def serialise_parser(): generic.add_argument("db", type=str, default=None, nargs='?', help="Optional output database. Default: derived from configuration") - generic.add_argument("--seed", type=int, default=None, - help="Random seed number.") + seed_group = parser.add_mutually_exclusive_group() + seed_group.add_argument("--seed", type=int, default=None, help="Random seed number. Default: 0.") + seed_group.add_argument("--random-seed", action="store_true", default=False, + help="Generate a new random seed number (instead of the default of 0)") parser.set_defaults(func=serialise) return parser diff --git a/Mikado/tests/locus_test.py b/Mikado/tests/locus_test.py index c005f9fe1..f1db3f490 100644 --- a/Mikado/tests/locus_test.py +++ b/Mikado/tests/locus_test.py @@ -516,6 +516,7 @@ def setUp(self): self.assertIsNotNone(self.configuration.scoring, self.configuration) self.transcript1.configuration = self.configuration self.transcript2.configuration = self.configuration + self.assertEqual(self.transcript1.configuration.seed, self.transcript2.configuration.seed) def test_create_metrics_row(self): @@ -999,6 +1000,7 @@ def test_serialisation(self): def test_slocus_dicts(self): + self.assertEqual(self.transcript1.configuration.seed, self.transcript2.configuration.seed) locus = Superlocus(self.transcript1) locus.add_transcript_to_locus(self.transcript2, check_in_locus=False) locus.subloci = [Sublocus(self.transcript1)] @@ -1006,7 +1008,7 @@ def test_slocus_dicts(self): locus.loci = {l.id: l} ml = MonosublocusHolder(Monosublocus(self.transcript1)) locus.monoholders = [ml] - locus.excluded = Excluded(self.transcript2) + locus.excluded = Excluded(self.transcript2, configuration=locus.configuration) conf = locus.configuration.copy() _without = locus.as_dict(with_subloci=False, with_monoholders=False) self.assertEqual(_without["subloci"], []) @@ -1014,9 +1016,15 @@ def test_slocus_dicts(self): self.assertEqual(_without["excluded"], locus.excluded.as_dict()) self.assertEqual(_without["loci"], {l.id: l.as_dict()}) _with = locus.as_dict(with_subloci=True, with_monoholders=True) + self.assertIsNotNone(_with["json_conf"]["seed"]) + self.assertEqual(_with["json_conf"]["seed"], conf.seed) self.assertEqual(_with["subloci"], [locus.subloci[0].as_dict()]) self.assertEqual(_with["monoholders"], [ml.as_dict()]) - self.assertEqual(_with["excluded"], Excluded(self.transcript2).as_dict()) + self.assertEqual(conf.seed, locus.configuration.seed) + self.assertEqual(conf.seed, self.transcript2.configuration.seed) + excl = Excluded(self.transcript2, configuration=conf) + self.assertEqual(excl.configuration.seed, locus.configuration.seed) + self.assertEqual(_with["excluded"], Excluded(self.transcript2, configuration=conf).as_dict()) self.assertEqual(_with["loci"], {l.id: l.as_dict()}) self.assertIsInstance(_with["json_conf"], dict) # Now test the reloading @@ -4234,11 +4242,11 @@ def test_complete_padding(self): locus.logger = logger locus.configuration.pick.alternative_splicing.ts_distance = pad_distance locus.configuration.pick.alternative_splicing.ts_max_splices = max_splice - # locus.logger.setLevel("DEBUG") + locus.logger.setLevel("DEBUG") locus.pad_transcripts() locus.logger.setLevel("WARNING") - - self.assertEqual(locus[best].start, transcripts["AT5G01030.2"].start) + self.assertEqual(transcripts["AT5G01030.2"].start, 9869) + self.assertEqual(locus[best].start, 9869) self.assertIn(best, locus) if max_splice < 2 or pad_distance <= 250: with self.assertLogs(logger, "DEBUG") as cm: diff --git a/Mikado/tests/test_bed12.py b/Mikado/tests/test_bed12.py index abb23936b..e86160d51 100644 --- a/Mikado/tests/test_bed12.py +++ b/Mikado/tests/test_bed12.py @@ -97,6 +97,27 @@ def test_ambiguous(self): if ambigouous is None: return # Nothing to test + def test_set_table(self): + b = BED12() + for invalid in (True, list(), "Inexistent", b"Standard2"): + with self.assertRaises(ValueError): + b.table = invalid + self.assertNotIn(0, CodonTable.ambiguous_dna_by_id.keys()) + for num in range(0, max(CodonTable.ambiguous_dna_by_id.keys()) + 10): + if num in CodonTable.ambiguous_dna_by_id.keys(): + b.table = num + self.assertEqual(b.table, CodonTable.ambiguous_dna_by_id[num]) + elif num == 0: + b.table = num + self.assertEqual(b.table, standard) + else: + with self.assertRaises(ValueError): + b.table = num + + for valid in list(CodonTable.ambiguous_dna_by_name.keys()): + b.table = valid + self.assertEqual(b.table, CodonTable.ambiguous_dna_by_name[valid]) + class Bed12GenToTrans(unittest.TestCase): diff --git a/Mikado/tests/test_external_async.py b/Mikado/tests/test_external_async.py index f7479a8e7..6a7c3828d 100644 --- a/Mikado/tests/test_external_async.py +++ b/Mikado/tests/test_external_async.py @@ -3,6 +3,8 @@ from Mikado._transcripts.scoring_configuration import MinMaxScore, SizeFilter from Mikado.configuration.configurator import load_and_validate_config from Mikado.loci import Superlocus +from Mikado.parsers.bed12 import BED12 +from Mikado.serializers.blast_serializer import Target, Hit, Hsp from Mikado.serializers.external import External, ExternalSource from Mikado.serializers.blast_serializer.query import Query from Mikado.serializers.orf import Orf @@ -195,4 +197,43 @@ def test_retrieval(self): class AsyncOrfLoading(unittest.TestCase): def test_load_orfs(self): - """""" + + transcript_line = 'Chr1\t100\t2000\tID=foo;coding=True;phase=0'\ + '\t0\t+\t300\t1850\t0\t4\t400,400,400,200\t0,500,1100,1700' + transcript = Transcript(transcript_line) + orf = transcript.orfs[0].to_transcriptomic() + transcript2 = transcript.copy() + transcript2.unfinalize() + transcript2.chrom = "Chr2" + transcript2.id = "foo.2" + transcript2.finalize() + other_orf = transcript2.orfs[0].to_transcriptomic() + engine = create_engine("sqlite:///:memory:") + db.metadata.create_all(engine) + SessionMaker = sessionmaker(bind=engine) + session = SessionMaker() + query = Query(transcript.id, transcript.cdna_length) + query2 = Query(transcript2.id, transcript2.cdna_length) + session.add_all([query, query2]) + session.commit() + serialized_orf = Orf(orf, query.query_id) + self.assertEqual(serialized_orf.thick_end, orf.thick_end) + self.assertEqual(serialized_orf.cds_len, orf.cds_len) + serialized_other_orf = Orf(other_orf, query2.query_id) + session.add_all([serialized_orf, serialized_other_orf]) + session.commit() + sup = Superlocus(transcript) + sup.session = session + sup_orfs = asyncio.run(sup.get_orfs([query.query_id])) + self.assertEqual(len(sup_orfs), 1) + self.assertIn(transcript.id, sup_orfs) + self.assertEqual(len(sup_orfs[transcript.id]), 1) + self.assertIsInstance(sup_orfs[transcript.id][0], BED12, type(sup_orfs[transcript.id][0])) + self.assertTrue(sup_orfs[transcript.id][0] == orf, "\n" + "\n".join( + [str(orf), str(sup_orfs[transcript.id][0])])) + + +# TODO: Create a test for the BLAST hits/hsps + +class AsyncBlastTest(unittest.TestCase): + """Test for the functionality of loading a BLAST hit from a Superlocus object.""" diff --git a/Mikado/tests/test_system_calls.py b/Mikado/tests/test_system_calls.py index 9ceef1151..04e2dbafb 100644 --- a/Mikado/tests/test_system_calls.py +++ b/Mikado/tests/test_system_calls.py @@ -527,7 +527,7 @@ def test_cdna_redundant_cds_not(self): args.procs = 1 args.list = None args.gffs = None - args.strand_specific_assemblies = None + args.strand_specific_assemblies = [] args.labels = None args.configuration = self.conf args.exclude_redundant = b @@ -538,6 +538,8 @@ def test_cdna_redundant_cds_not(self): args.log = "prepare.log" self.logger.setLevel("DEBUG") assert os.path.exists(folder) + self.assertEqual(args.strand_specific_assemblies, []) + self.assertEqual(args.configuration.prepare.files.strand_specific_assemblies, []) args, mikado_configuration, _logger = prepare_setup(args) self.assertIsNotNone(mikado_configuration) # self.assertEqual(args.output_dir, folder) @@ -2427,6 +2429,7 @@ def test_xml_vs_tsv(self): args.log = "{}_{}.log".format(name, proc) args.xml = blast args.procs = proc + args.start_adjustment = True serialise(args) dbs[name][proc] = os.path.join(test_xml_vs_tsv_folder, args.db) logged = [_.rstrip() for _ in open(os.path.join(test_xml_vs_tsv_folder, args.log))] diff --git a/Mikado/transcripts/pad.py b/Mikado/transcripts/pad.py new file mode 100644 index 000000000..9fe7b97ac --- /dev/null +++ b/Mikado/transcripts/pad.py @@ -0,0 +1,388 @@ +from .transcript import Transcript +import pysam +from ..exceptions import InvalidTranscript +from .transcriptchecker import TranscriptChecker + + +def expand_transcript(transcript: Transcript, + backup: Transcript, + start_transcript: [Transcript, bool], + end_transcript: [Transcript, bool], + fai: pysam.libcfaidx.FastaFile, + logger): + + """This method will enlarge the coordinates and exon structure of a transcript, given: + :param transcript: the transcript to modify. + :type transcript: Transcript + :param backup: a copy of the transcript to be modified. + :type backup: Transcript + :param start_transcript: the template transcript for the 5' end. + :param end_transcript: the template transcript for the 3' end. + :param fai: the indexed genomic sequence. + :param logger: the logger to be used in the function. + """ + + # If there is nothing to do, just get out + assert transcript == backup + transcript.finalize() + if start_transcript not in (False, None): + start_transcript.finalize() + if end_transcript not in (False, None): + end_transcript.finalize() + + if start_transcript in (False, None) and end_transcript in (False, None): + logger.debug("%s does not need to be expanded, exiting", transcript.id) + return transcript + + if transcript.strand == "-": + start_transcript, end_transcript = end_transcript, start_transcript + + # Make a backup copy of the transcript + # First get the ORFs + # Remove the CDS and unfinalize + logger.debug("Starting expansion of %s", transcript.id) + strand = transcript.strand + transcript.strip_cds() + transcript.unfinalize() + assert strand == transcript.strand + + upstream, up_exons, new_first_exon, up_remove = _enlarge_start(transcript, backup, start_transcript) + downstream, up_exons, down_exons, down_remove = _enlarge_end(transcript, + backup, end_transcript, up_exons, new_first_exon) + + first_exon, last_exon = transcript.exons[0], transcript.exons[-1] + + assert upstream >= 0 and downstream >= 0 + + if up_remove is True: + # Remove the first exon + transcript.remove_exon(first_exon) + if down_remove is True: + if not (up_remove is True and first_exon == last_exon): + transcript.remove_exon(last_exon) + + new_exons = up_exons + down_exons + if not new_exons: + logger.debug("%s does not need to be expanded, exiting", transcript.id) + return backup + + transcript.add_exons(new_exons) + transcript.start, transcript.end = None, None + transcript.finalize() + + if transcript.strand == "-": + downstream, upstream = upstream, downstream + + if backup.is_coding: + seq = check_expanded(transcript, backup, start_transcript, end_transcript, + fai, upstream, downstream, logger) + transcript = enlarge_orfs(transcript, backup, seq, upstream, downstream, logger) + transcript.finalize() + + logger.debug("%s: start (before %s, now %s, %s), end (before %s, now %s, %s)", + transcript.id, + backup.start, transcript.start, transcript.start < backup.start, + backup.end, transcript.end, transcript.end > backup.end) + if transcript.start < backup.start or transcript.end > backup.end: + transcript.attributes["padded"] = True + + # Now check that we have a valid expansion + if backup.is_coding and not transcript.is_coding: + # Something has gone wrong. Just return the original transcript. + assert new_exons + logger.info("Padding %s would lead to an invalid CDS (up exons: %s). Aborting.", + transcript.id, up_exons) + return backup + elif backup.is_coding: + abort = False + if backup.strand == "-" and backup.combined_cds_end < transcript.combined_cds_end: + abort = True + elif backup.strand != "-" and backup.combined_cds_end > transcript.combined_cds_end: + abort = True + if abort is True: + msg = "Padding {} (strand: {}) would lead to an in-frame stop codon ({} to {}, \ +vs original {} to {}. Aborting.".format( + transcript.id, backup.strand, transcript.combined_cds_start, transcript.combined_cds_end, + backup.combined_cds_start, backup.combined_cds_end) + logger.info(msg) + return backup + + return transcript + + +def _enlarge_start(transcript: Transcript, + backup: Transcript, + start_transcript: Transcript) -> (int, list, [None, tuple], bool): + + """This method will enlarge the transcript at the 5' end, using another transcript as the template. + :param transcript: the original transcript to modify. + :param backup: a copy of the transcript. As we are modifying the original one, we do need a hard copy. + :param start_transcript: the template transcript. + + The function returns the following: + :returns: the upstream modification, the list of upstream exons to add, the new first exon (if any), + a boolean flag indicating whether the first exon of the transcript should be removed. + """ + + upstream = 0 + up_exons = [] + new_first_exon = None + to_remove = False + if start_transcript: + transcript.start = start_transcript.start + upstream_exons = sorted( + [_ for _ in start_transcript.find_upstream(transcript.exons[0][0], transcript.exons[0][1]) + if _.value == "exon"]) + intersecting_upstream = sorted(start_transcript.search( + transcript.exons[0][0], transcript.exons[0][1])) + + if not intersecting_upstream: + raise KeyError("No exon or intron found to be intersecting with %s vs %s, this is a mistake", + transcript.id, start_transcript.id) + + if intersecting_upstream[0].value == "exon": + new_first_exon = (min(intersecting_upstream[0][0], backup.start), + transcript.exons[0][1]) + if new_first_exon != transcript.exons[0]: + upstream += backup.start - new_first_exon[0] + up_exons.append(new_first_exon) + to_remove = True + else: + new_first_exon = None + if intersecting_upstream[0] in upstream_exons: + upstream_exons.remove(intersecting_upstream[0]) + upstream += sum(_[1] - _[0] + 1 for _ in upstream_exons) + up_exons.extend([(_[0], _[1]) for _ in upstream_exons]) + elif intersecting_upstream[0].value == "intron": + # Check whether the first exon of the model *ends* within an *intron* of the template + # If that is the case, we have to keep the first exon in place and + # just expand it until the end + # Now we have to expand until the first exon in the upstream_exons + if intersecting_upstream[0][1] == transcript.exons[0][0] - 1: + assert upstream_exons + to_remove = False + elif upstream_exons: + to_remove = True + upstream_exon = upstream_exons[-1] + new_first_exon = (upstream_exon[0], transcript.exons[0][1]) + upstream_exons.remove(upstream_exon) + upstream += backup.start - new_first_exon[0] + up_exons.append(new_first_exon) + else: + # Something fishy going on here. Let us double check everything. + if start_transcript.exons[0][0] == transcript.start: + raise ValueError( + "Something has gone wrong. The template transcript should have returned upstream exons." + ) + elif start_transcript.exons[0][0] < transcript.start: + raise ValueError( + "Something has gone wrong. We should have found the correct exons." + ) + else: + pass + + upstream += sum(_[1] - _[0] + 1 for _ in upstream_exons) + up_exons.extend([(_[0], _[1]) for _ in upstream_exons]) + + return upstream, up_exons, new_first_exon, to_remove + + +def _enlarge_end(transcript: Transcript, + backup: Transcript, + end_transcript: Transcript, + up_exons: list, + new_first_exon: [None, tuple]) -> [int, list, list, bool]: + + """ + This method will enlarge the transcript at the 5' end, using another transcript as the template. + :param transcript: the original transcript to modify. + :param backup: a copy of the transcript. As we are modifying the original one, we do need a hard copy. + :param end_transcript: the template transcript. + :param up_exons: the list of exons added at the 5' end. + :param new_first_exon: the new coordinates of what used to be the first exon of the transcript. + This is necessary because if the transcript is monoexonic, we might need to re-modify it. + + The function returns the following: + :returns: the downstream modification, the (potentially modified) list of upstream exons to add, + the list of downstream exons to add, a boolean flag indicating whether the last exon of the transcript + should be removed. + """ + + downstream = 0 + down_exons = [] + to_remove = False + + if end_transcript: + transcript.end = end_transcript.end + downstream_exons = sorted( + [_ for _ in end_transcript.find_downstream(transcript.exons[-1][0], transcript.exons[-1][1]) + if _.value == "exon"]) + intersecting_downstream = sorted(end_transcript.search( + transcript.exons[-1][0], transcript.exons[-1][1])) + if not intersecting_downstream: + raise KeyError("No exon or intron found to be intersecting with %s vs %s, this is a mistake", + transcript.id, end_transcript.id) + # We are taking the right-most intersecting element. + if intersecting_downstream[-1].value == "exon": + if transcript.monoexonic and new_first_exon is not None: + new_exon = (new_first_exon[0], max(intersecting_downstream[-1][1], new_first_exon[1])) + if new_exon != new_first_exon: + up_exons.remove(new_first_exon) + downstream += new_exon[1] - backup.end + down_exons.append(new_exon) + to_remove = True + else: + new_exon = (transcript.exons[-1][0], + max(intersecting_downstream[-1][1], transcript.exons[-1][1])) + if new_exon != transcript.exons[-1]: + downstream += new_exon[1] - backup.end + down_exons.append(new_exon) + to_remove = True + + if intersecting_downstream[-1] in downstream_exons: + downstream_exons.remove(intersecting_downstream[-1]) + downstream += sum(_[1] - _[0] + 1 for _ in downstream_exons) + down_exons.extend([(_[0], _[1]) for _ in downstream_exons]) + elif intersecting_downstream[-1].value == "intron": + # Now we have to expand until the first exon in the upstream_exons + if intersecting_downstream[-1][0] == transcript.exons[-1][1] + 1: + assert downstream_exons + to_remove = False + elif downstream_exons: + downstream_exon = downstream_exons[0] + assert downstream_exon[1] > backup.end + assert downstream_exon[0] > backup.end + if transcript.monoexonic and new_first_exon is not None: + new_exon = (new_first_exon[0], downstream_exon[1]) + up_exons.remove(new_first_exon) + to_remove = True + else: + new_exon = (transcript.exons[-1][0], downstream_exon[1]) + to_remove = True + downstream_exons.remove(downstream_exon) + downstream += new_exon[1] - backup.end + down_exons.append(new_exon) + else: + # Something fishy going on here. Let us double check everything. + if end_transcript.exons[-1][1] == transcript.end: + raise ValueError( + "Something has gone wrong. The template transcript should have returned upstream exons." + ) + elif end_transcript.exons[-1][1] > transcript.end: + raise ValueError( + "Something has gone wrong. We should have found the correct exons." + ) + downstream += sum(_[1] - _[0] + 1 for _ in downstream_exons) + down_exons.extend([(_[0], _[1]) for _ in downstream_exons]) + + return downstream, up_exons, down_exons, to_remove + + +def check_expanded(transcript, backup, start_transcript, end_transcript, fai, upstream, downstream, logger) -> str: + + """ + This function checks that the expanded transcript is valid, and it also calculates and returns its cDNA sequence. + :param transcript: the modified transcript. + :param backup: The original transcript, before expansion. + :param start_transcript: the transcript used as template at the 5' end. + :param end_transcript: the transcript used as template at the 3' end. + :param fai: The pysam.libcfaidx.FastaFile object indexing the genome. + :param upstream: the amount of transcriptomic base-pairs added to the transcript at its 5' end. + :param downstream: the amount of transcriptomic base-pairs added to the transcript at its 3' end. + :param logger: the logger to use. + :returns: the cDNA of the modified transcript, as a standard Python string. + """ + + assert transcript.exons != backup.exons + assert transcript.end <= fai.get_reference_length(transcript.chrom), ( + transcript.end, fai.get_reference_length(transcript.chrom)) + genome_seq = fai.fetch(transcript.chrom, transcript.start - 1, transcript.end) + + if not (transcript.exons[-1][1] - transcript.start + 1 == len(genome_seq)): + error = "{} should have a sequence of length {} ({} start, {} end), but one of length {} has been given" + error = error.format(transcript.id, transcript.exons[-1][1] - transcript.start + 1, + transcript.start, transcript.end, len(genome_seq)) + logger.error(error) + raise InvalidTranscript(error) + seq = TranscriptChecker(transcript, genome_seq, is_reference=True).cdna + assert len(seq) == transcript.cdna_length, (len(seq), transcript.cdna_length, transcript.exons) + if not len(seq) == backup.cdna_length + upstream + downstream: + error = [len(seq), backup.cdna_length + upstream + downstream, + backup.cdna_length, upstream, downstream, + (transcript.start, transcript.end), (backup.id, backup.start, backup.end), + (None if not start_transcript else (start_transcript.id, (start_transcript.start, + start_transcript.end))), + (None if not end_transcript else (end_transcript.id, (end_transcript.start, + end_transcript.end))), + (backup.id, backup.exons), + None if not start_transcript else (start_transcript.id, start_transcript.exons), + None if not end_transcript else (end_transcript.id, end_transcript.exons), + (transcript.id + "_expanded", transcript.exons), + set.difference(set(transcript.exons), set(backup.exons)), + set.difference(set(backup.exons), set(transcript.exons)) + ] + error = "\n".join([str(_) for _ in error]) + raise AssertionError(error) + return seq + + +def enlarge_orfs(transcript: Transcript, + backup: Transcript, + seq: str, + upstream: int, + downstream: int, + logger) -> Transcript: + + """ + This method will take an expanded transcript and recalculate its ORF(s). As a consequence of the expansion, + truncated transcripts might become whole. + :param transcript: the expanded transcript. + :param backup: the original transcript. Used to extract the original ORF(s). + :param seq: the new cDNA sequence of the expanded transcript. + :param upstream: the amount of expansion that happened at the 5'. + :param downstream: the amount of expansion that happened at the 3'. + :param logger: the logger. + :returns: the modified transcript with the ORF(s) recalculated. + """ + + if backup.combined_cds_length > 0: + try: + internal_orfs = list(backup.get_internal_orf_beds()) + except (ValueError, TypeError, AssertionError): + logger.error("Something went wrong with the CDS extraction for %s. Stripping it.", + backup.id) + internal_orfs = [] + else: + internal_orfs = [] + + if not internal_orfs: + return transcript + + new_orfs = [] + for orf in internal_orfs: + logger.debug("Old ORF: %s", str(orf)) + try: + logger.debug("Sequence for %s: %s[..]%s (upstream %s, downstream %s)", + transcript.id, seq[:10], seq[-10:], upstream, downstream) + orf.expand(seq, upstream, downstream, expand_orf=True, logger=logger) + except AssertionError as err: + logger.error(err) + logger.error("%s, %s, %s, %s", + upstream, + downstream, + transcript.exons, + transcript.cdna_length) + raise AssertionError(err) + logger.debug("New ORF: %s", str(orf)) + if orf.coding is False: + raise ValueError(orf) + elif orf.invalid: + raise InvalidTranscript(orf.invalid_reason) + + new_orfs.append(orf) + + transcript.load_orfs(new_orfs) + transcript.finalize() + if backup.is_coding and not transcript.is_coding: + raise InvalidTranscript(new_orfs) + return transcript diff --git a/sample_data/Snakefile b/sample_data/Snakefile index f10e9be4d..dc7821b54 100644 --- a/sample_data/Snakefile +++ b/sample_data/Snakefile @@ -72,8 +72,8 @@ rule daijin_assemble: output: conf="daijin_test/mikado.yaml" threads: 4 - message: "daijin assemble --nolock --threads 2 --cores 4 --jobs 2 daijin.toml" - shell: "daijin assemble --nolock --threads 2 --cores 4 --jobs 2 {input.conf}" + message: "daijin assemble -nd --nolock --threads 2 --cores 4 --jobs 2 daijin.toml" + shell: "daijin assemble -nd --nolock --threads 2 --cores 4 --jobs 2 {input.conf}" rule test_json: input: db=swissprot, config=configname