Fix #387. Add tests (with relative patches for minor, untriggered bug…

…s) for BED12 and ORF loading. CHANGELOG updated.
EI-CoreBioinformatics · Mar 12, 2021 · 67d4672 · 67d4672
1 parent 501704b
commit 67d4672
Show file tree

Hide file tree

Showing 22 changed files with 626 additions and 507 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,10 +10,17 @@ Other changes:
   provided as a stream is *disabled* though.
 - Fix [#382](https://github.com/EI-CoreBioinformatics/mikado/issues/382): now Mikado can accept generic BED12 files 
   as input junctions, not just Portcullis junctions. This allows e.g. a user to provide a ***set of gene models*** 
-  in BED12 format as sources of valid junctions.  
-- Slightly increased the unit-test coverage for the locus classes, e.g. properly covering the `as_dict` and `load_dict`
-  methods. Minor bugfixes related to the introduction of these unit-tests.
+  in BED12 format as sources of valid junctions.
+- Fix [#387](https://github.com/EI-CoreBioinformatics/mikado/issues/387): now Mikado will always use a static seed, 
+  rather than generating a new one per call unless specifically instructed to do so. The old behaviour can still be 
+  replicated by either setting the `seed` parameter to `null` (ie `None`) in the configuration file, or by 
+  specifying `--random-seed` during the command invocation.
+- General increase in code unit-test coverage; in particular:  
+  - Slightly increased the unit-test coverage for the locus classes, e.g. properly covering the `as_dict` and `load_dict`
+    methods. Minor bugfixes related to the introduction of these unit-tests.
 - `Mikado.parsers.to_gff` has been renamed to `Mikado.parsers.parser_factory`.
+- The code related to the transcript padding has been moved to the submodule `Mikado.transcripts.pad`, rather than 
+  being part of the `Mikado.loci.locus` submodule.
 - Mikado will error informatively if the scoring configuration file is malformed.
 
 # Version 2.1.1

diff --git a/Mikado/_transcripts/transcript_base.py b/Mikado/_transcripts/transcript_base.py
@@ -827,8 +827,8 @@ def get_internal_orf_beds(self) -> List[BED12]:
                 new_row.thick_start = utr + 1
                 new_row.thick_end = new_row.thick_start + cds_len - 1
                 new_row.name = "{}_orf{}".format(self.tid, index)
-                new_row.block_starts = [row.thick_start]
-                new_row.block_sizes = [cds_len]
+                new_row.block_starts = [0]
+                new_row.block_sizes = [self.cdna_length]
                 new_row.phase = phase
                 # self.logger.debug(new_row)
                 new_row = BED12(new_row,
@@ -849,6 +849,10 @@ def get_internal_orf_beds(self) -> List[BED12]:
 
                 yield new_row
 
+    @property
+    def orfs(self) -> List[BED12]:
+        return list(self.get_internal_orf_beds())
+
     @Metric
     def is_reference(self):
         """Checks whether the transcript has been marked as reference by Mikado prepare"""

diff --git a/Mikado/configuration/configuration.py b/Mikado/configuration/configuration.py
@@ -1,6 +1,7 @@
 import copy
 import dataclasses
 from dataclasses import field
+import random
 from marshmallow import validate, ValidationError
 from marshmallow_dataclass import dataclass, Optional
 from .picking_config import PickConfiguration
@@ -41,8 +42,10 @@ class MikadoConfiguration:
         "required": True
     })
     seed: int = field(default=0, metadata={
-        "metadata": {"description": "Random number generator seed, to ensure reproducibility across runs"},
-        "validate": validate.Range(min=0, max=2 ** 32 - 1)
+        "metadata": {"description": "Random number generator seed, to ensure reproducibility across runs. Set to None"
+                     "('null' in YAML/JSON/TOML files) to let Mikado select a random seed every time."},
+        "validate": validate.Range(min=0, max=2 ** 32 - 1),
+        "allow_none": True, "required": True
     })
     multiprocessing_method: Optional[str] = field(default="spawn", metadata={
         "metadata": {"description": "Which method (fork, spawn, forkserver) Mikado should use for multiprocessing"},
@@ -75,11 +78,18 @@ def __post_init__(self):
     def copy(self):
         return copy.copy(self)
 
-    def check(self):
+    def check(self, logger=create_null_logger()):
+        if self.seed is None:
+            self.seed = random.randint(0, 2 ** 32 - 1)
+            logger.info(f"Random seed: {self.seed}")
         if self.scoring is None or not hasattr(self.scoring.requirements, "parameters"):
-            self.load_scoring()
+            self.load_scoring(logger=logger)
         self.scoring.check(minimal_orf_length=self.pick.orf_loading.minimal_orf_length)
-        self.Schema().validate(dataclasses.asdict(self))
+        errors = self.Schema().validate(dataclasses.asdict(self))
+        if len(errors) > 0:
+            exc = InvalidConfiguration(f"The configuration is invalid, please double check. Errors:\n{errors}")
+            logger.critical(exc)
+            raise exc
 
     def load_scoring(self, logger=None):
         """

diff --git a/Mikado/configuration/configurator.py b/Mikado/configuration/configurator.py
@@ -118,24 +118,16 @@ def check_and_load_scoring(configuration: Union[DaijinConfiguration, MikadoConfi
 
     try:
         configuration.load_scoring(logger=logger)
-        configuration.check()
+        configuration.check(logger=logger)
         configuration = check_db(configuration)
         if not configuration.multiprocessing_method:
             configuration.multiprocessing_method = get_start_method()
-
-    except Exception as exc:
+    except InvalidConfiguration as exc:
         logger.exception(exc)
         raise
 
-    seed = configuration.seed
-
-    if seed != 0:
-        # numpy.random.seed(seed % (2 ** 32 - 1))
-        random.seed(seed % (2 ** 32 - 1))
-    else:
-        # numpy.random.seed(None)
-        random.seed(None)
-
+    assert configuration.seed is not None
+    random.seed(configuration.seed % (2 ** 32 - 1))
     return configuration
 
 
@@ -212,10 +204,6 @@ def load_and_validate_config(raw_configuration: Union[None, MikadoConfiguration,
         logger.exception("Loading the configuration file failed with error:\n%s\n\n\n", exc)
         raise InvalidConfiguration("The configuration file passed is invalid. Please double check.")
 
-    if config.seed == 0 or config.seed is None:
-        config.seed = random.randint(1, 2 ** 32 - 1)
-        logger.info("Random seed: {}", config.seed)
-
     random.seed(config.seed % (2 ** 32 - 1))
 
     return config
diff --git a/Mikado/configuration/daijin_configurator.py b/Mikado/configuration/daijin_configurator.py
@@ -6,7 +6,7 @@
 import toml
 import yaml
 from pkg_resources import resource_stream
-from .configurator import create_cluster_config
+from .configurator import create_cluster_config, load_and_validate_config
 from . import print_config
 from .daijin_configuration import DaijinConfiguration
 from .._transcripts.scoring_configuration import ScoringFile
@@ -254,6 +254,8 @@ def create_daijin_config(args: Namespace, config=None, level="ERROR", piped=Fals
 
     final_config = config.copy()
 
+    final_config = load_and_validate_config(final_config)
+
     if args.exe:
         with open(args.exe, "wt") as out:
             for key, val in dataclasses.asdict(final_config.load).items():

diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py
@@ -258,6 +258,7 @@ def as_dict(self) -> dict:
         state["transcripts"] = dict((tid, state["transcripts"][tid].as_dict()) for tid in state["transcripts"])
         assert "metrics_calculated" in state
         state["json_conf"] = dataclasses.asdict(state["json_conf"])
+        assert state["json_conf"]["seed"] is not None
         return state
 
     def load_dict(self, state: dict, load_transcripts=True, load_configuration=True):

diff --git a/Mikado/loci/excluded.py b/Mikado/loci/excluded.py
@@ -34,7 +34,6 @@ def __init__(self, monosublocus_instance=None, configuration=None, logger=None):
         Abstractlocus.__init__(self, configuration=configuration)
         self.splitted = False
         self.metrics_calculated = False
-        # self.configuration = configuration
         self.logger = logger
         if isinstance(monosublocus_instance, Transcript):
             Abstractlocus.__init__(self, transcript_instance=monosublocus_instance)