Issue 240 (#245)

* Fix #240, #243 * Solved a bug that caused boolean values to be converted into integers for `pick`.
EI-CoreBioinformatics · Oct 25, 2019 · 160a3a3 · 160a3a3
1 parent b451404
commit 160a3a3
Show file tree

Hide file tree

Showing 17 changed files with 1,217 additions and 73 deletions.
diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json
@@ -639,6 +639,10 @@
               "type": "boolean",
               "default": false
             },
+            "check_references": {
+              "type": "boolean",
+              "default": false
+            },
             "single_thread": {
               "type": "boolean",
               "default": false

diff --git a/Mikado/configuration/daijin_schema.json b/Mikado/configuration/daijin_schema.json
@@ -276,7 +276,7 @@
       "- identity: minimum identity for any alignment. Default: 95%",
       "- coverage: minimum coverage for any alignment. Default: 70%"],
       "properties": {
-        "max_mem": {"type": "integer", "default": 6000, "minimum": 1000, "required": true},
+        "max_mem": {"type": "integer", "default": 6000, "minimum": 1000},
         "npaths": {"type": "integer", "default": 0},
         "identity": {"type": "number", "default": 0.95, "minimum": 0, "maximum": 1},
         "coverage": {"type": "number", "default": 0.70, "minimum": 0, "maximum": 1}

diff --git a/Mikado/daijin/__init__.py b/Mikado/daijin/__init__.py
@@ -372,6 +372,7 @@ def assemble_transcripts_pipeline(args):
                                             prefix="assemble")
     yaml.dump(doc, yaml_file)
     yaml_file.flush()
+    shutil.copystat(args.config, yaml_file.name)
 
     if args.latency_wait is not None:
         latency = abs(args.latency_wait)
@@ -400,7 +401,8 @@ def assemble_transcripts_pipeline(args):
         "printdag": args.dag,
         "forceall": args.dag,
         "forcerun": args.forcerun,
-        "lock": (not args.nolock)
+        "lock": (not args.nolock),
+        "printreason": True
     }
 
     if "configfile" in inspect.getfullargspec(snakemake.snakemake).args:
@@ -494,6 +496,7 @@ def mikado_pipeline(args):
                                             )
     yaml.dump(doc, yaml_file)
     yaml_file.flush()
+    shutil.copystat(args.config, yaml_file.name)
 
     if SCHEDULER == "local":
         hpc_conf = None
@@ -541,6 +544,7 @@ def mikado_pipeline(args):
         "forceall": args.dag,
         "forcerun": args.forcerun,
         "lock": (not args.nolock),
+        "printreason": True
     }
 
     if "configfile" in inspect.getfullargspec(snakemake.snakemake).args:

diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py
@@ -570,6 +570,11 @@ def remove_transcript_from_locus(self, tid: str):
             self.initialized = False
 
         self.logger.debug("Deleted %s from %s", tid, self.id)
+        if tid in self._metrics:
+            del self._metrics[tid]
+        if tid in self.scores:
+            del self.scores[tid]
+
         self.metrics_calculated = False
         self.scores_calculated = False
 
@@ -987,6 +992,7 @@ def get_metrics(self):
 
         if self.metrics_calculated is True:
             return
+        self._metrics = dict()
         cds_bases = sum(_[1] - _[0] + 1 for _ in merge_ranges(
             itertools.chain(*[
                 self.transcripts[_].combined_cds for _ in self.transcripts
@@ -1135,19 +1141,20 @@ def _check_not_passing(self, previous_not_passing=set()):
                 assert self.transcripts[tid].json_conf["prepare"]["files"][\
                            "reference"] == self.json_conf["prepare"]["files"]["reference"]
 
-            if self.transcripts[tid].is_reference is True:
-                # Reference transcripts should be kept in, no matter what.
-                self.logger.debug("Skipping %s from the requirement check as it is a reference transcript")
-                continue
-            elif self.transcripts[tid].original_source in self.json_conf["prepare"]["files"]["reference"]:
-                self.transcripts[tid].is_reference = True  # Bug
-                self.logger.debug("Skipping %s from the requirement check as it is a reference transcript", tid)
-                continue
-            else:
+            is_reference = ((self.transcripts[tid].is_reference is True) or
+                            (self.transcripts[tid].original_source in self.json_conf["prepare"]["files"]["reference"]))
+
+            if is_reference is False:
                 self.logger.debug("Transcript %s (source %s) is not a reference transcript (references: %s; in it: %s)",
                                   tid, self.transcripts[tid].original_source,
                                   self.json_conf["prepare"]["files"]["reference"],
-                                  self.transcripts[tid].original_source in self.json_conf["prepare"]["files"]["reference"])
+                                  self.transcripts[tid].original_source in self.json_conf["prepare"]["files"][
+                                      "reference"])
+            elif is_reference is True and self.json_conf["pick"]["run_options"]["check_references"] is False:
+                self.logger.debug("Skipping %s from the requirement check as it is a reference transcript", tid)
+                continue
+            elif is_reference is True and self.json_conf["pick"]["run_options"]["check_references"] is True:
+                self.logger.debug("Performing the requirement check for %s even if it is a reference transcript", tid)
 
             evaluated = dict()
             for key in self.json_conf["requirements"]["parameters"]:
@@ -1333,16 +1340,27 @@ def _calculate_score(self, param):
             try:
                 # metric = rgetattr(self.transcripts[tid], param)
                 if tid not in self._metrics and transcript.alias in self._metrics:
-                    metric = self._metrics[transcript.alias][param]
+                    if param in self._metrics[transcript.alias]:
+                        metric = self._metrics[transcript.alias][param]
+                    else:
+                        metric = rgetattr(self.transcripts[tid], param)
+                        self._metrics[transcript.alias][param] = metric
                 else:
-                    metric = self._metrics[tid][param]
+                    if tid not in self._metrics:
+                        self._metrics[tid] = dict()
+                    if param in self._metrics[tid]:
+                        metric = self._metrics[tid][param]
+                    else:
+                        metric = rgetattr(self.transcripts[tid], param)
+                        self._metrics[tid][param] = metric
                 if isinstance(metric, (tuple, list)):
                     metric = metric[0]
                 metrics[tid] = metric
             except TypeError:
                 raise TypeError(param)
             except KeyError:
-                raise KeyError(param)
+                metric = rgetattr(self.transcripts[tid], param)
+                raise KeyError((tid, param, metric))
             except AttributeError:
                 raise AttributeError(param)
 

diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py
@@ -349,6 +349,10 @@ def __remove_redundant_after_padding(self):
             continue
         return
 
+    def as_dict(self):
+        self.calculate_scores()
+        return super().as_dict()
+
     def remove_transcript_from_locus(self, tid: str):
 
         """Overloading of the AbstractLocus class, in order to ensure that the primary transcript will *not*
@@ -484,7 +488,8 @@ def is_putative_fragment(self):
         """This method will use the expression in the "not_fragmentary" section
         of the configuration to determine whether it is itself a putative fragment."""
 
-        if any(self.transcripts[tid].is_reference is True for tid in self.transcripts):
+        if not self.json_conf["pick"]["run_options"]["check_references"] and \
+                any(self.transcripts[tid].is_reference is True for tid in self.transcripts):
             return False
 
         self.json_conf["not_fragmentary"]["compiled"] = compile(
@@ -1065,7 +1070,7 @@ def __set_id(self, string):
             return
         primary_id = "{0}.1".format(string)
         old_primary = self.primary_transcript.id
-        self.primary_transcript.attributes["Alias"] = self.primary_transcript.id
+        self.primary_transcript.attributes["alias"] = self.primary_transcript.id
         self.primary_transcript.id = primary_id
         self.transcripts[primary_id] = self.primary_transcript
         self.primary_transcript_id = primary_id
@@ -1078,7 +1083,7 @@ def __set_id(self, string):
 
         for counter, tid in enumerate(order):
             counter += 2
-            self.transcripts[tid].attributes["Alias"] = tid
+            self.transcripts[tid].attributes["alias"] = tid
             new_id = "{0}.{1}".format(string, counter)
             self.transcripts[tid].id = new_id
             self.transcripts[new_id] = self.transcripts.pop(tid)
@@ -1087,6 +1092,7 @@ def __set_id(self, string):
         if self.scores_calculated is True:
             for tid in mapper:
                 self.scores[mapper[tid]] = self.scores.pop(tid)
+                self._metrics[mapper[tid]] = self._metrics.pop(tid)
         if self.metrics_calculated is True:
             for index in range(len(self.metric_lines_store)):
                 self.metric_lines_store[index]["tid"] = mapper[self.metric_lines_store[index]["tid"]]
@@ -1153,7 +1159,7 @@ def ts_max_splices(self):
 
     @property
     def has_reference_transcript(self):
-        return any(self.transcripts[transcript].is_reference is True for transcript in self)
+        return any(self.transcripts[transcript].is_reference for transcript in self)
 
     def _get_alternative_splicing_codes(self):
         """Method to retrieve the currently valid alternative splicing event codes"""

diff --git a/Mikado/loci/superlocus.py b/Mikado/loci/superlocus.py
@@ -937,6 +937,7 @@ def define_subloci(self):
 
         if self.subloci_defined is True:
             return
+
         self.compile_requirements()
         self.subloci = []
 

diff --git a/Mikado/picking/_merge_loci_utils.py b/Mikado/picking/_merge_loci_utils.py
@@ -5,6 +5,8 @@
 from ..loci import Locus
 import sys
 import collections
+import itertools
+import numpy as np
 from ._locus_line_creator import _create_locus_lines
 
 
@@ -83,39 +85,34 @@ def manage_index(data, dumps, source):
 def __create_gene_counters(common_index: dict) -> (dict, int):
     """Function to assign to each counter in the database the correct base and maximum number of genes.
     This allows to parallelise the printing.
+    The common index has the following structure:
+
+    d[counter] = (database index, chrom, number of genes in locus)
     """
 
-    chroms, nums = list(zip(*[common_index[index][1:3] for index in range(1, max(common_index.keys()) + 1)]))
-    total_genes = sum(nums)
+    chroms = []
+    num_genes = []
+
+    for index in range(1, max(common_index.keys()) + 1):
+        _, chrom, n_genes = common_index[index]
+        chroms.append(chrom)
+        num_genes.append(n_genes)
+
+    chroms = np.array(chroms)
+    num_genes = np.array(num_genes)
+
     gene_counters = dict()
+    total_genes = sum(num_genes)
+
     chrom_tots = collections.defaultdict(list)
-    assert len(chroms) == len(common_index), (len(chroms), len(common_index))
-    for pos in range(len(chroms)):
-        key = pos + 1
-        chrom, num = chroms[pos], nums[pos]
-        if chrom == '' and pos > 0:
-            assert num == 0
-            former = gene_counters[pos][0]
-        elif pos == 0 or chrom != chroms[pos - 1]:
-            if chroms[pos - 1] != "":
-                former = 0
-            else:  # The previous one is wrong ..
-                prev_pos = pos - 1
-                prev_chrom = chroms[prev_pos]
-                while prev_chrom == "":
-                    prev_pos -= 1
-                    if prev_pos < 0:
-                        break
-                    prev_chrom = chroms[prev_pos]
-                if prev_chrom == "" or prev_chrom != chrom:
-                    former = 0
-                else:
-                    former = gene_counters[pos][0] + gene_counters[pos][1]
-        else:
-            former = gene_counters[pos][0] + gene_counters[pos][1]
-        gene_counters[key] = (former, num)
-        if chrom:
-            chrom_tots[chrom].extend(list(range(former + 1, former + num + 1)))
+    for chrom in np.unique(chroms):
+        index = np.where(chroms == chrom)
+        totals = num_genes[index]
+        cumu = totals.cumsum()
+        for counter, former, num in zip(index[0], itertools.chain([0], cumu[:-1]), totals):
+            gene_counters[counter + 1] = (former, num)
+            if chrom:
+                chrom_tots[chrom].extend(list(range(former + 1, former + num + 1)))
 
     tot_found = 0
     for chrom in chrom_tots:
@@ -137,9 +134,11 @@ def __create_gene_counters(common_index: dict) -> (dict, int):
             tot_found += chrom_tots[chrom][-1]
 
     assert tot_found == total_genes, (tot_found, total_genes)
-    new_common = dict()
+
     assert min(common_index) == 1
 
+    new_common = dict()
     for key in common_index:
+        # DbIndex
         new_common[key] = (common_index[key][0], gene_counters[key][0], gene_counters[key][1])
     return new_common, total_genes
diff --git a/Mikado/picking/loci_processer.py b/Mikado/picking/loci_processer.py
@@ -61,12 +61,17 @@ def merge_loci(num_temp, out_handles,
         checker.update(counters)
         logger.fatal("%d double indices found!", len([_ for _ in checker if checker[_] > 1]))
 
+    # Start iterating the output dictionaries ("cursors")
     for dbindex, cursor in enumerate(cursors):
-        d = dict((index[0], (dbindex, index[1], index[2])) for index in cursor.execute(
-            "SELECT counter, chrom, genes FROM loci").fetchall())
-        assert not set.intersection(set(d.keys()), set(common_index.keys())), set.intersection(
-            set(d.keys()), set(common_index.keys()))
-
+        # Get the counter (this is the dictionary key), chromosome, and number of genes
+        d = dict()
+        doubles = set()
+        for counter, chrom, genes in cursor.execute("SELECT counter, chrom, genes FROM loci"):
+            if counter in common_index:
+                doubles.add(counter)
+            d[counter] = (dbindex, chrom, genes)
+        if len(doubles) > 0:
+            raise AssertionError("Double indices found: {}".format(doubles))
         common_index.update(d)
 
     print_subloci = (out_handles[1][0] is not None)
@@ -75,10 +80,10 @@ def merge_loci(num_temp, out_handles,
         raise KeyError("I am missing some loci! {} vs {}".format(
             max_counter, max(common_index.keys())))
 
-    assert set(common_index.keys()) == set(range(1, max(common_index.keys()) + 1)), (
-        set.difference(set(range(1, max(common_index.keys()) + 1)), set(common_index.keys()))
-    )
-    assert len(common_index.keys()) == len(set(common_index.keys()))
+    __valid = set(range(1, max(common_index.keys()) + 1))
+    if set(common_index.keys()) != __valid:
+        missing = set.difference(__valid, set(common_index.keys()))
+        raise AssertionError("Missing the following loci: {}".format(missing))
 
     new_common, total_genes = __create_gene_counters(common_index)
 

diff --git a/Mikado/serializers/external.py b/Mikado/serializers/external.py
@@ -48,7 +48,7 @@ def __init__(self, source, rtype, valid_raw):
         elif np.dtype("complex") == rtype:
             rtype = "complex"
         else:
-            raise ValueError("Invalid source rtype: {}".format(rtype))
+            raise ValueError("Invalid source rtype for {}: {}".format(source, rtype))
 
         self.rtype = rtype
         self.valid_raw = valid_raw
@@ -138,8 +138,6 @@ def __init__(self, handle,
                 type(fasta_index))
             self.logger.warning(error)
 
-
-
         try:
             self.data = pd.read_csv(self.handle, delimiter=delimiter, index_col=["tid"])
         except ValueError:
-Original file line number
+Diff line change
@@ Expand Up / @@ -937,6 +937,7 @@ def define_subloci(self): @@
             if self.subloci_defined is True:
                 return
             self.compile_requirements()
             self.subloci = []
@@ Expand Down @@