Mikado now can accept numerical values such as TPM for the serialisat…

…ion. Updated tests (EI-CoreBioinformatics#137) and documentation (EI-CoreBioinformatics#138)
lucventurini · Oct 26, 2018 · 392e2af · 392e2af
1 parent 755eb8b
commit 392e2af
Show file tree

Hide file tree

Showing 13 changed files with 145 additions and 55 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,9 @@
 
 Users are ***very strongly recommended*** to update Mikado as soon as possible.
 
+**IMPORTANT**: this release **changes the format of the mikado database**. As such, old mikado databases **have to be regenerated** in order for the run not to fail.
+
+
 One of the major highlights of this release is the completion of the "padding" functionality.
 Briefly, if instructed to do so, now Mikado will be able to uniform the ends of transcripts within a single locus (similar to what was done for the last _Arabidopsis thaliana_ annotation release).
 The behaviour is controlled by the "pad" boolean switch, and by the "ts_max_splices" and "ts_distance" parameters under "pick".
@@ -12,6 +15,8 @@ Bugfixes and improvements:
 
 - Fixed a bug which caused some loci to crash at the last part of the picking stage
 - Made logging more sensible and informative for all three steps of the pipeline (prepare, serialise, pick)
+- For the external scores, Mikado can now accept any type of numerical or boolean value. Mikado will understand at serialisation time whether a particular score can be used raw (ie its values are strictly comprised between 0 and 1) or whether it has to be forcibly scaled.
+  - This allows Mikado to use e.g. transcript expression as a valid metric.
 - Now coding and non-coding transcripts will be in different loci.
 - Mikado prepare now can accept models that lack any exon features but still have valid CDS/UTR features - this is necessary for some protein prediction tools.
 - Fixed [#139](https://github.com/lucventurini/mikado/issues/139): Mikado was reverse complementing non-uppercase letters incorrectly.

diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py
@@ -835,7 +835,7 @@ def print_metrics(self):
                     row[key] = "NA"
             for source in self.transcripts[tid].external_scores:
                 # Each score from external files also contains a multiplier.
-                row["external.{}".format(source)] = self.transcripts[tid].external_scores.get(source)
+                row["external.{}".format(source)] = self.transcripts[tid].external_scores.get(source)[0]
 
             assert row != {}
             yield row
@@ -1148,10 +1148,15 @@ def _calculate_score(self, param):
         use_raw = self.json_conf["scoring"][param]["use_raw"]
         multiplier = self.json_conf["scoring"][param]["multiplier"]
 
-        metrics = dict((tid, getattr(self.transcripts[tid], param)) for tid in self.transcripts)
+        if param.startswith("external"):
+            # For external metrics, we have a tuple - first item is score, second item is usable_raw
+            metrics = dict((tid, getattr(self.transcripts[tid], param)[0]) for tid in self.transcripts)
+        else:
+            metrics = dict((tid, getattr(self.transcripts[tid], param)) for tid in self.transcripts)
 
         for tid in self.transcripts.keys():
             tid_metric = metrics[tid]
+
             if ("filter" in self.json_conf["scoring"][param] and
                     self.json_conf["scoring"][param]["filter"] != {}):
                 if "metric" not in self.json_conf["scoring"][param]["filter"]:
@@ -1171,8 +1176,14 @@ def _calculate_score(self, param):
             for tid in self.transcripts:
                 self.scores[tid][param] = 0
         else:
+            if param.startswith("external"):
+                # Take any transcript and verify
+                usable_raw = getattr(self.transcripts[list(self.transcripts.keys())[0]], param)[1]
+            else:
+                usable_raw = getattr(Transcript, param).usable_raw
 
-            if use_raw is True and not param.startswith("external") and getattr(Transcript, param).usable_raw is False:
+            assert usable_raw in (False, True)
+            if use_raw is True and usable_raw is False:
                 self.logger.warning("The \"%s\" metric cannot be used as a raw score for %s, switching to False",
                                     param, self.id)
                 use_raw = False
@@ -1191,7 +1202,10 @@ def _calculate_score(self, param):
                 elif use_raw is True and rescaling == "min":
                     denominator = -1
                 else:
-                    denominator = (max(metrics.values()) - min(metrics.values()))
+                    try:
+                        denominator = (max(metrics.values()) - min(metrics.values()))
+                    except TypeError:
+                        raise TypeError([param, metrics])
             if denominator == 0:
                 denominator = 1
 

diff --git a/Mikado/loci/superlocus.py b/Mikado/loci/superlocus.py
@@ -537,7 +537,18 @@ def _create_data_dict(self, engine, tid_keys):
                 external = []
 
             for ext in external:
-                data_dict["external"][ext.query][ext.source] = ext.score
+                if ext.rtype == "int":
+                    score = int(ext.score)
+                elif ext.rtype == "float":
+                    score = float(ext.score)
+                elif ext.rtype == "complex":
+                    score = complex(ext.score)
+                elif ext.rtype == "bool":
+                    score = bool(int(ext.score))
+                else:
+                    raise ValueError("Invalid rtype: {}".format(ext.rtype))
+
+                data_dict["external"][ext.query][ext.source] = (score, ext.valid_raw)
 
             # Load the ORFs from the table
             if query_ids:

diff --git a/Mikado/serializers/external.py b/Mikado/serializers/external.py
@@ -9,30 +9,49 @@
 
 import os
 import pyfaidx
-from sqlalchemy import Column, String, Integer, ForeignKey, Float
+from sqlalchemy import Column, String, Integer, ForeignKey, Float, Boolean
 from sqlalchemy.sql.schema import PrimaryKeyConstraint
 from sqlalchemy.orm import column_property
 from sqlalchemy.orm.session import Session  # sessionmaker
 from sqlalchemy import select
 from ..utilities.dbutils import DBBASE, Inspector, connect
 from .blast_serializer import Query
 from ..utilities.log_utils import check_logger, create_default_logger
-from csv import DictReader
 import numbers
-import sqlalchemy.exc
 import pandas as pd
+import numpy as np
+import re
+from collections import Counter
+import sqlalchemy.exc
 
 
 class ExternalSource(DBBASE):
 
     __tablename__ = "external_sources"
 
     source_id = Column(Integer, primary_key=True)
-    source = Column(String)
+    source = Column(String, unique=True)
+    rtype = Column(String, unique=False)
+    valid_raw = Column(Boolean)
 
-    def __init__(self, source):
+    def __init__(self, source, rtype, valid_raw):
 
         self.source = source
+        if valid_raw not in (True, False):
+            raise ValueError("\"Valid raw\" flags must be boolean!")
+        if np.dtype("bool") == rtype:
+            rtype = "bool"
+        elif np.dtype("int") == rtype:
+            rtype = "int"
+        elif np.dtype("float") == rtype:
+            rtype = "float"
+        elif np.dtype("complex") == rtype:
+            rtype = "complex"
+        else:
+            raise ValueError("Invalid source rtype: {}".format(rtype))
+
+        self.rtype = rtype
+        self.valid_raw = valid_raw
 
 
 class External(DBBASE):
@@ -46,20 +65,28 @@ class External(DBBASE):
     ext_constraint = PrimaryKeyConstraint("query_id", "source_id", name="source_key")
     source = column_property(select([ExternalSource.source]).where(
         ExternalSource.source_id == source_id))
-    score = Column(Float)
+    score = Column(String, nullable=False)
 
     query = column_property(select([Query.query_name]).where(
         Query.query_id == query_id))
 
+    valid_raw = column_property(select([ExternalSource.valid_raw]).where(
+        ExternalSource.source_id == source_id))
+
+    rtype = column_property(select([ExternalSource.rtype]).where(
+        ExternalSource.source_id == source_id))
+
     __table_args__ = (ext_constraint, )
 
     def __init__(self, query_id, source_id, score):
 
         self.query_id = query_id
         self.source_id = source_id
         if not isinstance(score, numbers.Number):
-            raise sqlalchemy.exc.ArgumentError("This class only accepts numeric scores")
-        self.score = score
+            raise sqlalchemy.exc.ArgumentError("Invalid score for external values: {}".format(type(score)))
+        score = str(score)
+        assert score.strip()
+        self.score = str(score)
 
 
 class ExternalSerializer:
@@ -127,15 +154,6 @@ def __init__(self, handle,
             raise error
 
         self.data.fillna(0, inplace=True)
-        for column in self.data.columns:
-            try:
-                self.data[column].astype("float")
-            except ValueError:
-                exc = ValueError("Invalid non-numeric values in external table, for column {}. Aborting".format(
-                    column
-                ))
-                self.logger.critical(exc)
-                raise
 
         self.engine = connect(json_conf, logger=logger)
 
@@ -157,9 +175,36 @@ def serialize(self):
 
         sources = dict()
         self.session.begin(subtransactions=True)
+
+        # Check columns
+        cols = []
+        for col in self.data.columns:
+            cols.append(re.sub("\.[0-9]*", '', str(col)))
+
+        cols = Counter(cols)
+
+        if cols.most_common()[0][1] > 1:  # IE the most common element is present more than one time
+            raise IndexError("Duplicated values in the external table: {}".format(
+                ",".join([_[0] for _ in cols.most_common() if _[1] > 1])
+            ))
+
         for source in self.data.columns:
-            source = ExternalSource(source)
+
+            if ((self.data[source].dtype == np.dtype("float") or
+                    self.data[source].dtype == np.dtype("int")) and
+                    0 <= self.data[source].min() <= self.data[source].max() <= 1):
+                valid_raw = True
+            else:
+                valid_raw = False
+
+            rtype = self.data[source].dtype
+            if rtype == np.dtype("bool"):
+                # We have to cast booleans as integers otherwise the conversion after extraction will fail
+                self.data[source] = self.data[source].astype(int)
+
+            source = ExternalSource(source, rtype=rtype, valid_raw=valid_raw)
             self.session.add(source)
+
         self.session.commit()
 
         # Now retrieve the values from the dictionary
@@ -218,6 +263,12 @@ def close(self):
         :return:
         """
 
+        if hasattr(self, "fasta_index") and self.fasta_index is not None:
+            self.fasta_index.close()
+        if hasattr(self, "session"):
+            self.session.close()
+        if hasattr(self, "engine"):
+            self.engine.dispose()
         return
 
     def load_fasta(self, cache):

diff --git a/Mikado/subprograms/serialise.py b/Mikado/subprograms/serialise.py
@@ -156,10 +156,10 @@ def load_external(args, logger):
         return
     else:
         logger.info("Starting to load external data")
-        serializer = external.ExternalSerializer(args.json_conf["serialise"]["files"]["external_scores"],
+        with external.ExternalSerializer(args.json_conf["serialise"]["files"]["external_scores"],
                                                  json_conf=args.json_conf,
-                                                 logger=logger)
-        serializer()
+                                                 logger=logger) as serializer:
+            serializer()
         logger.info("Finished loading external data")
 
 
@@ -192,7 +192,7 @@ def setup(args):
                         setattr(args, file_key, getattr(args, file_key).split(","))
                     args.json_conf["serialise"]["files"][file_key] = getattr(args, file_key)
         elif key in ("SimpleComment", "Comment"):
-            # Necesarry for JSON configurations
+            # Necessary for JSON configurations
             continue
         else:
             if getattr(args, key, None) or getattr(args, key, None) == 0:
@@ -287,7 +287,7 @@ def setup(args):
                 args.json_conf["serialise"]["procs"],
                 args.json_conf["serialise"]["single_thread"])
 
-    return args, logger
+    return args, logger, sql_logger
 
 
 def serialise(args):
@@ -300,9 +300,9 @@ def serialise(args):
     :return:
     """
 
-    args, logger = setup(args)
+    args, logger, sql_logger = setup(args)
 
-    logger.info("Command line: %s",  " ".join(sys.argv))
+    # logger.info("Command line: %s",  " ".join(sys.argv))
 
     if args.json_conf["serialise"]["force"] is True:
         if args.json_conf["db_settings"]["dbtype"] == "sqlite" and os.path.exists(args.json_conf["db_settings"]["db"]):
@@ -324,10 +324,10 @@ def serialise(args):
             engine.execute("VACUUM")
         dbutils.DBBASE.metadata.create_all(engine)
 
+    load_external(args, logger)
     load_junctions(args, logger)
     load_orfs(args, logger)
     load_blast(args, logger)
-    load_external(args, logger)
     logger.info("Finished")
     try:
         return 0
@@ -336,6 +336,7 @@ def serialise(args):
     except Exception as exc:
         logger.exception(exc)
     finally:
+        logging.shutdown()
         return 0
 
 

diff --git a/Mikado/tests/mikado.db b/Mikado/tests/mikado.db
diff --git a/Mikado/tests/test_db_utils.py b/Mikado/tests/test_db_utils.py
@@ -54,11 +54,11 @@ def test_content(self):
         # Simple tests based on the static content of the dictionary
         self.assertEqual(session.query(Mikado.serializers.junction.Junction).count(), 371,
                          self.json["db_settings"])
-        self.assertEqual(session.query(Mikado.serializers.orf.Orf).count(), 80)
+        self.assertEqual(session.query(Mikado.serializers.orf.Orf).count(), 169)
         self.assertEqual(session.query(Mikado.serializers.blast_serializer.Target).count(), 38909)
         self.assertEqual(session.query(Mikado.serializers.blast_serializer.Query).count(), 93)
-        self.assertEqual(session.query(Mikado.serializers.blast_serializer.Hit).count(), 344)
-        self.assertEqual(session.query(Mikado.serializers.blast_serializer.Hsp).count(), 410)
+        self.assertEqual(session.query(Mikado.serializers.blast_serializer.Hit).count(), 562)
+        self.assertEqual(session.query(Mikado.serializers.blast_serializer.Hsp).count(), 669)
 
         first_query = session.query(Mikado.serializers.blast_serializer.Query).limit(1).one()
         astup = first_query.as_tuple()

diff --git a/Mikado/tests/test_serialise_external.py b/Mikado/tests/test_serialise_external.py
@@ -1,16 +1,12 @@
-import logging
-import os
 import unittest
 import Mikado
 import tempfile
 import sqlalchemy.orm
-from sqlalchemy import and_  # , or_
-from pkg_resources import resource_stream
 import pandas as pd
+import numpy as np
 from Mikado.utilities.dbutils import DBBASE
 from Mikado.serializers.external import External, ExternalSource, ExternalSerializer
 from Mikado.serializers.orf import Query
-from Mikado.utilities.log_utils import create_default_logger
 import sqlalchemy.exc
 
 
@@ -37,15 +33,15 @@ def __create_session(self):
 
     def test_serialize_source(self):
 
-        source = ExternalSource("foo")
+        source = ExternalSource("foo", np.dtype("float"), False)
         self.session.add(source)
         self.session.commit()
 
         self.assertEqual(self.session.query(Mikado.serializers.external.ExternalSource).count(), 1)
 
     def test_add_score(self):
 
-        source = ExternalSource("cdna_length")
+        source = ExternalSource("cdna_length", np.dtype("int"), False)
         query = Query("foo.1", 200)
         self.session.add(source)
         self.session.add(query)
@@ -63,7 +59,7 @@ def test_add_score(self):
         self.assertEqual(self.session.query(Mikado.serializers.external.External).count(), 1)
 
     def test_wrong_score(self):
-        source = ExternalSource("cdna_length")
+        source = ExternalSource("cdna_length", np.dtype("int"), False)
         query = Query("foo.1", 200)
         self.session.add(source)
         self.session.add(query)

diff --git a/Mikado/tests/test_system_calls.py b/Mikado/tests/test_system_calls.py
@@ -801,14 +801,15 @@ def test_single_proc(self):
         json_conf["log_settings"]["log_level"] = "WARNING"
 
         pick_caller = picker.Picker(json_conf=json_conf)
-        with self.assertRaises(SystemExit), self.assertLogs("main_logger", "INFO"):
+        with self.assertRaises(SystemExit), self.assertLogs("main_logger", "INFO") as cm:
             pick_caller()
         self.assertTrue(os.path.exists(os.path.join(tempfile.gettempdir(), "mikado.monoproc.loci.gff3")))
         with to_gff(os.path.join(tempfile.gettempdir(), "mikado.monoproc.loci.gff3")) as inp_gff:
             lines = [_ for _ in inp_gff if not _.header is True]
             self.assertGreater(len(lines), 0)
             self.assertGreater(len([_ for _ in lines if _.is_transcript is True]), 0)
-            self.assertGreater(len([_ for _ in lines if _.feature == "mRNA"]), 0)
+            self.assertGreater(len([_ for _ in lines if _.feature == "mRNA"]), 0,
+                               [_ for _ in cm.output if "WARNING" in _])
             self.assertGreater(len([_ for _ in lines if _.feature == "CDS"]), 0)
 
         [os.remove(_) for _ in glob.glob(os.path.join(tempfile.gettempdir(), "mikado.monoproc.") + "*")]