Skip to content

Commit

Permalink
Mikado now can accept numerical values such as TPM for the serialisat…
Browse files Browse the repository at this point in the history
…ion. Updated tests (EI-CoreBioinformatics#137) and documentation (EI-CoreBioinformatics#138)
  • Loading branch information
lucventurini committed Oct 26, 2018
1 parent 755eb8b commit 392e2af
Show file tree
Hide file tree
Showing 13 changed files with 145 additions and 55 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

Users are ***very strongly recommended*** to update Mikado as soon as possible.

**IMPORTANT**: this release **changes the format of the mikado database**. As such, old mikado databases **have to be regenerated** in order for the run not to fail.


One of the major highlights of this release is the completion of the "padding" functionality.
Briefly, if instructed to do so, now Mikado will be able to uniform the ends of transcripts within a single locus (similar to what was done for the last _Arabidopsis thaliana_ annotation release).
The behaviour is controlled by the "pad" boolean switch, and by the "ts_max_splices" and "ts_distance" parameters under "pick".
Expand All @@ -12,6 +15,8 @@ Bugfixes and improvements:

- Fixed a bug which caused some loci to crash at the last part of the picking stage
- Made logging more sensible and informative for all three steps of the pipeline (prepare, serialise, pick)
- For the external scores, Mikado can now accept any type of numerical or boolean value. Mikado will understand at serialisation time whether a particular score can be used raw (ie its values are strictly comprised between 0 and 1) or whether it has to be forcibly scaled.
- This allows Mikado to use e.g. transcript expression as a valid metric.
- Now coding and non-coding transcripts will be in different loci.
- Mikado prepare now can accept models that lack any exon features but still have valid CDS/UTR features - this is necessary for some protein prediction tools.
- Fixed [#139](https://github.com/lucventurini/mikado/issues/139): Mikado was reverse complementing non-uppercase letters incorrectly.
Expand Down
22 changes: 18 additions & 4 deletions Mikado/loci/abstractlocus.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,7 +835,7 @@ def print_metrics(self):
row[key] = "NA"
for source in self.transcripts[tid].external_scores:
# Each score from external files also contains a multiplier.
row["external.{}".format(source)] = self.transcripts[tid].external_scores.get(source)
row["external.{}".format(source)] = self.transcripts[tid].external_scores.get(source)[0]

assert row != {}
yield row
Expand Down Expand Up @@ -1148,10 +1148,15 @@ def _calculate_score(self, param):
use_raw = self.json_conf["scoring"][param]["use_raw"]
multiplier = self.json_conf["scoring"][param]["multiplier"]

metrics = dict((tid, getattr(self.transcripts[tid], param)) for tid in self.transcripts)
if param.startswith("external"):
# For external metrics, we have a tuple - first item is score, second item is usable_raw
metrics = dict((tid, getattr(self.transcripts[tid], param)[0]) for tid in self.transcripts)
else:
metrics = dict((tid, getattr(self.transcripts[tid], param)) for tid in self.transcripts)

for tid in self.transcripts.keys():
tid_metric = metrics[tid]

if ("filter" in self.json_conf["scoring"][param] and
self.json_conf["scoring"][param]["filter"] != {}):
if "metric" not in self.json_conf["scoring"][param]["filter"]:
Expand All @@ -1171,8 +1176,14 @@ def _calculate_score(self, param):
for tid in self.transcripts:
self.scores[tid][param] = 0
else:
if param.startswith("external"):
# Take any transcript and verify
usable_raw = getattr(self.transcripts[list(self.transcripts.keys())[0]], param)[1]
else:
usable_raw = getattr(Transcript, param).usable_raw

if use_raw is True and not param.startswith("external") and getattr(Transcript, param).usable_raw is False:
assert usable_raw in (False, True)
if use_raw is True and usable_raw is False:
self.logger.warning("The \"%s\" metric cannot be used as a raw score for %s, switching to False",
param, self.id)
use_raw = False
Expand All @@ -1191,7 +1202,10 @@ def _calculate_score(self, param):
elif use_raw is True and rescaling == "min":
denominator = -1
else:
denominator = (max(metrics.values()) - min(metrics.values()))
try:
denominator = (max(metrics.values()) - min(metrics.values()))
except TypeError:
raise TypeError([param, metrics])
if denominator == 0:
denominator = 1

Expand Down
13 changes: 12 additions & 1 deletion Mikado/loci/superlocus.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,18 @@ def _create_data_dict(self, engine, tid_keys):
external = []

for ext in external:
data_dict["external"][ext.query][ext.source] = ext.score
if ext.rtype == "int":
score = int(ext.score)
elif ext.rtype == "float":
score = float(ext.score)
elif ext.rtype == "complex":
score = complex(ext.score)
elif ext.rtype == "bool":
score = bool(int(ext.score))
else:
raise ValueError("Invalid rtype: {}".format(ext.rtype))

data_dict["external"][ext.query][ext.source] = (score, ext.valid_raw)

# Load the ORFs from the table
if query_ids:
Expand Down
87 changes: 69 additions & 18 deletions Mikado/serializers/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,30 +9,49 @@

import os
import pyfaidx
from sqlalchemy import Column, String, Integer, ForeignKey, Float
from sqlalchemy import Column, String, Integer, ForeignKey, Float, Boolean
from sqlalchemy.sql.schema import PrimaryKeyConstraint
from sqlalchemy.orm import column_property
from sqlalchemy.orm.session import Session # sessionmaker
from sqlalchemy import select
from ..utilities.dbutils import DBBASE, Inspector, connect
from .blast_serializer import Query
from ..utilities.log_utils import check_logger, create_default_logger
from csv import DictReader
import numbers
import sqlalchemy.exc
import pandas as pd
import numpy as np
import re
from collections import Counter
import sqlalchemy.exc


class ExternalSource(DBBASE):

__tablename__ = "external_sources"

source_id = Column(Integer, primary_key=True)
source = Column(String)
source = Column(String, unique=True)
rtype = Column(String, unique=False)
valid_raw = Column(Boolean)

def __init__(self, source):
def __init__(self, source, rtype, valid_raw):

self.source = source
if valid_raw not in (True, False):
raise ValueError("\"Valid raw\" flags must be boolean!")
if np.dtype("bool") == rtype:
rtype = "bool"
elif np.dtype("int") == rtype:
rtype = "int"
elif np.dtype("float") == rtype:
rtype = "float"
elif np.dtype("complex") == rtype:
rtype = "complex"
else:
raise ValueError("Invalid source rtype: {}".format(rtype))

self.rtype = rtype
self.valid_raw = valid_raw


class External(DBBASE):
Expand All @@ -46,20 +65,28 @@ class External(DBBASE):
ext_constraint = PrimaryKeyConstraint("query_id", "source_id", name="source_key")
source = column_property(select([ExternalSource.source]).where(
ExternalSource.source_id == source_id))
score = Column(Float)
score = Column(String, nullable=False)

query = column_property(select([Query.query_name]).where(
Query.query_id == query_id))

valid_raw = column_property(select([ExternalSource.valid_raw]).where(
ExternalSource.source_id == source_id))

rtype = column_property(select([ExternalSource.rtype]).where(
ExternalSource.source_id == source_id))

__table_args__ = (ext_constraint, )

def __init__(self, query_id, source_id, score):

self.query_id = query_id
self.source_id = source_id
if not isinstance(score, numbers.Number):
raise sqlalchemy.exc.ArgumentError("This class only accepts numeric scores")
self.score = score
raise sqlalchemy.exc.ArgumentError("Invalid score for external values: {}".format(type(score)))
score = str(score)
assert score.strip()
self.score = str(score)


class ExternalSerializer:
Expand Down Expand Up @@ -127,15 +154,6 @@ def __init__(self, handle,
raise error

self.data.fillna(0, inplace=True)
for column in self.data.columns:
try:
self.data[column].astype("float")
except ValueError:
exc = ValueError("Invalid non-numeric values in external table, for column {}. Aborting".format(
column
))
self.logger.critical(exc)
raise

self.engine = connect(json_conf, logger=logger)

Expand All @@ -157,9 +175,36 @@ def serialize(self):

sources = dict()
self.session.begin(subtransactions=True)

# Check columns
cols = []
for col in self.data.columns:
cols.append(re.sub("\.[0-9]*", '', str(col)))

cols = Counter(cols)

if cols.most_common()[0][1] > 1: # IE the most common element is present more than one time
raise IndexError("Duplicated values in the external table: {}".format(
",".join([_[0] for _ in cols.most_common() if _[1] > 1])
))

for source in self.data.columns:
source = ExternalSource(source)

if ((self.data[source].dtype == np.dtype("float") or
self.data[source].dtype == np.dtype("int")) and
0 <= self.data[source].min() <= self.data[source].max() <= 1):
valid_raw = True
else:
valid_raw = False

rtype = self.data[source].dtype
if rtype == np.dtype("bool"):
# We have to cast booleans as integers otherwise the conversion after extraction will fail
self.data[source] = self.data[source].astype(int)

source = ExternalSource(source, rtype=rtype, valid_raw=valid_raw)
self.session.add(source)

self.session.commit()

# Now retrieve the values from the dictionary
Expand Down Expand Up @@ -218,6 +263,12 @@ def close(self):
:return:
"""

if hasattr(self, "fasta_index") and self.fasta_index is not None:
self.fasta_index.close()
if hasattr(self, "session"):
self.session.close()
if hasattr(self, "engine"):
self.engine.dispose()
return

def load_fasta(self, cache):
Expand Down
17 changes: 9 additions & 8 deletions Mikado/subprograms/serialise.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,10 @@ def load_external(args, logger):
return
else:
logger.info("Starting to load external data")
serializer = external.ExternalSerializer(args.json_conf["serialise"]["files"]["external_scores"],
with external.ExternalSerializer(args.json_conf["serialise"]["files"]["external_scores"],
json_conf=args.json_conf,
logger=logger)
serializer()
logger=logger) as serializer:
serializer()
logger.info("Finished loading external data")


Expand Down Expand Up @@ -192,7 +192,7 @@ def setup(args):
setattr(args, file_key, getattr(args, file_key).split(","))
args.json_conf["serialise"]["files"][file_key] = getattr(args, file_key)
elif key in ("SimpleComment", "Comment"):
# Necesarry for JSON configurations
# Necessary for JSON configurations
continue
else:
if getattr(args, key, None) or getattr(args, key, None) == 0:
Expand Down Expand Up @@ -287,7 +287,7 @@ def setup(args):
args.json_conf["serialise"]["procs"],
args.json_conf["serialise"]["single_thread"])

return args, logger
return args, logger, sql_logger


def serialise(args):
Expand All @@ -300,9 +300,9 @@ def serialise(args):
:return:
"""

args, logger = setup(args)
args, logger, sql_logger = setup(args)

logger.info("Command line: %s", " ".join(sys.argv))
# logger.info("Command line: %s", " ".join(sys.argv))

if args.json_conf["serialise"]["force"] is True:
if args.json_conf["db_settings"]["dbtype"] == "sqlite" and os.path.exists(args.json_conf["db_settings"]["db"]):
Expand All @@ -324,10 +324,10 @@ def serialise(args):
engine.execute("VACUUM")
dbutils.DBBASE.metadata.create_all(engine)

load_external(args, logger)
load_junctions(args, logger)
load_orfs(args, logger)
load_blast(args, logger)
load_external(args, logger)
logger.info("Finished")
try:
return 0
Expand All @@ -336,6 +336,7 @@ def serialise(args):
except Exception as exc:
logger.exception(exc)
finally:
logging.shutdown()
return 0


Expand Down
Binary file modified Mikado/tests/mikado.db
Binary file not shown.
6 changes: 3 additions & 3 deletions Mikado/tests/test_db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,11 @@ def test_content(self):
# Simple tests based on the static content of the dictionary
self.assertEqual(session.query(Mikado.serializers.junction.Junction).count(), 371,
self.json["db_settings"])
self.assertEqual(session.query(Mikado.serializers.orf.Orf).count(), 80)
self.assertEqual(session.query(Mikado.serializers.orf.Orf).count(), 169)
self.assertEqual(session.query(Mikado.serializers.blast_serializer.Target).count(), 38909)
self.assertEqual(session.query(Mikado.serializers.blast_serializer.Query).count(), 93)
self.assertEqual(session.query(Mikado.serializers.blast_serializer.Hit).count(), 344)
self.assertEqual(session.query(Mikado.serializers.blast_serializer.Hsp).count(), 410)
self.assertEqual(session.query(Mikado.serializers.blast_serializer.Hit).count(), 562)
self.assertEqual(session.query(Mikado.serializers.blast_serializer.Hsp).count(), 669)

first_query = session.query(Mikado.serializers.blast_serializer.Query).limit(1).one()
astup = first_query.as_tuple()
Expand Down
12 changes: 4 additions & 8 deletions Mikado/tests/test_serialise_external.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
import logging
import os
import unittest
import Mikado
import tempfile
import sqlalchemy.orm
from sqlalchemy import and_ # , or_
from pkg_resources import resource_stream
import pandas as pd
import numpy as np
from Mikado.utilities.dbutils import DBBASE
from Mikado.serializers.external import External, ExternalSource, ExternalSerializer
from Mikado.serializers.orf import Query
from Mikado.utilities.log_utils import create_default_logger
import sqlalchemy.exc


Expand All @@ -37,15 +33,15 @@ def __create_session(self):

def test_serialize_source(self):

source = ExternalSource("foo")
source = ExternalSource("foo", np.dtype("float"), False)
self.session.add(source)
self.session.commit()

self.assertEqual(self.session.query(Mikado.serializers.external.ExternalSource).count(), 1)

def test_add_score(self):

source = ExternalSource("cdna_length")
source = ExternalSource("cdna_length", np.dtype("int"), False)
query = Query("foo.1", 200)
self.session.add(source)
self.session.add(query)
Expand All @@ -63,7 +59,7 @@ def test_add_score(self):
self.assertEqual(self.session.query(Mikado.serializers.external.External).count(), 1)

def test_wrong_score(self):
source = ExternalSource("cdna_length")
source = ExternalSource("cdna_length", np.dtype("int"), False)
query = Query("foo.1", 200)
self.session.add(source)
self.session.add(query)
Expand Down
5 changes: 3 additions & 2 deletions Mikado/tests/test_system_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,14 +801,15 @@ def test_single_proc(self):
json_conf["log_settings"]["log_level"] = "WARNING"

pick_caller = picker.Picker(json_conf=json_conf)
with self.assertRaises(SystemExit), self.assertLogs("main_logger", "INFO"):
with self.assertRaises(SystemExit), self.assertLogs("main_logger", "INFO") as cm:
pick_caller()
self.assertTrue(os.path.exists(os.path.join(tempfile.gettempdir(), "mikado.monoproc.loci.gff3")))
with to_gff(os.path.join(tempfile.gettempdir(), "mikado.monoproc.loci.gff3")) as inp_gff:
lines = [_ for _ in inp_gff if not _.header is True]
self.assertGreater(len(lines), 0)
self.assertGreater(len([_ for _ in lines if _.is_transcript is True]), 0)
self.assertGreater(len([_ for _ in lines if _.feature == "mRNA"]), 0)
self.assertGreater(len([_ for _ in lines if _.feature == "mRNA"]), 0,
[_ for _ in cm.output if "WARNING" in _])
self.assertGreater(len([_ for _ in lines if _.feature == "CDS"]), 0)

[os.remove(_) for _ in glob.glob(os.path.join(tempfile.gettempdir(), "mikado.monoproc.") + "*")]
Expand Down
Loading

0 comments on commit 392e2af

Please sign in to comment.