Skip to content

Commit

Permalink
Development (#184)
Browse files Browse the repository at this point in the history
* This should address #173 (both configuration file and docs) and #158

* Fix #181 and small bug fix for parsing Mikado annotations.

* Progress for #142 - this should fix the wrong ORF calculation for cases when the CDS was open at the 5' end.

* Fixed previous commit (always for #142)

* #142: corrected and tested the issue with one-off exons, for padding.

* This should fix and test #142 for good.

* Removed spurious warning/error messages

* #142: solved a bug which caused truncated transcripts at the 5' end not to be padded.

* #142: solved a problem which caused a false abort for transcripts on the - strand with changed stop codon.

* #142: fixing previous commit

* Pushing the fix for #182 onto the development branch

* Fix #183

* Fix #183 and previous commit

* #183: now Mikado configure will set a seed when generating the configuration file. The seed will be explicitly mentioned in the log.

* #177: made ORF loading slightly faster with pysam. Also made XML serialisation much faster using SQL sessions and multiprocessing.Pool instead of queues.

* Solved annoying bug that caused Mikado to crash with TAIR GFF3s.
  • Loading branch information
lucventurini authored Jun 18, 2019
1 parent 275ff65 commit 28898f1
Show file tree
Hide file tree
Showing 29 changed files with 577 additions and 475 deletions.
14 changes: 10 additions & 4 deletions Mikado/configuration/configuration_blueprint.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@
"type": "object",
"properties": {
"SimpleComment": {"type": "object", "properties": {},
"SimpleComment": ["Configuration file for Mikado. Sections:",
"SimpleComment": ["Configuration file for Mikado. Please note that absent values, e.g. if a field is deleted, will be imputed by the default values for the program.",
"Sections:",
"- log_settings: settings related to the verbosity of logs.",
"- db_settings: Database settings, for Mikado serialise and pick.",
"- reference: Settings related to the genome reference.",
"- prepare: settings related to the Mikado prepare stage",
"- serialise: settings related to the Mikado serialise stage",
"- pick: settings related to the Mikado pick stage",
"- multiprocessing_method: which method (fork, spawn, forkserver) Mikado should use for multiprocessing."],
"Comment": ["Configuration file for Mikado. Sections:",
"Comment": ["Configuration file for Mikado. Please note that absent values, e.g. if a field is deleted, will be imputed by the default values for the program.",
"Sections:",
"- log_settings: settings related to the verbosity of logs.",
"- db_settings: Database settings, for Mikado serialise and pick.",
"- reference: Settings related to the genome reference.",
Expand All @@ -21,6 +23,10 @@
"- pick: settings related to the Mikado pick stage",
"- multiprocessing_method: which method (fork, spawn, forkserver) Mikado should use for multiprocessing."]
},
"seed": {
"type": ["integer", "null"],
"default": null
},
"multiprocessing_method": {
"type": "string",
"default": "",
Expand Down Expand Up @@ -420,7 +426,7 @@
},
"ts_distance": {
"type": "integer",
"default": 300,
"default": 1000,
"minimum": 0
},
"pad": {
Expand All @@ -430,7 +436,7 @@
"ts_max_splices": {
"type": "integer",
"minimum": 0,
"default": 1
"default": 2
}
}
},
Expand Down
15 changes: 15 additions & 0 deletions Mikado/configuration/configurator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
from ..exceptions import InvalidJson, UnrecognizedRescaler
from ..utilities import merge_dictionaries
from ..utilities.log_utils import create_default_logger
import sys
import random


__author__ = "Luca Venturini"
Expand Down Expand Up @@ -603,6 +605,13 @@ def check_json(json_conf, simple=False, external_dict=None, logger=None):
logger.debug("Scoring parameters: {}".format("\n".join(["\n"] + [
"{}: {}".format(_, json_conf["scoring"][_]) for _ in json_conf["scoring"].keys()])))

seed = json_conf.get("seed", None)
if seed is None:
seed = random.randint(0, sys.maxsize)
logger.info("Random seed: {}", seed)
json_conf["seed"] = seed
random.seed(seed)

return json_conf


Expand Down Expand Up @@ -644,4 +653,10 @@ def to_json(string, simple=False, logger=None):
except Exception as exc:
raise OSError((exc, string))

seed = json_dict.get("seed", None)
if seed is None:
seed = random.randint(0, sys.maxsize)
logger.info("Random seed: {}", seed)
random.seed(seed)

return json_dict
3 changes: 2 additions & 1 deletion Mikado/loci/abstractlocus.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,10 +193,11 @@ def __getstate__(self):

if hasattr(self, "json_conf"):
# This removes unpicklable compiled attributes, eg in "requirements" or "as_requirements"
if "json_conf" not in state:
state["json_conf"] = self.json_conf.copy()
for key in self.json_conf:
if (isinstance(self.json_conf[key], dict) and
self.json_conf[key].get("compiled", None) is not None):
assert "json_conf" in state
assert key in state["json_conf"]
del state["json_conf"][key]["compiled"]

Expand Down
108 changes: 66 additions & 42 deletions Mikado/loci/locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,10 @@ def pad_transcripts(self) -> set:
"""

try:
self.fai = pysam.FastaFile(self.json_conf["reference"]["genome"])
if isinstance(self.json_conf["reference"]["genome"], pysam.FastaFile):
self.fai = self.json_conf["reference"]["genome"]
else:
self.fai = pysam.FastaFile(self.json_conf["reference"]["genome"])
except KeyError:
raise KeyError(self.json_conf.keys())

Expand All @@ -730,13 +733,25 @@ def pad_transcripts(self) -> set:
templates.add(__to_modify[tid][1].id)

self.logger.debug("Expanding %s to have start %s (from %s) and end %s (from %s)",
tid, __to_modify[tid][0],
self[tid].start, __to_modify[tid][1], self[tid].end)
new_transcript = expand_transcript(self[tid].deepcopy(),
__to_modify[tid][0],
__to_modify[tid][1],
self.fai,
self.logger)
tid, __to_modify[tid][0] if not __to_modify[tid][0] else __to_modify[tid][0].start,
self[tid].start,
__to_modify[tid][1] if not __to_modify[tid][1] else __to_modify[tid][1].end,
self[tid].end)
try:
new_transcript = expand_transcript(self[tid].deepcopy(),
__to_modify[tid][0],
__to_modify[tid][1],
self.fai,
self.logger)
except KeyboardInterrupt:
raise
except Exception as exc:
self.logger.exception(exc)
raise
if (new_transcript.start == self.transcripts[tid].end) and (new_transcript.end == self.transcripts[tid].end):
self.logger.debug("No expansion took place for %s!", tid)
else:
self.logger.debug("Expansion took place for %s!", tid)
self.transcripts[tid] = new_transcript

self.exons = set()
Expand All @@ -755,6 +770,7 @@ def define_graph(self, objects: dict, inters=None, three_prime=False):
inters = self._share_extreme

for obj, other_obj in combinations(objects.keys(), 2):
self.logger.debug("Comparing %s to %s (%s')", obj, other_obj, "5" if not three_prime else "3")
if obj == other_obj:
continue
else:
Expand Down Expand Up @@ -847,9 +863,11 @@ def _share_five_prime(self, first: Transcript, second: Transcript):
first, second = sorted([first, second], key=operator.attrgetter("start"))
# Now let us check whether the second falls within an intron
matched = first.segmenttree.find(second.exons[0][0], second.exons[0][1])
if matched[0].value == "intron":
self.logger.debug("{second.id} last exon {second.exons[0]} intersects in {first.id}: {matched}".format(
**locals()))
if matched[0].value == "intron" or second.exons[0][0] < matched[0].start:
decision = False
reason = "{second} first exon ends within an intron of {first}".format(**locals())
reason = "{second.id} first exon ends within an intron of {first.id}".format(**locals())
else:
upstream = [_ for _ in first.find_upstream(second.exons[0][0], second.exons[0][1])
if _.value == "exon" and _ not in matched]
Expand All @@ -868,8 +886,8 @@ def _share_five_prime(self, first: Transcript, second: Transcript):
decision = (ts_distance <= self.ts_distance) and (ts_splices <= self.ts_max_splices)
if decision:
decision = (second, first)
reason = "{first.id} {doesit} overlap {second.id} (distance {ts_distance} max {self.ts_distance}, splices {ts_splices} max {self.ts_max_splices})".format(
doesit="does" if decision else "does not", **locals())
reason = "{first.id} {doesit} overlap {second.id} (distance {ts_distance} max {self.ts_distance}, splices \
{ts_splices} max {self.ts_max_splices})".format(doesit="does" if decision else "does not", **locals())
self.logger.debug(reason)
return decision

Expand All @@ -886,7 +904,7 @@ def _share_three_prime(self, first: Transcript, second: Transcript):
first, second = sorted([first, second], key=operator.attrgetter("end"), reverse=False)
# Now let us check whether the second falls within an intron
matched = second.segmenttree.find(first.exons[-1][0], first.exons[-1][1])
if matched[-1].value == "intron":
if matched[-1].value == "intron" or first.exons[-1][1] > matched[-1].end:
decision = False
reason = "{second.id} last exon ends within an intron of {first.id}".format(**locals())
else:
Expand Down Expand Up @@ -1083,17 +1101,16 @@ def expand_transcript(transcript: Transcript,
start_transcript, end_transcript = end_transcript, start_transcript

# Make a backup copy of the transcript
logger.debug("Starting expansion of %s", transcript.id)
backup = transcript.deepcopy()

# First get the ORFs
transcript.logger = logger
# Remove the CDS and unfinalize
logger.debug("Starting expansion of %s", transcript.id)
strand = transcript.strand
transcript.strip_cds()
transcript.unfinalize()

assert strand == transcript.strand

downstream = 0
down_exons = []

Expand All @@ -1114,6 +1131,7 @@ def expand_transcript(transcript: Transcript,

new_exons = up_exons + down_exons
if not new_exons:
logger.debug("%s does not need to be expanded, exiting", transcript.id)
return backup

transcript.add_exons(new_exons)
Expand All @@ -1123,43 +1141,52 @@ def expand_transcript(transcript: Transcript,
if transcript.strand == "-":
downstream, upstream = upstream, downstream

if up_exons or down_exons:
seq = check_expanded(transcript, backup, start_transcript, end_transcript,
fai, upstream, downstream, logger)
transcript = enlarge_orfs(transcript, backup, seq, upstream, downstream, logger)
transcript.finalize()
if (up_exons or down_exons):
if backup.is_coding:
seq = check_expanded(transcript, backup, start_transcript, end_transcript,
fai, upstream, downstream, logger)
transcript = enlarge_orfs(transcript, backup, seq, upstream, downstream, logger)
transcript.finalize()
else:
return backup

# Now finalize again
if upstream > 0 or downstream > 0:
logger.debug("%s: start (before %s, now %s, %s), end (before %s, now %s, %s)",
transcript.id,
backup.start, transcript.start, transcript.start < backup.start,
backup.end, transcript.end, transcript.end > backup.end)
if transcript.start < backup.start or transcript.end > backup.end:
transcript.attributes["padded"] = True

# Now check that we have a valid expansion
if backup.is_coding and not transcript.is_coding:
# Something has gone wrong. Just return the original transcript.
assert new_exons
logger.info("Padding %s would lead to an invalid CDS. Aborting.",
logger.info("Padding %s would lead to an invalid CDS (up exons: %s). Aborting.",
transcript.id, up_exons)
return backup
elif (backup.is_coding and ((backup.strand == "-" and backup.combined_cds_end < transcript.combined_cds_end) or
(backup.combined_cds_end > transcript.combined_cds_end))):
message = "Padding %s would lead to an in-frame stop codon (%s to %s, vs original %s to %s. Aborting." % (
transcript.id, transcript.combined_cds_start, transcript.combined_cds_end,
backup.combined_cds_start, backup.combined_cds_end
)
logger.info(message)
return backup
else:
message = "{transcript.id} has now start {transcript.start}, end {transcript.end}"
if (backup.is_coding and ((backup.combined_cds_end != transcript.combined_cds_end) or
(backup.combined_cds_start != transcript.combined_cds_start))):
elif backup.is_coding:
abort = False
if backup.strand == "-" and backup.combined_cds_end < transcript.combined_cds_end:
abort = True
elif backup.strand != "-" and backup.combined_cds_end > transcript.combined_cds_end:
abort = True
if abort is True:
msg = "Padding {} (strand: {}) would lead to an in-frame stop codon ({} to {}, vs original {} to {}.\
Aborting.".format(transcript.id, backup.strand, transcript.combined_cds_start, transcript.combined_cds_end,
backup.combined_cds_start, backup.combined_cds_end)
logger.info(msg)
return backup

message = "{transcript.id} has now start {transcript.start}, end {transcript.end}"
if (backup.is_coding and ((backup.combined_cds_end != transcript.combined_cds_end) or
(backup.combined_cds_start != transcript.combined_cds_start))):
transcript.attributes["cds_padded"] = True
message += "; CDS moved to {transcript.combined_cds_start}, end {transcript.combined_cds_end}"
else:
transcript.attributes["cds_padded"] = False
message += "."
logger.info(message.format(**locals()))
elif backup.is_coding:
transcript.attributes["cds_padded"] = False
message += "."
logger.info(message.format(**locals()))

return transcript

Expand Down Expand Up @@ -1401,14 +1428,11 @@ def enlarge_orfs(transcript: Transcript,
internal_orfs = []
else:
internal_orfs = []
internal_orfs = []

if not internal_orfs:
return transcript

logger.debug("Enlarging the ORFs for TID %s", transcript.id)
new_orfs = []

for orf in internal_orfs:
logger.debug("Old ORF: %s", str(orf))
try:
Expand Down
4 changes: 4 additions & 0 deletions Mikado/loci/reference_gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Minimal checks.
"""

import re
import copy
import logging
import operator
Expand Down Expand Up @@ -157,6 +158,9 @@ def add_exon(self, row):

for parent in (_ for _ in row.parent if _ not in self.transcripts):
found = False
if parent.endswith("-Protein") and re.sub("-Protein", "", parent) in self.transcripts:
continue

for tid in self.transcripts:
if parent in self.transcripts[tid].derived_children:
found = True
Expand Down
Loading

0 comments on commit 28898f1

Please sign in to comment.