From 28898f121e1c88761c871a2a0cc1ff7bbdc349c1 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Tue, 18 Jun 2019 17:59:55 -0400 Subject: [PATCH] Development (#184) * This should address #173 (both configuration file and docs) and #158 * Fix #181 and small bug fix for parsing Mikado annotations. * Progress for #142 - this should fix the wrong ORF calculation for cases when the CDS was open at the 5' end. * Fixed previous commit (always for #142) * #142: corrected and tested the issue with one-off exons, for padding. * This should fix and test #142 for good. * Removed spurious warning/error messages * #142: solved a bug which caused truncated transcripts at the 5' end not to be padded. * #142: solved a problem which caused a false abort for transcripts on the - strand with changed stop codon. * #142: fixing previous commit * Pushing the fix for #182 onto the development branch * Fix #183 * Fix #183 and previous commit * #183: now Mikado configure will set a seed when generating the configuration file. The seed will be explicitly mentioned in the log. * #177: made ORF loading slightly faster with pysam. Also made XML serialisation much faster using SQL sessions and multiprocessing.Pool instead of queues. * Solved annoying bug that caused Mikado to crash with TAIR GFF3s. --- .../configuration_blueprint.json | 14 +- Mikado/configuration/configurator.py | 15 + Mikado/loci/abstractlocus.py | 3 +- Mikado/loci/locus.py | 108 +++-- Mikado/loci/reference_gene.py | 4 + Mikado/parsers/bed12.py | 73 +-- Mikado/parsers/blast_utils.py | 8 +- Mikado/parsers/gfannotation.py | 2 + Mikado/picking/picker.py | 1 + Mikado/serializers/blast_serializer/utils.py | 35 +- .../blast_serializer/xml_serialiser.py | 429 ++++++------------ Mikado/serializers/orf.py | 22 +- Mikado/subprograms/configure.py | 14 +- Mikado/subprograms/pick.py | 3 + Mikado/subprograms/prepare.py | 8 + Mikado/subprograms/serialise.py | 29 +- Mikado/tests/locus_test.py | 138 +++++- Mikado/tests/pad_three_neg.bed12 | 2 + Mikado/tests/pad_utr.bed12 | 2 + Mikado/tests/padding_test.bed12 | 6 +- Mikado/tests/phasing_padding.bed12 | 2 + Mikado/tests/test_system_calls.py | 117 +++-- Mikado/transcripts/transcript.py | 2 +- Mikado/utilities/dbutils.py | 2 +- Mikado/utilities/log_utils.py | 4 +- docs/Usage/Configure.rst | 4 +- environment.yml | 4 +- sample_data/chr5.fas.gz.fai | 1 + sample_data/chr5.fas.gz.gzi | Bin 0 -> 6696 bytes 29 files changed, 577 insertions(+), 475 deletions(-) create mode 100644 Mikado/tests/pad_three_neg.bed12 create mode 100644 Mikado/tests/pad_utr.bed12 create mode 100644 Mikado/tests/phasing_padding.bed12 create mode 100644 sample_data/chr5.fas.gz.fai create mode 100644 sample_data/chr5.fas.gz.gzi diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json index a7176a517..966c70caf 100644 --- a/Mikado/configuration/configuration_blueprint.json +++ b/Mikado/configuration/configuration_blueprint.json @@ -4,7 +4,8 @@ "type": "object", "properties": { "SimpleComment": {"type": "object", "properties": {}, - "SimpleComment": ["Configuration file for Mikado. Sections:", + "SimpleComment": ["Configuration file for Mikado. Please note that absent values, e.g. if a field is deleted, will be imputed by the default values for the program.", + "Sections:", "- log_settings: settings related to the verbosity of logs.", "- db_settings: Database settings, for Mikado serialise and pick.", "- reference: Settings related to the genome reference.", @@ -12,7 +13,8 @@ "- serialise: settings related to the Mikado serialise stage", "- pick: settings related to the Mikado pick stage", "- multiprocessing_method: which method (fork, spawn, forkserver) Mikado should use for multiprocessing."], - "Comment": ["Configuration file for Mikado. Sections:", + "Comment": ["Configuration file for Mikado. Please note that absent values, e.g. if a field is deleted, will be imputed by the default values for the program.", + "Sections:", "- log_settings: settings related to the verbosity of logs.", "- db_settings: Database settings, for Mikado serialise and pick.", "- reference: Settings related to the genome reference.", @@ -21,6 +23,10 @@ "- pick: settings related to the Mikado pick stage", "- multiprocessing_method: which method (fork, spawn, forkserver) Mikado should use for multiprocessing."] }, + "seed": { + "type": ["integer", "null"], + "default": null + }, "multiprocessing_method": { "type": "string", "default": "", @@ -420,7 +426,7 @@ }, "ts_distance": { "type": "integer", - "default": 300, + "default": 1000, "minimum": 0 }, "pad": { @@ -430,7 +436,7 @@ "ts_max_splices": { "type": "integer", "minimum": 0, - "default": 1 + "default": 2 } } }, diff --git a/Mikado/configuration/configurator.py b/Mikado/configuration/configurator.py index 109020e2b..e49bea291 100644 --- a/Mikado/configuration/configurator.py +++ b/Mikado/configuration/configurator.py @@ -23,6 +23,8 @@ from ..exceptions import InvalidJson, UnrecognizedRescaler from ..utilities import merge_dictionaries from ..utilities.log_utils import create_default_logger +import sys +import random __author__ = "Luca Venturini" @@ -603,6 +605,13 @@ def check_json(json_conf, simple=False, external_dict=None, logger=None): logger.debug("Scoring parameters: {}".format("\n".join(["\n"] + [ "{}: {}".format(_, json_conf["scoring"][_]) for _ in json_conf["scoring"].keys()]))) + seed = json_conf.get("seed", None) + if seed is None: + seed = random.randint(0, sys.maxsize) + logger.info("Random seed: {}", seed) + json_conf["seed"] = seed + random.seed(seed) + return json_conf @@ -644,4 +653,10 @@ def to_json(string, simple=False, logger=None): except Exception as exc: raise OSError((exc, string)) + seed = json_dict.get("seed", None) + if seed is None: + seed = random.randint(0, sys.maxsize) + logger.info("Random seed: {}", seed) + random.seed(seed) + return json_dict diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py index d770a1b30..29471e624 100644 --- a/Mikado/loci/abstractlocus.py +++ b/Mikado/loci/abstractlocus.py @@ -193,10 +193,11 @@ def __getstate__(self): if hasattr(self, "json_conf"): # This removes unpicklable compiled attributes, eg in "requirements" or "as_requirements" + if "json_conf" not in state: + state["json_conf"] = self.json_conf.copy() for key in self.json_conf: if (isinstance(self.json_conf[key], dict) and self.json_conf[key].get("compiled", None) is not None): - assert "json_conf" in state assert key in state["json_conf"] del state["json_conf"][key]["compiled"] diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py index 858edbb75..f0cdb26d4 100644 --- a/Mikado/loci/locus.py +++ b/Mikado/loci/locus.py @@ -709,7 +709,10 @@ def pad_transcripts(self) -> set: """ try: - self.fai = pysam.FastaFile(self.json_conf["reference"]["genome"]) + if isinstance(self.json_conf["reference"]["genome"], pysam.FastaFile): + self.fai = self.json_conf["reference"]["genome"] + else: + self.fai = pysam.FastaFile(self.json_conf["reference"]["genome"]) except KeyError: raise KeyError(self.json_conf.keys()) @@ -730,13 +733,25 @@ def pad_transcripts(self) -> set: templates.add(__to_modify[tid][1].id) self.logger.debug("Expanding %s to have start %s (from %s) and end %s (from %s)", - tid, __to_modify[tid][0], - self[tid].start, __to_modify[tid][1], self[tid].end) - new_transcript = expand_transcript(self[tid].deepcopy(), - __to_modify[tid][0], - __to_modify[tid][1], - self.fai, - self.logger) + tid, __to_modify[tid][0] if not __to_modify[tid][0] else __to_modify[tid][0].start, + self[tid].start, + __to_modify[tid][1] if not __to_modify[tid][1] else __to_modify[tid][1].end, + self[tid].end) + try: + new_transcript = expand_transcript(self[tid].deepcopy(), + __to_modify[tid][0], + __to_modify[tid][1], + self.fai, + self.logger) + except KeyboardInterrupt: + raise + except Exception as exc: + self.logger.exception(exc) + raise + if (new_transcript.start == self.transcripts[tid].end) and (new_transcript.end == self.transcripts[tid].end): + self.logger.debug("No expansion took place for %s!", tid) + else: + self.logger.debug("Expansion took place for %s!", tid) self.transcripts[tid] = new_transcript self.exons = set() @@ -755,6 +770,7 @@ def define_graph(self, objects: dict, inters=None, three_prime=False): inters = self._share_extreme for obj, other_obj in combinations(objects.keys(), 2): + self.logger.debug("Comparing %s to %s (%s')", obj, other_obj, "5" if not three_prime else "3") if obj == other_obj: continue else: @@ -847,9 +863,11 @@ def _share_five_prime(self, first: Transcript, second: Transcript): first, second = sorted([first, second], key=operator.attrgetter("start")) # Now let us check whether the second falls within an intron matched = first.segmenttree.find(second.exons[0][0], second.exons[0][1]) - if matched[0].value == "intron": + self.logger.debug("{second.id} last exon {second.exons[0]} intersects in {first.id}: {matched}".format( + **locals())) + if matched[0].value == "intron" or second.exons[0][0] < matched[0].start: decision = False - reason = "{second} first exon ends within an intron of {first}".format(**locals()) + reason = "{second.id} first exon ends within an intron of {first.id}".format(**locals()) else: upstream = [_ for _ in first.find_upstream(second.exons[0][0], second.exons[0][1]) if _.value == "exon" and _ not in matched] @@ -868,8 +886,8 @@ def _share_five_prime(self, first: Transcript, second: Transcript): decision = (ts_distance <= self.ts_distance) and (ts_splices <= self.ts_max_splices) if decision: decision = (second, first) - reason = "{first.id} {doesit} overlap {second.id} (distance {ts_distance} max {self.ts_distance}, splices {ts_splices} max {self.ts_max_splices})".format( - doesit="does" if decision else "does not", **locals()) + reason = "{first.id} {doesit} overlap {second.id} (distance {ts_distance} max {self.ts_distance}, splices \ +{ts_splices} max {self.ts_max_splices})".format(doesit="does" if decision else "does not", **locals()) self.logger.debug(reason) return decision @@ -886,7 +904,7 @@ def _share_three_prime(self, first: Transcript, second: Transcript): first, second = sorted([first, second], key=operator.attrgetter("end"), reverse=False) # Now let us check whether the second falls within an intron matched = second.segmenttree.find(first.exons[-1][0], first.exons[-1][1]) - if matched[-1].value == "intron": + if matched[-1].value == "intron" or first.exons[-1][1] > matched[-1].end: decision = False reason = "{second.id} last exon ends within an intron of {first.id}".format(**locals()) else: @@ -1083,17 +1101,16 @@ def expand_transcript(transcript: Transcript, start_transcript, end_transcript = end_transcript, start_transcript # Make a backup copy of the transcript + logger.debug("Starting expansion of %s", transcript.id) backup = transcript.deepcopy() # First get the ORFs - transcript.logger = logger # Remove the CDS and unfinalize + logger.debug("Starting expansion of %s", transcript.id) strand = transcript.strand transcript.strip_cds() transcript.unfinalize() - assert strand == transcript.strand - downstream = 0 down_exons = [] @@ -1114,6 +1131,7 @@ def expand_transcript(transcript: Transcript, new_exons = up_exons + down_exons if not new_exons: + logger.debug("%s does not need to be expanded, exiting", transcript.id) return backup transcript.add_exons(new_exons) @@ -1123,43 +1141,52 @@ def expand_transcript(transcript: Transcript, if transcript.strand == "-": downstream, upstream = upstream, downstream - if up_exons or down_exons: - seq = check_expanded(transcript, backup, start_transcript, end_transcript, - fai, upstream, downstream, logger) - transcript = enlarge_orfs(transcript, backup, seq, upstream, downstream, logger) - transcript.finalize() + if (up_exons or down_exons): + if backup.is_coding: + seq = check_expanded(transcript, backup, start_transcript, end_transcript, + fai, upstream, downstream, logger) + transcript = enlarge_orfs(transcript, backup, seq, upstream, downstream, logger) + transcript.finalize() else: return backup # Now finalize again - if upstream > 0 or downstream > 0: + logger.debug("%s: start (before %s, now %s, %s), end (before %s, now %s, %s)", + transcript.id, + backup.start, transcript.start, transcript.start < backup.start, + backup.end, transcript.end, transcript.end > backup.end) + if transcript.start < backup.start or transcript.end > backup.end: transcript.attributes["padded"] = True # Now check that we have a valid expansion if backup.is_coding and not transcript.is_coding: # Something has gone wrong. Just return the original transcript. assert new_exons - logger.info("Padding %s would lead to an invalid CDS. Aborting.", + logger.info("Padding %s would lead to an invalid CDS (up exons: %s). Aborting.", transcript.id, up_exons) return backup - elif (backup.is_coding and ((backup.strand == "-" and backup.combined_cds_end < transcript.combined_cds_end) or - (backup.combined_cds_end > transcript.combined_cds_end))): - message = "Padding %s would lead to an in-frame stop codon (%s to %s, vs original %s to %s. Aborting." % ( - transcript.id, transcript.combined_cds_start, transcript.combined_cds_end, - backup.combined_cds_start, backup.combined_cds_end - ) - logger.info(message) - return backup - else: - message = "{transcript.id} has now start {transcript.start}, end {transcript.end}" - if (backup.is_coding and ((backup.combined_cds_end != transcript.combined_cds_end) or - (backup.combined_cds_start != transcript.combined_cds_start))): + elif backup.is_coding: + abort = False + if backup.strand == "-" and backup.combined_cds_end < transcript.combined_cds_end: + abort = True + elif backup.strand != "-" and backup.combined_cds_end > transcript.combined_cds_end: + abort = True + if abort is True: + msg = "Padding {} (strand: {}) would lead to an in-frame stop codon ({} to {}, vs original {} to {}.\ +Aborting.".format(transcript.id, backup.strand, transcript.combined_cds_start, transcript.combined_cds_end, + backup.combined_cds_start, backup.combined_cds_end) + logger.info(msg) + return backup + + message = "{transcript.id} has now start {transcript.start}, end {transcript.end}" + if (backup.is_coding and ((backup.combined_cds_end != transcript.combined_cds_end) or + (backup.combined_cds_start != transcript.combined_cds_start))): transcript.attributes["cds_padded"] = True message += "; CDS moved to {transcript.combined_cds_start}, end {transcript.combined_cds_end}" - else: - transcript.attributes["cds_padded"] = False - message += "." - logger.info(message.format(**locals())) + elif backup.is_coding: + transcript.attributes["cds_padded"] = False + message += "." + logger.info(message.format(**locals())) return transcript @@ -1401,14 +1428,11 @@ def enlarge_orfs(transcript: Transcript, internal_orfs = [] else: internal_orfs = [] - internal_orfs = [] if not internal_orfs: return transcript - logger.debug("Enlarging the ORFs for TID %s", transcript.id) new_orfs = [] - for orf in internal_orfs: logger.debug("Old ORF: %s", str(orf)) try: diff --git a/Mikado/loci/reference_gene.py b/Mikado/loci/reference_gene.py index d12344f2d..d8ccab075 100644 --- a/Mikado/loci/reference_gene.py +++ b/Mikado/loci/reference_gene.py @@ -5,6 +5,7 @@ Minimal checks. """ +import re import copy import logging import operator @@ -157,6 +158,9 @@ def add_exon(self, row): for parent in (_ for _ in row.parent if _ not in self.transcripts): found = False + if parent.endswith("-Protein") and re.sub("-Protein", "", parent) in self.transcripts: + continue + for tid in self.transcripts: if parent in self.transcripts[tid].derived_children: found = True diff --git a/Mikado/parsers/bed12.py b/Mikado/parsers/bed12.py index 4f0044ed5..eff67adfe 100644 --- a/Mikado/parsers/bed12.py +++ b/Mikado/parsers/bed12.py @@ -393,7 +393,6 @@ def __check_validity(self, transcriptomic, fasta_index, sequence): self.has_stop_codon = False if transcriptomic is True and self.coding is True and (fasta_index is not None or sequence is not None): - orig_phase = self.phase self.validity_checked = True if sequence is not None: self.fasta_length = len(sequence) @@ -417,7 +416,7 @@ def __check_validity(self, transcriptomic, fasta_index, sequence): (self.thick_start - 1 if not self.phase else self.start + self.phase - 1):self.thick_end] else: orf_sequence = sequence[(self.thick_start - 1):( - self.thick_end if not self.phase else self.end - self.phase)].reverse_complement() + self.thick_end if not self.phase else self.end - (3 - self.phase) % 3)].reverse_complement() self.start_codon = str(orf_sequence)[:3].upper() self.stop_codon = str(orf_sequence[-3:]).upper() @@ -464,10 +463,10 @@ def _adjust_start(self, sequence, orf_sequence): self.thick_start -= 3 else: self.thick_end += 3 - if sequence[pos -3:pos] in self.table.start_codons: + if sequence[pos - 3:pos] in self.table.start_codons: # We have found a valid methionine. break - elif sequence[pos -3:pos] in self.table.stop_codons: + elif sequence[pos - 3:pos] in self.table.stop_codons: if self.strand == "+": self.thick_start += 3 else: @@ -500,14 +499,14 @@ def _adjust_start(self, sequence, orf_sequence): self.phase = 0 else: if self.end - self.thick_end <= 2: - self.phase = self.end - self.thick_end + new_phase = max(self.end - self.thick_end, 0) + self.phase = new_phase self.thick_end = self.end else: self.phase = 0 else: self.phase = 0 - if self.invalid: raise ValueError(self.invalid_reason) @@ -811,42 +810,54 @@ def expand(self, sequence, upstream, downstream, expand_orf=False, logger=create self.fasta_length = len(sequence) # I presume that the sequence is already in the right orientation - - self.start_codon = str( - old_sequence[self.thick_start + self.phase - 1:self.thick_start + self.phase + 2]).upper() - - last_codon_start = (self.thick_end - 3) - ((self.thick_end - self.thick_start - self.phase - 2) % 3) - self.stop_codon = str(old_sequence[last_codon_start:last_codon_start + 3]).upper() - - assert 0 < len(self.stop_codon) <= 3, self.stop_codon - - logger.debug("%s: start codon %s, old start %s; stop codon %s, old start %s", - self.name, self.start_codon, self.thick_start + self.phase, - self.stop_codon, self.thick_end - ) + old_start_pos = self.thick_start + self.phase - 1 + old_end_pos = self.thick_end - (self.thick_end - old_start_pos) % 3 + old_orf = old_sequence[old_start_pos:old_end_pos].upper() + logger.debug("Old sequence of %s (%s bps): %s[...]%s", self.id, len(old_sequence), + old_sequence[:10], old_sequence[-10:]) + logger.debug("Old ORF of %s (%s bps, phase %s): %s[...]%s", self.id, len(old_orf), self.phase, + old_orf[:10], old_orf[-10:]) + assert len(old_orf) > 0, (old_start_pos, old_end_pos) + assert len(old_orf) % 3 == 0, (old_start_pos, old_end_pos) + old_pep = Seq.Seq(old_orf).translate(self.table, gap="N") + if "*" in old_pep and old_pep.find("*") < len(old_pep) - 1: + logger.error("Stop codon found within the ORF of %s (pos %s of %s; phase %s). This is invalid!", + self.id, old_pep.find("*"), len(old_pep), self.phase) + + self.start_codon = old_orf[:3] + self.stop_codon = old_orf[-3:] + logger.debug("%s: start codon %s, old start %s (%s); stop codon %s, old stop %s (%s)", + self.name, self.start_codon, self.thick_start + self.phase, + (self.thick_start + self.phase + upstream), + self.stop_codon, self.thick_end, (self.thick_end + upstream)) # Now expand self.end = len(sequence) self.thick_start += upstream self.thick_end += upstream - if expand_orf is True: - if str(self.start_codon).upper() not in self.table.start_codons: - for pos in range(self.thick_start - self.phase, + self.has_start_codon = (str(self.start_codon).upper() in self.table.start_codons) + self.has_stop_codon = (str(self.stop_codon).upper() in self.table.stop_codons) + if expand_orf is True and not (self.has_start_codon and self.has_stop_codon): + if not self.has_start_codon: + for pos in range(old_start_pos + upstream, 0, -3): - codon = sequence[pos -1:pos + 2].upper() - self.thick_start = pos + codon = sequence[pos:pos + 3].upper() + + self.thick_start = pos + 1 if codon in self.table.start_codons: # self.thick_start = pos self.start_codon = codon self.__has_start = True + logger.debug("Position %d, codon %s. Start codon found.", pos, codon) break - - if self.start_codon not in self.table.start_codons: - self.phase = self.thick_start % 3 - self.thick_start = 1 - else: - self.phase = 0 - self.__has_start = True + if self.start_codon not in self.table.start_codons: + self.phase = (self.thick_start - 1) % 3 + logger.debug("No start codon found for %s. Thick start %s, new phase: %s", + self.id, self.thick_start, self.phase) + self.thick_start = 1 + else: + self.phase = 0 + self.__has_start = True coding_seq = Seq.Seq(sequence[self.thick_start + self.phase - 1:self.end]) if len(coding_seq) % 3 != 0: diff --git a/Mikado/parsers/blast_utils.py b/Mikado/parsers/blast_utils.py index 7975b386f..35f078b28 100644 --- a/Mikado/parsers/blast_utils.py +++ b/Mikado/parsers/blast_utils.py @@ -7,7 +7,6 @@ import os import subprocess import gzip -import operator import multiprocessing import io import collections @@ -18,9 +17,10 @@ from . import HeaderError from ..utilities.log_utils import create_null_logger # from Bio.SearchIO.BlastIO.blast_xml import BlastXmlParser as xparser -import Bio.SearchIO -import functools -xparser = functools.partial(Bio.SearchIO.parse, format="blast-xml") +from Bio.Blast.NCBIXML import parse as xparser +# import Bio.SearchIO +# import functools +# xparser = functools.partial(Bio.SearchIO.parse, format="blast-xml") from ..utilities import overlap import xml.etree.ElementTree import numpy as np diff --git a/Mikado/parsers/gfannotation.py b/Mikado/parsers/gfannotation.py index 39b3b5387..61215c7ec 100644 --- a/Mikado/parsers/gfannotation.py +++ b/Mikado/parsers/gfannotation.py @@ -240,6 +240,8 @@ def is_gene(self): elif self.id is not None and self.id.startswith("gene:"): # Hack for EnsEMBL return True + elif self.feature in ("sublocus", "monosublocus", "monosublocusholder"): + return True return False @property diff --git a/Mikado/picking/picker.py b/Mikado/picking/picker.py index a3af47f3f..e41f8c5a9 100644 --- a/Mikado/picking/picker.py +++ b/Mikado/picking/picker.py @@ -97,6 +97,7 @@ def __init__(self, json_conf, commandline=""): force=True) # self.setup_logger() + self.logger.info("Random seed: %s", self.json_conf["seed"]) self.logger.debug("Multiprocessing method: %s", self.json_conf["multiprocessing_method"]) diff --git a/Mikado/serializers/blast_serializer/utils.py b/Mikado/serializers/blast_serializer/utils.py index 30ae0b1d2..f92c62df9 100644 --- a/Mikado/serializers/blast_serializer/utils.py +++ b/Mikado/serializers/blast_serializer/utils.py @@ -25,7 +25,7 @@ def prepare_hsp(hsp, counter, qmultiplier=1, tmultiplier=1): :param hsp: An HSP object from Bio.Blast.NCBIXML # :type hsp: Bio.Blast.Record.HSP - :type hsp: Bio.SearchIO._model.hsp.HSP + :type hsp: Bio.Blast.Record.HSP :param counter: a digit that indicates the priority of the HSP in the hit :return: hsp_dict, identical_positions, positives :rtype: (dict, set, set) @@ -37,16 +37,16 @@ def prepare_hsp(hsp, counter, qmultiplier=1, tmultiplier=1): hsp_dict["counter"] = counter + 1 hsp_dict["query_hsp_start"] = hsp.query_start hsp_dict["query_hsp_end"] = hsp.query_end - hsp_dict["query_frame"] = hsp.query_frame - hsp_dict["target_hsp_start"] = hsp.hit_start - hsp_dict["target_hsp_end"] = hsp.hit_end - hsp_dict["target_frame"] = hsp.hit_frame - hsp_dict["hsp_identity"] = hsp.ident_num / hsp.aln_span * 100 - hsp_dict["hsp_positives"] = hsp.pos_num / hsp.aln_span * 100 + hsp_dict["query_frame"] = hsp.frame[0] + hsp_dict["target_hsp_start"] = hsp.sbjct_start + hsp_dict["target_hsp_end"] = hsp.sbjct_end + hsp_dict["target_frame"] = hsp.frame[1] + hsp_dict["hsp_identity"] = hsp.identities / hsp.align_length * 100 + hsp_dict["hsp_positives"] = hsp.positives / hsp.align_length * 100 hsp_dict["match"] = match - hsp_dict["hsp_length"] = hsp.aln_span - hsp_dict["hsp_bits"] = hsp.bitscore - hsp_dict["hsp_evalue"] = hsp.evalue + hsp_dict["hsp_length"] = hsp.align_length + hsp_dict["hsp_bits"] = hsp.score + hsp_dict["hsp_evalue"] = hsp.expect return hsp_dict, identical_positions, positives @@ -57,12 +57,12 @@ def _prepare_aln_strings(hsp, qmultiplier=1, tmultiplier=1): identical_positions, positives = set(), set() # for query_aa, middle_aa, target_aa in zip(hsp.query, hsp.match, hsp.sbjct): - query_pos, target_pos = hsp.query_start, hsp.hit_start + query_pos, target_pos = hsp.query_start - 1, hsp.sbjct_start - 1 match = "" - zipper = zip(hsp.aln_annotation["similarity"], *list(hsp.aln)) + # zipper = zip(hsp.aln_annotation["similarity"], *list(hsp.aln)) - for middle_aa, query_aa, target_aa in zipper: + for middle_aa, query_aa, target_aa in zip(hsp.match, hsp.query, hsp.sbjct): if middle_aa in valid_matches or middle_aa == "+": if middle_aa != "+": identical_positions.update(set(range(query_pos, query_pos + qmultiplier))) @@ -89,7 +89,9 @@ def _prepare_aln_strings(hsp, qmultiplier=1, tmultiplier=1): query_pos += qmultiplier target_pos += tmultiplier - assert query_pos <= hsp.query_end and target_pos <= hsp.hit_end + assert query_pos <= hsp.query_end and target_pos <= hsp.sbjct_end, ((query_pos, hsp.query_end), + (target_pos, hsp.sbjct_end), + hsp.match, hsp.query, hsp.sbjct) return match, identical_positions, positives @@ -100,7 +102,7 @@ def prepare_hit(hit, query_id, target_id, **kwargs): global_identity: the identity rate for the global hit *using the query perspective* :param hit: the hit to parse. - :type hit: Bio.SearchIO._model.hit.Hit + :type hit: Bio.Blast.Record.Alignment :param query_id: the numeric ID of the query in the database. Necessary for serialisation. :type query_id: int @@ -114,7 +116,6 @@ def prepare_hit(hit, query_id, target_id, **kwargs): hit_dict = dict() hsp_dict_list = [] - # hit_dict["global_identity"] = [] q_intervals = [] t_intervals = [] @@ -154,7 +155,7 @@ def hsp_sorter(val): hsp_dict_list.append(hsp_dict) q_intervals.append((hsp.query_start, hsp.query_end)) # t_intervals.append((hsp.sbjct_start, hsp.sbjct_end)) - t_intervals.append((hsp.hit_start, hsp.hit_end)) + t_intervals.append((hsp.sbjct_start, hsp.sbjct_end)) q_merged_intervals, q_aligned = merge(q_intervals) assert isinstance(q_aligned, np.int), (q_merged_intervals, q_aligned, type(q_aligned)) diff --git a/Mikado/serializers/blast_serializer/xml_serialiser.py b/Mikado/serializers/blast_serializer/xml_serialiser.py index 82faa2f6b..9c5bffe5e 100644 --- a/Mikado/serializers/blast_serializer/xml_serialiser.py +++ b/Mikado/serializers/blast_serializer/xml_serialiser.py @@ -3,28 +3,23 @@ """ import os -import functools import logging.handlers as logging_handlers import logging import tempfile -try: - import ujson as json -except ImportError: - import json +import ujson as json import sqlite3 -# import pickle import sqlalchemy import sqlalchemy.exc from sqlalchemy.orm.session import Session from ...utilities.dbutils import DBBASE -import pyfaidx +import pysam from ...utilities.dbutils import connect from ...parsers.blast_utils import BlastOpener # , XMLMerger from ...utilities.log_utils import create_null_logger, check_logger from . import Query, Target, Hsp, Hit, prepare_hit, InvalidHit from xml.parsers.expat import ExpatError import xml -from queue import Empty +# from queue import Empty import multiprocessing @@ -34,188 +29,75 @@ # A serialisation class must have a ton of attributes ... # pylint: disable=too-many-instance-attributes -class _XmlPickler(multiprocessing.Process): - - def __init__(self, - queries, - targets, - filequeue: multiprocessing.Queue, - returnqueue, - default_header, - identifier, - logging_queue, - level="WARN", - max_target_seqs=10, - maxobjects=20000): - - super().__init__() - self.queries = queries - self.targets = targets - self.level = level - self.logging_queue = logging_queue - self.handler = logging_handlers.QueueHandler(logging_queue) - self.handler.setLevel(self.level) - self.__identifier = identifier - self.name = self._name = "_XmlPickler-{0}".format(self.identifier) - self.logger = logging.getLogger(self.name) - self.logger.addHandler(self.handler) - self.logger.setLevel(self.level) - self.filequeue = filequeue - self.returnqueue = returnqueue - self.default_header = default_header - self.maxobjects = maxobjects - self.__max_target_seqs = max_target_seqs - self.logger.debug("Started %s", self.name) - def __getstate__(self): - - state = self.__dict__.copy() - - state["logger"].removeHandler(state["handler"]) - state["handler"].close() - state["handler"] = None - - # state["_pickler"] = None - state["logger"] = None - return state - - def __setstate__(self, state): - - self.__dict__.update(state) - self.handler = logging_handlers.QueueHandler(self.logging_queue) - self.handler.setLevel(self.level) - self.logger = logging.getLogger(self.name) - self.logger.addHandler(self.handler) - self.logger.setLevel(self.level) - - def _create_db(self, filename): - - """Private method to create a DB for serialisation. - :param filename: the name of the file to serialise - :returns dbname, cursor: the name of the database and the SQLite cursor +def _create_xml_db(filename): + """Private method to create a DB for serialisation. + :param filename: the name of the file to serialise + :returns dbname, cursor: the name of the database and the SQLite cursor - """ - - directory = os.path.dirname(filename) - try: - dbname = tempfile.mktemp(suffix=".db", dir=directory) - conn = sqlite3.connect(dbname) - except (OSError, PermissionError, sqlite3.OperationalError): - dbname = tempfile.mktemp(suffix=".db") - conn = sqlite3.connect(dbname) - cursor = conn.cursor() - creation_string = "CREATE TABLE dump (query_counter integer, hits blob, hsps blob)" - try: - cursor.execute( # TODO: change - creation_string) - except sqlite3.OperationalError: - # Table already exists - self.logger.error( - "Temporary db %s already exists (maybe from a previous aborted run?), dropping its contents", - dbname) - cursor.close() - conn.close() - os.remove(dbname) - conn = sqlite3.connect(dbname) - cursor = conn.cursor() - cursor.execute(creation_string) - cursor.execute("CREATE INDEX idx ON dump (query_counter)") - self.logger.debug("Created tables for shelf %s", dbname) - - return dbname, conn, cursor - - def _pickler(self, filename): - - # Check the header is alright - valid, _, exc = BlastOpener(filename).sniff(default_header=self.default_header) - if not valid: - self.logger.warning("Invalid BLAST file: %s", filename) - return [] - - self.logger.debug("Starting to pickle %s", filename) - # hits, hsps = [], [] - - pickle_count = 0 - query_counter = 0 - # Create the database - dbname, conn, cursor = self._create_db(filename) - - try: - with BlastOpener(filename) as opened: - try: - for query_counter, record in enumerate(opened, start=1): - hits, hsps = objectify_record(self, record, [], [], - max_target_seqs=self.__max_target_seqs) - - cursor.execute("INSERT INTO dump VALUES (?, ?, ?)", - (query_counter, json.dumps(hits), json.dumps(hsps)) - ) - if query_counter % self.maxobjects and query_counter > 0: - conn.commit() - cursor.close() - conn.close() - yield dbname - dbname, conn, cursor = self._create_db(filename) - pickle_count += 1 + """ - except ExpatError: - self.logger.error("%s is an invalid BLAST file, sending back anything salvageable", - filename) - raise - except xml.etree.ElementTree.ParseError: - self.logger.error("%s is an invalid BLAST file, sending back anything salvageable", - filename) - raise - except ValueError: - self.logger.error("Invalid BLAST entry") - raise - - self.logger.debug("Finished serialising %s in %s subsection", filename, pickle_count) + directory = os.path.dirname(filename) + try: + dbname = tempfile.mktemp(suffix=".db", dir=directory) + conn = sqlite3.connect(dbname) + except (OSError, PermissionError, sqlite3.OperationalError): + dbname = tempfile.mktemp(suffix=".db") + conn = sqlite3.connect(dbname) + cursor = conn.cursor() + creation_string = "CREATE TABLE dump (query_counter integer, hits blob, hsps blob)" + try: + cursor.execute( # TODO: change + creation_string) + except sqlite3.OperationalError: + # Table already exists cursor.close() - conn.commit() conn.close() - yield dbname - # del records - - def run(self): - """ - While running, the process will get the filenames to analyse from the first queue - and return them through the second one. - """ - - while True: - try: - number, filename = self.filequeue.get(timeout=10) - except multiprocessing.TimeoutError: - self.logger.error( - "Something has gone awry in %s, no data received from the queue after waiting 10s. Aborting.", - self._name) - # self.filequeue.put("EXIT") - # return 0 - raise - except Empty: - continue - - if filename == "EXIT": - self.logger.debug("Process %s received EXIT signal, terminating", - self._name) - self.filequeue.put((number, filename)) - return 0 + os.remove(dbname) + conn = sqlite3.connect(dbname) + cursor = conn.cursor() + cursor.execute(creation_string) + cursor.execute("CREATE INDEX idx ON dump (query_counter)") + return dbname, conn, cursor + + +def xml_pickler(json_conf, filename, default_header, + max_target_seqs=10): + valid, _, exc = BlastOpener(filename).sniff(default_header=default_header) + engine = connect(json_conf) + session = Session(bind=engine) + + if not valid: + err = "Invalid BLAST file: %s" % filename + raise TypeError(err) + dbname, conn, cursor = _create_xml_db(filename) + try: + with BlastOpener(filename) as opened: try: - for pickled in self._pickler(filename): - self.logger.debug("Sending serialised information in {}".format(pickled)) - self.returnqueue.put((number, [pickled])) - except Exception as exc: - self.logger.error("Error encountered in %s, blocking the program.", self.identifier) - self.returnqueue.put((number, "FINISHED")) - self.logger.exception(exc) - raise - - self.returnqueue.put((number, "FINISHED")) - - @property - def identifier(self): - return self.__identifier + for query_counter, record in enumerate(opened, start=1): + hits, hsps = objectify_record(session, record, [], [], + max_target_seqs=max_target_seqs) + + cursor.execute("INSERT INTO dump VALUES (?, ?, ?)", + (query_counter, json.dumps(hits), json.dumps(hsps)) + ) + except ExpatError as err: + # logger.error("%s is an invalid BLAST file, sending back anything salvageable", filename) + raise ExpatError("{} is an invalid BLAST file, sending back anything salvageable.\n{}".format(filename, + err)) + except xml.etree.ElementTree.ParseError as err: + # logger.error("%s is an invalid BLAST file, sending back anything salvageable", filename) + raise xml.etree.ElementTree.ParseError( + "{} is an invalid BLAST file, sending back anything salvageable.\n{}".format(filename, err)) + except ValueError as err: + # logger.error("Invalid BLAST entry") + raise ValueError( + "{} is an invalid BLAST file, sending back anything salvageable.\n{}".format(filename, err)) + + cursor.close() + conn.commit() + conn.close() + return dbname class XmlSerializer: @@ -329,7 +211,7 @@ def __determine_sequences(self, query_seqs, target_seqs): if isinstance(query_seqs, str): assert os.path.exists(query_seqs) - self.query_seqs = pyfaidx.Fasta(query_seqs) + self.query_seqs = pysam.FastaFile(query_seqs) elif query_seqs is None: self.query_seqs = None else: @@ -341,7 +223,7 @@ def __determine_sequences(self, query_seqs, target_seqs): for target in target_seqs: if not os.path.exists(target): raise ValueError("{} not found!".format(target)) - self.target_seqs.append(pyfaidx.Fasta(target)) + self.target_seqs.append(pysam.FastaFile(target)) return @@ -354,20 +236,20 @@ def __serialize_queries(self, queries): counter = 0 self.logger.info("Started to serialise the queries") objects = [] - for record in self.query_seqs.records: + for record, length in zip(self.query_seqs.references, self.query_seqs.lengths): if not record: continue if record in queries and queries[record][1] is not None: continue elif record in queries: self.session.query(Query).filter(Query.query_name == record).update( - {"query_length": len(self.query_seqs[record])}) - queries[record] = (queries[record][0], len(self.query_seqs[record])) + {"query_length": length}) + queries[record] = (queries[record][0], length) continue objects.append({ "query_name": record, - "query_length": len(self.query_seqs[record]) + "query_length": length }) if len(objects) >= self.maxobjects: @@ -413,18 +295,18 @@ def __serialize_targets(self, targets): objects = [] self.logger.info("Started to serialise the targets") for target in self.target_seqs: - for record in target.records: + for record, length in zip(target.references, target.lengths): if record in targets and targets[record][1] is True: continue elif record in targets: self.session.query(Target).filter(Target.target_name == record).update( - {"target_length": len(self.target_seqs[record])}) + {"target_length": length}) targets[record] = (targets[record][0], True) continue objects.append({ "target_name": record, - "target_length": len(target[record]) + "target_length": length }) counter += 1 # @@ -535,10 +417,6 @@ def serialize(self): else: assert isinstance(self.xml, (list, set)) - # Create the function that will retrieve the query_id given the name - # self.get_query = functools.partial(self.__get_query_for_blast, - # **{"queries": self.queries}) - hits, hsps = [], [] hit_counter, record_counter = 0, 0 @@ -550,7 +428,7 @@ def serialize(self): self.logger.error(exc) self.xml.remove(filename) - if self.single_thread is True: + if self.single_thread is True or self.procs == 1: for filename in self.xml: valid, _, exc = BlastOpener(filename).sniff(default_header=self.header) if not valid: @@ -563,86 +441,53 @@ def serialize(self): record_counter += 1 if record_counter > 0 and record_counter % 10000 == 0: self.logger.info("Parsed %d queries", record_counter) - hits, hsps = objectify_record(self, record, hits, hsps, - max_target_seqs=self.__max_target_seqs) + hits, hsps = objectify_record(self.session, + record, hits, hsps, + max_target_seqs=self.__max_target_seqs, logger=self.logger) hits, hsps = load_into_db(self, hits, hsps, force=False) self.logger.debug("Finished %s", filename) except ExpatError: - self.logger.error("%s is an invalid BLAST file, saving what's available", - filename) - + self.logger.error("%s is an invalid BLAST file, saving what's available", filename) _, _ = self.__load_into_db(hits, hsps, force=True) else: self.logger.debug("Creating a pool with %d processes", - min(self.procs, len(self.xml))) - - filequeue = multiprocessing.Queue(-1) - returnqueue = multiprocessing.Queue(-1) - - procs = [_XmlPickler( - self.queries, - self.targets, - filequeue, - returnqueue, - self.header, - _, - logging_queue=self.logging_queue, - # level=self.logger.level, - level=self.json_conf["log_settings"]["log_level"], - maxobjects=int(self.maxobjects/self.procs), - max_target_seqs=self.__max_target_seqs - ) - for _ in range(min([self.procs, len(self.xml)])) - ] - - self.logger.debug("Starting to pickle and serialise %d files", len(self.xml)) - [_.start() for _ in procs] # Start processes - for number, xml_name in enumerate(self.xml): - filequeue.put((number, xml_name)) + min(self.procs, len(self.xml))) + + pool = multiprocessing.Pool(self.procs) + results = [] + for num, filename in enumerate(self.xml): + args = (self.json_conf, filename, self.header) + kwds = {"max_target_seqs": self.__max_target_seqs} + pool.apply_async(xml_pickler, args=args, kwds=kwds, callback=results.append) + pool.close() + pool.join() + + for dbfile in results: + conn = sqlite3.connect(dbfile) + cursor = conn.cursor() + for query_counter, __hits, __hsps in cursor.execute("SELECT * FROM dump"): + record_counter += 1 + __hits = json.loads(__hits) + __hsps = json.loads(__hsps) + hit_counter += len(__hits) + hits.extend(__hits) + hsps.extend(__hsps) + hits, hsps = load_into_db(self, hits, hsps, force=False) + if record_counter > 0 and record_counter % 10000 == 0: + self.logger.debug("Parsed %d queries", record_counter) + cursor.close() + conn.close() + os.remove(dbfile) self.logger.debug("Finished sending off the data for serialisation") - - filequeue.put((None, "EXIT")) - returned = [] - while len(returned) != len(self.xml): - number, result = returnqueue.get() - if result == "FINISHED": - self.logger.debug("Finished receiving pickles for %d", number) - returned.append(number) - continue - if result == "EXIT": - continue - for dbfile in result: - conn = sqlite3.connect(dbfile) - cursor = conn.cursor() - for query_counter, __hits, __hsps in cursor.execute("SELECT * FROM dump"): - record_counter += 1 - __hits = json.loads(__hits) - __hsps = json.loads(__hsps) - hit_counter += len(__hits) - hits.extend(__hits) - hsps.extend(__hsps) - hits, hsps = load_into_db(self, hits, hsps, force=False) - if record_counter > 0 and record_counter % 10000 == 0: - self.logger.debug("Parsed %d queries", record_counter) - cursor.close() - conn.close() - os.remove(dbfile) - [_.join() for _ in procs] # Wait for processes to join - self.logger.info("All %d children finished", len(procs)) - del procs - _, _ = self.__load_into_db(hits, hsps, force=True) - returnqueue.close() - filequeue.close() self.logger.info("Loaded %d alignments for %d queries", hit_counter, record_counter) self.logger.info("Finished loading blast hits") - # [_.close() for _ in self.logger.handlers] if hasattr(self, "logging_queue"): self.logging_queue.close() @@ -659,13 +504,14 @@ def get_multipliers(record): Private quick method to determine the multipliers for a BLAST alignment according to the application present in the record. :param record: + :type record: Bio.Blast.Record.Blast :return: """ q_mult, h_mult = 1, 1 # application = record.application.upper() - application = record.program.upper() + application = record.application.upper() if application in ("BLASTN", "TBLASTX", "BLASTP"): q_mult = 1 @@ -722,7 +568,7 @@ def load_into_db(self, hits, hsps, force=False): return hits, hsps -def _get_query_for_blast(self, record): +def _get_query_for_blast(session: sqlalchemy.orm.session.Session, record): """ This private method formats the name of the query recovered from the BLAST hit. It will cause an exception if the target is not present in the dictionary. @@ -730,23 +576,14 @@ def _get_query_for_blast(self, record): :return: current_query (ID in the database), name """ - if record.id in self.queries: - name = record.id - else: - name = record.id.split()[0] - if name not in self.queries: - raise KeyError("{} not found in the queries!".format(record)) - - self.logger.debug("Started with %s", name) - - if self.queries[name][1] is False: - raise KeyError("{} not found in the queries!".format(record)) - - current_query = self.queries[name][0] - return current_query, name + got = session.query(Query).filter(sqlalchemy.or_( + Query.query_name == record.query, + Query.query_name == record.query.split()[0], + )).one() + return got.query_id, got.query_name -def _get_target_for_blast(self, alignment): +def _get_target_for_blast(session, alignment): """ This private method retrieves the correct target_id key for the target of the BLAST. If the entry is not present in the database, it will be created on the fly. @@ -756,22 +593,20 @@ def _get_target_for_blast(self, alignment): :return: current_target (ID in the database), targets """ - if alignment.accession in self.targets: - accession = alignment.accession - elif alignment.id in self.targets: - accession = alignment.id - else: - raise KeyError("{} not found in the targets!".format(alignment.accession)) + got = session.query(Target).filter(sqlalchemy.or_( + Target.target_name == alignment.accession, + Target.target_name == alignment.hit_id)).one() - current_target = self.targets[accession][0] - return current_target + # current_target = targets[accession][0] + return got.target_id -def objectify_record(self, record, hits, hsps, max_target_seqs=10000): +def objectify_record(session, record, hits, hsps, max_target_seqs=10000, logger=create_null_logger()): """ Private method to serialise a single record into the DB. :param record: The BLAST record to load into the DB. + :type record: Bio.Blast.Record.Blast :param hits: Cache of hits to load into the DB. :type hits: list @@ -782,29 +617,27 @@ def objectify_record(self, record, hits, hsps, max_target_seqs=10000): :rtype: (list, list) """ - if len(record.hits) == 0: + if len(record.alignments) == 0: return hits, hsps - current_query, name = _get_query_for_blast(self, record) + current_query, name = _get_query_for_blast(session, record) current_evalue = -1 current_counter = 0 # for ccc, alignment in enumerate(record.alignments): - for ccc, alignment in enumerate(record.hits): + for ccc, alignment in enumerate(record.alignments): if ccc + 1 > max_target_seqs: break - self.logger.debug("Started the hit %s vs. %s", - # name, record.alignments[ccc].accession) - name, record.hits[ccc].id) - current_target = _get_target_for_blast(self, alignment) + logger.debug("Started the hit %s vs. %s", name, record.alignments[ccc].hit_id) + current_target = _get_target_for_blast(session, alignment) hit_dict_params = dict() (hit_dict_params["query_multiplier"], hit_dict_params["target_multiplier"]) = XmlSerializer.get_multipliers(record) - hit_evalue = min(_.evalue for _ in record.hits[ccc].hsps) - hit_bs = max(_.bitscore for _ in record.hits[ccc].hsps) + hit_evalue = min(_.expect for _ in record.alignments[ccc].hsps) + hit_bs = max(_.score for _ in record.alignments[ccc].hsps) if current_evalue < hit_evalue: current_counter += 1 current_evalue = hit_evalue @@ -818,7 +651,7 @@ def objectify_record(self, record, hits, hsps, max_target_seqs=10000): hit, hit_hsps = prepare_hit(alignment, current_query, current_target, **hit_dict_params) except InvalidHit as exc: - self.logger.error(exc) + logger.error(exc) continue hits.append(hit) hsps.extend(hit_hsps) diff --git a/Mikado/serializers/orf.py b/Mikado/serializers/orf.py index 302961084..704c1b72e 100644 --- a/Mikado/serializers/orf.py +++ b/Mikado/serializers/orf.py @@ -7,7 +7,7 @@ import os import sqlite3 -import pyfaidx +import pysam from sqlalchemy import Column, String, Integer, ForeignKey, CHAR, Index, Float, Boolean import sqlalchemy.exc from sqlalchemy.orm import relationship, backref, column_property @@ -173,17 +173,19 @@ def __init__(self, fasta_index = json_conf["serialise"]["files"]["transcripts"] self._max_regression = json_conf["serialise"]["max_regression"] self._table = json_conf["serialise"]["codon_table"] + self.procs = json_conf["serialise"]["procs"] + self.single_thread = json_conf["serialise"]["single_thread"] if isinstance(fasta_index, str): assert os.path.exists(fasta_index) - self.fasta_index = pyfaidx.Fasta(fasta_index) + self.fasta_index = pysam.FastaFile(fasta_index) # self.fasta_index = SeqIO.index(fasta_index, "fasta") elif fasta_index is None: exc = ValueError("A fasta index is needed for the serialization!") self.logger.exception(exc) return else: - assert isinstance(fasta_index, pyfaidx.Fasta) + assert isinstance(fasta_index, pysam.FastaFile) self.fasta_index = fasta_index if isinstance(handle, str): @@ -226,16 +228,16 @@ def load_fasta(self, cache): if self.fasta_index is not None: done = 0 self.logger.debug("%d entries already present in db, %d in the index", - len([fasta_key for fasta_key in self.fasta_index if + len([fasta_key for fasta_key in self.fasta_index.references if fasta_key not in cache]), - len(self.fasta_index.keys())) + self.fasta_index.nreferences) found = set() - for record in self.fasta_index.keys(): - if record in cache: + for ref, length in zip(self.fasta_index.references, self.fasta_index.lengths): + if ref in cache: continue - objects.append(Query(record, len(self.fasta_index[record]))) - assert record not in found, record - found.add(record) + objects.append(Query(ref, length)) + assert ref not in found, ref + found.add(ref) if len(objects) >= self.maxobjects: done += len(objects) self.session.begin(subtransactions=True) diff --git a/Mikado/subprograms/configure.py b/Mikado/subprograms/configure.py index c8a23683f..34de41dd4 100644 --- a/Mikado/subprograms/configure.py +++ b/Mikado/subprograms/configure.py @@ -48,7 +48,7 @@ def get_key(new_dict, key, default): return new_dict -def create_simple_config(): +def create_simple_config(seed=None): """ Method to create a stripped down configuration dictionary @@ -67,7 +67,7 @@ def create_simple_config(): new_dict = dict() composite_keys = [(ckey[1:]) for ckey in check_has_requirements(default, - validator.schema["properties"])] + validator.schema["properties"])] + [["seed"]] # Sort the composite keys by depth for ckey in sorted(composite_keys, key=len, reverse=True): @@ -84,6 +84,9 @@ def create_simple_config(): new_dict = configurator.merge_dictionaries(new_dict, val) + if seed is not None: + new_dict["seed"] = seed + return new_dict @@ -102,7 +105,7 @@ def create_config(args): del default["as_requirements"] config = default else: - config = create_simple_config() + config = create_simple_config(seed=args.seed) if len(args.mode) > 1: args.daijin = True @@ -160,6 +163,9 @@ def create_config(args): # del external_conf["mikado"][key] config = configurator.merge_dictionaries(config, external_conf) + if args.seed is not None: + config["seed"] = args.seed + if args.reference is not None: config["reference"]["genome"] = args.reference @@ -333,6 +339,8 @@ def configure_parser(): parser = argparse.ArgumentParser(description="Configuration utility for Mikado", formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("--full", action="store_true", default=False) + parser.add_argument("--seed", type=int, default=None, + help="Random seed number.") scoring = parser.add_argument_group("Options related to the scoring system") scoring.add_argument("--scoring", type=str, default=None, help="Scoring file to use. Mikado provides the following:\n{}".format( diff --git a/Mikado/subprograms/pick.py b/Mikado/subprograms/pick.py index a65845492..560f19d90 100644 --- a/Mikado/subprograms/pick.py +++ b/Mikado/subprograms/pick.py @@ -9,6 +9,7 @@ from ..picking import Picker from ..configuration.configurator import to_json, check_json from ..utilities.log_utils import create_default_logger, create_null_logger +import random def check_log_settings(args): @@ -301,6 +302,8 @@ def pick_parser(): either of the ORFs lacks a BLAST hit (but not both). - permissive: like lenient, but also split when both ORFs lack BLAST hits - split: split multi-orf transcripts regardless of what BLAST data is available.""") + parser.add_argument("--seed", type=int, default=None, + help="Random seed number.") # parser.formatter_class = argparse.HelpFormatter parser.add_argument("gff", nargs="?", default=None) parser.set_defaults(func=pick) diff --git a/Mikado/subprograms/prepare.py b/Mikado/subprograms/prepare.py index 07db8d08f..a5a982cfb 100644 --- a/Mikado/subprograms/prepare.py +++ b/Mikado/subprograms/prepare.py @@ -15,6 +15,7 @@ from ..preparation.prepare import prepare from ..configuration.configurator import to_json, check_json from Mikado.exceptions import InvalidJson +import random __author__ = 'Luca Venturini' @@ -54,6 +55,10 @@ def setup(args): args.log.close() args.json_conf["prepare"]["files"]["log"] = args.log.name + if args.seed is not None: + args.json_conf["seed"] = args.seed + random.seed(args.seed, version=2) + if args.json_conf["prepare"]["files"]["log"]: try: _ = open(path_join( @@ -79,6 +84,7 @@ def setup(args): assert logger.handlers == [handler] logger.propagate = False logger.info("Command line: %s", " ".join(sys.argv)) + logger.info("Random seed: %s", args.json_conf["seed"]) if args.verbose is True: args.json_conf["log_settings"]["log_level"] = "DEBUG" @@ -285,6 +291,8 @@ def positive(string): parser.add_argument("-k", "--keep-redundant", default=None, dest="keep_redundant", action="store_true", help="Boolean flag. If invoked, Mikado prepare will retain redundant models.") + parser.add_argument("--seed", type=int, default=None, + help="Random seed number.") parser.add_argument("gff", help="Input GFF/GTF file(s).", nargs="*") parser.set_defaults(func=prepare_launcher) return parser diff --git a/Mikado/subprograms/serialise.py b/Mikado/subprograms/serialise.py index 0fb4604d4..e8280f279 100644 --- a/Mikado/subprograms/serialise.py +++ b/Mikado/subprograms/serialise.py @@ -22,7 +22,8 @@ from ..serializers import external from ..exceptions import InvalidJson import pyfaidx -# from csv import DictReader +import random + __author__ = 'Luca Venturini' @@ -156,9 +157,10 @@ def load_external(args, logger): return else: logger.info("Starting to load external data") - with external.ExternalSerializer(args.json_conf["serialise"]["files"]["external_scores"], - json_conf=args.json_conf, - logger=logger) as serializer: + with external.ExternalSerializer( + args.json_conf["serialise"]["files"]["external_scores"], + json_conf=args.json_conf, + logger=logger) as serializer: serializer() logger.info("Finished loading external data") @@ -205,6 +207,10 @@ def setup(args): args.json_conf["db_settings"]["db"] = args.db args.json_conf["dbtype"] = "sqlite" + if args.seed is not None: + args.json_conf["seed"] = args.seed + random.seed(args.seed, version=2) + if args.output_dir is not None: args.json_conf["serialise"]["files"]["output_dir"] = args.output_dir if args.json_conf["db_settings"]["dbtype"] == "sqlite": @@ -271,6 +277,7 @@ def setup(args): logger.setLevel("INFO") logger.info("Command line: %s", " ".join(sys.argv)) + logger.info("Random seed: %s", args.json_conf["seed"]) logger.setLevel(args.log_level) # Add sqlalchemy logging @@ -406,11 +413,12 @@ def serialise_parser(): junctions.add_argument("--genome_fai", default=None) junctions.add_argument("--junctions", type=str, default=None) - external = parser.add_argument_group() - external.add_argument("--external-scores", dest="external_scores", - help="""Tabular file containing external scores for the transcripts. - Each column should have a distinct name, and transcripts have to be listed - on the first column.""") + external_args = parser.add_argument_group() + external_args.add_argument( + "--external-scores", + dest="external_scores", + help="""Tabular file containing external scores for the transcripts. + Each column should have a distinct name, and transcripts have to be listed on the first column.""") generic = parser.add_argument_group() generic.add_argument("-mo", "--max-objects", dest="max_objects", @@ -437,6 +445,7 @@ def serialise_parser(): generic.add_argument("db", type=str, default=None, nargs='?', help="Optional output database. Default: derived from json_conf") - + generic.add_argument("--seed", type=int, default=None, + help="Random seed number.") parser.set_defaults(func=serialise) return parser diff --git a/Mikado/tests/locus_test.py b/Mikado/tests/locus_test.py index 215ab296d..afe892010 100644 --- a/Mikado/tests/locus_test.py +++ b/Mikado/tests/locus_test.py @@ -105,7 +105,7 @@ def test_serialisation(self): for child in [Superlocus, Sublocus, Monosublocus, Locus]: child1 = child(self.transcript1) # Check compiled in dictionary - assert isinstance(child1.json_conf, dict) + self.assertIsInstance(child1.json_conf, dict) assert any((isinstance(child1.json_conf[_], dict) and child1.json_conf[_].get("compiled", None) is not None) or not isinstance(child1.json_conf[_], dict) for _ in child1.json_conf.keys()) obj = pickle.dumps(child1) @@ -2299,6 +2299,98 @@ def load_from_bed(manager, resource): transcripts[transcript.id] = transcript return transcripts + def test_pad_utr(self): + logger = create_default_logger(inspect.getframeinfo(inspect.currentframe())[2], level="WARNING") + transcripts = self.load_from_bed("Mikado.tests", "pad_utr.bed12") + locus = Locus(transcripts["mikado.Chr5G2.1"], logger=logger) + locus.json_conf["reference"]["genome"] = self.fai + # We need to pad + locus.json_conf["pick"]["alternative_splicing"]["pad"] = True + locus.json_conf["pick"]["alternative_splicing"]["ts_distance"] = 10000 + locus.json_conf["pick"]["alternative_splicing"]["ts_max_splices"] = 10 + locus.json_conf["pick"]["alternative_splicing"]["ts_distance"] = 1000 + locus.json_conf["pick"]["alternative_splicing"]["ts_max_splices"] = 10 + locus.json_conf["pick"]["alternative_splicing"]["only_confirmed_introns"] = False + locus.json_conf["pick"]["alternative_splicing"]["min_cdna_overlap"] = 0.1 + locus.add_transcript_to_locus(transcripts["mikado.Chr5G2.2"]) + self.assertIn("mikado.Chr5G2.1", locus.transcripts) + self.assertIn("mikado.Chr5G2.2", locus.transcripts) + locus.pad_transcripts() + for tid in locus: + self.assertEqual(locus[tid].end, locus.end, tid) + + def test_ad_three_prime(self): + logger = create_default_logger(inspect.getframeinfo(inspect.currentframe())[2], level="WARNING") + transcripts = self.load_from_bed("Mikado.tests", "pad_three_neg.bed12") + locus = Locus(transcripts["mikado.Chr5G486.1"], logger=logger) + locus.json_conf["reference"]["genome"] = self.fai + locus.add_transcript_to_locus(transcripts["mikado.Chr5G486.2"]) + # We need to pad + locus.json_conf["pick"]["alternative_splicing"]["pad"] = True + locus.json_conf["pick"]["alternative_splicing"]["ts_distance"] = 10000 + locus.json_conf["pick"]["alternative_splicing"]["ts_max_splices"] = 10 + locus.json_conf["pick"]["alternative_splicing"]["ts_distance"] = 1000 + locus.json_conf["pick"]["alternative_splicing"]["ts_max_splices"] = 10 + locus.json_conf["pick"]["alternative_splicing"]["only_confirmed_introns"] = False + locus.json_conf["pick"]["alternative_splicing"]["min_cdna_overlap"] = 0.1 + self.assertIn("mikado.Chr5G486.1", locus.transcripts) + self.assertIn("mikado.Chr5G486.2", locus.transcripts) + locus.logger.setLevel("DEBUG") + locus.pad_transcripts() + for tid in locus: + self.assertEqual(locus[tid].start, locus.start, tid) + + + def test_one_off(self): + logger = create_default_logger(inspect.getframeinfo(inspect.currentframe())[2], level="WARNING") + for strand in ("+", "-"): + with self.subTest(strand=strand): + logger.setLevel("WARNING") + t1 = Transcript() + t1.chrom, t1.strand, t1.start, t1.end, t1.id, t1.parent = ["Chr5", strand, 100, 1000, "t1", "loc"] + t1.add_exons([(100, 200), (300, 500), (700, 800), (900, 1000)]) + t1.finalize() + loc = Locus(t1, logger=logger) + loc.json_conf["reference"]["genome"] = self.fai + # We need these to be padded + loc.json_conf["pick"]["alternative_splicing"]["ts_distance"] = 1000 + loc.json_conf["pick"]["alternative_splicing"]["ts_max_splices"] = 10 + loc.json_conf["pick"]["alternative_splicing"]["only_confirmed_introns"] = False + loc.json_conf["pick"]["alternative_splicing"]["min_cdna_overlap"] = 0.1 + t2 = Transcript() + t2.chrom, t2.strand, t2.start, t2.end, t2.id, t2.parent = ["Chr5", strand, 299, 1000, "t2", "loc"] + t2.add_exons([(299, 400), (700, 800), (900, 1000)]) + t2.finalize() + loc.add_transcript_to_locus(t2) + self.assertIn(t2.id, loc) + t3 = Transcript() + t3.chrom, t3.strand, t3.start, t3.end, t3.id, t3.parent = ["Chr5", strand, 100, 801, "t3", "loc"] + t3.add_exons([(100, 150), (350, 500), (700, 801)]) + t3.finalize() + loc.add_transcript_to_locus(t3) + self.assertIn(t3.id, loc) + t4 = Transcript() + t4.chrom, t4.strand, t4.start, t4.end, t4.id, t4.parent = ["Chr5", strand, 300, 1000, "t4", "loc"] + t4.add_exons([(300, 320), (600, 800), (900, 1000)]) + t4.finalize() + self.assertGreaterEqual(t4.cdna_length, 300) + loc.add_transcript_to_locus(t4) + self.assertIn(t4.id, loc) + t5 = Transcript() + t5.chrom, t5.strand, t5.start, t5.end, t5.id, t5.parent = ["Chr5", strand, 100, 800, "t5", "loc"] + t5.add_exons([(100, 140), (360, 650), (700, 800)]) + t5.finalize() + loc.add_transcript_to_locus(t5) + loc._load_scores({"t1": 20, "t2": 10, "t3": 10, "t4": 15, "t5": 15}) + self.assertIn(t5.id, loc) + self.assertIn(t1.id, loc) + self.assertEqual(loc["t1"].score, 20) + loc.pad_transcripts() + self.assertEqual(loc["t4"].start, 100) + self.assertEqual(loc["t5"].end, 1000) + self.assertEqual(loc["t2"].start, 299) + self.assertEqual(loc["t3"].end, 801) + @mark.slow def test_complete_padding(self): @@ -2348,11 +2440,12 @@ def test_complete_padding(self): cds_coordinates[transcript] = ( locus[transcript].combined_cds_start, locus[transcript].combined_cds_end) - logger = create_default_logger("logger", level="WARNING") locus.logger = logger locus.json_conf["pick"]["alternative_splicing"]["ts_distance"] = pad_distance locus.json_conf["pick"]["alternative_splicing"]["ts_max_splices"] = max_splice + # locus.logger.setLevel("DEBUG") locus.pad_transcripts() + locus.logger.setLevel("WARNING") self.assertEqual(locus[best].start, transcripts["AT5G01030.2"].start) self.assertIn(best, locus) @@ -2391,7 +2484,7 @@ def test_complete_padding(self): self.assertEqual(locus["AT5G01030.4"].combined_cds_end, transcripts["AT5G01030.2"].combined_cds_end) - @mark.triage + @mark.slow def test_negative_padding(self): genome = pkg_resources.resource_filename("Mikado.tests", "neg_pad.fa") transcripts = self.load_from_bed("Mikado.tests", "neg_pad.bed12") @@ -2493,7 +2586,7 @@ def test_negative_padding(self): self.assertNotEqual(locus[corr[3]].start, locus[corr[1]].start, pado.output) - @mark.triage + @mark.slow def test_padding(self): genome = pkg_resources.resource_filename("Mikado.tests", "padding_test.fa") transcripts = self.load_from_bed("Mikado.tests", "padding_test.bed12") @@ -2511,6 +2604,7 @@ def test_padding(self): print(params) + logger = create_default_logger(inspect.getframeinfo(inspect.currentframe())[2], level="INFO") for pad_distance, max_splice, coding, best in itertools.product((200, 1000, 1200, 5000), (1, 1, 5), (True, False), ("mikado.44G2.1", "mikado.44G2.5")): @@ -2542,7 +2636,6 @@ def test_padding(self): cds_coordinates[transcript] = ( locus[transcript].combined_cds_start, locus[transcript].combined_cds_end) - logger = create_default_logger("logger", level="WARNING") locus.logger = logger locus.json_conf["pick"]["alternative_splicing"]["ts_distance"] = pad_distance locus.json_conf["pick"]["alternative_splicing"]["ts_max_splices"] = max_splice @@ -2570,7 +2663,10 @@ def test_padding(self): if trans in params.keys(): continue self.assertFalse(locus[trans].attributes.get("padded", False), - (locus[trans].id, best, locus[trans].end, pad_distance, max_splice, + ((locus[trans].id, locus[trans].start, locus[trans].end, + transcripts[trans].start, transcripts[trans].end), + best, + pad_distance, max_splice, params[best], {item for item in locus[trans].attributes.items() if "ts" in item[0]})) @@ -2583,6 +2679,36 @@ def test_padding(self): self.assertEqual(locus[transcript].combined_cds_start, cds_coordinates[transcript][0]) self.assertEqual(locus[transcript].combined_cds_end, cds_coordinates[transcript][1]) + @mark.slow + def test_phasing(self): + + transcripts = self.load_from_bed("Mikado.tests", "phasing_padding.bed12") + # We have to test that the CDS is reconstructed correctly even when considering the phasing + genome = self.fai + logger = create_null_logger("test_phasing", level="INFO") + for phase in (0, 1, 2): + with self.subTest(phase=phase): + locus = Locus(transcripts["AT5G01030.1"], logger=logger) + locus.json_conf["reference"]["genome"] = genome + other = transcripts["AT5G01030.2"].deepcopy() + self.assertNotEqual(other.start, locus["AT5G01030.1"].start) + other.unfinalize() + other.start += (3 - phase) % 3 + other.remove_exon((10644, 12665)) + other.add_exon((other.start, 12665)) + other.add_exon((other.start, 12665), feature="CDS", phase=phase) + other.finalize() + self.assertTrue(other.is_coding) + self.assertEqual(other.phases[(other.start, 12665)], phase) + self.assertEqual(other.combined_cds_start, other.start) + locus.add_transcript_to_locus(other) + self.assertIn(other.id, locus.transcripts) + locus.logger.setLevel("DEBUG") + locus.pad_transcripts() + # self.assertEqual("", locus[other.id].format("bed12")) + self.assertEqual(locus[other.id].start, transcripts["AT5G01030.1"].start, phase) + self.assertEqual(locus[other.id].combined_cds_start, transcripts["AT5G01030.1"].combined_cds_start) + def test_pad_monoexonic(self): transcript = Transcript() diff --git a/Mikado/tests/pad_three_neg.bed12 b/Mikado/tests/pad_three_neg.bed12 new file mode 100644 index 000000000..9af38195f --- /dev/null +++ b/Mikado/tests/pad_three_neg.bed12 @@ -0,0 +1,2 @@ +Chr5 1964545 1967483 ID=mikado.Chr5G486.1;coding=True;phase=0;alias=trn-0-sta-combined-0_trinity19110.mrna1 14.0 - 1964640 1966807 0 9 725,103,131,100,90,40,109,122,791 0,813,998,1211,1406,1573,1710,1929,2147 +Chr5 1964735 1967483 ID=mikado.Chr5G486.2;coding=True;phase=0;alias=stn-0-sta-combined-0_Stringtie_STAR.16976.1 15.0 - 1964735 1966807 0 10 535,103,131,100,90,40,109,122,131,524 0,623,808,1021,1216,1383,1520,1739,1957,2224 \ No newline at end of file diff --git a/Mikado/tests/pad_utr.bed12 b/Mikado/tests/pad_utr.bed12 new file mode 100644 index 000000000..358a881e2 --- /dev/null +++ b/Mikado/tests/pad_utr.bed12 @@ -0,0 +1,2 @@ +Chr5 26419810 26421561 ID=mikado.Chr5G2.2;coding=True;phase=2;alias=cls-0-sta-combined-0_Chr5.5721.5 18.0 + 26419810 26421118 0 7 38,98,98,130,188,86,252 0,302,494,713,924,1234,1499 +Chr5 26420109 26421591 ID=mikado.Chr5G2.1;coding=True;phase=0;alias=stn-0-sta-combined-0_Stringtie_STAR.21649.2 9.0 + 26420157 26421118 0 6 101,98,130,188,86,282 0,195,414,625,935,1200 \ No newline at end of file diff --git a/Mikado/tests/padding_test.bed12 b/Mikado/tests/padding_test.bed12 index ccc9575e4..f47e0d191 100644 --- a/Mikado/tests/padding_test.bed12 +++ b/Mikado/tests/padding_test.bed12 @@ -1,5 +1,5 @@ -44 1 61560 ID=mikado.44G2.2;coding=True;phase=0 85.0 + 70 61341 0 22 140,119,137,89,218,261,240,141,156,112,160,165,132,72,126,75,168,105,201,87,114,309 0,19637,20638,24654,26061,27238,28273,28974,29420,30474,32004,33291,34188,35466,42587,43055,44730,46583,50405,53556,59448,61250 -44 1 61560 ID=mikado.44G2.3;coding=True;phase=0 79.0 + 70 61341 0 21 140,119,137,89,218,261,240,141,141,112,160,165,132,126,75,168,105,201,87,114,309 0,19637,20638,24654,26061,27238,28273,28974,29435,30474,32004,33291,34188,42587,43055,44730,46583,50405,53556,59448,61250 -44 1 61560 ID=mikado.44G2.4;coding=True;phase=0 72.0 + 70 61341 0 21 140,119,137,89,218,261,240,171,141,112,160,165,132,126,75,168,105,201,87,114,309 0,19637,20638,24654,26061,27238,28273,28944,29435,30474,32004,33291,34188,42587,43055,44730,46583,50405,53556,59448,61250 +44 1 61560 ID=mikado.44G2.2;coding=True;phase=0 85.0 + 70 61341 0 22 140,119,137,89,218,261,240,141,156,112,160,165,132,72,126,75,168,105,201,87,114,309 0,19637,20638,24654,26061,27238,28273,28974,29420,30474,32004,33291,34188,35466,42587,43055,44730,46583,50405,53556,59448,61250 +44 1 61560 ID=mikado.44G2.3;coding=True;phase=0 79.0 + 70 61341 0 21 140,119,137,89,218,261,240,141,141,112,160,165,132,126,75,168,105,201,87,114,309 0,19637,20638,24654,26061,27238,28273,28974,29435,30474,32004,33291,34188,42587,43055,44730,46583,50405,53556,59448,61250 +44 1 61560 ID=mikado.44G2.4;coding=True;phase=0 72.0 + 70 61341 0 21 140,119,137,89,218,261,240,171,141,112,160,165,132,126,75,168,105,201,87,114,309 0,19637,20638,24654,26061,27238,28273,28944,29435,30474,32004,33291,34188,42587,43055,44730,46583,50405,53556,59448,61250 44 1 62511 ID=mikado.44G2.1;coding=True;phase=0 85.0 + 70 61341 0 22 140,119,137,89,218,261,240,141,141,112,160,165,132,72,126,75,168,105,201,87,114,1260 0,19637,20638,24654,26061,27238,28273,28974,29435,30474,32004,33291,34188,35466,42587,43055,44730,46583,50405,53556,59448,61250 44 1 67118 ID=mikado.44G2.5;coding=True;phase=0 83.0 + 70 61341 0 24 140,119,137,89,218,261,240,141,141,112,160,165,132,72,126,75,168,105,201,87,114,1186,59,430 0,19637,20638,24654,26061,27238,28273,28974,29435,30474,32004,33291,34188,35466,42587,43055,44730,46583,50405,53556,59448,61250,63052,66687 diff --git a/Mikado/tests/phasing_padding.bed12 b/Mikado/tests/phasing_padding.bed12 new file mode 100644 index 000000000..dc565edf7 --- /dev/null +++ b/Mikado/tests/phasing_padding.bed12 @@ -0,0 +1,2 @@ +Chr5 10643 13235 ID=AT5G01030.2;coding=True;phase=0 0 + 10643 13003 0 2 2022,433 0,2159 +Chr5 9929 13235 ID=AT5G01030.1;coding=True;phase=0 0 + 10637 13003 0 3 243,2046,439 0,690,2867 diff --git a/Mikado/tests/test_system_calls.py b/Mikado/tests/test_system_calls.py index ebd4df140..15cb66f3b 100644 --- a/Mikado/tests/test_system_calls.py +++ b/Mikado/tests/test_system_calls.py @@ -120,6 +120,7 @@ def setUpClass(cls): def setUp(self): self.conf = configurator.to_json(None) + self.conf["seed"] = 1066 self.conf["reference"]["genome"] = self.fai.filename.decode() assert isinstance(self.conf["reference"]["genome"], str) self.logger = create_null_logger("prepare") @@ -858,19 +859,7 @@ class ConfigureCheck(unittest.TestCase): @classmethod def setUpClass(cls): - # cls.__genomefile__ = tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".fa.gz", - # prefix="configure") - # cls.__genomefile__.write(pkg_resources.resource_stream("Mikado.tests", "chr5.fas.gz").read()) - # cls.__genomefile__.flush() cls.fai = pysam.FastaFile(pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz")) - # cls.__genomefile__.flush() - - # @classmethod - # def tearDownClass(cls): - # """""" - # - # cls.__genomefile__.close() - # os.remove(cls.__genomefile__.name) def test_mikado_config(self): namespace = Namespace(default=False) @@ -883,6 +872,7 @@ def test_mikado_config(self): namespace.blast_targets = [] namespace.junctions = [] namespace.new_scoring = None + namespace.seed = None dir = tempfile.TemporaryDirectory() out = os.path.join(dir.name, "configuration.yaml") with open(out, "w") as out_handle: @@ -895,6 +885,47 @@ def test_mikado_config(self): self.assertNotIn("asm_methods", conf) dir.cleanup() + def test_seed(self): + namespace = Namespace(default=False) + namespace.scoring = None + namespace.intron_range = None + namespace.reference = "" + namespace.external = None + namespace.threads = 1 + namespace.blast_targets = [] + namespace.junctions = [] + namespace.new_scoring = None + dir = tempfile.TemporaryDirectory() + out = os.path.join(dir.name, "configuration.yaml") + for trial in (None, 1066, 175108): + with self.subTest(trial=trial): + namespace.mode = ["permissive"] + namespace.seed = trial + with open(out, "w") as out_handle: + namespace.out = out_handle + sub_configure.create_config(namespace) + self.assertGreater(os.stat(out).st_size, 0) + conf = configuration.configurator.to_json(out) + conf = configuration.configurator.check_json(conf) + conf = configuration.configurator.check_json(conf) + self.assertNotIn("asm_methods", conf) + if trial is not None: + self.assertEqual(conf["seed"], trial) + else: + self.assertNotEqual(conf["seed"], trial) + self.assertIsInstance(conf["seed"], int) + + for mistake in (False, "hello", 10.5, b"890"): + with self.subTest(mistake=mistake): + namespace.mode = ["permissive"] + with self.assertRaises(OSError): + namespace.seed = mistake + with open(out, "w") as out_handle: + namespace.out = out_handle + sub_configure.create_config(namespace) + + dir.cleanup() + def test_mikado_config_full(self): namespace = Namespace(default=False) namespace.scoring = None @@ -908,6 +939,7 @@ def test_mikado_config_full(self): namespace.new_scoring = None namespace.full = True namespace.daijin = False + namespace.seed = None dir = tempfile.TemporaryDirectory() out = os.path.join(dir.name, "configuration.yaml") with open(out, "w") as out_handle: @@ -933,6 +965,7 @@ def test_mikado_config_daijin(self): namespace.new_scoring = None namespace.full = True namespace.daijin = True + namespace.seed = None dir = tempfile.TemporaryDirectory() out = os.path.join(dir.name, "configuration.yaml") with open(out, "w") as out_handle: @@ -959,6 +992,7 @@ def test_mikado_config_daijin_set_from_mode(self): namespace.new_scoring = None namespace.full = True namespace.daijin = False + namespace.seed = None dir = tempfile.TemporaryDirectory() out = os.path.join(dir.name, "configuration.yaml") with open(out, "w") as out_handle: @@ -996,6 +1030,7 @@ def test_daijin_config(self): namespace.name = "Daijin" namespace.threads = 1 namespace.full = False + namespace.seed = None for iteration in range(20): with self.subTest(iteration=iteration): @@ -1017,7 +1052,7 @@ def test_daijin_config(self): dir.cleanup() -# @mark.slow +@mark.slow class PickTest(unittest.TestCase): """This unit test will check that pick functions correctly.""" @@ -1041,6 +1076,7 @@ def tearDown(self): thread._wait_for_tstate_lock(block=True, timeout=0.00001) thread._stop() + @mark.slow def test_single_proc(self): self.json_conf["pick"]["run_options"]["procs"] = 1 @@ -1071,6 +1107,7 @@ def test_single_proc(self): dir.cleanup() + @mark.slow def test_multi_proc(self): self.json_conf["pick"]["run_options"]["procs"] = 2 self.json_conf["pick"]["files"]["input"] = pkg_resources.resource_filename("Mikado.tests", @@ -1128,7 +1165,7 @@ def test_subprocess(self): sub_configure.print_config(yaml.dump(self.json_conf, default_flow_style=False), json_handle) - sys.argv = ["mikado", "pick", "--json-conf", json_file] + sys.argv = ["mikado", "pick", "--json-conf", json_file, "--seed", "1078"] with self.assertRaises(SystemExit): pkg_resources.load_entry_point("Mikado", "console_scripts", "mikado")() @@ -1168,7 +1205,7 @@ def test_different_scoring(self): json_file = os.path.join(self.json_conf["pick"]["files"]["output_dir"], "mikado.yaml") with open(json_file, "wt") as json_handle: sub_configure.print_config(yaml.dump(self.json_conf, default_flow_style=False), json_handle) - sys.argv = ["mikado", "pick", "--json-conf", json_file, "--single"] + sys.argv = ["mikado", "pick", "--json-conf", json_file, "--single", "--seed", "1078"] with self.assertRaises(SystemExit): pkg_resources.load_entry_point("Mikado", "console_scripts", "mikado")() @@ -1209,7 +1246,7 @@ def test_different_scoring_2(self): self.json_conf["pick"]["files"]["output_dir"] = os.path.join(outdir.name) scoring_file = pkg_resources.resource_filename("Mikado.tests", "scoring_only_cds.yaml") sys.argv = ["mikado", "pick", "--json-conf", json_file, "--single", - "--scoring-file", scoring_file] + "--scoring-file", scoring_file, "--seed", "1078"] with self.assertRaises(SystemExit): pkg_resources.load_entry_point("Mikado", "console_scripts", "mikado")() @@ -1266,6 +1303,7 @@ def __get_purgeable_gff(self): return gtf, dir, temp_gtf, scoring + @mark.slow def test_purging1(self): # Now the scoring @@ -1319,6 +1357,7 @@ def test_purging1(self): temp_gtf.close() dir.cleanup() + @mark.slow def test_purging2(self): gtf, dir, temp_gtf, scoring = self.__get_purgeable_gff() @@ -1377,6 +1416,7 @@ def test_purging2(self): temp_gtf.close() dir.cleanup() + @mark.slow def test_purging3(self): gtf, dir, temp_gtf, scoring = self.__get_purgeable_gff() @@ -1460,7 +1500,6 @@ def test_subprocess_multi(self): orfs = pkg_resources.resource_filename("Mikado.tests", "transcripts.fasta.prodigal.gff3") uniprot = pkg_resources.resource_filename("Mikado.tests", "uniprot_sprot_plants.fasta.gz") mobjects = 300 # Let's test properly the serialisation for BLAST - procs = 3 dir = tempfile.TemporaryDirectory() json_file = os.path.join(dir.name, "mikado.yaml") @@ -1474,27 +1513,29 @@ def test_subprocess_multi(self): sub_configure.print_config(yaml.dump(self.json_conf, default_flow_style=False), json_handle) # Set up the command arguments - - sys.argv = [str(_) for _ in ["mikado", "serialise", "--json-conf", json_file, - "--transcripts", transcripts, "--blast_targets", uni_out, - "--orfs", orfs, "--junctions", junctions, "--xml", xml, - "-p", procs, "-mo", mobjects, db, "--log", log]] - - pkg_resources.load_entry_point("Mikado", "console_scripts", "mikado")() - - self.assertTrue(os.path.exists(db)) - conn = sqlite3.connect(db) - cursor = conn.cursor() - self.assertEqual(cursor.execute("select count(*) from hit").fetchall()[0][0], 562) - self.assertEqual(cursor.execute("select count(*) from hsp").fetchall()[0][0], 669) - self.assertEqual(cursor.execute("select count(distinct(query_id)) from hsp").fetchall()[0][0], 71) - self.assertEqual(cursor.execute("select count(distinct(query_id)) from hit").fetchall()[0][0], 71) - self.assertEqual(cursor.execute("select count(distinct(target_id)) from hsp").fetchall()[0][0], 32) - self.assertEqual(cursor.execute("select count(distinct(target_id)) from hit").fetchall()[0][0], 32) - self.assertEqual(cursor.execute("select count(*) from junctions").fetchall()[0][0], 372) - self.assertEqual(cursor.execute("select count(distinct(chrom_id)) from junctions").fetchall()[0][0], 2) - self.assertEqual(cursor.execute("select count(*) from orf").fetchall()[0][0], 169) - self.assertEqual(cursor.execute("select count(distinct(query_id)) from orf").fetchall()[0][0], 81) + for procs in (1, 3): + with self.subTest(proc=procs): + sys.argv = [str(_) for _ in ["mikado", "serialise", "--json-conf", json_file, + "--transcripts", transcripts, "--blast_targets", uni_out, + "--orfs", orfs, "--junctions", junctions, "--xml", xml, + "-p", procs, "-mo", mobjects, db, "--log", log, "--seed", "1078"]] + pkg_resources.load_entry_point("Mikado", "console_scripts", "mikado")() + logged = [_.rstrip() for _ in open(log)] + + self.assertTrue(os.path.exists(db)) + conn = sqlite3.connect(db) + cursor = conn.cursor() + self.assertEqual(cursor.execute("select count(*) from hit").fetchall()[0][0], 562, logged) + self.assertEqual(cursor.execute("select count(*) from hsp").fetchall()[0][0], 669) + self.assertEqual(cursor.execute("select count(distinct(query_id)) from hsp").fetchall()[0][0], 71) + self.assertEqual(cursor.execute("select count(distinct(query_id)) from hit").fetchall()[0][0], 71) + self.assertEqual(cursor.execute("select count(distinct(target_id)) from hsp").fetchall()[0][0], 32) + self.assertEqual(cursor.execute("select count(distinct(target_id)) from hit").fetchall()[0][0], 32) + self.assertEqual(cursor.execute("select count(*) from junctions").fetchall()[0][0], 372) + self.assertEqual(cursor.execute("select count(distinct(chrom_id)) from junctions").fetchall()[0][0], 2) + self.assertEqual(cursor.execute("select count(*) from orf").fetchall()[0][0], 169) + self.assertEqual(cursor.execute("select count(distinct(query_id)) from orf").fetchall()[0][0], 81) + os.remove(db) dir.cleanup() diff --git a/Mikado/transcripts/transcript.py b/Mikado/transcripts/transcript.py index 090e1b7f6..d200e7e31 100644 --- a/Mikado/transcripts/transcript.py +++ b/Mikado/transcripts/transcript.py @@ -1500,7 +1500,7 @@ def logger(self, logger): else: pass else: - assert isinstance(logger, logging.Logger) + assert isinstance(logger, logging.Logger), type(logger) self.__logger = logger self.__logger.propagate = False diff --git a/Mikado/utilities/dbutils.py b/Mikado/utilities/dbutils.py index 012c04297..35b21fc6b 100644 --- a/Mikado/utilities/dbutils.py +++ b/Mikado/utilities/dbutils.py @@ -51,7 +51,7 @@ def create_connector(json_conf, logger=None): func = None if db_settings["dbtype"] == "sqlite": if not database_exists("sqlite:///{}".format(db_settings["db"])): - logger.warning("No database found, creating a mock one!") + logger.debug("No database found, creating a mock one") create_database("sqlite:///{}".format(db_settings["db"])) if json_conf["pick"]["run_options"]['shm'] is False: logger.debug("Connecting to %s", db_settings["db"]) diff --git a/Mikado/utilities/log_utils.py b/Mikado/utilities/log_utils.py index 359031853..507c9ff50 100644 --- a/Mikado/utilities/log_utils.py +++ b/Mikado/utilities/log_utils.py @@ -34,13 +34,11 @@ def create_null_logger(*args, **kwargs): null_special_logger = logging.getLogger(args[0]) null_special_handler = logging.NullHandler() null_special_handler.setFormatter(formatter) + null_special_logger.handlers = [null_special_handler] if "level" in kwargs: null_special_logger.setLevel(kwargs["level"]) else: null_special_logger.setLevel(logging.CRITICAL) - for _, handler in enumerate(null_special_logger.handlers): - null_special_logger.removeHandler(handler) - null_special_logger.addHandler(null_special_handler) return null_special_logger return null_logger diff --git a/docs/Usage/Configure.rst b/docs/Usage/Configure.rst index dd016ba03..535ab9997 100644 --- a/docs/Usage/Configure.rst +++ b/docs/Usage/Configure.rst @@ -7,7 +7,9 @@ Mikado configure ================ -This utility prepares the configuration file that will be used throughout the pipeline stages. While the most important options can be set at runtime through the command line, many algorithmic details can be accessed and intervened upon only through the file produced through this command. +This utility prepares the configuration file that will be used throughout the pipeline stages. +While the most important options can be set at runtime through the command line, many algorithmic details can be accessed and intervened upon only through the file produced through this command. +.. important:: Please note that any value absent from the configuration at runtime **will be imputed to the default value for Mikado, as specified internally**. Usage ~~~~~ diff --git a/environment.yml b/environment.yml index 423729758..112b8cfdb 100644 --- a/environment.yml +++ b/environment.yml @@ -17,7 +17,7 @@ dependencies: - conda-forge::pandas>=0.24.2 - conda-forge::networkx>=2.2 - conda-forge::sqlalchemy>=1.3.2 - - conda-forge::sqlalchemy-utils>=0.33.11 + - conda-forge::sqlalchemy-utils>=0.34.0,!=0.33 - conda-forge::biopython>=1.73 - conda-forge::intervaltree>=3.0.2 - conda-forge::pytest>=4.4.0 @@ -33,7 +33,7 @@ dependencies: - bioconda::pysam>=0.15.2 - bioconda::samtools>=1.9 - bioconda::transdecoder>=5.5.0 - - bioconda::portcullis>=1.1.2 + - bioconda::portcullis>=1.2.0 - bioconda::prodigal>=2.6.3 - bioconda::diamond>=0.9.24 - bioconda::blast>=2.7.1 diff --git a/sample_data/chr5.fas.gz.fai b/sample_data/chr5.fas.gz.fai new file mode 100644 index 000000000..9253f98eb --- /dev/null +++ b/sample_data/chr5.fas.gz.fai @@ -0,0 +1 @@ +Chr5 26975502 75 79 80 diff --git a/sample_data/chr5.fas.gz.gzi b/sample_data/chr5.fas.gz.gzi new file mode 100644 index 0000000000000000000000000000000000000000..3c9e803b1d68f9d063335b53c074653b6267819d GIT binary patch literal 6696 zcmXZgcU;Zw9|rK-9vPtsNu_KmA)}%tqlk>kC=F?78A&Chg*ast8piV^>q)XHk6DC_ zN+Bwzwnw@B8~Y?QbIR(_kx8{E;~*g>ICE+#33KQ~aVm zii5wJ!M~&FpUvThV*0-p@Dgn~CmLG9z5MAPt>D`^^!L_qdRzH-#kV$c-huww7XFk@ z*DJw~ztCUW!J6jsb%@XHVdHrEQwP}VDgCh{Y&A%}PVqq*4hW&&cY>4h>AKGFM^*Vg z#JetVnH&991#Xv3*Q&zlLcUK?(-l^mN>_J-cc##9)L^Bz^y}`jKU#h+;#Cj0G@5?d z6BZ@(3w2nhkNn)k^Iotto=LmU116Yh&S8 zTj{IDaD63x#RT50C)cUCYzo_j(U-=-rUmrBWu(2I|-U>dMMxUDix2dDgTEoM}%6%cuOoTl)(5ENC8m089$#9mA+&ALn6zL%P zgbm#EGM#M;@97}-l{h{XzBZ4}ng+)mrjOad89(UE>9B>h+=t?y8L;{e`lvnJ`UQPt zCLCrU_o+BM3qBc6ADRu970?Ig!1-$2#~BW=lLvjk5&oP_r#r#Nh1}=j@40ZPE&bO# zxH_5MKOZ)%q4zC-lZ@oKAoe=LS0m{?F7W;bblO7rxu!ffM5-(7=|`uy!NYUtWOw** zTY0XC-5&5`2YS~cSUH2<=?N#)(@Bfr<`$Yrk?4gqNT7Fk!+W36+kN2D!Sb9E+kD~G ztLUvu;0HJ81V8vyS9y+!cz?LP2dxNzZBEcz0^yJ*@|+W!gWyfm=(wfu%M^OkGWcW- zy>U6b)kvO;V#5l!Dw2*3hWi%LF)QJtn)2Kf(IK#xA04#{-k(E9hQe3c$a7V!4}*8j zq1T1Omg)4`)$qyB^qL6x)OdLgi-$N+gxi?H>udoUdu39DbD zS4F{hl;t@tLZadJ3+Ry!u>N*>*(Nxuie4HA_tTU2 z1rf9vjtHd#x4`{x&;bf~s;azi2>*E4)|K{4fahn?OSZx@8fo8caO)}ZJ|ld#!!=2? z_YU~(E7~g&o@+Q1DHbOo_pYHmcf#t0^rBs`RZl(H_t=fRyO?%QhQA)C-BRGfM%pzM z_ME7X6bsXkify#Z9(Z>p?YtNMGsFNX7VJZg4W;Mrhb!~xd4Iv>D#K)d?%&9{uC!A+ zeCrtPcmTfni+0F>drUS&ia7_7-ID0phv4X!^sK}18-wApKl2E(ID)o63g5`1XZ!Rb`?lH0W`VTL3D`21o^ldy^O2r>3SK(c z2q`9=M*i}kC!T>T6twkOxbt;-!Z|oxX%tdeokyzJ)0P+DnpD~%2X6O{HqV8VMvq1c zvx`WF7<&A_u<1Q|+$A`t*BGQQy^J*Rp-ry9W@l;Rt8i@dv9dq*8q#$-J?1+6QbCW- zgX7N8qw-<9mc~e7bOWg}lOA~!u1=#z6u>5L>EXBF1NtUNVR#$q7Df-d0}sDR8x+Fp zRZV4I|1NTuJ*{^ScHT=5y$`3?(L)}<;bX=j#o!`jW(+;37M!7f{VZj_=SyVFa=OPW_|Rp#`)fF{qm}Hd zy+KyYqq|kZwuk7hHL&_uTD2CQHGTq8sJulgZKk`tgKZwuo$FvHU2ECz^d4CdNGpGU z6&LA_A7MYGiL&3}6Ec1_-TpH?F_muj1rDmAmFnTsBPJn5+pkF7^>mwWaO^$0^>_Gp zugS9C>Ic%$hi=&br=6u+{0CRGm?Hble%h;OImrGaUHITMx~wm(kxG~L zgR5%khyCHYQI2xHWB}|KLl+N(uawY5gWwn1PO|@CFnlP0zCQ%MeUZL56n1PkSN8Af z!QM`Ep+0PTkiKI8i*NMpVQ?#pd2;@iA#9XD7Yv7+R?s&`z@vuDm;D^tCZ?X%>BTEPS_-zG4h-w{e#9mrdZayXi}&uxmB_?>IPQ zq>Jod91r)1qI1n)^CCLO9QM*$DEk*IU|)awyd|ugOP{lX^^{y?|Lg>~(1AW<4UaxR zpPmR0sHaa&g5AyBrYxF=MT?>%M$5Bvtad?^ugJ1iGio=XUu_XBj^JT z@b^1(x+8q6=VICa+X+7HMgKJyc0NPzp9d?NdddF2`S74=^xg$6H`q_hNA`ES!y5kdE)RHUF1>RR?4jf<`$?X#nj@XK7`}Ia z-r)s1e5JR0!{ruBd5I5~xmT>)>brDKBOD5F3*AH5Pj8bwEi zz=|R|auw{Z6(sxXL*dJQ^tv$kTn@c99Ddz)sqC*=4Zm}sBO>4<>GbL~u+tYhd@ZbI zwoJ~4t%H>nbm)3`*i(8{Bpf(!x$K8T!D1!7G8)#+ql05$N0k+_zakdSa;2AVfNitr zWgFoOjr7t@aA(_KIUf`UuTG`|H^Yx==zuM7ozY6!_gBE@qG-Q(_-YZoBmqv+43T}` zt#HE<+GiVl`vUE~9WHLOO7^{Wz|nK)#fh??PJ1T7-u3jNo$xpFP&x0h3wBAM-FL&b q&uF(~IACy??7ODG%B$#wsj$fn+9eHE?HVro&U@fK?(~Aa@c#f;v8n3- literal 0 HcmV?d00001