From 6175eb2f6b8f777918651756c28433886ccd76e5 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Fri, 12 Oct 2018 14:46:42 +0100 Subject: [PATCH] Modified error messages for Mikado prepare, serialise and pick. Now the default level "INFO" should give appropriate results. Also correcting the codon table issue for #34 --- .../configuration_blueprint.json | 7 +-- Mikado/loci/locus.py | 10 ++++ Mikado/loci/superlocus.py | 3 ++ Mikado/picking/picker.py | 30 ++++++------ Mikado/preparation/annotation_parser.py | 2 +- Mikado/preparation/prepare.py | 8 +++- .../blast_serializer/xml_serialiser.py | 46 +++++++++---------- Mikado/serializers/orf.py | 12 ++--- Mikado/subprograms/serialise.py | 16 +++++-- .../transcript_methods/splitting.py | 1 + 10 files changed, 81 insertions(+), 54 deletions(-) diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json index a92896aaf..99ae75e6e 100644 --- a/Mikado/configuration/configuration_blueprint.json +++ b/Mikado/configuration/configuration_blueprint.json @@ -31,7 +31,7 @@ "Settings related to the logs. Keys:", "- sql_level: verbosity for SQL calls. Default: WARNING.", " In decreasing order: 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'", - "- log_level: verbosity. Default: WARNING.", + "- log_level: verbosity. Default: INFO.", " In decreasing order: 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'" ], "type": "object", @@ -45,7 +45,7 @@ "ERROR", "CRITICAL" ], - "default": "WARNING" + "default": "INFO" }, "sql_level": { "type": "string", @@ -202,7 +202,8 @@ "Scenedesmus obliquus Mitochondrial", "Thraustochytrium Mitochondrial", "Pterobranchia Mitochondrial", "Candidate Division SR1", "Gracilibacteria", "Pachysolen tannophilus Nuclear", "Karyorelict Nuclear", "Condylostoma Nuclear", - "Mesodinium Nuclear", "Peritrich Nuclear", "Blastocrithidia Nuclear"] + "Mesodinium Nuclear", "Peritrich Nuclear", "Blastocrithidia Nuclear"], + "default": 0 } } }, diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py index 906ed0802..f850198ca 100644 --- a/Mikado/loci/locus.py +++ b/Mikado/loci/locus.py @@ -938,5 +938,15 @@ def expand_transcript(transcript, new_start, new_end, fai, logger): logger.info("Padding %s would lead to an in-frame stop codon. Aborting.", transcript.id) return backup + else: + message = "{transcript.id} has now start {transcript.start}, end {transcript.end}" + if ((backup.combined_cds_end != transcript.combined_cds_end) or + (backup.combined_cds_start != transcript.combined_cds_start)): + transcript.attributes["cds_padded"] = True + message += "; CDS moved to {transcript.combined_cds_start}, end {transcript.combined_cds_end}" + else: + transcript.attributes["cds_padded"] = False + message += "." + logger.info(message.format(**locals())) return transcript diff --git a/Mikado/loci/superlocus.py b/Mikado/loci/superlocus.py index dd996ee10..6db8fe271 100644 --- a/Mikado/loci/superlocus.py +++ b/Mikado/loci/superlocus.py @@ -444,6 +444,9 @@ def load_transcript_data(self, tid, data_dict): if len(new_tr) > 1: to_add.update(new_tr) to_remove = True + self.logger.info("%s has been split into %d different transcripts.", + tid, len(new_tr)) + del data_dict return to_remove, to_add # @profile diff --git a/Mikado/picking/picker.py b/Mikado/picking/picker.py index ab9d50b86..67b597698 100644 --- a/Mikado/picking/picker.py +++ b/Mikado/picking/picker.py @@ -96,7 +96,7 @@ def __init__(self, json_conf, commandline=""): force=True) # self.setup_logger() - self.logger.info("Multiprocessing method: %s", + self.logger.debug("Multiprocessing method: %s", self.json_conf["multiprocessing_method"]) # pylint: enable=no-member @@ -232,11 +232,11 @@ def setup_shm_db(self): This method will copy the SQLite input DB into memory. """ - self.main_logger.info("Copy into a SHM db: %s", + self.main_logger.debug("Copy into a SHM db: %s", self.json_conf["pick"]["run_options"]["shm"]) if self.json_conf["pick"]["run_options"]["shm"] is True: self.json_conf["pick"]["run_options"]["shm_shared"] = False - self.main_logger.info("Copying the DB into memory") + self.main_logger.debug("Copying the DB into memory") assert self.json_conf["db_settings"]["dbtype"] == "sqlite" self.json_conf["pick"]["run_options"]["preload"] = False if self.json_conf["pick"]["run_options"]["shm_db"] is not None: @@ -253,7 +253,7 @@ def setup_shm_db(self): self.json_conf["pick"]["run_options"]["shm_db"] = temp if self.json_conf["pick"]["run_options"]["shm"]: if not os.path.exists(self.json_conf["pick"]["run_options"]["shm_db"]): - self.main_logger.info("Copying {0} into {1}".format( + self.main_logger.debug("Copying {0} into {1}".format( self.json_conf["db_settings"]["db"], self.json_conf["pick"]["run_options"]["shm_db"])) try: @@ -265,7 +265,7 @@ def setup_shm_db(self): Back to using the DB on disk.""") self.json_conf["pick"]["run_options"]["shm"] = False else: - self.main_logger.info("%s exists already. Doing nothing.", + self.main_logger.debug("%s exists already. Doing nothing.", self.json_conf["pick"]["run_options"]["shm_db"]) self.main_logger.info("DB copied into memory") @@ -548,7 +548,7 @@ def __preload_blast(self, engine, queries): hsps[hsp.query_id] = collections.defaultdict(list) hsps[hsp.query_id][hsp.target_id].append(hsp) - self.main_logger.info("{0} HSPs prepared".format(len(hsps))) + self.main_logger.debug("{0} HSPs prepared".format(len(hsps))) targets = dict((x.target_id, x) for x in engine.execute("select * from target")) @@ -598,7 +598,7 @@ def __preload_blast(self, engine, queries): del hsps assert len(hits_dict) <= len(queries) - self.main_logger.info("%d BLAST hits loaded for %d queries", + self.main_logger.debug("%d BLAST hits loaded for %d queries", hit_counter, len(hits_dict)) self.main_logger.debug("%s", @@ -634,7 +634,7 @@ def preload(self): # data_dict["junctions"] = self.manager.dict(data_dict["junctions"], lock=False) - self.main_logger.info("%d junctions loaded", + self.main_logger.debug("%d junctions loaded", len(data_dict["junctions"])) self.main_logger.debug("Example junctions:\n{0}".format( "\n".join(str(junc) for junc in list( @@ -658,7 +658,7 @@ def preload(self): # data_dict['orf'] = self.manager.dict(orfs, lock=False) - self.main_logger.info("%d ORFs loaded", + self.main_logger.debug("%d ORFs loaded", len(data_dict["orfs"])) self.main_logger.debug(",".join( list(data_dict["orfs"].keys())[:10] @@ -800,7 +800,7 @@ def __submit_multi_threading(self, data_dict): """ intron_range = self.json_conf["pick"]["run_options"]["intron_range"] - self.logger.info("Intron range: %s", intron_range) + self.logger.debug("Intron range: %s", intron_range) current_locus = None current_transcript = None @@ -826,7 +826,7 @@ def __submit_multi_threading(self, data_dict): # tempdir = os.path.join(self.json_conf["pick"]["files"]["output_dir"], "mikado_pick_tmp") # os.makedirs(tempdir, exist_ok=True) - self.logger.info("Creating the worker processes") + self.logger.debug("Creating the worker processes") conn, cursor = self._create_temporary_store(tempdir) working_processes = [LociProcesser(self.json_conf, data_dict, @@ -838,7 +838,7 @@ def __submit_multi_threading(self, data_dict): for _ in range(1, self.procs+1)] # Start all processes [_.start() for _ in working_processes] - self.logger.info("Started all %d workers", self.procs) + self.logger.debug("Started all %d workers", self.procs) # No sense in keeping this data available on the main thread now del data_dict @@ -927,7 +927,7 @@ def __submit_multi_threading(self, data_dict): current_locus.id, counter, ", ".join(list(current_locus.transcripts.keys()))) locus_queue.put(("EXIT", )) - self.logger.info("Joining children processes") + self.logger.debug("Joining children processes") [_.join() for _ in working_processes] conn.close() self.logger.info("Joined children processes; starting to merge partial files") @@ -988,7 +988,7 @@ def __submit_single_threaded(self, data_dict): logger.debug("Begun single-threaded run") intron_range = self.json_conf["pick"]["run_options"]["intron_range"] - logger.info("Intron range: %s", intron_range) + logger.debug("Intron range: %s", intron_range) handles = self.__get_output_files() @@ -1170,7 +1170,7 @@ def __call__(self): # Clean up the DB copied to SHM if (self.json_conf["pick"]["run_options"]["shm"] is True and self.json_conf["pick"]["run_options"]["shm_shared"] is False): - self.main_logger.info("Removing shared memory DB %s", + self.main_logger.debug("Removing shared memory DB %s", self.json_conf["pick"]["run_options"]["shm_db"]) os.remove(self.json_conf["pick"]["run_options"]["shm_db"]) diff --git a/Mikado/preparation/annotation_parser.py b/Mikado/preparation/annotation_parser.py index 718c4e3eb..b38480b9d 100644 --- a/Mikado/preparation/annotation_parser.py +++ b/Mikado/preparation/annotation_parser.py @@ -248,7 +248,7 @@ def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True # Discard transcript under a certain size if tlength < min_length: - logger.debug("Discarding %s because its size (%d) is under the minimum of %d", + logger.info("Discarding %s because its size (%d) is under the minimum of %d", tid, tlength, min_length) continue diff --git a/Mikado/preparation/prepare.py b/Mikado/preparation/prepare.py index 0d59bfbb4..1beba5203 100644 --- a/Mikado/preparation/prepare.py +++ b/Mikado/preparation/prepare.py @@ -121,8 +121,12 @@ def store_transcripts(shelf_stacks, logger, keep_redundant=False): logger.debug("The following transcripts are redundant: %s", ",".join([_[0] for _ in cds_list])) to_keep = random.choice(cds_list) - logger.debug("Keeping only %s out of the list", - to_keep) + for tid in cds_list: + if tid != to_keep: + logger.info("Discarding %s as redundant", tid[0]) + else: + logger.info("Keeping %s amongst redundant transcripts", tid[0]) + tids.append(to_keep) else: tids.extend(tid_list) diff --git a/Mikado/serializers/blast_serializer/xml_serialiser.py b/Mikado/serializers/blast_serializer/xml_serialiser.py index 8817488c0..3ce4e79bb 100644 --- a/Mikado/serializers/blast_serializer/xml_serialiser.py +++ b/Mikado/serializers/blast_serializer/xml_serialiser.py @@ -91,7 +91,7 @@ def _pickler(self, filename): self.logger.warning("Invalid BLAST file: %s", filename) return [] - self.logger.info("Starting to pickle %s", filename) + self.logger.debug("Starting to pickle %s", filename) hits, hsps = [], [] pickle_count = 0 @@ -131,7 +131,7 @@ def _pickler(self, filename): yield pickle_temp[1] pickle_count += 1 # pfiles.append() - self.logger.info("Finished pickling %s in %s subsection", filename, pickle_count) + self.logger.debug("Finished pickling %s in %s subsection", filename, pickle_count) # del records def run(self): @@ -154,7 +154,7 @@ def run(self): continue if filename == "EXIT": - self.logger.info("Process %s received EXIT signal, terminating", + self.logger.debug("Process %s received EXIT signal, terminating", self._name) self.filequeue.put((number, filename)) return 0 @@ -205,7 +205,7 @@ def __init__(self, xml_name, self.logger = check_logger(logger) else: raise ValueError("No logger provided!") - self.logger.info("Started to serialise %s, log level: %s", + self.logger.debug("Started to serialise %s, log level: %s", xml_name, self.logger.level) # Runtime arguments @@ -242,7 +242,7 @@ def __init__(self, xml_name, DBBASE.metadata.create_all(self.engine) # @UndefinedVariable session = Session(bind=self.engine, autocommit=True, autoflush=True) self.session = session # session() - self.logger.info("Created the session") + self.logger.debug("Created the session") # Load sequences if necessary self.__determine_sequences(query_seqs, target_seqs) self.xml = xml_name @@ -257,7 +257,7 @@ def __init__(self, xml_name, self.queries, self.targets = dict(), dict() - self.logger.info("Finished __init__") + self.logger.debug("Finished __init__") def __getstate__(self): @@ -321,7 +321,7 @@ def __serialize_queries(self, queries): }) if len(objects) >= self.maxobjects: - self.logger.info("Loading %d objects into the \"query\" table (total %d)", + self.logger.debug("Loading %d objects into the \"query\" table (total %d)", self.maxobjects, counter) # pylint: disable=no-member @@ -335,7 +335,7 @@ def __serialize_queries(self, queries): objects = [] if len(objects) > 0: - self.logger.info("Loading %d objects into the \"query\" table (total %d)", + self.logger.debug("Loading %d objects into the \"query\" table (total %d)", len(objects), counter+len(objects)) # pylint: disable=no-member counter += len(objects) @@ -345,7 +345,7 @@ def __serialize_queries(self, queries): # pylint: enable=no-member self.session.commit() # pylint: enable=no-member - self.logger.info("Loaded %d objects into the \"query\" table", counter) + self.logger.debug("Loaded %d objects into the \"query\" table", counter) for query in self.session.query(Query): queries[query.query_name] = (query.query_id, query.query_length) self.logger.info("%d in queries", len(queries)) @@ -381,7 +381,7 @@ def __serialize_targets(self, targets): # objects.append(Target(record, len(self.target_seqs[record]))) if len(objects) >= self.maxobjects: # counter += len(objects) - self.logger.info("Loading %d objects into the \"target\" table", + self.logger.debug("Loading %d objects into the \"target\" table", counter) # self.session.bulk_insert_mappings(Target, objects) self.session.begin(subtransactions=True) @@ -393,7 +393,7 @@ def __serialize_targets(self, targets): # self.logger.info("Loaded %d objects into the \"target\" table", # len(objects)) # objects = [] - self.logger.info("Loading %d objects into the \"target\" table, (total %d)", + self.logger.debug("Loading %d objects into the \"target\" table, (total %d)", len(objects), counter) # pylint: disable=no-member self.session.begin(subtransactions=True) @@ -404,7 +404,7 @@ def __serialize_targets(self, targets): self.logger.info("Loaded %d objects into the \"target\" table", counter) for target in self.session.query(Target): targets[target.target_name] = (target.target_id, target.target_length is not None) - self.logger.info("%d in targets", len(targets)) + self.logger.debug("%d in targets", len(targets)) return targets def __serialise_sequences(self): @@ -415,15 +415,15 @@ def __serialise_sequences(self): targets = dict() queries = dict() - self.logger.info("Loading previous IDs") + self.logger.debug("Loading previous IDs") for query in self.session.query(Query): queries[query.query_name] = (query.query_id, (query.query_length is not None)) for target in self.session.query(Target): targets[target.target_name] = (target.target_id, (target.target_length is not None)) - self.logger.info("Loaded previous IDs; %d for queries, %d for targets", + self.logger.debug("Loaded previous IDs; %d for queries, %d for targets", len(queries), len(targets)) - self.logger.info("Started the sequence serialisation") + self.logger.debug("Started the sequence serialisation") if self.target_seqs: targets = self.__serialize_targets(targets) assert len(targets) > 0 @@ -456,7 +456,7 @@ def __load_into_db(self, hits, hsps, force=False): if tot_objects >= self.maxobjects or force: # Bulk load - self.logger.info("Loading %d BLAST objects into database", tot_objects) + self.logger.debug("Loading %d BLAST objects into database", tot_objects) try: # pylint: disable=no-member @@ -469,7 +469,7 @@ def __load_into_db(self, hits, hsps, force=False): self.logger.critical("Failed to serialise BLAST!") self.logger.exception(err) raise err - self.logger.info("Loaded %d BLAST objects into database", tot_objects) + self.logger.debug("Loaded %d BLAST objects into database", tot_objects) hits, hsps = [], [] return hits, hsps @@ -525,7 +525,7 @@ def serialize(self): _, _ = self.__load_into_db(hits, hsps, force=True) else: - self.logger.info("Creating a pool with %d processes", + self.logger.debug("Creating a pool with %d processes", min(self.procs, len(self.xml))) filequeue = multiprocessing.Queue(-1) @@ -547,12 +547,12 @@ def serialize(self): for _ in range(min([self.procs, len(self.xml)])) ] - self.logger.info("Starting to pickle and serialise %d files", len(self.xml)) + self.logger.debug("Starting to pickle and serialise %d files", len(self.xml)) [_.start() for _ in procs] # Start processes for number, xml_name in enumerate(self.xml): filequeue.put((number, xml_name)) - self.logger.info("Finished sending off the data for serialisation") + self.logger.debug("Finished sending off the data for serialisation") filequeue.put((None, "EXIT")) returned = [] @@ -575,7 +575,7 @@ def serialize(self): hsps.extend(__hsps) hits, hsps = load_into_db(self, hits, hsps, force=False) if record_counter > 0 and record_counter % 10000 == 0: - self.logger.info("Parsed %d queries", record_counter) + self.logger.debug("Parsed %d queries", record_counter) os.remove(pickle_file) [_.join() for _ in procs] # Wait for processes to join self.logger.info("All %d children finished", len(procs)) @@ -720,7 +720,7 @@ def load_into_db(self, hits, hsps, force=False): if tot_objects >= self.maxobjects or force: # Bulk load - self.logger.info("Loading %d BLAST objects into database", tot_objects) + self.logger.debug("Loading %d BLAST objects into database", tot_objects) try: # pylint: disable=no-member @@ -733,7 +733,7 @@ def load_into_db(self, hits, hsps, force=False): self.logger.critical("Failed to serialise BLAST!") self.logger.exception(err) raise err - self.logger.info("Loaded %d BLAST objects into database", tot_objects) + self.logger.debug("Loaded %d BLAST objects into database", tot_objects) hits, hsps = [], [] return hits, hsps diff --git a/Mikado/serializers/orf.py b/Mikado/serializers/orf.py index 14c51c9d0..302961084 100644 --- a/Mikado/serializers/orf.py +++ b/Mikado/serializers/orf.py @@ -225,7 +225,7 @@ def load_fasta(self, cache): if self.fasta_index is not None: done = 0 - self.logger.info("%d entries already present in db, %d in the index", + self.logger.debug("%d entries already present in db, %d in the index", len([fasta_key for fasta_key in self.fasta_index if fasta_key not in cache]), len(self.fasta_index.keys())) @@ -241,7 +241,7 @@ def load_fasta(self, cache): self.session.begin(subtransactions=True) self.session.bulk_save_objects(objects) self.session.commit() - self.logger.info( + self.logger.debug( "Loaded %d transcripts into query table", done) objects = [] @@ -249,7 +249,7 @@ def load_fasta(self, cache): self.session.begin(subtransactions=True) self.session.bulk_save_objects(objects) self.session.commit() - self.logger.info( + self.logger.debug( "Finished loading %d transcripts into query table", done) return cache @@ -269,10 +269,10 @@ def serialize(self): done = 0 cache = self.load_fasta(cache) - self.logger.info("Loading IDs into the cache") + self.logger.debug("Loading IDs into the cache") for record in self.session.query(Query): cache[record.query_name] = record.query_id - self.logger.info("Finished loading IDs into the cache") + self.logger.debug("Finished loading IDs into the cache") for row in self.bed12_parser: if row.header is True: @@ -298,7 +298,7 @@ def serialize(self): self.session.begin(subtransactions=True) self.session.bulk_save_objects(objects) self.session.commit() - self.logger.info("Loaded %d ORFs into the database", done) + self.logger.debug("Loaded %d ORFs into the database", done) objects = [] done += len(objects) diff --git a/Mikado/subprograms/serialise.py b/Mikado/subprograms/serialise.py index 1ca2720f6..83a77f63c 100644 --- a/Mikado/subprograms/serialise.py +++ b/Mikado/subprograms/serialise.py @@ -80,7 +80,7 @@ def load_junctions(args, logger): for junction_file in iter( j_file for j_file in args.json_conf["serialise"]["files"]["junctions"] if j_file != ''): - logger.info("Loading junctions: %s", junction_file) + logger.debug("Loading junctions: %s", junction_file) serializer = junction.JunctionSerializer( junction_file, json_conf=args.json_conf, @@ -137,7 +137,7 @@ def load_orfs(args, logger): if len(args.json_conf["serialise"]["files"]["orfs"]) > 0: logger.info("Starting to load ORF data") for orf_file in args.json_conf["serialise"]["files"]["orfs"]: - logger.info("Starting to load ORFs from %s", orf_file) + logger.debug("Starting to load ORFs from %s", orf_file) serializer = orf.OrfSerializer(orf_file, json_conf=args.json_conf, logger=logger) @@ -263,6 +263,11 @@ def setup(args): if args.max_regression is not None: args.json_conf["serialise"]["max_regression"] = args.max_regression + if args.codon_table is not None: + args.json_conf["serialise"]["codon_table"] = args.codon_table + else: + assert "codon_table" in args.json_conf["serialise"] + logger.setLevel("INFO") logger.info("Command line: %s", " ".join(sys.argv)) @@ -300,15 +305,15 @@ def serialise(args): logger.info("Command line: %s", " ".join(sys.argv)) if args.json_conf["serialise"]["force"] is True: - logger.warn("Removing old data because force option in place") if args.json_conf["db_settings"]["dbtype"] == "sqlite" and os.path.exists(args.json_conf["db_settings"]["db"]): + logger.warn("Removing old data because force option in place") os.remove(args.json_conf["db_settings"]["db"]) engine = dbutils.connect(args.json_conf) meta = sqlalchemy.MetaData(bind=engine) meta.reflect(engine) for tab in reversed(meta.sorted_tables): - logger.warn("Dropping %s", tab) + logger.debug("Dropping %s", tab) tab.drop() if args.json_conf["db_settings"]["dbtype"] == "mysql": engine.execute("OPTIMIZE TABLE {}".format(tab.name)) @@ -369,6 +374,9 @@ def serialise_parser(): type=float, default=None, help=""""Amount of sequence in the ORF (in %%) to backtrack in order to find a valid START codon, if one is absent. Default: %(default)s""") + orfs.add_argument("--codon-table", dest="codon_table", default=None, + help="""Codon table to use. Default: 0 (ie Standard, NCBI #1, but only ATG is considered + a valid stop codon.""") blast = parser.add_argument_group() blast.add_argument("--max_target_seqs", type=int, default=None, diff --git a/Mikado/transcripts/transcript_methods/splitting.py b/Mikado/transcripts/transcript_methods/splitting.py index 5b57d625f..8cbc36242 100644 --- a/Mikado/transcripts/transcript_methods/splitting.py +++ b/Mikado/transcripts/transcript_methods/splitting.py @@ -825,6 +825,7 @@ def split_by_cds(transcript): new_transc.verified_introns = set.intersection(set(new_transc.introns), transcript.verified_introns) + new_transc.attributes["split"] = True yield new_transc return