From 4dd547a44f0961145318cf76aa9ff151b5222033 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Tue, 31 Jan 2017 10:03:20 +0000 Subject: [PATCH 01/47] Deprecated the "discard_definition" flag in serialise and simplified the BLAST loading code. --- .../configuration_blueprint.json | 4 +- .../blast_serializer/xml_serialiser.py | 454 ++++++------------ Mikado/subprograms/serialise.py | 3 - docs/Library/Mikado.configuration.rst | 30 ++ docs/Library/Mikado.daijin.rst | 10 + docs/Library/Mikado.loci.rst | 109 +++++ .../Mikado.loci.transcript_methods.rst | 46 ++ docs/Library/Mikado.parsers.rst | 62 +++ docs/Library/Mikado.picking.rst | 30 ++ docs/Library/Mikado.preparation.rst | 38 ++ docs/Library/Mikado.rst | 39 ++ docs/Library/Mikado.scales.rst | 62 +++ .../Mikado.serializers.blast_serializer.rst | 62 +++ docs/Library/Mikado.serializers.rst | 45 ++ docs/Library/Mikado.subprograms.rst | 61 +++ docs/Library/Mikado.subprograms.util.rst | 70 +++ docs/Library/Mikado.tests.rst | 214 +++++++++ docs/Library/Mikado.utilities.rst | 46 ++ docs/Library/modules.rst | 7 + 19 files changed, 1088 insertions(+), 304 deletions(-) create mode 100644 docs/Library/Mikado.configuration.rst create mode 100644 docs/Library/Mikado.daijin.rst create mode 100644 docs/Library/Mikado.loci.rst create mode 100644 docs/Library/Mikado.loci.transcript_methods.rst create mode 100644 docs/Library/Mikado.parsers.rst create mode 100644 docs/Library/Mikado.picking.rst create mode 100644 docs/Library/Mikado.preparation.rst create mode 100644 docs/Library/Mikado.rst create mode 100644 docs/Library/Mikado.scales.rst create mode 100644 docs/Library/Mikado.serializers.blast_serializer.rst create mode 100644 docs/Library/Mikado.serializers.rst create mode 100644 docs/Library/Mikado.subprograms.rst create mode 100644 docs/Library/Mikado.subprograms.util.rst create mode 100644 docs/Library/Mikado.tests.rst create mode 100644 docs/Library/Mikado.utilities.rst create mode 100644 docs/Library/modules.rst diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json index 0813d074b..d062d4c62 100644 --- a/Mikado/configuration/configuration_blueprint.json +++ b/Mikado/configuration/configuration_blueprint.json @@ -127,9 +127,7 @@ " in the first 30 bps (10% of the cDNA).", "- max_target_seqs: equivalently to BLAST, it indicates the maximum number of targets to keep", " per blasted sequence.", - "- discard_definition: Boolean. Used to indicate whether Mikado should use the definition", - " rather than the ID for BLAST sequences. Necessary as in some instances BLAST XMLs will have", - " a mock identifier rather than the original sequence ID (eg lcl|1). Default: false.", + "- discard_definition: Boolean. **Deprecated**, it was used for specifying how to load BLAST files.", "- procs: Number of processors to use. Default: 1.", "- single_thread: if true, Mikado prepare will force the usage of a single thread in this step."], "SimpleComment": ["Options related to serialisation", diff --git a/Mikado/serializers/blast_serializer/xml_serialiser.py b/Mikado/serializers/blast_serializer/xml_serialiser.py index f193991b0..c84adca38 100644 --- a/Mikado/serializers/blast_serializer/xml_serialiser.py +++ b/Mikado/serializers/blast_serializer/xml_serialiser.py @@ -41,13 +41,11 @@ def __init__(self, logging_queue, level="WARN", max_target_seqs=10, - maxobjects=20000, - discard_definition=False): + maxobjects=20000): super().__init__() self.queries = queries self.targets = targets - self.discard_definition = discard_definition self.level = level self.logging_queue = logging_queue self.handler = logging_handlers.QueueHandler(logging_queue) @@ -64,12 +62,6 @@ def __init__(self, self.__max_target_seqs = max_target_seqs self.logger.debug("Started %s", self.name) - # self._pickler = functools.partial(_pickle_xml, - # **{"default_header": self.default_header, - # "maxobjects": self.maxobjects, - # "logging_queue": self.logging_queue, - # "level": self.level}) - def __getstate__(self): state = self.__dict__.copy() @@ -90,11 +82,6 @@ def __setstate__(self, state): self.logger = logging.getLogger(self.name) self.logger.addHandler(self.handler) self.logger.setLevel(self.level) - # self._pickler = functools.partial(_pickle_xml, - # **{"default_header": self.default_header, - # "maxobjects": self.maxobjects, - # "logging_queue": self.logging_queue, - # "level": self.level}) def _pickler(self, filename): @@ -114,7 +101,8 @@ def _pickler(self, filename): try: for record in opened: query_counter += 1 - hits, hsps = self.objectify_record(record, hits, hsps) + hits, hsps = objectify_record(self, record, hits, hsps, + max_target_seqs=self.__max_target_seqs) if len(hits) + len(hsps) > self.maxobjects: pickle_temp = tempfile.mkstemp(suffix=".pickle", @@ -146,64 +134,6 @@ def _pickler(self, filename): self.logger.info("Finished pickling %s in %s subsection", filename, pickle_count) # del records - def objectify_record(self, record, hits, hsps): - """ - Private method to serialise a single record into the DB. - - :param record: The BLAST record to load into the DB. - :param hits: Cache of hits to load into the DB. - :type hits: list - - :param hsps: Cache of hsps to load into the DB. - :type hsps: list - - :returns: hits, hsps - :rtype: (list, list) - """ - - if len(record.hits) == 0: - return hits, hsps - - current_query, name = _get_query_for_blast(self, record) - - current_evalue = -1 - current_counter = 0 - - # for ccc, alignment in enumerate(record.alignments): - for ccc, alignment in enumerate(record.hits): - if ccc + 1 > self.__max_target_seqs: - break - - self.logger.debug("Started the hit %s vs. %s", - # name, record.alignments[ccc].accession) - name, record.hits[ccc].id) - current_target = _get_target_for_blast(self, alignment) - - hit_dict_params = dict() - (hit_dict_params["query_multiplier"], - hit_dict_params["target_multiplier"]) = XmlSerializer.get_multipliers(record) - hit_evalue = min(_.evalue for _ in record.hits[ccc].hsps) - hit_bs = max(_.bitscore for _ in record.hits[ccc].hsps) - if current_evalue < hit_evalue: - current_counter += 1 - current_evalue = hit_evalue - - hit_dict_params["hit_number"] = current_counter - hit_dict_params["evalue"] = hit_evalue - hit_dict_params["bits"] = hit_bs - - # Prepare for bulk load - try: - hit, hit_hsps = prepare_hit(alignment, current_query, - current_target, **hit_dict_params) - except InvalidHit as exc: - self.logger.error(exc) - continue - hits.append(hit) - hsps.extend(hit_hsps) - - return hits, hsps - def run(self): """ While running, the process will get the filenames to analyse from the first queue @@ -296,7 +226,6 @@ def __init__(self, xml_name, self.log_writer = logging_handlers.QueueListener(self.logging_queue, self.logger) self.log_writer.start() - self.discard_definition = json_conf["serialise"]["discard_definition"] self.__max_target_seqs = json_conf["serialise"]["max_target_seqs"] self.maxobjects = json_conf["serialise"]["max_objects"] target_seqs = json_conf["serialise"]["files"]["blast_targets"] @@ -318,7 +247,7 @@ def __init__(self, xml_name, self.__determine_sequences(query_seqs, target_seqs) self.xml = xml_name # Just a mock definition - self.get_query = functools.partial(self.__get_query_for_blast) + # self.get_query = functools.partial(self.__get_query_for_blast) self.not_pickable = ["manager", "printer_process", "context", "logger_queue_handler", "queue_logger", "log_writer", @@ -326,6 +255,8 @@ def __init__(self, xml_name, "log_handler", "log_writer", "logger", "session", "get_query", "engine", "query_seqs", "target_seqs"] + self.queries, self.targets = dict(), dict() + self.logger.info("Finished __init__") def __getstate__(self): @@ -362,17 +293,6 @@ def __determine_sequences(self, query_seqs, target_seqs): raise ValueError("{} not found!".format(target)) self.target_seqs.append(pyfaidx.Fasta(target)) - # if isinstance(target_seqs, str): - # assert os.path.exists(target_seqs) - # self.target_seqs = pyfaidx.Fasta(target_seqs) - # elif target_seqs is None: - # self.target_seqs = None - # else: - # self.logger.warn("Target (%s) type: %s", - # target_seqs, - # type(target_seqs)) - # # assert "SeqIO.index" in repr(target_seqs) - # self.target_seqs = target_seqs return def __serialize_queries(self, queries): @@ -399,8 +319,7 @@ def __serialize_queries(self, queries): "query_name": record, "query_length": len(self.query_seqs[record]) }) - # - # objects.append(Target(record, len(self.target_seqs[record]))) + if len(objects) >= self.maxobjects: self.logger.info("Loading %d objects into the \"query\" table (total %d)", self.maxobjects, counter) @@ -414,25 +333,16 @@ def __serialize_queries(self, queries): self.session.commit() counter += len(objects) objects = [] - # pylint: disable=no-member - # # pylint: enable=no-member - # self.logger.info("Loaded %d objects into the \"target\" table", - # len(objects)) - # objects = [] + if len(objects) > 0: self.logger.info("Loading %d objects into the \"query\" table (total %d)", len(objects), counter+len(objects)) # pylint: disable=no-member - # self.engine.execute(Target.__table__.insert(), - # [{"target_name": obj.target_name, - # "target_length": obj.target_length} for obj in objects]) counter += len(objects) # pylint: disable=no-member self.session.begin() - # self.session.bulk_insert_mappings(Query, objects) self.engine.execute(Query.__table__.insert(), objects) # pylint: enable=no-member - # self.session.bulk_insert_mappings(Query, objects) self.session.commit() # pylint: enable=no-member self.logger.info("Loaded %d objects into the \"query\" table", counter) @@ -520,6 +430,7 @@ def __serialise_sequences(self): if self.query_seqs is not None: queries = self.__serialize_queries(queries) assert len(queries) > 0 + return queries, targets def __load_into_db(self, hits, hsps, force=False): @@ -562,104 +473,21 @@ def __load_into_db(self, hits, hsps, force=False): hits, hsps = [], [] return hits, hsps - def __serialise_record(self, record, hits, hsps, targets): - """ - Private method to serialise a single record into the DB. - - :param record: The BLAST record to load into the DB. - :param hits: Cache of hits to load into the DB. - :type hits: list - - :param hsps: Cache of hsps to load into the DB. - :type hsps: list - - :param targets: dictionary which holds the relationship target ID/name. - It can be updated in place if a target has not been serialised already. - - :returns: hits, hsps, hit_counter, targets - :rtype: (list, list, int, dict) - """ - - hit_counter = 0 - - if len(record.hits) == 0: - return hits, hsps, hit_counter, targets - - current_query, name = self.get_query(record) - - current_evalue = -1 - current_counter = 0 - - # for ccc, alignment in enumerate(record.alignments): - for ccc, alignment in enumerate(record.hits): - if ccc + 1 > self.__max_target_seqs: - break - - self.logger.debug("Started the hit %s vs. %s", - # name, record.alignments[ccc].accession) - name, record.hits[ccc].id) - try: - current_target, targets = self.__get_target_for_blast(alignment, - targets) - except sqlalchemy.exc.IntegrityError as exc: - self.session.rollback() - self.logger.exception(exc) - continue - - hit_counter += 1 - - hit_dict_params = dict() - (hit_dict_params["query_multiplier"], - hit_dict_params["target_multiplier"]) = self.get_multipliers(record) - hit_evalue = min(_.evalue for _ in record.hits[ccc].hsps) - hit_bs = max(_.bitscore for _ in record.hits[ccc].hsps) - if current_evalue < hit_evalue: - current_counter += 1 - current_evalue = hit_evalue - - hit_dict_params["hit_number"] = current_counter - hit_dict_params["evalue"] = hit_evalue - hit_dict_params["bits"] = hit_bs - - # Prepare for bulk load - try: - hit, hit_hsps = prepare_hit(alignment, - current_query, - current_target, - **hit_dict_params) - except InvalidHit as exc: - self.logger.error(exc) - continue - - hits.append(hit) - hsps.extend(hit_hsps) - - hits, hsps = self.__load_into_db(hits, hsps, force=False) - - return hits, hsps, hit_counter, targets - def serialize(self): """Method to serialize the BLAST XML file into a database provided with the __init__ method """ # Load sequences in DB, precache IDs - queries, targets = self.__serialise_sequences() + self.queries, self.targets = self.__serialise_sequences() if isinstance(self.xml, str): self.xml = [self.xml] - # self.xml_parser = xparser(create_opener(self.xml)) else: assert isinstance(self.xml, (list, set)) - # if len(self.xml) < 1: - # raise ValueError("No input file provided!") - # elif len(self.xml) == 1: - # self.xml_parser = xparser(create_opener(list(self.xml)[0])) - # else: - # self.xml_parser = xparser(XMLMerger(self.xml)) # Merge in memory # Create the function that will retrieve the query_id given the name - self.get_query = functools.partial(self.__get_query_for_blast, - **{"queries": queries}) + # self.get_query = functools.partial(self.__get_query_for_blast, + # **{"queries": self.queries}) hits, hsps = [], [] hit_counter, record_counter = 0, 0 @@ -685,16 +513,10 @@ def serialize(self): record_counter += 1 if record_counter > 0 and record_counter % 10000 == 0: self.logger.info("Parsed %d queries", record_counter) - (hits, - hsps, - partial_hit_counter, - targets) = self.__serialise_record(record, - hits, - hsps, - targets) - hit_counter += partial_hit_counter - if hit_counter > 0 and hit_counter % 10000 == 0: - self.logger.info("Serialized %d alignments", hit_counter) + hits, hsps = objectify_record(self, record, hits, hsps, + max_target_seqs=self.__max_target_seqs) + + hits, hsps = load_into_db(self, hits, hsps, force=False) self.logger.debug("Finished %s", filename) except ExpatError: self.logger.error("%s is an invalid BLAST file, saving what's available", @@ -706,21 +528,12 @@ def serialize(self): self.logger.info("Creating a pool with %d processes", min(self.procs, len(self.xml))) - # pool = multiprocessing.Pool(min(self.procs, len(self.xml))) - # args = zip( - # self.xml, - # [self.header] * len(self.xml), - # [self.maxobjects] * len(self.xml), - # [self.logging_queue] * len(self.xml), - # [self.json_conf["log_settings"]["log_level"]] * len(self.xml) - # ) - filequeue = multiprocessing.Queue(-1) returnqueue = multiprocessing.Queue(-1) procs = [_XmlPickler( - queries, - targets, + self.queries, + self.targets, filequeue, returnqueue, self.header, @@ -728,7 +541,6 @@ def serialize(self): logging_queue=self.logging_queue, # level=self.logger.level, level=self.json_conf["log_settings"]["log_level"], - discard_definition=self.discard_definition, maxobjects=int(self.maxobjects/self.procs), max_target_seqs=self.__max_target_seqs ) @@ -764,22 +576,6 @@ def serialize(self): hits, hsps = load_into_db(self, hits, hsps, force=False) if record_counter > 0 and record_counter % 10000 == 0: self.logger.info("Parsed %d queries", record_counter) - # for record in pickle.load(pickled): - # record_counter += 1 - # if - # - # - # # (hits, - # # hsps, - # # partial_hit_counter, - # # targets) = self.__serialise_record(record, - # # hits, - # # hsps, - # # targets) - # - # # hit_counter += partial_hit_counter - # if hit_counter > 0 and hit_counter % 10000 == 0: - # self.logger.info("Serialized %d alignments", hit_counter) os.remove(pickle_file) [_.join() for _ in procs] # Wait for processes to join self.logger.info("All %d children finished", len(procs)) @@ -830,74 +626,74 @@ def get_multipliers(record): return q_mult, h_mult - def __get_query_for_blast(self, record, queries): - - """ This private method formats the name of the query - recovered from the BLAST hit, verifies whether it is present or not - in the DB, and moreover whether the data can be updated (e.g. - by adding the query length) - :param record: - :param queries: - :return: current_query (ID in the database), name - """ - - if self.discard_definition is False: - name = record.id.split()[0] - else: - name = record.id - self.logger.debug("Started with %s", name) - - if name in queries: - try: - current_query = queries[name][0] - except TypeError as exc: - raise TypeError("{0}, {1}".format(exc, name)) - if queries[name][1] is False: - self.session.query(Query).filter(Query.query_name == name).update( - {"query_length": record.query_length}) - self.session.commit() - else: - self.logger.warn("%s not found among queries, adding to the DB now", - name) - current_query = Query(name, record.query_length) - self.session.add(current_query) - self.session.commit() - queries[name] = (current_query.query_id, True) - current_query = current_query.query_id - return current_query, name - - def __get_target_for_blast(self, alignment, targets): - - """ This private method retrieves the correct target_id - key for the target of the BLAST. If the entry is not present - in the database, it will be created on the fly. - The method returns the index of the current target and - and an updated target dictionary. - :param alignment: an alignment child of a BLAST record object - :param targets: dictionary caching the known targets - :return: current_target (ID in the database), targets - """ - - if alignment.accession in targets: - current_target = targets[alignment.accession][0] - if targets[alignment.accession][1] is False: - self.session.query(Target).filter( - Target.target_name == alignment.accession).\ - update({"target_length": alignment.length}) - self.session.commit() - targets[alignment.accession] = (targets[alignment.accession][0], - True) - else: - current_target = Target(alignment.accession, - alignment.length) - self.logger.warn("%s not found among targets, adding to the DB now", - alignment.accession) - self.session.add(current_target) - self.session.commit() - assert isinstance(current_target.target_id, int) - targets[alignment.accession] = (current_target.target_id, True) - current_target = current_target.target_id - return current_target, targets + # def __get_query_for_blast(self, record, queries): + # + # """ This private method formats the name of the query + # recovered from the BLAST hit, verifies whether it is present or not + # in the DB, and moreover whether the data can be updated (e.g. + # by adding the query length) + # :param record: + # :param queries: + # :return: current_query (ID in the database), name + # """ + # + # if self.discard_definition is False: + # name = record.id.split()[0] + # else: + # name = record.id + # self.logger.debug("Started with %s", name) + # + # if name in queries: + # try: + # current_query = queries[name][0] + # except TypeError as exc: + # raise TypeError("{0}, {1}".format(exc, name)) + # if queries[name][1] is False: + # self.session.query(Query).filter(Query.query_name == name).update( + # {"query_length": record.query_length}) + # self.session.commit() + # else: + # self.logger.warn("%s not found among queries, adding to the DB now", + # name) + # current_query = Query(name, record.query_length) + # self.session.add(current_query) + # self.session.commit() + # queries[name] = (current_query.query_id, True) + # current_query = current_query.query_id + # return current_query, name + # + # def __get_target_for_blast(self, alignment, targets): + # + # """ This private method retrieves the correct target_id + # key for the target of the BLAST. If the entry is not present + # in the database, it will be created on the fly. + # The method returns the index of the current target and + # and an updated target dictionary. + # :param alignment: an alignment child of a BLAST record object + # :param targets: dictionary caching the known targets + # :return: current_target (ID in the database), targets + # """ + # + # if alignment.accession in targets: + # current_target = targets[alignment.accession][0] + # if targets[alignment.accession][1] is False: + # self.session.query(Target).filter( + # Target.target_name == alignment.accession).\ + # update({"target_length": alignment.length}) + # self.session.commit() + # targets[alignment.accession] = (targets[alignment.accession][0], + # True) + # else: + # current_target = Target(alignment.accession, + # alignment.length) + # self.logger.warn("%s not found among targets, adding to the DB now", + # alignment.accession) + # self.session.add(current_target) + # self.session.commit() + # assert isinstance(current_target.target_id, int) + # targets[alignment.accession] = (current_target.target_id, True) + # current_target = current_target.target_id + # return current_target, targets # pylint: enable=too-many-instance-attributes @@ -950,14 +746,18 @@ def _get_query_for_blast(self, record): :return: current_query (ID in the database), name """ - if self.discard_definition is False: - name = record.id.split()[0] - else: + if record.id in self.queries: name = record.id + else: + name = record.id.split()[0] + if name not in self.queries: + raise KeyError("{} not found in the queries!".format(record)) + self.logger.debug("Started with %s", name) - if name not in self.queries or self.queries[name][1] is False: + if self.queries[name][1] is False: raise KeyError("{} not found in the queries!".format(record)) + current_query = self.queries[name][0] return current_query, name @@ -972,13 +772,71 @@ def _get_target_for_blast(self, alignment): :return: current_target (ID in the database), targets """ - if self.discard_definition: + if alignment.accession in self.targets: accession = alignment.accession - else: + elif alignment.id in self.targets: accession = alignment.id - - if accession not in self.targets: + else: raise KeyError("{} not found in the targets!".format(alignment.accession)) current_target = self.targets[accession][0] return current_target + + +def objectify_record(self, record, hits, hsps, max_target_seqs=10000): + """ + Private method to serialise a single record into the DB. + + :param record: The BLAST record to load into the DB. + :param hits: Cache of hits to load into the DB. + :type hits: list + + :param hsps: Cache of hsps to load into the DB. + :type hsps: list + + :returns: hits, hsps + :rtype: (list, list) + """ + + if len(record.hits) == 0: + return hits, hsps + + current_query, name = _get_query_for_blast(self, record) + + current_evalue = -1 + current_counter = 0 + + # for ccc, alignment in enumerate(record.alignments): + for ccc, alignment in enumerate(record.hits): + if ccc + 1 > max_target_seqs: + break + + self.logger.debug("Started the hit %s vs. %s", + # name, record.alignments[ccc].accession) + name, record.hits[ccc].id) + current_target = _get_target_for_blast(self, alignment) + + hit_dict_params = dict() + (hit_dict_params["query_multiplier"], + hit_dict_params["target_multiplier"]) = XmlSerializer.get_multipliers(record) + hit_evalue = min(_.evalue for _ in record.hits[ccc].hsps) + hit_bs = max(_.bitscore for _ in record.hits[ccc].hsps) + if current_evalue < hit_evalue: + current_counter += 1 + current_evalue = hit_evalue + + hit_dict_params["hit_number"] = current_counter + hit_dict_params["evalue"] = hit_evalue + hit_dict_params["bits"] = hit_bs + + # Prepare for bulk load + try: + hit, hit_hsps = prepare_hit(alignment, current_query, + current_target, **hit_dict_params) + except InvalidHit as exc: + self.logger.error(exc) + continue + hits.append(hit) + hsps.extend(hit_hsps) + + return hits, hsps diff --git a/Mikado/subprograms/serialise.py b/Mikado/subprograms/serialise.py index be4d37248..df026fc63 100644 --- a/Mikado/subprograms/serialise.py +++ b/Mikado/subprograms/serialise.py @@ -372,9 +372,6 @@ def serialise_parser(): help="Maximum number of target sequences.") blast.add_argument("--blast_targets", default=[], type=comma_split, help="Target sequences") - blast.add_argument("--discard-definition", action="store_true", default=False, - help="""Flag. If set, the sequences IDs instead of their definition - will be used for serialisation.""") blast.add_argument("--xml", type=str, help="""XML file(s) to parse. They can be provided in three ways: - a comma-separated list diff --git a/docs/Library/Mikado.configuration.rst b/docs/Library/Mikado.configuration.rst new file mode 100644 index 000000000..979f878ca --- /dev/null +++ b/docs/Library/Mikado.configuration.rst @@ -0,0 +1,30 @@ +Mikado.configuration package +============================ + +Submodules +---------- + +Mikado.configuration.configurator module +---------------------------------------- + +.. automodule:: Mikado.configuration.configurator + :members: + :undoc-members: + :show-inheritance: + +Mikado.configuration.daijin_configurator module +----------------------------------------------- + +.. automodule:: Mikado.configuration.daijin_configurator + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: Mikado.configuration + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/Library/Mikado.daijin.rst b/docs/Library/Mikado.daijin.rst new file mode 100644 index 000000000..fa3726248 --- /dev/null +++ b/docs/Library/Mikado.daijin.rst @@ -0,0 +1,10 @@ +Mikado.daijin package +===================== + +Module contents +--------------- + +.. automodule:: Mikado.daijin + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/Library/Mikado.loci.rst b/docs/Library/Mikado.loci.rst new file mode 100644 index 000000000..f6444520f --- /dev/null +++ b/docs/Library/Mikado.loci.rst @@ -0,0 +1,109 @@ +Mikado.loci package +=================== + +Subpackages +----------- + +.. toctree:: + + Mikado.loci.transcript_methods + +Submodules +---------- + +Mikado.loci.abstractlocus module +-------------------------------- + +.. automodule:: Mikado.loci.abstractlocus + :members: + :undoc-members: + :show-inheritance: + +Mikado.loci.clique_methods module +--------------------------------- + +.. automodule:: Mikado.loci.clique_methods + :members: + :undoc-members: + :show-inheritance: + +Mikado.loci.excluded module +--------------------------- + +.. automodule:: Mikado.loci.excluded + :members: + :undoc-members: + :show-inheritance: + +Mikado.loci.locus module +------------------------ + +.. automodule:: Mikado.loci.locus + :members: + :undoc-members: + :show-inheritance: + +Mikado.loci.monosublocus module +------------------------------- + +.. automodule:: Mikado.loci.monosublocus + :members: + :undoc-members: + :show-inheritance: + +Mikado.loci.monosublocusholder module +------------------------------------- + +.. automodule:: Mikado.loci.monosublocusholder + :members: + :undoc-members: + :show-inheritance: + +Mikado.loci.reference_gene module +--------------------------------- + +.. automodule:: Mikado.loci.reference_gene + :members: + :undoc-members: + :show-inheritance: + +Mikado.loci.sublocus module +--------------------------- + +.. automodule:: Mikado.loci.sublocus + :members: + :undoc-members: + :show-inheritance: + +Mikado.loci.superlocus module +----------------------------- + +.. automodule:: Mikado.loci.superlocus + :members: + :undoc-members: + :show-inheritance: + +Mikado.loci.transcript module +----------------------------- + +.. automodule:: Mikado.loci.transcript + :members: + :undoc-members: + :show-inheritance: + +Mikado.loci.transcriptchecker module +------------------------------------ + +.. automodule:: Mikado.loci.transcriptchecker + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: Mikado.loci + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/Library/Mikado.loci.transcript_methods.rst b/docs/Library/Mikado.loci.transcript_methods.rst new file mode 100644 index 000000000..c5054b2e8 --- /dev/null +++ b/docs/Library/Mikado.loci.transcript_methods.rst @@ -0,0 +1,46 @@ +Mikado.loci.transcript_methods package +====================================== + +Submodules +---------- + +Mikado.loci.transcript_methods.finalizing module +------------------------------------------------ + +.. automodule:: Mikado.loci.transcript_methods.finalizing + :members: + :undoc-members: + :show-inheritance: + +Mikado.loci.transcript_methods.printing module +---------------------------------------------- + +.. automodule:: Mikado.loci.transcript_methods.printing + :members: + :undoc-members: + :show-inheritance: + +Mikado.loci.transcript_methods.retrieval module +----------------------------------------------- + +.. automodule:: Mikado.loci.transcript_methods.retrieval + :members: + :undoc-members: + :show-inheritance: + +Mikado.loci.transcript_methods.splitting module +----------------------------------------------- + +.. automodule:: Mikado.loci.transcript_methods.splitting + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: Mikado.loci.transcript_methods + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/Library/Mikado.parsers.rst b/docs/Library/Mikado.parsers.rst new file mode 100644 index 000000000..5e25ce83f --- /dev/null +++ b/docs/Library/Mikado.parsers.rst @@ -0,0 +1,62 @@ +Mikado.parsers package +====================== + +Submodules +---------- + +Mikado.parsers.GFF module +------------------------- + +.. automodule:: Mikado.parsers.GFF + :members: + :undoc-members: + :show-inheritance: + +Mikado.parsers.GTF module +------------------------- + +.. automodule:: Mikado.parsers.GTF + :members: + :undoc-members: + :show-inheritance: + +Mikado.parsers.bed12 module +--------------------------- + +.. automodule:: Mikado.parsers.bed12 + :members: + :undoc-members: + :show-inheritance: + +Mikado.parsers.blast_utils module +--------------------------------- + +.. automodule:: Mikado.parsers.blast_utils + :members: + :undoc-members: + :show-inheritance: + +Mikado.parsers.blast_xml module +------------------------------- + +.. automodule:: Mikado.parsers.blast_xml + :members: + :undoc-members: + :show-inheritance: + +Mikado.parsers.gfannotation module +---------------------------------- + +.. automodule:: Mikado.parsers.gfannotation + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: Mikado.parsers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/Library/Mikado.picking.rst b/docs/Library/Mikado.picking.rst new file mode 100644 index 000000000..8a6aa202c --- /dev/null +++ b/docs/Library/Mikado.picking.rst @@ -0,0 +1,30 @@ +Mikado.picking package +====================== + +Submodules +---------- + +Mikado.picking.loci_processer module +------------------------------------ + +.. automodule:: Mikado.picking.loci_processer + :members: + :undoc-members: + :show-inheritance: + +Mikado.picking.picker module +---------------------------- + +.. automodule:: Mikado.picking.picker + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: Mikado.picking + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/Library/Mikado.preparation.rst b/docs/Library/Mikado.preparation.rst new file mode 100644 index 000000000..912b09a08 --- /dev/null +++ b/docs/Library/Mikado.preparation.rst @@ -0,0 +1,38 @@ +Mikado.preparation package +========================== + +Submodules +---------- + +Mikado.preparation.annotation_parser module +------------------------------------------- + +.. automodule:: Mikado.preparation.annotation_parser + :members: + :undoc-members: + :show-inheritance: + +Mikado.preparation.checking module +---------------------------------- + +.. automodule:: Mikado.preparation.checking + :members: + :undoc-members: + :show-inheritance: + +Mikado.preparation.prepare module +--------------------------------- + +.. automodule:: Mikado.preparation.prepare + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: Mikado.preparation + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/Library/Mikado.rst b/docs/Library/Mikado.rst new file mode 100644 index 000000000..291967ecf --- /dev/null +++ b/docs/Library/Mikado.rst @@ -0,0 +1,39 @@ +Mikado package +============== + +Subpackages +----------- + +.. toctree:: + + Mikado.configuration + Mikado.daijin + Mikado.loci + Mikado.parsers + Mikado.picking + Mikado.preparation + Mikado.scales + Mikado.serializers + Mikado.subprograms + Mikado.tests + Mikado.utilities + +Submodules +---------- + +Mikado.exceptions module +------------------------ + +.. automodule:: Mikado.exceptions + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: Mikado + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/Library/Mikado.scales.rst b/docs/Library/Mikado.scales.rst new file mode 100644 index 000000000..b57a77dc5 --- /dev/null +++ b/docs/Library/Mikado.scales.rst @@ -0,0 +1,62 @@ +Mikado.scales package +===================== + +Submodules +---------- + +Mikado.scales.accountant module +------------------------------- + +.. automodule:: Mikado.scales.accountant + :members: + :undoc-members: + :show-inheritance: + +Mikado.scales.assigner module +----------------------------- + +.. automodule:: Mikado.scales.assigner + :members: + :undoc-members: + :show-inheritance: + +Mikado.scales.compare module +---------------------------- + +.. automodule:: Mikado.scales.compare + :members: + :undoc-members: + :show-inheritance: + +Mikado.scales.contrast module +----------------------------- + +.. automodule:: Mikado.scales.contrast + :members: + :undoc-members: + :show-inheritance: + +Mikado.scales.f1 module +----------------------- + +.. automodule:: Mikado.scales.f1 + :members: + :undoc-members: + :show-inheritance: + +Mikado.scales.resultstorer module +--------------------------------- + +.. automodule:: Mikado.scales.resultstorer + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: Mikado.scales + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/Library/Mikado.serializers.blast_serializer.rst b/docs/Library/Mikado.serializers.blast_serializer.rst new file mode 100644 index 000000000..319259a67 --- /dev/null +++ b/docs/Library/Mikado.serializers.blast_serializer.rst @@ -0,0 +1,62 @@ +Mikado.serializers.blast_serializer package +=========================================== + +Submodules +---------- + +Mikado.serializers.blast_serializer.hit module +---------------------------------------------- + +.. automodule:: Mikado.serializers.blast_serializer.hit + :members: + :undoc-members: + :show-inheritance: + +Mikado.serializers.blast_serializer.hsp module +---------------------------------------------- + +.. automodule:: Mikado.serializers.blast_serializer.hsp + :members: + :undoc-members: + :show-inheritance: + +Mikado.serializers.blast_serializer.query module +------------------------------------------------ + +.. automodule:: Mikado.serializers.blast_serializer.query + :members: + :undoc-members: + :show-inheritance: + +Mikado.serializers.blast_serializer.target module +------------------------------------------------- + +.. automodule:: Mikado.serializers.blast_serializer.target + :members: + :undoc-members: + :show-inheritance: + +Mikado.serializers.blast_serializer.utils module +------------------------------------------------ + +.. automodule:: Mikado.serializers.blast_serializer.utils + :members: + :undoc-members: + :show-inheritance: + +Mikado.serializers.blast_serializer.xml_serialiser module +--------------------------------------------------------- + +.. automodule:: Mikado.serializers.blast_serializer.xml_serialiser + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: Mikado.serializers.blast_serializer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/Library/Mikado.serializers.rst b/docs/Library/Mikado.serializers.rst new file mode 100644 index 000000000..41cc2b499 --- /dev/null +++ b/docs/Library/Mikado.serializers.rst @@ -0,0 +1,45 @@ +Mikado.serializers package +========================== + +Subpackages +----------- + +.. toctree:: + + Mikado.serializers.blast_serializer + +Submodules +---------- + +Mikado.serializers.external module +---------------------------------- + +.. automodule:: Mikado.serializers.external + :members: + :undoc-members: + :show-inheritance: + +Mikado.serializers.junction module +---------------------------------- + +.. automodule:: Mikado.serializers.junction + :members: + :undoc-members: + :show-inheritance: + +Mikado.serializers.orf module +----------------------------- + +.. automodule:: Mikado.serializers.orf + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: Mikado.serializers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/Library/Mikado.subprograms.rst b/docs/Library/Mikado.subprograms.rst new file mode 100644 index 000000000..73c9472fb --- /dev/null +++ b/docs/Library/Mikado.subprograms.rst @@ -0,0 +1,61 @@ +Mikado.subprograms package +========================== + +Subpackages +----------- + +.. toctree:: + + Mikado.subprograms.util + +Submodules +---------- + +Mikado.subprograms.compare module +--------------------------------- + +.. automodule:: Mikado.subprograms.compare + :members: + :undoc-members: + :show-inheritance: + +Mikado.subprograms.configure module +----------------------------------- + +.. automodule:: Mikado.subprograms.configure + :members: + :undoc-members: + :show-inheritance: + +Mikado.subprograms.pick module +------------------------------ + +.. automodule:: Mikado.subprograms.pick + :members: + :undoc-members: + :show-inheritance: + +Mikado.subprograms.prepare module +--------------------------------- + +.. automodule:: Mikado.subprograms.prepare + :members: + :undoc-members: + :show-inheritance: + +Mikado.subprograms.serialise module +----------------------------------- + +.. automodule:: Mikado.subprograms.serialise + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: Mikado.subprograms + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/Library/Mikado.subprograms.util.rst b/docs/Library/Mikado.subprograms.util.rst new file mode 100644 index 000000000..556ecd047 --- /dev/null +++ b/docs/Library/Mikado.subprograms.util.rst @@ -0,0 +1,70 @@ +Mikado.subprograms.util package +=============================== + +Submodules +---------- + +Mikado.subprograms.util.awk_gtf module +-------------------------------------- + +.. automodule:: Mikado.subprograms.util.awk_gtf + :members: + :undoc-members: + :show-inheritance: + +Mikado.subprograms.util.convert module +-------------------------------------- + +.. automodule:: Mikado.subprograms.util.convert + :members: + :undoc-members: + :show-inheritance: + +Mikado.subprograms.util.grep module +----------------------------------- + +.. automodule:: Mikado.subprograms.util.grep + :members: + :undoc-members: + :show-inheritance: + +Mikado.subprograms.util.merge_blast module +------------------------------------------ + +.. automodule:: Mikado.subprograms.util.merge_blast + :members: + :undoc-members: + :show-inheritance: + +Mikado.subprograms.util.metrics module +-------------------------------------- + +.. automodule:: Mikado.subprograms.util.metrics + :members: + :undoc-members: + :show-inheritance: + +Mikado.subprograms.util.stats module +------------------------------------ + +.. automodule:: Mikado.subprograms.util.stats + :members: + :undoc-members: + :show-inheritance: + +Mikado.subprograms.util.trim module +----------------------------------- + +.. automodule:: Mikado.subprograms.util.trim + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: Mikado.subprograms.util + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/Library/Mikado.tests.rst b/docs/Library/Mikado.tests.rst new file mode 100644 index 000000000..d04e8a3cb --- /dev/null +++ b/docs/Library/Mikado.tests.rst @@ -0,0 +1,214 @@ +Mikado.tests package +==================== + +Submodules +---------- + +Mikado.tests.assigner_tester module +----------------------------------- + +.. automodule:: Mikado.tests.assigner_tester + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.intervaltree_tests module +-------------------------------------- + +.. automodule:: Mikado.tests.intervaltree_tests + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.locus_tester module +-------------------------------- + +.. automodule:: Mikado.tests.locus_tester + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.orf_tester module +------------------------------ + +.. automodule:: Mikado.tests.orf_tester + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.parser_testing module +---------------------------------- + +.. automodule:: Mikado.tests.parser_testing + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.test_abstractlocus module +-------------------------------------- + +.. automodule:: Mikado.tests.test_abstractlocus + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.test_blast_related module +-------------------------------------- + +.. automodule:: Mikado.tests.test_blast_related + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.test_clique_methods module +--------------------------------------- + +.. automodule:: Mikado.tests.test_clique_methods + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.test_db_utils module +--------------------------------- + +.. automodule:: Mikado.tests.test_db_utils + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.test_excluded module +--------------------------------- + +.. automodule:: Mikado.tests.test_excluded + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.test_external_scores module +---------------------------------------- + +.. automodule:: Mikado.tests.test_external_scores + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.test_gene module +----------------------------- + +.. automodule:: Mikado.tests.test_gene + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.test_invalid_orfs module +------------------------------------- + +.. automodule:: Mikado.tests.test_invalid_orfs + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.test_json module +----------------------------- + +.. automodule:: Mikado.tests.test_json + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.test_metrics module +-------------------------------- + +.. automodule:: Mikado.tests.test_metrics + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.test_modifications module +-------------------------------------- + +.. automodule:: Mikado.tests.test_modifications + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.test_serialisation module +-------------------------------------- + +.. automodule:: Mikado.tests.test_serialisation + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.test_splitting module +---------------------------------- + +.. automodule:: Mikado.tests.test_splitting + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.test_system_calls module +------------------------------------- + +.. automodule:: Mikado.tests.test_system_calls + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.test_transcript_checker module +------------------------------------------- + +.. automodule:: Mikado.tests.test_transcript_checker + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.test_transcript_methods module +------------------------------------------- + +.. automodule:: Mikado.tests.test_transcript_methods + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.transcript_tester_negative module +---------------------------------------------- + +.. automodule:: Mikado.tests.transcript_tester_negative + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.transcript_tester_positive module +---------------------------------------------- + +.. automodule:: Mikado.tests.transcript_tester_positive + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.transcript_tester_single module +-------------------------------------------- + +.. automodule:: Mikado.tests.transcript_tester_single + :members: + :undoc-members: + :show-inheritance: + +Mikado.tests.utilities_tester module +------------------------------------ + +.. automodule:: Mikado.tests.utilities_tester + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: Mikado.tests + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/Library/Mikado.utilities.rst b/docs/Library/Mikado.utilities.rst new file mode 100644 index 000000000..dde72e6bc --- /dev/null +++ b/docs/Library/Mikado.utilities.rst @@ -0,0 +1,46 @@ +Mikado.utilities package +======================== + +Submodules +---------- + +Mikado.utilities.dbutils module +------------------------------- + +.. automodule:: Mikado.utilities.dbutils + :members: + :undoc-members: + :show-inheritance: + +Mikado.utilities.intervaltree module +------------------------------------ + +.. automodule:: Mikado.utilities.intervaltree + :members: + :undoc-members: + :show-inheritance: + +Mikado.utilities.log_utils module +--------------------------------- + +.. automodule:: Mikado.utilities.log_utils + :members: + :undoc-members: + :show-inheritance: + +Mikado.utilities.overlap module +------------------------------- + +.. automodule:: Mikado.utilities.overlap + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: Mikado.utilities + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/Library/modules.rst b/docs/Library/modules.rst new file mode 100644 index 000000000..342ce1103 --- /dev/null +++ b/docs/Library/modules.rst @@ -0,0 +1,7 @@ +Mikado +====== + +.. toctree:: + :maxdepth: 4 + + Mikado From bf0108c21766788b3defb24d4f55202eb5c6e192 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Tue, 31 Jan 2017 10:18:22 +0000 Subject: [PATCH 02/47] Minor mod for serialise --- Mikado/subprograms/serialise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Mikado/subprograms/serialise.py b/Mikado/subprograms/serialise.py index df026fc63..83bcc829e 100644 --- a/Mikado/subprograms/serialise.py +++ b/Mikado/subprograms/serialise.py @@ -22,7 +22,7 @@ from ..serializers import external from ..exceptions import InvalidJson import pyfaidx -from csv import DictReader +# from csv import DictReader __author__ = 'Luca Venturini' From 8bfc0aa7a4c47259d409c9202f8689db154d9423 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 1 Feb 2017 14:16:06 +0000 Subject: [PATCH 03/47] Re-written the find_retained_introns method, with unittests and new documentation included. --- CHANGELOG.md | 9 + .../configuration/scoring_files/plants.yaml | 2 +- Mikado/loci/abstractlocus.py | 226 ++++++++++++------ Mikado/loci/locus.py | 2 + Mikado/loci/transcript.py | 30 ++- Mikado/loci/transcript_methods/finalizing.py | 5 +- Mikado/subprograms/serialise.py | 2 +- Mikado/tests/intervaltree_tests.py | 34 ++- Mikado/tests/locus_tester.py | 125 +++++++--- Mikado/utilities/intervaltree.pyx | 50 +++- Mikado/utilities/overlap.pyx | 10 +- 11 files changed, 369 insertions(+), 126 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b286c864..03cdc70ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +#Version 1.0.0beta10 + +Changes in this release: + +- Deprecated the "discard_definition" flag in Mikado serialise. Now Mikado will infer on its own whether to use the definition or the ID for serialising BLAST results. +- Re-written the "find_retained_introns" method of AbstractLocus, to solve some bugs found during the utilisation of last beta. As a corollary, expanded the intervaltree module to allow searches for "tagged" intervals. + + + #Version 1.0.0beta9 - "External scores" Changes in this release: diff --git a/Mikado/configuration/scoring_files/plants.yaml b/Mikado/configuration/scoring_files/plants.yaml index f9f71c7fe..3a090a14a 100644 --- a/Mikado/configuration/scoring_files/plants.yaml +++ b/Mikado/configuration/scoring_files/plants.yaml @@ -22,7 +22,7 @@ as_requirements: three_utr_length: {operator: le, value: 2500} suspicious_splicing: {operator: ne, value: true} not_fragmentary: - expression: [is_complete and ((exon_num.multi and (cdna_length.multi or combined_cds_length.multi)), or, (exon_num.mono and combined_cds_length.mono))] + expression: [((exon_num.multi and (cdna_length.multi or combined_cds_length.multi)), or, (exon_num.mono and combined_cds_length.mono))] parameters: is_complete: {operator: eq, value: true} exon_num.multi: {operator: gt, value: 1} diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py index c3513393b..f55a1b35c 100644 --- a/Mikado/loci/abstractlocus.py +++ b/Mikado/loci/abstractlocus.py @@ -485,103 +485,187 @@ def remove_transcript_from_locus(self, tid: str): for tid in self.transcripts: self.transcripts[tid].parent = self.id - def find_retained_introns(self, transcript): + @staticmethod + def _exon_to_be_considered(exon, + transcript, + consider_truncated=False): + """Private static method to evaluate whether an exon should be considered for being a retained intron. + + :param exon: the exon to be considered. + :type exon: (tuple|Interval) + + :param transcript: the candidate transcript from which the exon comes from. + :type transcript: Transcript + + :param consider_truncated: boolean flag. If set, also terminal exons can be considered for retained intron + events. + :type consider_truncated: bool + + :returns: boolean flag (True if it has to be considered, False otherwise) and a list of sections of the exons + which are non-coding. + """ + + cds_segments = sorted(transcript.cds_tree.search(*exon)) + terminal = bool(set.intersection( + {*exon}, + {transcript.start, transcript.end, transcript.combined_cds_end, transcript.combined_cds_start})) + if cds_segments == [Interval(*exon)]: + # It is completely coding + if terminal is False: + return False, [] + elif not consider_truncated: + return False, [] + else: + return True, cds_segments + else: + frags = [] + if cds_segments: + if cds_segments[0].start > exon[0]: + frags.append((exon[0], cds_segments[0].start - 1)) + for before, after in zip(cds_segments[:-1], cds_segments[1:]): + frags.append((before.end + 1, max(after.start - 1, before.end + 1))) + if cds_segments[-1].end < exon[1]: + frags.append((cds_segments[-1].end + 1, exon[1])) + else: + frags = [Interval(*exon)] + return True, frags + + @staticmethod + def _is_exon_retained_in_transcript(exon: tuple, + frags: list, + candidate: Transcript, + consider_truncated=False, + terminal=False): + + """Private static method to verify whether a given exon is a retained intron of the candidate Transcript. + :param exon: the exon to be considered. + :type exon: (tuple|Interval) + + :param frags: a list of intervals that are non-coding within the exon. + :type frags: list[(tuple|Interval)] + + :param candidate: a transcript to be evaluated to verify whether the exon is a retained intron event. + :type candidate: Transcript + + :param consider_truncated: boolean flag. If set, also terminal exons can be considered for retained intron + events. + :type consider_truncated: bool + + :param terminal: whether the exon is at the 3' end. + :type terminal: bool + + :rtype: bool + """ + + found_exons = sorted( + candidate.segmenttree.find(exon[0], exon[1], strict=False, value="exon"), + reverse=(candidate.strand == "-")) + found_introns = sorted( + candidate.segmenttree.find(exon[0], exon[1], strict=not consider_truncated, value="intron"), + reverse=(candidate.strand == "-")) + + if len(found_exons) == 0 or len(found_introns) == 0: + return False + elif len(found_exons) == 1 and len(found_introns) == 1: + found_exons = found_exons.pop() + found_introns = found_introns.pop() + if candidate.strand != "-" and found_exons[1] + 1 == found_introns[0]: + return consider_truncated and terminal + elif candidate.strand == "-" and found_exons[0] - 1 == found_introns[1]: + return consider_truncated and terminal + else: + return False + else: + # Now we have to check whether the matched introns contain both coding and non-coding parts + assert len(found_exons) >= 2, (found_exons, found_introns) + for index, exon in enumerate(found_exons[:-1]): + intron = found_introns[index] + if candidate.strand == "-": + assert intron[1] == exon[0] - 1 + else: + assert exon[1] == intron[0] - 1 + for frag in frags: + # The fragment is just a sub-section of the exon + if (overlap(frag, exon) < exon[1] - exon[0] and + overlap(frag, exon, positive=True) == 0 and + overlap(frag, intron, positive=True)): + return True + elif overlap(frag, exon) == exon[1] - exon[0]: + return True + + return False + + def find_retained_introns(self, transcript: Transcript): """This method checks the number of exons that are possibly retained introns for a given transcript. A retained intron is defined as an exon which: - - - spans completely an intron of another model *between coding exons* + - spans completely an intron of another model *between coding exons* + - is not completely coding itself + - if the model is coding, the exon has *part* of the non-coding section lying inside the intron (ie the non-coding section must not be starting in the exonic part). + + If the "pick/run_options/consider_truncated_for_retained" flag in the configuration is set to true, + an exon will be considered as a retained intron event also if: + - it is the last exon of the transcript + - it ends *within* an intron of another model *between coding exons* - is not completely coding itself - - has *part* of the non-coding section lying inside the intron - - The results are stored inside the transcript instance, - in the "retained_introns" tuple. + - if the model is coding, the exon has *part* of the non-coding section lying inside the intron (ie the non-coding section must not be starting in the exonic part). + The results are stored inside the transcript instance, in the "retained_introns" tuple. :param transcript: a Transcript instance :type transcript: Transcript - :returns : None - :rtype : None - """ + :rtype : None""" self.logger.debug("Starting to calculate retained introns for %s", transcript.id) - if len(self.introns) == 0 or len(self._cds_introntree) == 0: + if len(self.introns) == 0: transcript.retained_introns = tuple() self.logger.debug("No introns in the locus to check against. Exiting.") return + transcript.logger = self.logger + transcript.finalize() + + # A retained intron is defined as an exon which + # - is not completely coding + # - EITHER spans completely the intron of another transcript. + # - OR is the last exon of the transcript and it ends within the intron of another transcript retained_introns = [] + consider_truncated = self.json_conf["pick"]["run_options"]["consider_truncated_for_retained"] + for exon in transcript.exons: + is_retained = False + to_consider, frags = self._exon_to_be_considered( + exon, transcript, consider_truncated=consider_truncated) + if not to_consider: + continue - if transcript.cds_tree is None: - # Enlarge the CDS segments if they are of length 1 - transcript.cds_tree = IntervalTree.from_tuples( - [(cds[0], max(cds[1], cds[0] + 1)) for cds in transcript.combined_cds]) - - for exon in iter(_ for _ in transcript.exons if - (_ not in transcript.combined_cds or - (_ in transcript.combined_cds and - (_[0] == transcript.combined_cds_end or _[1] == transcript.combined_cds_end)))): - # Ignore stuff that is at the 5' - if (exon not in transcript.combined_cds or - not self.json_conf["pick"]["run_options"]["consider_truncated_for_retained"]): - strict = True + if exon[0] == transcript.start and transcript.strand == "-": + terminal = True + elif exon[1] == transcript.end: + terminal = True else: - strict = False + terminal = False - if transcript.combined_cds_length > 0: - if transcript.strand == "+" and exon[1] < transcript.combined_cds_start: + for candidate in (_ for _ in self.transcripts.values()): + if candidate == transcript: continue - elif transcript.strand == "-" and exon[0] > transcript.combined_cds_start: + elif candidate.strand != transcript.strand and None not in (transcript.strand, candidate.strand): continue - cds_segments = sorted(transcript.cds_tree.search(*exon)) - if not cds_segments or not strict: - frags = [exon] - else: - frags = [] - if cds_segments[0].start > exon[0]: - frags.append((exon[0], cds_segments[0].start -1)) - for before, after in zip(cds_segments[:-1], cds_segments[1:]): - frags.append((before.end + 1, max(after.start -1, before.end + 1))) - if cds_segments[-1].end < exon[1]: - frags.append((cds_segments[-1].end + 1, exon[1])) - - if not frags: - continue - - is_retained = False - for tid in self.transcripts: + is_retained = self._is_exon_retained_in_transcript(exon, + frags, + # transcript, + candidate, + terminal=terminal, + consider_truncated=consider_truncated) if is_retained: + self.logger.debug("Exon %s of %s is a retained intron of %s", + exon, transcript.id, candidate.id) + retained_introns.append(exon) break - if tid == transcript.id or transcript.strand != self.transcripts[tid].strand: - # We cannot call retained introns against oneself or against stuff on the opposite strand - continue - cds_introns = self.transcripts[tid]._cds_introntree.find(exon[0], - exon[1], - strict=strict) - # cds_introns = [_ for _ in cds_introns if _.start >= exon[0] and _.end <= exon[1]] - if len(cds_introns) > 0: - for frag in frags: - if is_retained: - break - for intr in cds_introns: - if overlap((frag[0], frag[1]), (intr[0], intr[1])) > 0: - self.logger.debug("Exon %s of %s is a retained intron", - exon, transcript.id) - is_retained = True - break - - if is_retained: - retained_introns.append(exon) - - # Sort the exons marked as retained introns - # self.logger.info("Finished calculating retained introns for %s", transcript.id) transcript.retained_introns = tuple(sorted(retained_introns)) - transcript.logger = self.logger - # self.logger.info("Returning retained introns for %s", transcript.id) - # return transcript + return def print_metrics(self): diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py index 0c1c8b944..61e74a530 100644 --- a/Mikado/loci/locus.py +++ b/Mikado/loci/locus.py @@ -270,6 +270,7 @@ def add_transcript_to_locus(self, transcript: Transcript, **kwargs): to_be_added = False else: transcript.attributes["ccode"] = ccode + self.logger.debug("%s is a valid splicing isoform; Ccode: %s", transcript.id, ccode) if self.json_conf["pick"]["alternative_splicing"]["min_cdna_overlap"] > 0: overlap = comparison.n_recall[0] if overlap < self.json_conf["pick"]["alternative_splicing"]["min_cdna_overlap"]: @@ -295,6 +296,7 @@ def add_transcript_to_locus(self, transcript: Transcript, **kwargs): self.json_conf["as_requirements"]["parameters"][key]) # pylint: disable=eval-used if eval(self.json_conf["as_requirements"]["compiled"]) is False: + self.logger.debug("%s fails the minimum requirements for AS events", transcript.id) to_be_added = False if to_be_added and transcript.combined_utr_length > max_utr_lenghts["total"]: diff --git a/Mikado/loci/transcript.py b/Mikado/loci/transcript.py index 2b186b5c5..cba7d62db 100644 --- a/Mikado/loci/transcript.py +++ b/Mikado/loci/transcript.py @@ -313,9 +313,11 @@ def __init__(self, *args, self.loaded_bed12 = [] self.engine, self.session, self.sessionmaker = None, None, None # Initialisation of the CDS segments used for finding retained introns - self.__cds_tree = None + self.__cds_tree = IntervalTree() self.__expandable = False + self.__segmenttree = IntervalTree() self.__cds_introntree = IntervalTree() + self.__introntree = IntervalTree() self._possibly_without_exons = False # self.query_id = None @@ -1702,6 +1704,18 @@ def _cds_introntree(self): [(_[0], _[1] + 1) for _ in self.combined_cds_introns]) return self.__cds_introntree + @property + def _introntree(self): + + """ + :rtype: intervaltree.IntervalTree + """ + + if len(self.__introntree) != len(self.introns): + self.__cds_introntree = IntervalTree.from_tuples( + [(_[0], _[1] + 1) for _ in self.introns]) + return self.__introntree + @property def selected_cds(self): """This property return the CDS exons of the ORF selected as best @@ -1780,6 +1794,7 @@ def cds_tree(self): Used to calculate the non-coding parts of the CDS. :rtype: intervaltree.Intervaltree """ + return self.__cds_tree @cds_tree.setter @@ -1792,7 +1807,7 @@ def cds_tree(self, segments): """ if segments is None: - pass + self.cds_tree = IntervalTree() elif isinstance(segments, IntervalTree): assert len(segments) == len(self.combined_cds) else: @@ -1801,6 +1816,17 @@ def cds_tree(self, segments): self.__cds_tree = segments + @property + def segmenttree(self): + + if len(self.__segmenttree) != self.exon_num + len(self.introns): + + self.__segmenttree = IntervalTree.from_intervals( + [Interval(*_, value="exon") for _ in self.exons] + [Interval(*_, value="intron") for _ in self.introns] + ) + + return self.__segmenttree + @property def derived_children(self): """ diff --git a/Mikado/loci/transcript_methods/finalizing.py b/Mikado/loci/transcript_methods/finalizing.py index df03c8e52..bb053a948 100644 --- a/Mikado/loci/transcript_methods/finalizing.py +++ b/Mikado/loci/transcript_methods/finalizing.py @@ -3,6 +3,8 @@ e.g. reliability of the CDS/UTR, sanity of borders, etc. """ + +from Mikado.utilities.intervaltree import IntervalTree import intervaltree import operator from ...exceptions import InvalidCDS, InvalidTranscript @@ -607,7 +609,8 @@ def finalize(transcript): internal_cds[0] == "CDS") # Create the interval tree - transcript.cds_tree = None + transcript.cds_tree = IntervalTree.from_tuples( + [(cds[0], max(cds[1], cds[0] + 1)) for cds in transcript.combined_cds]) # BUG somewhere ... I am not sorting this properly before (why?) transcript.exons = sorted(transcript.exons) diff --git a/Mikado/subprograms/serialise.py b/Mikado/subprograms/serialise.py index 83bcc829e..2a4fcae8c 100644 --- a/Mikado/subprograms/serialise.py +++ b/Mikado/subprograms/serialise.py @@ -195,7 +195,7 @@ def setup(args): # Necesarry for JSON configurations continue else: - if getattr(args, key) or getattr(args, key) == 0: + if getattr(args, key, None) or getattr(args, key, None) == 0: if getattr(args, key) is False or getattr(args, key) is None: continue else: diff --git a/Mikado/tests/intervaltree_tests.py b/Mikado/tests/intervaltree_tests.py index 9c523af33..7ed1a5f40 100644 --- a/Mikado/tests/intervaltree_tests.py +++ b/Mikado/tests/intervaltree_tests.py @@ -47,6 +47,7 @@ def get_right_start(b10): r = iv.right(i-1, max_dist=10, n=1) self.assertEqual(r[0].start, i) + class UpDownStreamTestCase(unittest.TestCase): def setUp(self): @@ -115,8 +116,6 @@ def setUp(self): iv = iv.insert( 0, 1, Interval(0, 1) ) self.intervals = iv - - def test_count(self): iv = self.intervals @@ -133,7 +132,6 @@ def test_count(self): u = iv.right(1, n=9999, max_dist=99999) self.assertEqual(len(u), 9999) - def test_max_dist(self): iv = self.intervals r = iv.right(1, max_dist=0, n=10) @@ -207,6 +205,36 @@ def test_public_interval(self): fn = lambda ival: self.assertTrue(ival.interval) self.iv.traverse(fn) + def test_multiple_values(self): + + iv = IntervalTree() + exons = [(100, 300), (501, 800), (1001, 1300), (1501, 1800)] + for index, exon in enumerate(exons): + interval = Interval(*exon, value="exon") + iv.insert_interval(interval) + if index < len(exons) - 1: + intron = Interval(exon[1] + 1, exons[index+1][0] - 1, value="intron") + iv.insert_interval(intron) + + self.assertEqual(iv.find(200, 600), + [Interval(100, 300, value="exon"), + Interval(301, 500, value="intron"), + Interval(501, 800, value="exon")]) + self.assertEqual(iv.find(200, 600, strict=True), + [Interval(301, 500, value="intron")]) + self.assertEqual(iv.find(200, 600, strict=True, value="exon"), + []) + self.assertEqual(iv.find(200, 600, strict=False, value="exon"), + [Interval(100, 300, value="exon"), + # Interval(301, 500, value="intron"), + Interval(501, 800, value="exon")] + ) + self.assertEqual(iv.find(200, 600, strict=False, value="intron"), + # [Interval(100, 300, value="exon"), + [Interval(301, 500, value="intron")] + # Interval(501, 800, value="exon")] + ) + if __name__ == "__main__": unittest.main() diff --git a/Mikado/tests/locus_tester.py b/Mikado/tests/locus_tester.py index 57b21fbd7..1c5b4b375 100644 --- a/Mikado/tests/locus_tester.py +++ b/Mikado/tests/locus_tester.py @@ -14,6 +14,7 @@ from Mikado.loci import Transcript, Superlocus, Abstractlocus, Locus, MonosublocusHolder, Sublocus from Mikado.utilities.log_utils import create_null_logger, create_default_logger from Mikado.utilities import overlap +from Mikado.utilities.intervaltree import Interval import Mikado.loci import pickle @@ -921,14 +922,23 @@ def test_real_retained_pos(self): ], features="CDS") t2.finalize() - sup = Superlocus(t1, json_conf=self.my_json) - sup.add_transcript_to_locus(t2) + t3 = Transcript() + t3.chrom, t3.strand, t3.id = 1, "+", "t3" + t3.add_exons([(101, 500), (801, 970), (1100, 1180)]) + t3.add_exons([(101, 500), (801, 970), (1100, 1130)], features="CDS") + t3.finalize() - sup.find_retained_introns(t2) + for pred, retained in [(t2, True), (t3, False)]: + with self.subTest(pred=pred, retained=retained): + sup = Superlocus(t1, json_conf=self.my_json) + sup.add_transcript_to_locus(pred) + sup.json_conf["pick"]["run_options"]["consider_truncated_for_retained"] = True + sup.find_retained_introns(pred) + self.assertEqual((len(sup.transcripts[pred.id].retained_introns) > 0), + retained) - self.assertEqual(sup.transcripts["t2"].retained_introns, ((1201, 1600),)) - def test_real_retained_pos_truncated(self): + def test_retained_pos_truncated(self): """Here we verify that a real retained intron is called as such, even when the transcript is truncated.""" @@ -952,13 +962,20 @@ def test_real_retained_pos_truncated(self): t2.finalize() self.assertEqual(t2.combined_cds_end, 1420) - sup = Superlocus(t1, json_conf=self.my_json) - sup.add_transcript_to_locus(t2) - sup.json_conf["pick"]["run_options"]["consider_truncated_for_retained"] = True - - sup.find_retained_introns(t2) + t3 = Transcript() + t3.chrom, t3.strand, t3.id = 1, "+", "t3" + t3.add_exons([(101, 500), (801, 970), (1100, 1130)]) + t3.add_exons([(101, 500), (801, 970), (1100, 1130)], features="CDS") + t3.finalize() - self.assertEqual(sup.transcripts["t2"].retained_introns, ((1201, 1420),)) + for pred, retained in [(t2, True), (t3, False)]: + with self.subTest(pred=pred, retained=retained): + sup = Superlocus(t1, json_conf=self.my_json) + sup.add_transcript_to_locus(pred) + sup.json_conf["pick"]["run_options"]["consider_truncated_for_retained"] = True + sup.find_retained_introns(pred) + self.assertEqual((len(sup.transcripts[pred.id].retained_introns) > 0), + retained) def test_real_retained_pos_truncated_skip(self): """Here we verify that a real retained intron is *NOT* called as such when @@ -1016,13 +1033,25 @@ def test_real_retained_neg_truncated(self): t2.finalize() self.assertEqual(t2.combined_cds_end, 601) - sup = Superlocus(t1, json_conf=self.my_json) - sup.add_transcript_to_locus(t2) - sup.json_conf["pick"]["run_options"]["consider_truncated_for_retained"] = True - - sup.find_retained_introns(t2) + t3 = Transcript() + t3.chrom, t3.strand, t3.id = 1, "-", "t3" + t3.add_exons([(551, 580), (801, 1000), (1201, 1300), (1501, 1800)]) + t3.add_exons([(551, 580), + (801, 1000), # 200 + (1201, 1300), #100 + (1501, 1530) # 30 + ], features="CDS") + t3.finalize() + self.assertEqual(t3.combined_cds_end, 551) - self.assertEqual(sup.transcripts["t2"].retained_introns, ((601, 1000),)) + for pred, retained in [(t2, True), (t3, False)]: + with self.subTest(pred=pred, retained=retained): + sup = Superlocus(t1, json_conf=self.my_json) + sup.add_transcript_to_locus(pred) + sup.json_conf["pick"]["run_options"]["consider_truncated_for_retained"] = True + sup.find_retained_introns(pred) + self.assertEqual((len(sup.transcripts[pred.id].retained_introns) > 0), + retained) def test_real_retained_neg_truncated_skip(self): """Here we verify that a real retained intron is *NOT* called as such when @@ -1072,10 +1101,6 @@ def test_real_retained_pos_noCDS(self): t2 = Transcript() t2.chrom, t2.strand, t2.id = 1, "+", "t2" t2.add_exons([(101, 500), (801, 1000), (1201, 1600)]) - # t2.add_exons([(201, 500), # 300 - # (801, 1000), # 200 - # (1201, 1420), # 220 - # ], features="CDS") t2.finalize() sup = Superlocus(t1, json_conf=self.my_json) @@ -1108,12 +1133,18 @@ def test_not_retained_pos(self): ], features="CDS") t2.finalize() - sup = Superlocus(t1, json_conf=self.my_json) - sup.add_transcript_to_locus(t2) - - sup.find_retained_introns(t2) + t3 = Transcript() + t3.chrom, t3.strand, t3.id = 1, "+", "t3" + t3.add_exons([(101, 500), (801, 970), (1100, 1130)]) + t3.add_exons([(101, 500), (801, 970), (1100, 1130)], features="CDS") + t3.finalize() - self.assertEqual(sup.transcripts["t2"].retained_intron_num, 0) + for pred in [t2, t3]: + with self.subTest(pred=pred): + sup = Superlocus(t1, json_conf=self.my_json) + sup.add_transcript_to_locus(pred) + sup.find_retained_introns(pred) + self.assertEqual(sup.transcripts[pred.id].retained_intron_num, 0) def test_real_retained_neg(self): """Here we verify that a real retained intron is called as such""" @@ -1166,18 +1197,29 @@ def test_not_real_retained_neg(self): ], features="CDS") t2.finalize() - sup = Superlocus(t1, json_conf=self.my_json) - sup.add_transcript_to_locus(t2) + t3 = Transcript() + t3.chrom, t3.strand, t3.id = 1, "-", "t3" + t3.add_exons([(401, 1000), (1201, 1300), (1501, 1800)]) + t3.add_exons([(831, 1000), # 200 + (1201, 1300), + (1501, 1530) + ], features="CDS") + t3.finalize() - sup.find_retained_introns(t2) + self.assertFalse( + Abstractlocus._is_exon_retained_in_transcript((401, 1000), [Interval(401, 830)], t1)) - self.assertEqual(sup.transcripts["t2"].retained_intron_num, 0, - sup.transcripts["t2"].retained_introns) + for alt in [t2, t3]: + with self.subTest(alt=alt): + sup = Superlocus(t1, json_conf=self.my_json) + sup.find_retained_introns(alt) + + self.assertEqual(alt.retained_intron_num, 0, + alt.retained_introns) def test_not_retained_neg(self): """Here we verify that a false retained intron is not called as such""" - t1 = Transcript() t1.chrom, t1.strand, t1.id = 1, "-", "t1" t1.add_exons([(101, 500), (801, 1000), (1201, 1300), (1501, 1800)]) @@ -1200,9 +1242,21 @@ def test_not_retained_neg(self): sup = Superlocus(t1, json_conf=self.my_json) sup.add_transcript_to_locus(t2) + self.assertEqual(t2.cds_tree.find(301, 1000), + [Interval(471, 1000)]) + + self.assertEqual(Abstractlocus._exon_to_be_considered((301, 1000), t2), + (True, [(301, 470)]), + Abstractlocus._exon_to_be_considered((301, 1000), t2)) + + self.assertFalse(Abstractlocus._is_exon_retained_in_transcript((301, 1000), + [(301, 470)], + t1)) + sup.find_retained_introns(t2) - self.assertEqual(sup.transcripts["t2"].retained_intron_num, 0) + self.assertEqual(sup.transcripts["t2"].retained_intron_num, 0, + sup.transcripts["t2"].retained_introns) def test_exon_switching_pos(self): @@ -1282,13 +1336,15 @@ def test_exon_switching_neg(self): t2 = Transcript() t2.chrom, t2.strand, t2.id = 1, "-", "t2" - t2.add_exons([(101, 500), (1701, 2000), (2201, 2300), (1501, 1800)]) + t2.add_exons([(101, 500), (1701, 2000), (2201, 2300), (2501, 2800)]) t2.add_exons([ (1801, 2000), # 200 (2201, 2300), # 100 (2501, 2530) # 30 ], features="CDS") t2.finalize() + self.assertEqual(len(t2.cds_tree), len(t2.combined_cds)) + self.assertEqual(len(t2.cds_tree), 3) sup = Superlocus(t1, json_conf=self.my_json) sup.add_transcript_to_locus(t2) @@ -1297,7 +1353,6 @@ def test_exon_switching_neg(self): self.assertEqual(sup.transcripts["t2"].retained_intron_num, 0) - def test_exon_switching_neg_noCDS(self): """Checking that an exon switching is treated correctly as a NON-retained intron even when the CDS is absent. Positive strand case""" diff --git a/Mikado/utilities/intervaltree.pyx b/Mikado/utilities/intervaltree.pyx index 413661aaf..8ea7cbb6f 100644 --- a/Mikado/utilities/intervaltree.pyx +++ b/Mikado/utilities/intervaltree.pyx @@ -253,6 +253,7 @@ cdef class IntervalNode: cdef list results = [] # use end + 1 becuase .right() assumes strictly right-of self._seek_right(position + 1, results, n, max_dist) + if len(results) == n: return results r = results r.sort(key=operator.attrgetter('start')) @@ -307,19 +308,19 @@ cdef class Interval: def __richcmp__(self, other, op): if op == 0: # < - return self.start < other.start or self.end < other.end + return self[0] < other[0] or self[1] < other[1] elif op == 1: # <= return self == other or self < other elif op == 2: # == - return self.start == other.start and self.end == other.end + return self[0] == other[0] and self[1] == other[1] elif op == 3: # != - return self.start != other.start or self.end != other.end + return self[0] != other[0] or self[1] != other[1] elif op == 4: # > - return self.start > other.start or self.end > other.end + return self[0] > other[0] or self[1] > other[1] elif op == 5: # >= return self == other or self > other @@ -329,8 +330,16 @@ cdef class Interval: return self.start elif index == 1: return self.end + elif index == 2: + return self.value else: - raise IndexError("Intervals only have starts and ends!") + return [self.start, self.end, self.value][index] + # raise IndexError("Intervals only have starts and ends!") + + def __iter__(self): + + return iter([self.start, self.end]) + cdef class IntervalTree: """ @@ -406,9 +415,11 @@ cdef class IntervalTree: add = insert - def find(self, int start, int end, bint strict=0, int max_distance=0, int num_intervals=100): + def find(self, int start, int end, bint strict=0, int max_distance=0, int num_intervals=100, object value=None): """ Return a sorted list of all intervals overlapping [start,end). + If strict is set to True, only matches which are completely contained will + be counted as full matches. If set to False, also partial matches will count. """ if self.root is None: @@ -430,6 +441,13 @@ cdef class IntervalTree: new_found.append(_) found = new_found + if value is not None: + new_found = [] + for _ in found: + if _.value == value: + new_found.append(_) + found = new_found + return found search = find @@ -439,23 +457,23 @@ cdef class IntervalTree: return self.num_intervals - def before( self, position, num_intervals=1, max_dist=2000 ): + def before( self, position, num_intervals=1, max_dist=2000): """ Find `num_intervals` intervals that lie before `position` and are no further than `max_dist` positions away """ if self.root is None: return [] - return self.root.left( position, num_intervals, max_dist ) + return self.root.left( position, num_intervals, max_dist) - def after( self, position, num_intervals=1, max_dist=25000 ): + def after( self, position, num_intervals=1, max_dist=25000): """ Find `num_intervals` intervals that lie after `position` and are no further than `max_dist` positions away """ if self.root is None: return [] - return self.root.right( position, num_intervals, max_dist ) + return self.root.right( position, num_intervals, max_dist) # ---- Interval-like object based interfaces ----------------------------- @@ -531,5 +549,17 @@ cdef class IntervalTree: tree.insert_interval(Interval(*iv)) return tree + @classmethod + def from_intervals(cls, intervals): + """ + Create a new IntervalTree from an iterable of Interval instances. + """ + + tree = IntervalTree() + for iv in intervals: + tree.insert_interval(iv) + return tree + + # For backward compatibility Intersecter = IntervalTree \ No newline at end of file diff --git a/Mikado/utilities/overlap.pyx b/Mikado/utilities/overlap.pyx index 2fc78ac68..6e8511b0e 100644 --- a/Mikado/utilities/overlap.pyx +++ b/Mikado/utilities/overlap.pyx @@ -8,8 +8,14 @@ cpdef long overlap(first, second, long flank=0, bint positive=0): cdef long start, end, ostart, oend - start, end = first[:2] - ostart, oend = second[:2] + if hasattr(first, "start"): + start, end = first.start, first.end + else: + start, end = first[:2] + if hasattr(second, "start"): + ostart, oend = second.start, second.end + else: + ostart, oend = second[:2] return c_overlap(start, end, ostart, oend, flank=flank, positive=positive) From ed6d0bd8f200f9ddf06293f3ee92d77930d4e90d Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 1 Feb 2017 14:19:44 +0000 Subject: [PATCH 04/47] Version bump --- Mikado/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Mikado/__init__.py b/Mikado/__init__.py index a9cbd34d4..aaec218f5 100755 --- a/Mikado/__init__.py +++ b/Mikado/__init__.py @@ -10,7 +10,7 @@ __author__ = 'Luca Venturini' __license__ = 'GPL3' __copyright__ = 'Copyright 2015-2016 Luca Venturini' -__version__ = "1.0.0b9" +__version__ = "1.0.0b10" __all__ = ["configuration", "exceptions", From 5619490b8f64ca3f1e376a6c7fbce81417c55a66 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 1 Feb 2017 16:08:28 +0000 Subject: [PATCH 05/47] Unified the printing mechanism for picking. Now the monoloci stage prints out the monosubloci_holder stage, not the simple monosublocus stage. --- CHANGELOG.md | 2 +- Mikado/loci/locus.py | 29 +-- Mikado/loci/monosublocus.py | 71 +++---- Mikado/loci/monosublocusholder.py | 46 ++++- Mikado/loci/superlocus.py | 37 +++- Mikado/picking/loci_processer.py | 302 +++++++++++++++++------------- Mikado/picking/picker.py | 93 +-------- 7 files changed, 288 insertions(+), 292 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 03cdc70ba..9e88a1a34 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ Changes in this release: - Deprecated the "discard_definition" flag in Mikado serialise. Now Mikado will infer on its own whether to use the definition or the ID for serialising BLAST results. - Re-written the "find_retained_introns" method of AbstractLocus, to solve some bugs found during the utilisation of last beta. As a corollary, expanded the intervaltree module to allow searches for "tagged" intervals. - +- Now the "monoloci_out" files contain the Monosublocus**Holder** step, not the Monosublocus step. This should help during fine-tuning. #Version 1.0.0beta9 - "External scores" diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py index 61e74a530..0fb110bdb 100644 --- a/Mikado/loci/locus.py +++ b/Mikado/loci/locus.py @@ -173,7 +173,7 @@ def finalize_alternative_splicing(self): self.scores_calculated = False self.calculate_scores() - self.logger.debug("Now checking the retained introns") + self.logger.debug("Now checking the retained introns for %s", self.id) while True: to_remove = set() for tid, transcript in self.transcripts.items(): @@ -234,17 +234,6 @@ def add_transcript_to_locus(self, transcript: Transcript, **kwargs): _ = kwargs to_be_added = True - # Total, 5', 3' - max_utr_lenghts = { - "total": self.json_conf["pick"]["alternative_splicing"]["max_utr_length"], - "five": self.json_conf["pick"]["alternative_splicing"]["max_fiveutr_length"], - "three": self.json_conf["pick"]["alternative_splicing"]["max_threeutr_length"]} - # max_isoforms = self.json_conf["pick"]["alternative_splicing"]["max_isoforms"] - # - # if len(self.transcripts) >= max_isoforms: - # self.logger.debug("%s not added because the Locus has already too many transcripts.", - # transcript.id) - # to_be_added = False if to_be_added and transcript.strand != self.strand: self.logger.debug("%s not added because it has a different strand from %s (%s vs. %s)", @@ -299,22 +288,6 @@ def add_transcript_to_locus(self, transcript: Transcript, **kwargs): self.logger.debug("%s fails the minimum requirements for AS events", transcript.id) to_be_added = False - if to_be_added and transcript.combined_utr_length > max_utr_lenghts["total"]: - self.logger.debug("%s not added because it has too much UTR (%d).", - transcript.id, - transcript.combined_utr_length) - to_be_added = False - if to_be_added and transcript.five_utr_length > max_utr_lenghts["five"]: - self.logger.debug("%s not added because it has too much 5'UTR (%d).", - transcript.id, - transcript.five_utr_length) - to_be_added = False - if to_be_added and transcript.three_utr_length > max_utr_lenghts["three"]: - self.logger.debug("%s not added because it has too much 3'UTR (%d).", - transcript.id, - transcript.three_utr_length) - to_be_added = False - if to_be_added and self.json_conf["pick"]["alternative_splicing"]["min_cds_overlap"] > 0: if self.primary_transcript.combined_cds_length > 0: tr_nucls = set(itertools.chain( diff --git a/Mikado/loci/monosublocus.py b/Mikado/loci/monosublocus.py index c31a9bed4..dfdba1902 100644 --- a/Mikado/loci/monosublocus.py +++ b/Mikado/loci/monosublocus.py @@ -9,7 +9,6 @@ """ from .abstractlocus import Abstractlocus -from ..parsers.GFF import GffLine # pylint: disable=too-many-instance-attributes,too-many-public-methods @@ -38,39 +37,43 @@ def __init__(self, transcript_instance, logger=None): # pylint: disable=arguments-differ def __str__(self, print_cds=True, source_in_name=True): - lines = [] - - self_line = GffLine('') - for attr in ["chrom", 'feature', 'source', 'start', 'end', 'strand']: - setattr(self_line, attr, getattr(self, attr)) - self_line.phase, self_line.score = None, self.score - if source_in_name is True: - self_line.id = "{0}_{1}".format(self.source, self.id) - else: - self_line.id = self.id - self_line.name = self.name - self_line.parent = self.parent - self_line.attributes.update(self.attributes) - self_line.attributes["multiexonic"] = (not self.monoexonic) - lines.append(str(self_line)) - - for tid in self.transcripts: - transcript_instance = self.transcripts[tid] - transcript_instance.source = self.source - transcript_instance.parent = self_line.id - self.logger.debug(self.attributes) - for attribute in self.attributes: - if attribute not in transcript_instance.attributes: - if attribute == "is_fragment" and self.attributes[attribute] is False: - continue - transcript_instance.attributes[attribute] = self.attributes[attribute] - - lines.append(transcript_instance.format( - "gff", - all_orfs=self.json_conf["pick"]["output_format"]["report_all_orfs"], - with_cds=print_cds).rstrip()) - - return "\n".join(lines) + raise NotImplementedError( + """This is a container used for computational purposes only, + it should not be printed out directly!""") + + # lines = [] + # + # self_line = GffLine('') + # for attr in ["chrom", 'feature', 'source', 'start', 'end', 'strand']: + # setattr(self_line, attr, getattr(self, attr)) + # self_line.phase, self_line.score = None, self.score + # if source_in_name is True: + # self_line.id = "{0}_{1}".format(self.source, self.id) + # else: + # self_line.id = self.id + # self_line.name = self.name + # self_line.parent = self.parent + # self_line.attributes.update(self.attributes) + # self_line.attributes["multiexonic"] = (not self.monoexonic) + # lines.append(str(self_line)) + # + # for tid in self.transcripts: + # transcript_instance = self.transcripts[tid] + # transcript_instance.source = self.source + # transcript_instance.parent = self_line.id + # self.logger.debug(self.attributes) + # for attribute in self.attributes: + # if attribute not in transcript_instance.attributes: + # if attribute == "is_fragment" and self.attributes[attribute] is False: + # continue + # transcript_instance.attributes[attribute] = self.attributes[attribute] + # + # lines.append(transcript_instance.format( + # "gff", + # all_orfs=self.json_conf["pick"]["output_format"]["report_all_orfs"], + # with_cds=print_cds).rstrip()) + # + # return "\n".join(lines) # pylint: enable=arguments-differ # ########## Class instance methods ############## diff --git a/Mikado/loci/monosublocusholder.py b/Mikado/loci/monosublocusholder.py index 65294f1b2..6659360b6 100644 --- a/Mikado/loci/monosublocusholder.py +++ b/Mikado/loci/monosublocusholder.py @@ -19,6 +19,7 @@ from ..utilities import overlap from ..utilities.log_utils import create_null_logger import logging +from ..parsers.GFF import GffLine # Resolution order is important here! # pylint: disable=too-many-instance-attributes @@ -48,11 +49,14 @@ def __init__(self, monosublocus_instance: Monosublocus, json_conf=None, logger=N self.json_conf = json_conf self.excluded = None self.purge = self.json_conf["pick"]["run_options"]["purge"] + self.feature = "MonosublocusHolder" + self.score = monosublocus_instance.score self.scores_calculated = False # Add the transcript to the Locus self.locus_verified_introns = set() self.add_monosublocus(monosublocus_instance) self.loci = SortedDict() + self.attributes = dict() # Overriding is correct here # pylint: disable=arguments-differ @@ -93,15 +97,49 @@ def add_monosublocus(self, monosublocus_instance: Monosublocus): self.add_transcript_to_locus(monosublocus_instance.transcripts[tid], check_in_locus=check_in_locus) - def __str__(self, print_cds=False): + def __str__(self, print_cds=False, source_in_name=True): """This special method is explicitly *not* implemented; this Locus object is not meant for printing, only for computation! :param print_cds: flag. Ignored. """ - raise NotImplementedError( - """This is a container used for computational purposes only, - it should not be printed out directly!""") + + lines = [] + + self_line = GffLine('') + for attr in ["chrom", 'feature', 'source', 'start', 'end', 'strand']: + setattr(self_line, attr, getattr(self, attr)) + self.calculate_scores() + self.score = max([_.score for _ in self.transcripts.values()]) + + self_line.phase, self_line.score = None, self.score + if source_in_name is True: + self_line.id = "{0}_{1}".format(self.source, self.id) + else: + self_line.id = self.id + self_line.name = self.name + self_line.parent = self.parent + self_line.attributes.update(self.attributes) + self_line.attributes["multiexonic"] = (not self.monoexonic) + lines.append(str(self_line)) + + for tid in self.transcripts: + transcript_instance = self.transcripts[tid] + transcript_instance.source = self.source + transcript_instance.parent = self_line.id + self.logger.debug(self.attributes) + for attribute in self.attributes: + if attribute not in transcript_instance.attributes: + if attribute == "is_fragment" and self.attributes[attribute] is False: + continue + transcript_instance.attributes[attribute] = self.attributes[attribute] + + lines.append(transcript_instance.format( + "gff", + all_orfs=self.json_conf["pick"]["output_format"]["report_all_orfs"], + with_cds=print_cds).rstrip()) + + return "\n".join(lines) def define_monosubloci(self, purge=False, excluded=None): """Overriden and set to NotImplemented to avoid cross-calling it when inappropriate. diff --git a/Mikado/loci/superlocus.py b/Mikado/loci/superlocus.py index f1376fe0e..713f46caf 100644 --- a/Mikado/loci/superlocus.py +++ b/Mikado/loci/superlocus.py @@ -174,6 +174,33 @@ def __create_locus_lines(self, superlocus_line, new_id, print_cds=True): lines.append(locus_instance.__str__(print_cds=print_cds).rstrip()) return lines + def __create_monolocus_holder_lines(self, superlocus_line, new_id, print_cds=True): + + """ + Private method to prepare the lines for printing out monosubloci + into GFF/GTF files. + """ + + lines = [] + self.define_monosubloci() + if len(self.monoholders) > 0: + source = "{0}_monosubloci".format(self.source) + superlocus_line.source = source + lines.append(str(superlocus_line)) + found = dict() + for monosublocus_instance in self.monoholders: + monosublocus_instance.source = source + monosublocus_instance.parent = new_id + if monosublocus_instance.id in found: + found[monosublocus_instance.id] += 1 + monosublocus_instance.counter = found[monosublocus_instance.id] + else: + found[monosublocus_instance.id] = 0 + + lines.append(monosublocus_instance.__str__(print_cds=print_cds).rstrip()) + + return lines + def __create_monolocus_lines(self, superlocus_line, new_id, print_cds=True): """ @@ -286,9 +313,12 @@ def __str__(self, level=None, print_cds=True): print_cds=print_cds ) elif level == "monosubloci" or (level is None and self.monosubloci_defined is True): - lines = self.__create_monolocus_lines(superlocus_line, - new_id, - print_cds=print_cds) + lines = self.__create_monolocus_holder_lines(superlocus_line, + new_id, + print_cds=print_cds) + # lines = self.__create_monolocus_lines(superlocus_line, + # new_id, + # print_cds=print_cds) elif level == "subloci" or (level is None and self.monosubloci_defined is False): lines = self.__create_sublocus_lines(superlocus_line, new_id, @@ -1162,6 +1192,7 @@ def calculate_mono_metrics(self): self.monoholders.append(holder) for monoholder in self.monoholders: + monoholder.scores_calculated = False if self.regressor is not None: monoholder.regressor = self.regressor monoholder.calculate_scores() diff --git a/Mikado/picking/loci_processer.py b/Mikado/picking/loci_processer.py index b5a9ca6c4..51afa8fe9 100644 --- a/Mikado/picking/loci_processer.py +++ b/Mikado/picking/loci_processer.py @@ -212,6 +212,114 @@ def merge_loci_gff(gff_filenames, gff_handle, prefix=""): return gid_to_new, tid_to_new +def print_locus(stranded_locus, + gene_counter, + handles, + counter=None, + logger=None, + json_conf=None): + """ + Method that handles a single superlocus for printing. + It also detects and flags/discard fragmentary loci. + :param stranded_locus: the stranded locus to analyse + :return: + """ + + locus_metrics, locus_scores, locus_out = handles[0] + sub_metrics, sub_scores, sub_out = handles[1] + mono_metrics, mono_scores, mono_out = handles[2] + + if json_conf is None: + from ..configuration.configurator import to_json + json_conf = to_json(None) + + stranded_locus.logger = logger + if sub_out is not None: # Skip this section if no sub_out is defined + sub_lines = stranded_locus.__str__( + level="subloci", + print_cds=not json_conf["pick"]["run_options"]["exclude_cds"]) + if sub_lines != '': + if counter is not None: + sub_lines = "\n".join( + ["{0}/{1}".format(counter, line) for line in sub_lines.split("\n")]) + print(sub_lines, file=sub_out) + sub_metrics_rows = [_ for _ in stranded_locus.print_subloci_metrics() + if _ != {} and "tid" in _] + sub_scores_rows = [_ for _ in stranded_locus.print_subloci_scores() + if _ != {} and "tid" in _] + for row in sub_metrics_rows: + if counter is not None: + row["tid"] = "{0}/{1}".format(counter, row["tid"]) + sub_metrics.writerow(row) + for row in sub_scores_rows: + if counter is not None: + row["tid"] = "{0}/{1}".format(counter, row["tid"]) + sub_scores.writerow(row) + if mono_out is not None: + mono_lines = stranded_locus.__str__( + level="monosubloci", + print_cds=not json_conf["pick"]["run_options"]["exclude_cds"]) + if mono_lines != '': + mono_lines = "\n".join( + ["{0}/{1}".format(counter, line) for line in mono_lines.split("\n")]) + print(mono_lines, file=mono_out) + mono_metrics_rows = [_ for _ in stranded_locus.print_monoholder_metrics() + if _ != {} and "tid" in _] + mono_scores_rows = [_ for _ in stranded_locus.print_monoholder_scores() + if _ != {} and "tid" in _] + for row in mono_metrics_rows: + if counter is not None: + row["tid"] = "{0}/{1}".format(counter, row["tid"]) + mono_metrics.writerow(row) + for row in mono_scores_rows: + if counter is not None: + row["tid"] = "{0}/{1}".format(counter, row["tid"]) + mono_scores.writerow(row) + + for locus in stranded_locus.loci: + gene_counter += 1 + fragment_test = ( + json_conf["pick"]["run_options"]["remove_overlapping_fragments"] + is True and stranded_locus.loci[locus].is_fragment is True) + + if fragment_test is True: + continue + gene_counter += 1 + new_id = "{0}.{1}G{2}".format( + json_conf["pick"]["output_format"]["id_prefix"], + stranded_locus.chrom, gene_counter) + stranded_locus.loci[locus].logger = logger + stranded_locus.loci[locus].id = new_id + + locus_lines = stranded_locus.__str__( + print_cds=not json_conf["pick"]["run_options"]["exclude_cds"], + level="loci") + + locus_metrics_rows = [x for x in stranded_locus.print_loci_metrics()] + locus_scores_rows = [x for x in stranded_locus.print_loci_scores()] + + if locus_lines: + assert len(locus_metrics_rows) > 0 + if counter is not None: + locus_lines = "\n".join( + ["{0}/{1}".format(counter, line) for line in locus_lines.split("\n")]) + print(locus_lines, file=locus_out) + + # assert len(locus_metrics_rows) == len(locus_scores_rows) + + for row in locus_metrics_rows: + if counter is not None: + row["tid"] = "{0}/{1}".format(counter, row["tid"]) + locus_metrics.writerow(row) + for row in locus_scores_rows: + if counter is not None: + row["tid"] = "{0}/{1}".format(counter, row["tid"]) + locus_scores.writerow(row) + # Necessary to flush out all the files + [_.flush() for _ in handles if hasattr(_, "close")] + return gene_counter + + def merge_loci(num_temp, out_handles, prefix="", tempdir="mikado_pick_tmp"): """ Function to merge the temporary loci files into single output files, @@ -531,7 +639,8 @@ def __init__(self, self._create_handles(self.__output_files) self.__gene_counter = 0 - assert self.locus_out is not None + assert len(self._handles) > 0 + self.logger.debug("Starting Process %s", self.name) self.logger.debug("Starting the pool for {0}".format(self.name)) @@ -575,18 +684,15 @@ def __getstate__(self): return state def terminate(self): - # [_.flush() for _ in self._handles if hasattr(_, "flush") and _.closed is False] - # [_.close() for _ in self._handles if hasattr(_, "close") and _.closed is False] - # if self.engine is not None: - # self.engine.dispose() self.__close_handles() super().terminate() def __close_handles(self): """Private method to flush and close all handles.""" - [_.flush() for _ in self._handles if hasattr(_, "flush") and _.closed is False] - [_.close() for _ in self._handles if hasattr(_, "close") and _.closed is False] + for group in self._handles: + [_.flush() for _ in group if hasattr(_, "flush") and _.closed is False] + [_.close() for _ in group if hasattr(_, "close") and _.closed is False] if self.engine is not None: self.engine.dispose() @@ -615,16 +721,52 @@ def __setstate__(self, state): engine=self.engine, logging_queue=self.logging_queue) - def _create_handles(self, handles): + def __create_step_handles(self, handles, metrics, score_keys): + + """Private method to create the handles for a given step (eg Locus). + + :param handles: the list with the filename prefixes + :type handles: [list|tuple] + + :param metrics: list of metrics name, to be used as header for the metrics file + :type metrics: list + + :param score_keys: list of metrics names used for scoring, to be used as header + for the score file + :type score_keys: list + + :returns: a list of handles to be used for writing + :rtype: list + """ (locus_metrics_file, locus_scores_file, locus_out_file) = [os.path.join(self._tempdir, "{0}-{1}".format(os.path.basename(_), self.identifier)) - for _ in handles[0]] + for _ in handles] locus_metrics_handle = open(locus_metrics_file, "a") locus_scores_handle = open(locus_scores_file, "a") + locus_metrics = csv.DictWriter( + locus_metrics_handle, + metrics, + delimiter="\t") + locus_metrics.handle = locus_metrics_handle + locus_metrics.close = locus_metrics.handle.close + locus_metrics.closed = locus_metrics.handle.closed + locus_metrics.flush = locus_metrics.handle.flush + + locus_scores = csv.DictWriter(locus_scores_handle, score_keys, delimiter="\t") + locus_scores.handle = locus_scores_handle + locus_scores.close = locus_scores.handle.close + locus_scores.closed = locus_scores.handle.closed + locus_scores.flush = locus_scores.handle.flush + + locus_out = open(locus_out_file, 'w') + + return [locus_metrics, locus_scores, locus_out] + + def _create_handles(self, handles): if self.regressor is None: score_keys = sorted(list(self.json_conf["scoring"].keys())) @@ -646,51 +788,22 @@ def _create_handles(self, handles): metrics.extend(["external.{}".format(_.source) for _ in session.query(ExternalSource.source).all()]) metrics = Superlocus.available_metrics[:3] + sorted(metrics) - self.locus_metrics = csv.DictWriter( - locus_metrics_handle, - metrics, - delimiter="\t") - - self.locus_scores = csv.DictWriter(locus_scores_handle, score_keys, delimiter="\t") - - self.locus_out = open(locus_out_file, 'w') - self._handles.extend((locus_metrics_handle, locus_scores_handle, self.locus_out)) + self._handles.append(self.__create_step_handles(handles[0], + metrics, score_keys)) + # Subloci if handles[1][0]: - (sub_metrics_file, - sub_scores_file, - sub_out_file) = [os.path.join(self._tempdir, - "{0}-{1}".format(os.path.basename(_), - self.identifier)) - for _ in handles[1]] - sub_metrics_handle = open(sub_metrics_file, "w") - sub_scores_handle = open(sub_scores_file, "w") - self.sub_metrics = csv.DictWriter( - sub_metrics_handle, - metrics, - delimiter="\t") - self.sub_scores = csv.DictWriter( - sub_scores_handle, score_keys, delimiter="\t") - self.sub_out = open(sub_out_file, "w") - self._handles.extend([sub_metrics_handle, sub_scores_handle, self.sub_out]) + self._handles.append(self.__create_step_handles(handles[1], + metrics, score_keys)) + else: + self._handles.append([None, None, None]) + # Monoloci if handles[2][0]: - (mono_metrics_file, - mono_scores_file, - mono_out_file) = [os.path.join(self._tempdir, - "{0}-{1}".format(os.path.basename(_), - self.identifier)) - for _ in handles[2]] - mono_metrics_handle = open(mono_metrics_file, "w") - mono_scores_handle = open(mono_scores_file, "w") - self.mono_metrics = csv.DictWriter( - mono_metrics_handle, - metrics, - delimiter="\t") - self.mono_scores = csv.DictWriter( - mono_scores_handle, score_keys, delimiter="\t") - self.mono_out = open(mono_out_file, "w") - self._handles.extend([mono_metrics_handle, mono_scores_handle, self.mono_out]) + self._handles.append(self.__create_step_handles(handles[2], + metrics, score_keys)) + else: + self._handles.append([None, None, None]) return @@ -721,90 +834,11 @@ def run(self): stranded_loci = self.analyse_locus(slocus, counter) else: stranded_loci = [] - for stranded_locus in stranded_loci: - self._print_locus(stranded_locus, counter) - return - - def _print_locus(self, stranded_locus, counter): - - """ - Private method that handles a single superlocus for printing. - It also detects and flags/discard fragmentary loci. - :param stranded_locus: the stranded locus to analyse - :return: - """ - - if self.sub_out is not None: # Skip this section if no sub_out is defined - sub_lines = stranded_locus.__str__( - level="subloci", - print_cds=not self.json_conf["pick"]["run_options"]["exclude_cds"]) - if sub_lines != '': - sub_lines = "\n".join( - ["{0}/{1}".format(counter, line) for line in sub_lines.split("\n")]) - print(sub_lines, file=self.sub_out) - sub_metrics_rows = [x for x in stranded_locus.print_subloci_metrics() - if x != {} and "tid" in x] - sub_scores_rows = [x for x in stranded_locus.print_subloci_scores() - if x != {} and "tid" in x] - for row in sub_metrics_rows: - row["tid"] = "{0}/{1}".format(counter, row["tid"]) - self.sub_metrics.writerow(row) - for row in sub_scores_rows: - row["tid"] = "{0}/{1}".format(counter, row["tid"]) - self.sub_scores.writerow(row) - if self.mono_out is not None: - mono_lines = stranded_locus.__str__( - level="monosubloci", - print_cds=not self.json_conf["pick"]["run_options"]["exclude_cds"]) - if mono_lines != '': - mono_lines = "\n".join( - ["{0}/{1}".format(counter, line) for line in mono_lines.split("\n")]) - print(mono_lines, file=self.mono_out) - mono_metrics_rows = [x for x in stranded_locus.print_subloci_metrics() - if x != {} and "tid" in x] - mono_scores_rows = [x for x in stranded_locus.print_subloci_scores() - if x != {} and "tid" in x] - for row in mono_metrics_rows: - row["tid"] = "{0}/{1}".format(counter, row["tid"]) - self.mono_metrics.writerow(row) - for row in mono_scores_rows: - row["tid"] = "{0}/{1}".format(counter, row["tid"]) - self.mono_scores.writerow(row) - for locus in stranded_locus.loci: - fragment_test = ( - self.json_conf["pick"]["run_options"]["remove_overlapping_fragments"] - is True and stranded_locus.loci[locus].is_fragment is True) - - if fragment_test is True: - continue - self.__gene_counter += 1 - new_id = "{0}.{1}G{2}".format( - self.json_conf["pick"]["output_format"]["id_prefix"], - stranded_locus.chrom, self.__gene_counter) - stranded_locus.loci[locus].logger = self.logger - stranded_locus.loci[locus].id = new_id - - locus_lines = stranded_locus.__str__( - print_cds=not self.json_conf["pick"]["run_options"]["exclude_cds"], - level="loci") - - locus_metrics_rows = [x for x in stranded_locus.print_loci_metrics()] - locus_scores_rows = [x for x in stranded_locus.print_loci_scores()] - - if locus_lines: - assert len(locus_metrics_rows) > 0 - locus_lines = "\n".join( - ["{0}/{1}".format(counter, line) for line in locus_lines.split("\n")]) - print(locus_lines, file=self.locus_out) + for stranded_locus in stranded_loci: + self.__gene_counter = print_locus( + stranded_locus, self.__gene_counter, self._handles, + counter=counter, logger=self.logger, json_conf=self.json_conf) - # assert len(locus_metrics_rows) == len(locus_scores_rows) + return - for row in locus_metrics_rows: - row["tid"] = "{0}/{1}".format(counter, row["tid"]) - self.locus_metrics.writerow(row) - for row in locus_scores_rows: - row["tid"] = "{0}/{1}".format(counter, row["tid"]) - self.locus_scores.writerow(row) - # Necessary to flush out all the files - [_.flush() for _ in self._handles if hasattr(_, "close")] diff --git a/Mikado/picking/picker.py b/Mikado/picking/picker.py index 55a49f6c7..51b72efb9 100644 --- a/Mikado/picking/picker.py +++ b/Mikado/picking/picker.py @@ -33,7 +33,7 @@ from ..configuration.configurator import to_json, check_json # Necessary for nosetests from ..utilities import dbutils, merge_partial from ..exceptions import UnsortedInput, InvalidJson, InvalidTranscript -from .loci_processer import analyse_locus, LociProcesser, merge_loci +from .loci_processer import analyse_locus, LociProcesser, merge_loci, print_locus import multiprocessing.managers from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier import pickle @@ -490,90 +490,6 @@ def __get_output_files(self): # actually file handlers. I cannot trim them down for now. # pylint: disable=too-many-locals - def _print_locus(self, stranded_locus, gene_counter, logger=None, handles=()): - - """ - Private method that handles a single superlocus for printing. - It also detects and flags/discard fragmentary loci. - :param stranded_locus: the stranded locus to analyse - :param gene_counter: A counter used to rename the genes/transcripts progressively - :param logger: logger instance - :param handles: the handles to print to - :return: - """ - - locus_metrics, locus_scores, locus_out = handles[0] - sub_metrics, sub_scores, sub_out = handles[1] - mono_metrics, mono_scores, mono_out = handles[2] - - stranded_locus.logger = logger - if self.sub_out != '': # Skip this section if no sub_out is defined - sub_lines = stranded_locus.__str__( - level="subloci", - print_cds=not self.json_conf["pick"]["run_options"]["exclude_cds"]) - if sub_lines != '': - print(sub_lines, file=sub_out) - # sub_out.flush() - sub_metrics_rows = [x for x in stranded_locus.print_subloci_metrics() - if x != {} and "tid" in x] - sub_scores_rows = [x for x in stranded_locus.print_subloci_scores() - if x != {} and "tid" in x] - for row in sub_metrics_rows: - sub_metrics.writerow(row) - # sub_metrics.flush() - for row in sub_scores_rows: - sub_scores.writerow(row) - # sub_scores.flush() - if self.monolocus_out != '': - mono_lines = stranded_locus.__str__( - level="monosubloci", - print_cds=not self.json_conf["pick"]["run_options"]["exclude_cds"]) - if mono_lines != '': - print(mono_lines, file=mono_out) - # mono_out.flush() - mono_metrics_rows = [x for x in stranded_locus.print_subloci_metrics() - if x != {} and "tid" in x] - mono_scores_rows = [x for x in stranded_locus.print_subloci_scores() - if x != {} and "tid" in x] - for row in mono_metrics_rows: - mono_metrics.writerow(row) - # mono_metrics.flush() - for row in mono_scores_rows: - mono_scores.writerow(row) - # mono_scores.flush() - - for locus in stranded_locus.loci: - gene_counter += 1 - fragment_test = ( - self.json_conf["pick"]["run_options"]["remove_overlapping_fragments"] - is True and stranded_locus.loci[locus].is_fragment is True) - - if fragment_test is True: - continue - new_id = "{0}.{1}G{2}".format( - self.json_conf["pick"]["output_format"]["id_prefix"], - stranded_locus.chrom, gene_counter) - stranded_locus.loci[locus].id = new_id - stranded_locus.loci[locus].logger = self.logger - - locus_lines = stranded_locus.__str__( - print_cds=not self.json_conf["pick"]["run_options"]["exclude_cds"], - level="loci") - locus_metrics_rows = [_ for _ in stranded_locus.print_loci_metrics()] - locus_scores_rows = [_ for _ in stranded_locus.print_loci_scores()] - - for row in locus_metrics_rows: - locus_metrics.writerow(row) - # locus_metrics.flush() - for row in locus_scores_rows: - locus_scores.writerow(row) - # locus_scores.flush() - - if locus_lines != '': - print(locus_lines, file=locus_out) - # locus_out.flush() - return gene_counter - def __getstate__(self): state = self.__dict__.copy() @@ -1014,9 +930,11 @@ def __submit_single_threaded(self, data_dict): handles = self.__get_output_files() - locus_printer = functools.partial(self._print_locus, + locus_printer = functools.partial(print_locus, + handles=handles, logger=logger, - handles=handles) + counter=None, + json_conf=self.json_conf) # last_printed = -1 curr_chrom = None @@ -1202,6 +1120,5 @@ def __call__(self): self.main_logger.info("Finished analysis of %s", self.input_file) - sys.exit(0) # pylint: enable=too-many-instance-attributes From 90753d4b38ee21a39dcb44217d452c9a5c6a97e6 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 1 Feb 2017 16:20:22 +0000 Subject: [PATCH 06/47] Now Travis is configured to perform also python3.4 tests --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index a803e1a9c..c5abe3ab1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,6 @@ language: python python: + - "3.4" - "3.5" install: "pip install -r requirements.txt" script: From 6752ba221bcb686fb2131865f16ac26569dcced8 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 1 Feb 2017 16:20:42 +0000 Subject: [PATCH 07/47] Now Travis is configured to perform also python3.4 tests --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index c5abe3ab1..7f6bb9b46 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,7 @@ language: python python: - "3.4" - "3.5" + - "3.6" install: "pip install -r requirements.txt" script: - "python3 setup.py nosetests -exe;" \ No newline at end of file From 8cd72fc944db85a33bde15457d1d192b9d1ca991 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 1 Feb 2017 18:00:31 +0000 Subject: [PATCH 08/47] Added support for Python3.6. --- .travis.yml | 19 ++++++++++++++-- CHANGELOG.md | 1 + Mikado/loci/abstractlocus.py | 2 +- Mikado/loci/transcript.py | 38 ++++++++----------------------- Mikado/subprograms/util/stats.py | 2 +- Mikado/tests/test_system_calls.py | 2 +- Mikado/tests/trinity_stats.txt | 8 +++---- Mikado/utilities/intervaltree.pyx | 29 +++++++++++++++++++++++ 8 files changed, 63 insertions(+), 38 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7f6bb9b46..d68ef5201 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,21 @@ python: - "3.4" - "3.5" - "3.6" -install: "pip install -r requirements.txt" +# Setup anaconda, see https://gist.github.com/dan-blanchard/7045057 +before_install: + - wget https://repo.continuum.io/miniconda/Miniconda3-4.2.12-Linux-x86_64.sh -O miniconda.sh + - chmod +x miniconda.sh + - ./miniconda.sh -b + - export PATH=/home/travis/miniconda/bin:$PATH + - conda update --yes conda + # The next couple lines fix a crash with multiprocessing on Travis and are not specific to using Miniconda + - sudo rm -rf /dev/shm + - sudo ln -s /run/shm /dev/shm +# Install packages +install: + - conda create --yes -n env_name python=$TRAVIS_PYTHON_VERSION + - source activate env_name + - conda install --yes setuptools cython atlas numpy scipy scikit-learn biopython + - pip install -r requirements.txt script: - - "python3 setup.py nosetests -exe;" \ No newline at end of file + - python setup.py nosetests -exe; diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e88a1a34..2b4a20d02 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ Changes in this release: - Deprecated the "discard_definition" flag in Mikado serialise. Now Mikado will infer on its own whether to use the definition or the ID for serialising BLAST results. - Re-written the "find_retained_introns" method of AbstractLocus, to solve some bugs found during the utilisation of last beta. As a corollary, expanded the intervaltree module to allow searches for "tagged" intervals. - Now the "monoloci_out" files contain the Monosublocus**Holder** step, not the Monosublocus step. This should help during fine-tuning. +- Mikado now supports also Python3.6. #Version 1.0.0beta9 - "External scores" diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py index f55a1b35c..0f6cfe9f9 100644 --- a/Mikado/loci/abstractlocus.py +++ b/Mikado/loci/abstractlocus.py @@ -507,7 +507,7 @@ def _exon_to_be_considered(exon, cds_segments = sorted(transcript.cds_tree.search(*exon)) terminal = bool(set.intersection( - {*exon}, + set(exon), {transcript.start, transcript.end, transcript.combined_cds_end, transcript.combined_cds_start})) if cds_segments == [Interval(*exon)]: # It is completely coding diff --git a/Mikado/loci/transcript.py b/Mikado/loci/transcript.py index cba7d62db..6841588d5 100644 --- a/Mikado/loci/transcript.py +++ b/Mikado/loci/transcript.py @@ -317,7 +317,6 @@ def __init__(self, *args, self.__expandable = False self.__segmenttree = IntervalTree() self.__cds_introntree = IntervalTree() - self.__introntree = IntervalTree() self._possibly_without_exons = False # self.query_id = None @@ -461,7 +460,11 @@ def __getstate__(self): logger = self.logger del self.logger - state = copy.deepcopy(self.__dict__) + + state = copy.deepcopy(dict((key, val) for key, val in self.__dict__.items() + if key not in ("_Transcript__segmenttree", + "_Transcript__cds_introntree", + "_Transcript__cds_tree"))) self.logger = logger if hasattr(self, "json_conf") and self.json_conf is not None: @@ -485,25 +488,14 @@ def __getstate__(self): del state["blast_baked"] del state["query_baked"] - # import pickle - # try: - # _ = pickle.dumps(state) - # except (pickle.PicklingError, TypeError): - # failed = [] - # for obj in state: - # try: - # _ = pickle.dumps(state[obj]) - # except (pickle.PicklingError, TypeError): - # failed.append(obj) - # - # raise pickle.PicklingError("Failed to serialise {}, because of the following fields: {}".format( - # self.id, - # "\t\n".join([""]+failed))) - return state def __setstate__(self, state): self.__dict__.update(state) + self.__cds_tree = IntervalTree() + self.__segmenttree = IntervalTree() + self.__cds_introntree = IntervalTree() + # Set the logger to NullHandler self.logger = None @@ -1704,18 +1696,6 @@ def _cds_introntree(self): [(_[0], _[1] + 1) for _ in self.combined_cds_introns]) return self.__cds_introntree - @property - def _introntree(self): - - """ - :rtype: intervaltree.IntervalTree - """ - - if len(self.__introntree) != len(self.introns): - self.__cds_introntree = IntervalTree.from_tuples( - [(_[0], _[1] + 1) for _ in self.introns]) - return self.__introntree - @property def selected_cds(self): """This property return the CDS exons of the ORF selected as best diff --git a/Mikado/subprograms/util/stats.py b/Mikado/subprograms/util/stats.py index 4b85d9d9e..954343aa9 100644 --- a/Mikado/subprograms/util/stats.py +++ b/Mikado/subprograms/util/stats.py @@ -366,7 +366,7 @@ def get_stats(row: dict, array: numpy.array) -> dict: moder = array[sorter][weights[sorter].searchsorted(weights.max()):] except TypeError as exc: raise TypeError((exc, array, weights, sorter)) - row["Mode"] = ";".join(str(x) for x in moder) + row["Mode"] = ";".join(str(x) for x in sorted(moder)) else: row["Average"] = "NA" row["Mode"] = "NA" diff --git a/Mikado/tests/test_system_calls.py b/Mikado/tests/test_system_calls.py index d66d581d5..4fe051efa 100644 --- a/Mikado/tests/test_system_calls.py +++ b/Mikado/tests/test_system_calls.py @@ -356,7 +356,7 @@ def test_mikado_config(self): conf = Mikado.configuration.configurator.check_json(conf) os.remove(out) - @unittest.skipUnless((sys.version_info.minor>4), + @unittest.skipUnless((sys.version_info.minor > 4), "Due to a bug in JSONSCHEMA, Daijin configure fails with Python versions lower than 3.5.") def test_daijin_config(self): diff --git a/Mikado/tests/trinity_stats.txt b/Mikado/tests/trinity_stats.txt index 56d25d028..76bc1ab9d 100644 --- a/Mikado/tests/trinity_stats.txt +++ b/Mikado/tests/trinity_stats.txt @@ -4,18 +4,18 @@ Number of genes (coding) 0 NA NA NA NA NA NA NA NA NA NA NA NA NA Number of monoexonic genes 14 NA NA NA NA NA NA NA NA NA NA NA NA NA Transcripts per gene 38 1.00 1 1 1 1 1 1 1 1 1 1 1 1 Coding transcripts per gene 0 0.00 0 0 0 0 0 0 0 0 0 0 0 0 -CDNA lengths 40199 1,057.87 417;383 203 204 212 261 335 490 1,565 1,938 4,730 4,914 4,945 +CDNA lengths 40199 1,057.87 383;417 203 204 212 261 335 490 1,565 1,938 4,730 4,914 4,945 CDNA lengths (mRNAs) 0.0 NA NA NA NA NA NA NA NA NA NA NA NA NA CDS lengths 0 0.00 0 0 0 0 0 0 0 0 0 0 0 0 CDS lengths (mRNAs) NA NA NA NA NA NA NA NA NA NA NA NA NA NA CDS/cDNA ratio NA NA NA NA NA NA NA NA NA NA NA NA NA NA -Monoexonic transcripts 14 457.57 608;320;475;292;550;243;458;203;269;206;1295;449;659;379 203 203 205 217 275 414 531 644 882 1,212 1,295 +Monoexonic transcripts 14 457.57 203;206;243;269;292;320;379;449;458;475;550;608;659;1295 203 203 205 217 275 414 531 644 882 1,212 1,295 MonoCDS transcripts 0 NA NA NA NA NA NA NA NA NA NA NA NA NA Exons per transcript 144 3.79 1 1 1 1 1 1 2 5 9 11 12 12 Exons per transcript (mRNAs) 0 NA NA NA NA NA NA NA NA NA NA NA NA NA Exon lengths NA 279.16 59 18 20 22 45 72 120 315 644 838 2,716 2,865 Exon lengths (mRNAs) NA NA NA NA NA NA NA NA NA NA NA NA NA NA -Intron lengths NA 643.35 93;74;87 71 74 74 77 83 96 152 874 3,778 7,938 7,938 +Intron lengths NA 643.35 74;87;93 71 74 74 77 83 96 152 874 3,778 7,938 7,938 Intron lengths (mRNAs) NA NA NA NA NA NA NA NA NA NA NA NA NA NA CDS exons per transcript 0 0.00 0 0 0 0 0 0 0 0 0 0 0 0 CDS exons per transcript (mRNAs) 0 NA NA NA NA NA NA NA NA NA NA NA NA NA @@ -26,5 +26,5 @@ CDS Intron lengths 0.0 NA NA NA NA NA NA NA NA NA NA NA NA NA 5'UTR length 0.0 NA NA NA NA NA NA NA NA NA NA NA NA NA 3'UTR length 0.0 NA NA NA NA NA NA NA NA NA NA NA NA NA Stop distance from junction NA NA NA NA NA NA NA NA NA NA NA NA NA NA -Intergenic distances NA -1,942.43 -21;-6249 -12,446 -11,573 -9,862 -8,766 -4,204 -21 299 1,764 2,107 5,443 6,980 +Intergenic distances NA -1,942.43 -6249;-21 -12,446 -11,573 -9,862 -8,766 -4,204 -21 299 1,764 2,107 5,443 6,980 Intergenic distances (coding) NA NA NA NA NA NA NA NA NA NA NA NA NA NA diff --git a/Mikado/utilities/intervaltree.pyx b/Mikado/utilities/intervaltree.pyx index 8ea7cbb6f..e6217007a 100644 --- a/Mikado/utilities/intervaltree.pyx +++ b/Mikado/utilities/intervaltree.pyx @@ -99,6 +99,16 @@ cdef class IntervalNode: self.cright = EmptyNode self.croot = EmptyNode + def __getstate__(self): + state = [] + state.append(self.priority) + state.append(self.start) + + + + def __setstate__(self, state): + pass + cpdef IntervalNode insert(IntervalNode self, int start, int end, object interval): """ Insert a new IntervalNode into the tree of which this node is @@ -340,6 +350,14 @@ cdef class Interval: return iter([self.start, self.end]) + def __getstate__(self): + + return [self.start, self.end, self.value, self.chrom, self.strand] + + def __setstate__(self, state): + + self.start, self.end, self.value, self.chrom, self.strand = state + cdef class IntervalTree: """ @@ -401,6 +419,17 @@ cdef class IntervalTree: root = None num_intervals = 0 + # ---- Pickling ------ + + def __getstate__(self): + + return [self.root, self.num_intervals] + + def __setstate__(self, state): + + self.root, self.num_intervals = state + + # ---- Position based interfaces ----------------------------------------- def insert( self, int start, int end, object value=None ): From 53bb1e40aba59adea1323bb076ee34ad5eb25d1a Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 1 Feb 2017 18:02:35 +0000 Subject: [PATCH 09/47] BF for Travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index d68ef5201..5c7b82243 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,7 @@ before_install: - wget https://repo.continuum.io/miniconda/Miniconda3-4.2.12-Linux-x86_64.sh -O miniconda.sh - chmod +x miniconda.sh - ./miniconda.sh -b - - export PATH=/home/travis/miniconda/bin:$PATH + - export PATH=/home/travis/miniconda3/bin:$PATH - conda update --yes conda # The next couple lines fix a crash with multiprocessing on Travis and are not specific to using Miniconda - sudo rm -rf /dev/shm From 80679e81c0ebcc612a0a4af09cdcb461db6b9acc Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Thu, 2 Feb 2017 18:17:43 +0000 Subject: [PATCH 10/47] BF for a bug in cleaning up a database during serialise (for python 3.6) --- .travis.yml | 2 +- Mikado/loci/abstractlocus.py | 15 ++++++++------- Mikado/subprograms/serialise.py | 5 ++++- Mikado/utilities/dbutils.py | 2 +- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5c7b82243..846ab9cfa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,7 +5,7 @@ python: - "3.6" # Setup anaconda, see https://gist.github.com/dan-blanchard/7045057 before_install: - - wget https://repo.continuum.io/miniconda/Miniconda3-4.2.12-Linux-x86_64.sh -O miniconda.sh + - wget https://repo.continuum.io/miniconda/Miniconda3-4.2.12-Linux-x86_64.sh -O miniconda.sh - chmod +x miniconda.sh - ./miniconda.sh -b - export PATH=/home/travis/miniconda3/bin:$PATH diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py index 0f6cfe9f9..841d1ec91 100644 --- a/Mikado/loci/abstractlocus.py +++ b/Mikado/loci/abstractlocus.py @@ -166,18 +166,19 @@ def __getitem__(self, item): return self.transcripts[item] - # #### Static methods ####### + + import typing + @staticmethod - def overlap(first_interval: tuple([int, int]), - second_interval: tuple([int, int]), flank=0) -> int: - """ + def overlap(first_interval: (int, int), + second_interval: (int, int), flank=0) -> int: - :param first_interval: a tuple of integers - :type first_interval: (int,int) + """:param first_interval: a tuple of integers + :type first_interval: [int,int] :param second_interval: a tuple of integers - :type second_interval: (int,int | intervaltree.Interval) + :type second_interval: [int,int | intervaltree.Interval] :param flank: an optional extending parameter to check for neighbours :type flank: int diff --git a/Mikado/subprograms/serialise.py b/Mikado/subprograms/serialise.py index 2a4fcae8c..1ca2720f6 100644 --- a/Mikado/subprograms/serialise.py +++ b/Mikado/subprograms/serialise.py @@ -301,6 +301,9 @@ def serialise(args): if args.json_conf["serialise"]["force"] is True: logger.warn("Removing old data because force option in place") + if args.json_conf["db_settings"]["dbtype"] == "sqlite" and os.path.exists(args.json_conf["db_settings"]["db"]): + os.remove(args.json_conf["db_settings"]["db"]) + engine = dbutils.connect(args.json_conf) meta = sqlalchemy.MetaData(bind=engine) meta.reflect(engine) @@ -312,7 +315,7 @@ def serialise(args): if args.json_conf["db_settings"]["dbtype"] == "mysql": engine.execute("") # This would fail in MySQL as it uses the OPTIMIZE TABLE syntax above - if args.json_conf["db_settings"]["dbtype"] != "mysql": + elif args.json_conf["db_settings"]["dbtype"] != "sqlite": engine.execute("VACUUM") dbutils.DBBASE.metadata.create_all(engine) diff --git a/Mikado/utilities/dbutils.py b/Mikado/utilities/dbutils.py index 8e5cebaba..7427f0369 100644 --- a/Mikado/utilities/dbutils.py +++ b/Mikado/utilities/dbutils.py @@ -63,7 +63,7 @@ def create_connector(json_conf, logger=None): func = sqlite3.connect( database=json_conf["pick"]["run_options"]["shm_db"], - check_same_thread=False) + check_same_thread=False, isolation_level=None) elif db_settings["dbtype"] in ("mysql", "postgresql"): if db_settings["dbpasswd"] != '': passwd = ":{0}".format(db_settings["dbpasswd"]) From ad8af52604b5b4bf9c75ee19d7f35cedc5e65970 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Fri, 3 Feb 2017 15:00:19 +0000 Subject: [PATCH 11/47] Moved transcript objects to their own submodule. Implemented the new MonosublocusHolder algorithm. Now locus-like objects will check at runtime for the correctness of the configuration. --- CHANGELOG.md | 1 + Mikado/configuration/configurator.py | 19 +- Mikado/loci/__init__.py | 16 +- Mikado/loci/abstractlocus.py | 38 ++-- Mikado/loci/locus.py | 15 +- Mikado/loci/monosublocus.py | 36 +--- Mikado/loci/monosublocusholder.py | 164 ++++++++++++++++-- Mikado/loci/reference_gene.py | 6 +- Mikado/loci/sublocus.py | 8 +- Mikado/loci/superlocus.py | 37 ++-- Mikado/preparation/annotation_parser.py | 2 +- Mikado/preparation/checking.py | 14 +- Mikado/preparation/prepare.py | 3 +- Mikado/scales/accountant.py | 9 +- Mikado/scales/assigner.py | 23 +-- Mikado/scales/compare.py | 10 +- Mikado/serializers/orf.py | 2 +- Mikado/subprograms/__init__.py | 4 +- Mikado/subprograms/util/awk_gtf.py | 6 +- Mikado/subprograms/util/convert.py | 2 +- Mikado/subprograms/util/trim.py | 1 - Mikado/tests/locus_tester.py | 161 ++++++++++++----- Mikado/tests/test_abstractlocus.py | 6 - Mikado/tests/test_clique_methods.py | 9 +- Mikado/tests/test_excluded.py | 15 -- Mikado/tests/test_external_scores.py | 6 +- Mikado/tests/test_invalid_orfs.py | 11 +- Mikado/tests/test_splitting.py | 8 +- Mikado/tests/test_system_calls.py | 38 ++-- Mikado/tests/test_transcript_checker.py | 18 +- Mikado/tests/test_transcript_methods.py | 10 +- Mikado/tests/transcript_tester_single.py | 16 +- Mikado/transcripts/__init__.py | 8 + .../{loci => transcripts}/clique_methods.py | 0 Mikado/{loci => transcripts}/transcript.py | 31 ++-- .../transcript_methods/__init__.py | 0 .../transcript_methods/finalizing.py | 2 +- .../transcript_methods/printing.py | 6 +- .../transcript_methods/retrieval.py | 13 +- .../transcript_methods/splitting.py | 6 +- .../transcriptchecker.py | 0 Mikado/utilities/__init__.py | 4 +- util/bam2gtf.py | 5 +- util/gffjunc_to_bed12.py | 7 +- 44 files changed, 489 insertions(+), 307 deletions(-) delete mode 100644 Mikado/tests/test_abstractlocus.py delete mode 100644 Mikado/tests/test_excluded.py create mode 100644 Mikado/transcripts/__init__.py rename Mikado/{loci => transcripts}/clique_methods.py (100%) rename Mikado/{loci => transcripts}/transcript.py (99%) rename Mikado/{loci => transcripts}/transcript_methods/__init__.py (100%) rename Mikado/{loci => transcripts}/transcript_methods/finalizing.py (99%) rename Mikado/{loci => transcripts}/transcript_methods/printing.py (99%) rename Mikado/{loci => transcripts}/transcript_methods/retrieval.py (99%) rename Mikado/{loci => transcripts}/transcript_methods/splitting.py (99%) rename Mikado/{loci => transcripts}/transcriptchecker.py (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b4a20d02..6780ab34f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ Changes in this release: - Deprecated the "discard_definition" flag in Mikado serialise. Now Mikado will infer on its own whether to use the definition or the ID for serialising BLAST results. +- Now AbstractLocus implementation will check at runtime that the configuration is correct. - Re-written the "find_retained_introns" method of AbstractLocus, to solve some bugs found during the utilisation of last beta. As a corollary, expanded the intervaltree module to allow searches for "tagged" intervals. - Now the "monoloci_out" files contain the Monosublocus**Holder** step, not the Monosublocus step. This should help during fine-tuning. - Mikado now supports also Python3.6. diff --git a/Mikado/configuration/configurator.py b/Mikado/configuration/configurator.py index e383f5005..387f2432d 100644 --- a/Mikado/configuration/configurator.py +++ b/Mikado/configuration/configurator.py @@ -7,21 +7,22 @@ while existing values are checked for type and consistency. """ -import os.path import io -import re -import yaml -import pkg_resources -from ..exceptions import InvalidJson, UnrecognizedRescaler -from ..loci.transcript import Transcript -from ..utilities import merge_dictionaries import json -import jsonschema +import os.path +import pickle +import re from multiprocessing import get_start_method +import jsonschema +import pkg_resources +import yaml from pkg_resources import resource_stream, resource_filename -import pickle from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier +from ..transcripts.transcript import Transcript +from ..exceptions import InvalidJson, UnrecognizedRescaler +from ..utilities import merge_dictionaries from ..utilities.log_utils import create_default_logger + # from frozendict import frozendict __author__ = "Luca Venturini" diff --git a/Mikado/loci/__init__.py b/Mikado/loci/__init__.py index 068cca8fe..fdcd4a4d4 100644 --- a/Mikado/loci/__init__.py +++ b/Mikado/loci/__init__.py @@ -9,24 +9,12 @@ The creation of the loci is delegated to the "Creator" class. """ -from .clique_methods import * +from ..transcripts import Transcript, TranscriptChecker from .abstractlocus import Abstractlocus -from .transcript import Transcript from .excluded import Excluded from .locus import Locus from .monosublocusholder import MonosublocusHolder -from .superlocus import Superlocus, Sublocus, Monosublocus from .reference_gene import Gene - -# from .picker import Picker -# from . import abstractlocus -# from . import picker -# from . import excluded -# from . import locus -# from . import monosublocus -# from . import sublocus -# from . import superlocus -# from . import transcript -# from . import transcriptchecker +from .superlocus import Superlocus, Sublocus, Monosublocus __title__ = "loci" diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py index 841d1ec91..10a4d5676 100644 --- a/Mikado/loci/abstractlocus.py +++ b/Mikado/loci/abstractlocus.py @@ -4,20 +4,24 @@ Module that defines the blueprint for all loci classes. """ -import operator import abc -import random -import logging import itertools +import logging +import operator +import random from sys import maxsize -from .clique_methods import find_cliques, find_communities, define_graph + import networkx +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier + +from ..transcripts.clique_methods import find_cliques, find_communities, define_graph +from ..transcripts.transcript import Transcript +from ..configuration.configurator import to_json, check_json from ..exceptions import NotInLocusError from ..utilities import overlap, merge_ranges -from ..utilities.log_utils import create_null_logger from ..utilities.intervaltree import Interval, IntervalTree -from .transcript import Transcript -from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier +from ..utilities.log_utils import create_null_logger + # I do not care that there are too many attributes: this IS a massive class! # pylint: disable=too-many-instance-attributes,too-many-public-methods @@ -32,6 +36,8 @@ class Abstractlocus(metaclass=abc.ABCMeta): # ##### Special methods ######### + __json_conf = to_json(None) + @abc.abstractmethod def __init__(self, source="", verified_introns=None): @@ -54,7 +60,7 @@ def __init__(self, source="", verified_introns=None): self.__locus_verified_introns = set() if verified_introns is not None: self.locus_verified_introns = verified_introns - self.json_conf = dict() + self.__cds_introntree = IntervalTree() self.__regressor = None self.session = None @@ -168,8 +174,6 @@ def __getitem__(self, item): # #### Static methods ####### - import typing - @staticmethod def overlap(first_interval: (int, int), second_interval: (int, int), flank=0) -> int: @@ -859,6 +863,20 @@ def is_intersecting(cls, *args, **kwargs): # ##### Properties ####### + @property + def json_conf(self): + return self.__json_conf + + @json_conf.setter + def json_conf(self, conf): + if conf is None or isinstance(conf, (str, bytes)): + conf = to_json(conf) + elif isinstance(conf, dict): + conf = check_json(conf) + else: + raise TypeError("Unrecognized type for configuration: {}".format(type(conf))) + self.__json_conf = conf + @property def stranded(self): """This property determines whether a Monosublocus will consider diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py index 0fb110bdb..57e11b02b 100644 --- a/Mikado/loci/locus.py +++ b/Mikado/loci/locus.py @@ -5,20 +5,19 @@ i.e. the locus. """ +import collections import itertools import operator -# import functools from collections import deque -from .transcript import Transcript -from .transcriptchecker import TranscriptChecker -from ..scales.assigner import Assigner -from .sublocus import Sublocus +from sys import version_info +import pyfaidx +from ..transcripts.transcript import Transcript +from ..transcripts.transcriptchecker import TranscriptChecker from .abstractlocus import Abstractlocus +from .sublocus import Sublocus from ..parsers.GFF import GffLine -import collections +from ..scales.assigner import Assigner from ..utilities import overlap -import pyfaidx -from sys import version_info if version_info.minor < 5: from sortedcontainers import SortedDict else: diff --git a/Mikado/loci/monosublocus.py b/Mikado/loci/monosublocus.py index dfdba1902..d7645ff47 100644 --- a/Mikado/loci/monosublocus.py +++ b/Mikado/loci/monosublocus.py @@ -19,7 +19,7 @@ class Monosublocus(Abstractlocus): # ########## Special methods ############ - def __init__(self, transcript_instance, logger=None): + def __init__(self, transcript_instance, json_conf=None, logger=None): self.counter = 0 # simple tag to avoid collisions Abstractlocus.__init__(self) @@ -33,6 +33,7 @@ def __init__(self, transcript_instance, logger=None): self.tid = transcript_instance.id self.logger = logger self.attributes = dict() + self.json_conf = json_conf # pylint: disable=arguments-differ def __str__(self, print_cds=True, source_in_name=True): @@ -41,39 +42,6 @@ def __str__(self, print_cds=True, source_in_name=True): """This is a container used for computational purposes only, it should not be printed out directly!""") - # lines = [] - # - # self_line = GffLine('') - # for attr in ["chrom", 'feature', 'source', 'start', 'end', 'strand']: - # setattr(self_line, attr, getattr(self, attr)) - # self_line.phase, self_line.score = None, self.score - # if source_in_name is True: - # self_line.id = "{0}_{1}".format(self.source, self.id) - # else: - # self_line.id = self.id - # self_line.name = self.name - # self_line.parent = self.parent - # self_line.attributes.update(self.attributes) - # self_line.attributes["multiexonic"] = (not self.monoexonic) - # lines.append(str(self_line)) - # - # for tid in self.transcripts: - # transcript_instance = self.transcripts[tid] - # transcript_instance.source = self.source - # transcript_instance.parent = self_line.id - # self.logger.debug(self.attributes) - # for attribute in self.attributes: - # if attribute not in transcript_instance.attributes: - # if attribute == "is_fragment" and self.attributes[attribute] is False: - # continue - # transcript_instance.attributes[attribute] = self.attributes[attribute] - # - # lines.append(transcript_instance.format( - # "gff", - # all_orfs=self.json_conf["pick"]["output_format"]["report_all_orfs"], - # with_cds=print_cds).rstrip()) - # - # return "\n".join(lines) # pylint: enable=arguments-differ # ########## Class instance methods ############## diff --git a/Mikado/loci/monosublocusholder.py b/Mikado/loci/monosublocusholder.py index 6659360b6..3bdb6009f 100644 --- a/Mikado/loci/monosublocusholder.py +++ b/Mikado/loci/monosublocusholder.py @@ -5,21 +5,24 @@ before the definition of real loci. """ -from sys import version_info -if version_info.minor < 5: - from sortedcontainers import SortedDict -else: - from collections import OrderedDict as SortedDict import itertools -from .transcript import Transcript +import logging +from sys import version_info + +from ..transcripts.transcript import Transcript from .abstractlocus import Abstractlocus -from .sublocus import Sublocus from .locus import Locus from .monosublocus import Monosublocus +from .sublocus import Sublocus +from ..parsers.GFF import GffLine +from ..scales.contrast import compare as c_compare from ..utilities import overlap from ..utilities.log_utils import create_null_logger -import logging -from ..parsers.GFF import GffLine + +if version_info.minor < 5: + from sortedcontainers import SortedDict +else: + from collections import OrderedDict as SortedDict # Resolution order is important here! # pylint: disable=too-many-instance-attributes @@ -205,7 +208,144 @@ def define_loci(self, purge=False, excluded=None): return @classmethod - def is_intersecting(cls, transcript, other, cds_only=False, logger=None, simple_overlap=False): + def is_intersecting(cls, + transcript, + other, + cds_only=False, + logger=None, + min_cdna_overlap=0.2, + min_cds_overlap=0.2) -> bool: + """ + Implementation of the is_intersecting method. Now that we are comparing transcripts that + by definition span multiple subloci, we have to be less strict in our definition of what + counts as an intersection. + Criteria: + - 1 splice site in common (splice, not junction) + - If one or both of the transcript is monoexonic OR + one or both lack an ORF, check for any exonic overlap + - Otherwise, check for any CDS overlap. + + :param transcript + :type transcript; Transcript + + :param other: + :type other: Transcript + + :param cds_only: boolean flag. If set to True, only + the CDS component of the transcripts will be considered to determine + whether they are intersecting or not. + :type cds_only: bool + + :param min_cdna_overlap: float. This is the minimum cDNA overlap for two transcripts to be considered as intersecting, + even when all other conditions fail. + :type min_cdna_overlap: float + + :param min_cds_overlap: float. This is the minimum CDS overlap for two transcripts to be considered as intersecting, + even when all other conditions fail. + :type min_cds_overlap: float + + :param logger: either None or a logger instance. If None, a null logger will be created. + + :rtype : bool + """ + + if logger is None or not isinstance(logger, logging.Logger): + logger = create_null_logger("MSH") + + if transcript.id == other.id or transcript.strand != other.strand: + logger.debug("Cannot intersect with itself (%s vs %s)", + transcript.id, other.id) + return False + + if cds_only is True and (transcript.is_coding and other.is_coding): + logger.debug("Considering only the CDS of %s and %s, as they are both coding; stripping the UTR", + transcript.id, other.id) + transcript = transcript.deepcopy() + transcript.remove_utrs() + other = other.deepcopy() + other.remove_utrs() + logger.debug("New coordinates: %s (%d-%d), %s (%d-%d)", + transcript.id, transcript.start, transcript.end, + other.id, other.start, other.end) + + # Calculate the relationship between the transcripts + comparison, _ = c_compare(other, transcript) + + logger.debug("Starting to check %s and %s" ,transcript.id, other.id) + if comparison.n_f1[0] == 0: + # No overlap. Return False + logger.debug("No genomic overlap between %s and %s. Comparison: %s", + transcript.id, other.id, comparison) + return False # We do not want intersection with oneself + elif min(transcript.exon_num, other.exon_num) == 1: + logger.debug("%s and %s intersect (%s %s monoexonic); class code: %s", + transcript.id, other.id, + transcript.id if ( + transcript.exon_num == 1 and other.exon_num > 1) else + other.id if transcript.exon_num > 1 else "both", + "is" if max(transcript.exon_num, other.exon_num) > 1 else "are", + comparison.ccode) + return True + elif comparison.j_f1[0] > 0 or comparison.ccode[0] == "h": + # Simple case: they do intersect! + logger.debug("%s and %s intersect; class code: %s", transcript.id, other.id, comparison.ccode) + return True + elif comparison.ccode[0] == "o": + # Is at least one intron completely contained? + if cls._intron_contained_in_exon(transcript, other): + logger.debug("At least 1 intron of %s is completely contained within an exon of %s\n%s\n%s", + transcript.id, + other.id, + transcript.combined_cds_introns, other.coding_exons) + return True + elif cls._intron_contained_in_exon(other, transcript): + logger.debug("At least 1 intron of %s is completely contained within an exon of %s", + other.id, transcript.id) + return True + + cdna_overlap = max(comparison.n_recall[0], comparison.n_prec[0]) / 100 + if cds_only is True and (transcript.is_coding and other.is_coding): + if cdna_overlap >= max(min_cds_overlap, min_cdna_overlap): + logger.debug("%s and %s have %s of CDS overlap (considering only the coding portion), they intersect.", + transcript.id, other.id, cdna_overlap) + return True + else: + logger.debug( + """Considering only the CDS, %s and %s do not have enough overlap (tot %s), they do not intersect. + Comparison: %s""", + transcript.id, other.id, cdna_overlap, comparison) + return False + elif not (cds_only is True and (transcript.is_coding and other.is_coding)): + logger.debug("Considering also the CDS of %s and %s, for the overlap.", + transcript.id, other.id) + cds_transcript = transcript.deepcopy() + cds_transcript.remove_utrs() + cds_other = other.deepcopy() + cds_other.remove_utrs() + cds_comparison, _ = c_compare(cds_other, cds_transcript) + cds_overlap = max(cds_comparison.n_recall[0], cds_comparison.n_prec[0]) / 100 + if cdna_overlap >= min_cdna_overlap and cds_overlap >= min_cds_overlap: + logger.debug("%s and %s have enough of CDS (%s) and cDNA (%s) overlap, intersecting.", + transcript.id, other.id, cds_overlap, cdna_overlap) + return True + else: + logger.debug("%s and %s do not have enough of CDS (%s) and cDNA (%s) overlap, not intersecting.", + transcript.id, other.id, cds_overlap, cdna_overlap) + return False + else: + logger.debug("%s and %s fail to meet the requirements to intersect.") + return False + + @staticmethod + def _intron_contained_in_exon(transcript: Transcript, other: Transcript) -> bool: + + """Mini-method to assess whether at least one intron of "transcript" is **completely** contained + within an exon of "other".""" + + return any((overlap(*_) == (_[0][1] - _[0][0])) for _ in itertools.product(transcript.introns, other.exons)) + + @classmethod + def is_intersecting_old(cls, transcript, other, cds_only=False, logger=None, simple_overlap=False): """ Implementation of the is_intersecting method. Now that we are comparing transcripts that by definition span multiple subloci, we have to be less strict in our definition of what @@ -372,8 +512,8 @@ def in_locus(cls, monosublocus: Abstractlocus, is_in_locus = cls.is_intersecting(tran, transcript, logger=logger, - cds_only=cds_only, - simple_overlap=simple_overlap) + cds_only=cds_only) + # simple_overlap=simple_overlap) if is_in_locus is True: break return is_in_locus diff --git a/Mikado/loci/reference_gene.py b/Mikado/loci/reference_gene.py index a3a238e76..411c5b605 100644 --- a/Mikado/loci/reference_gene.py +++ b/Mikado/loci/reference_gene.py @@ -5,15 +5,15 @@ Minimal checks. """ +import copy import logging import operator -import copy -from .transcript import Transcript +from sys import intern +from ..transcripts.transcript import Transcript from ..exceptions import InvalidTranscript, InvalidCDS from ..parsers.GFF import GffLine from ..parsers.GTF import GtfLine from ..utilities.log_utils import create_null_logger -from sys import intern class Gene: diff --git a/Mikado/loci/sublocus.py b/Mikado/loci/sublocus.py index bec5459a7..bdaf91d99 100644 --- a/Mikado/loci/sublocus.py +++ b/Mikado/loci/sublocus.py @@ -7,14 +7,14 @@ """ import itertools +from sys import version_info +import numpy +from sklearn.ensemble import RandomForestClassifier +from ..transcripts.transcript import Transcript from .abstractlocus import Abstractlocus from .excluded import Excluded from .monosublocus import Monosublocus -from .transcript import Transcript from ..parsers.GFF import GffLine -from sys import version_info -import numpy -from sklearn.ensemble import RandomForestClassifier if version_info.minor < 5: from sortedcontainers import SortedDict else: diff --git a/Mikado/loci/superlocus.py b/Mikado/loci/superlocus.py index 713f46caf..72f22d928 100644 --- a/Mikado/loci/superlocus.py +++ b/Mikado/loci/superlocus.py @@ -10,24 +10,25 @@ import collections from sys import version_info import networkx +from sqlalchemy import bindparam from sqlalchemy.engine import Engine -from ..utilities import dbutils, grouper +from sqlalchemy.ext import baked from sqlalchemy.orm.session import sessionmaker from sqlalchemy.sql.expression import and_ -from sqlalchemy import bindparam -from sqlalchemy.ext import baked -from ..serializers.junction import Junction, Chrom -from ..serializers.blast_serializer import Hit, Query, Target -from ..serializers.external import External -from ..serializers.orf import Orf +from ..transcripts.transcript import Transcript from .abstractlocus import Abstractlocus -from .monosublocus import Monosublocus from .excluded import Excluded -from .transcript import Transcript -from .sublocus import Sublocus +from .monosublocus import Monosublocus from .monosublocusholder import MonosublocusHolder -from ..parsers.GFF import GffLine +from .sublocus import Sublocus from ..exceptions import NoJsonConfigError, NotInLocusError +from ..parsers.GFF import GffLine +from ..serializers.blast_serializer import Hit, Query, Target +from ..serializers.external import External +from ..serializers.junction import Junction, Chrom +from ..serializers.orf import Orf +from ..utilities import dbutils, grouper + if version_info.minor < 5: from sortedcontainers import SortedDict else: @@ -678,10 +679,12 @@ def __prefilter_transcripts(self): self.logger.debug("No transcripts to be excluded for %s", self.id) return else: - self.logger.debug("%d transcript%s do not pass the requirements for %s", + self.logger.debug("""%d transcript%s do not pass the requirements for %s; +expression: %s""", len(not_passing), "" if len(not_passing) == 1 else "s", - self.id) + self.id, + self.json_conf["requirements"]["expression"]) if self.purge is True: self.logger.debug("Purging %d transcript%s from %s", @@ -1136,8 +1139,8 @@ def define_alternative_splicing(self): t_graph = self.define_graph(self.transcripts, inters=MonosublocusHolder.is_intersecting, cds_only=cds_only, - logger=self.logger, - simple_overlap=simple_overlap) + logger=self.logger) + # simple_overlap=simple_overlap) cliques = self.find_cliques(t_graph) @@ -1178,8 +1181,8 @@ def calculate_mono_metrics(self): if MonosublocusHolder.in_locus(holder, monosublocus_instance, logger=self.logger, - cds_only=cds_only, - simple_overlap=simple_overlap): + cds_only=cds_only): + # simple_overlap=simple_overlap): holder.add_monosublocus(monosublocus_instance) found_holder = True break diff --git a/Mikado/preparation/annotation_parser.py b/Mikado/preparation/annotation_parser.py index 0ea44184b..e79baf29a 100644 --- a/Mikado/preparation/annotation_parser.py +++ b/Mikado/preparation/annotation_parser.py @@ -1,5 +1,5 @@ import multiprocessing -from ..utilities import to_gff +from ..parsers import to_gff from ..utilities.log_utils import create_queue_logger import logging import logging.handlers diff --git a/Mikado/preparation/checking.py b/Mikado/preparation/checking.py index 89f6a3ea4..c09f1f069 100644 --- a/Mikado/preparation/checking.py +++ b/Mikado/preparation/checking.py @@ -1,11 +1,13 @@ -from ..loci import Transcript -from ..loci.transcriptchecker import TranscriptChecker -from ..utilities.log_utils import create_null_logger, create_queue_logger -import pyfaidx +import functools +import multiprocessing import os + +import pyfaidx + +from Mikado.transcripts.transcriptchecker import TranscriptChecker from .. import exceptions -import multiprocessing -import functools +from ..loci import Transcript +from ..utilities.log_utils import create_null_logger, create_queue_logger __author__ = 'Luca Venturini' diff --git a/Mikado/preparation/prepare.py b/Mikado/preparation/prepare.py index 387cf3926..7c49ffff5 100644 --- a/Mikado/preparation/prepare.py +++ b/Mikado/preparation/prepare.py @@ -3,6 +3,7 @@ import gc from .checking import create_transcript, CheckingProcess from .annotation_parser import AnnotationParser, load_from_gtf, load_from_gff +from ..parsers import to_gff import operator import collections import io @@ -15,7 +16,7 @@ import multiprocessing.sharedctypes import pyfaidx import logging -from ..utilities import path_join, to_gff, merge_partial +from ..utilities import path_join, merge_partial from collections import Counter import sqlite3 try: diff --git a/Mikado/scales/accountant.py b/Mikado/scales/accountant.py index 2f18ffbf6..c5b84cd3b 100644 --- a/Mikado/scales/accountant.py +++ b/Mikado/scales/accountant.py @@ -6,13 +6,14 @@ """ import argparse -import operator import collections -from logging import handlers as log_handlers import logging -from ..loci.transcript import Transcript -from .resultstorer import ResultStorer +import operator +from logging import handlers as log_handlers + +from Mikado.transcripts.transcript import Transcript from . import calc_f1 +from .resultstorer import ResultStorer # noinspection PyPropertyAccess,PyPropertyAccess,PyPropertyAccess diff --git a/Mikado/scales/assigner.py b/Mikado/scales/assigner.py index 0c39d09ae..446a92a6f 100644 --- a/Mikado/scales/assigner.py +++ b/Mikado/scales/assigner.py @@ -4,23 +4,24 @@ This class is the main workhorse of the compare.py utility. """ -import sys +import argparse +import collections import csv -from ..utilities.intervaltree import IntervalTree -from logging import handlers as log_handlers -import queue +import gzip import logging -import collections -import argparse import operator +import queue +import sys from collections import namedtuple -from .resultstorer import ResultStorer -from ..loci.transcript import Transcript -from ..exceptions import InvalidTranscript, InvalidCDS +from functools import partial +from logging import handlers as log_handlers + +from Mikado.transcripts.transcript import Transcript from .accountant import Accountant from .contrast import compare as c_compare -from functools import partial -import gzip +from .resultstorer import ResultStorer +from ..exceptions import InvalidTranscript, InvalidCDS +from ..utilities.intervaltree import IntervalTree # noinspection PyPropertyAccess,PyPropertyAccess diff --git a/Mikado/scales/compare.py b/Mikado/scales/compare.py index 1e83238d4..6b2d30c8f 100644 --- a/Mikado/scales/compare.py +++ b/Mikado/scales/compare.py @@ -7,22 +7,22 @@ """ import collections +import csv import logging import multiprocessing import os import re import sys -import csv from logging import handlers as log_handlers -from ..loci.reference_gene import Gene +from Mikado.transcripts.transcript import Transcript from .accountant import Accountant from .assigner import Assigner from .resultstorer import ResultStorer -from ..loci.transcript import Transcript +from ..exceptions import CorruptIndex +from ..loci.reference_gene import Gene from ..parsers.GFF import GFF3 +from ..parsers import to_gff from ..utilities.log_utils import create_default_logger, formatter -from ..exceptions import CorruptIndex -from ..utilities import to_gff try: import ujson as json except ImportError: diff --git a/Mikado/serializers/orf.py b/Mikado/serializers/orf.py index b99676f78..c2b164953 100644 --- a/Mikado/serializers/orf.py +++ b/Mikado/serializers/orf.py @@ -17,7 +17,7 @@ from ..parsers import bed12 # , GFF from .blast_serializer import Query from ..utilities.log_utils import create_null_logger, check_logger -from ..utilities import to_gff +from ..parsers import to_gff # from ..loci import Transcript # from Bio import SeqIO diff --git a/Mikado/subprograms/__init__.py b/Mikado/subprograms/__init__.py index b4c667d82..20692f4f2 100644 --- a/Mikado/subprograms/__init__.py +++ b/Mikado/subprograms/__init__.py @@ -2,9 +2,7 @@ """This module contains the subprograms launched by the Mikado suite""" -from ..parsers import GTF, GFF -from ..utilities import to_gff - +from ..parsers import GTF, GFF, to_gff # noinspection PyPep8 from . import configure # noinspection PyPep8 diff --git a/Mikado/subprograms/util/awk_gtf.py b/Mikado/subprograms/util/awk_gtf.py index 2990eba44..32ec26b8f 100644 --- a/Mikado/subprograms/util/awk_gtf.py +++ b/Mikado/subprograms/util/awk_gtf.py @@ -5,11 +5,11 @@ Script to extract features from a GTF with certain coordinates. """ -import sys import argparse -from ...parsers.GTF import GTF -from ...loci.transcript import Transcript +import sys +from Mikado.transcripts.transcript import Transcript +from ...parsers.GTF import GTF __author__ = 'Luca Venturini' diff --git a/Mikado/subprograms/util/convert.py b/Mikado/subprograms/util/convert.py index 1cedc9537..34438c6f4 100644 --- a/Mikado/subprograms/util/convert.py +++ b/Mikado/subprograms/util/convert.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import argparse import sys -from ...utilities import to_gff +from ...parsers import to_gff from ...loci import Transcript, Gene diff --git a/Mikado/subprograms/util/trim.py b/Mikado/subprograms/util/trim.py index 2e213058d..e17062947 100644 --- a/Mikado/subprograms/util/trim.py +++ b/Mikado/subprograms/util/trim.py @@ -5,7 +5,6 @@ import argparse import sys - from .. import to_gff from ...exceptions import InvalidTranscript from ...loci import Transcript diff --git a/Mikado/tests/locus_tester.py b/Mikado/tests/locus_tester.py index 1c5b4b375..ce1cd063d 100644 --- a/Mikado/tests/locus_tester.py +++ b/Mikado/tests/locus_tester.py @@ -11,12 +11,13 @@ from Mikado import exceptions from Mikado.parsers import GFF # ,GTF, bed12 from Mikado.parsers.GTF import GtfLine -from Mikado.loci import Transcript, Superlocus, Abstractlocus, Locus, MonosublocusHolder, Sublocus +from Mikado.loci import Transcript, Superlocus, Abstractlocus, Locus, Monosublocus, MonosublocusHolder, Sublocus from Mikado.utilities.log_utils import create_null_logger, create_default_logger from Mikado.utilities import overlap from Mikado.utilities.intervaltree import Interval import Mikado.loci import pickle +import inspect class OverlapTester(unittest.TestCase): @@ -35,7 +36,7 @@ def test_overlap(self): class LocusTester(unittest.TestCase): - logger = create_null_logger("locus_tester") + logger = create_null_logger(inspect.getframeinfo(inspect.currentframe())[2]) def setUp(self): @@ -88,7 +89,7 @@ def setUp(self): def test_locus(self): """Basic testing of the Locus functionality.""" - logger = create_null_logger("test_locus") + logger = create_null_logger(inspect.getframeinfo(inspect.currentframe())[2]) logger.setLevel("WARNING") logger.info("Started") self.transcript1.logger = logger @@ -171,7 +172,7 @@ def test_verified_introns(self): def test_boolean_requirement(self): - logger = create_null_logger("test_boolean_requirement") + logger = create_null_logger(inspect.getframeinfo(inspect.currentframe())[2]) logger.setLevel("DEBUG") logger.info("Started") @@ -203,44 +204,34 @@ def test_boolean_requirement(self): jconf["requirements"] = dict() jconf["requirements"]["parameters"] = dict() - jconf["requirements"]["expression"] = "evaluated['suspicious_splicing']" + jconf["requirements"]["expression"] = ["suspicious_splicing"] jconf["requirements"]["parameters"]["suspicious_splicing"] = dict() jconf["requirements"]["parameters"]["suspicious_splicing"]["operator"] = "ne" jconf["requirements"]["parameters"]["suspicious_splicing"]["name"] = "suspicious_splicing" jconf["requirements"]["parameters"]["suspicious_splicing"]["value"] = True - if "compiled" in jconf["requirements"]: - del jconf["requirements"]["compiled"] jconf["pick"]["alternative_splicing"]["report"] = False - - loc = Superlocus(t1, json_conf=jconf) - loc.add_transcript_to_locus(t2) - loc.add_transcript_to_locus(t3) - - loc.define_subloci() - - self.assertEqual(len(loc.transcripts), 3) - - # Set it as suspicious - t2.attributes["canonical_on_reverse_strand"] = True - self.assertTrue(t2.suspicious_splicing) - - jconf["requirements"]["expression"] = "evaluated['suspicious_splicing']" - jconf["requirements"]["parameters"]["suspicious_splicing"] = dict() - jconf["requirements"]["parameters"]["suspicious_splicing"]["operator"] = "ne" - jconf["requirements"]["parameters"]["suspicious_splicing"]["name"] = "suspicious_splicing" - jconf["requirements"]["parameters"]["suspicious_splicing"]["value"] = True - if "compiled" in jconf["requirements"]: - del jconf["requirements"]["compiled"] - jconf["pick"]["alternative_splicing"]["report"] = False - - loc = Superlocus(t1, json_conf=jconf, logger=logger) - loc.add_transcript_to_locus(t2) - loc.add_transcript_to_locus(t3) - - loc.define_subloci() - - self.assertEqual(len(loc.transcripts), 2) + # Necessary to make sure that the externally-specified requirements are taken in + configurator.check_all_requirements(jconf) + self.assertEqual( + jconf["requirements"]["expression"], + "evaluated[\"suspicious_splicing\"]") + + jconf = configurator.check_json(jconf) + self.assertEqual( + jconf["requirements"]["expression"], + "evaluated[\"suspicious_splicing\"]") + + logger = create_default_logger(inspect.getframeinfo(inspect.currentframe())[2]) + for suspicious in (False, True): + with self.subTest(suspicious=suspicious): + loc = Superlocus(t1, json_conf=jconf, logger=logger) + t2.attributes["canonical_on_reverse_strand"] = suspicious + loc.add_transcript_to_locus(t2) + loc.add_transcript_to_locus(t3) + self.assertEqual(len(loc.transcripts), 3) + loc.define_subloci() + self.assertEqual(len(loc.transcripts), 3 if not suspicious else 2) class ASeventsTester(unittest.TestCase): @@ -251,6 +242,7 @@ def setUp(self): self.conf = configurator.to_json(None) self.conf["pick"]["alternative_splicing"] = dict() + self.conf["pick"]["alternative_splicing"]["report"] = True self.conf["pick"]["alternative_splicing"]["max_utr_length"] = 10000 self.conf["pick"]["alternative_splicing"]["max_fiveutr_length"] = 10000 self.conf["pick"]["alternative_splicing"]["max_threeutr_length"] = 10000 @@ -563,10 +555,14 @@ def test_intron_not_contained_in_exon(self): t2.end = 3000 t2.add_exons([(1400, 1560), (2800, 3000)]) t2.finalize() - for simple_overlap in (True, False): - with self.subTest(simple_overlap=simple_overlap): + + logger = create_default_logger("test_intron_not_contained_in_exon") + + for min_cdna_overlap in (0.01, 1): + with self.subTest(min_cdna_overlap=min_cdna_overlap): self.assertIs(MonosublocusHolder.is_intersecting( - self.t1, t2, logger=None, simple_overlap=simple_overlap), simple_overlap) + self.t1, t2, logger=None, min_cdna_overlap=min_cdna_overlap, + min_cds_overlap=min_cdna_overlap), (min_cdna_overlap != 1)) def test_noCDSOverlap(self): @@ -616,7 +612,16 @@ def test_only_CDS_overlap(self): t2.add_exons([(1250, 1560), (1801, 2000)]) t2.add_exons([(1401, 1560), (1801, 1850)], "CDS") t2.finalize() - self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2, cds_only=True)) + logger = create_default_logger(inspect.getframeinfo(inspect.currentframe())[2]) + + for min_cds_overlap in [0.05, 0.1, 0.15, 0.2, 0.5]: + with self.subTest(min_cds_overlap=min_cds_overlap): + self.assertIs(MonosublocusHolder.is_intersecting(self.t1, t2, + cds_only=True, + logger=logger, + min_cds_overlap=min_cds_overlap, + min_cdna_overlap=0.01), + (min_cds_overlap <= 0.19)) t2.strip_cds() t2.finalized = False @@ -624,8 +629,13 @@ def test_only_CDS_overlap(self): t2.finalize() self.assertGreater(len(t2.introns), 0) self.assertGreater(len(t2.combined_cds_introns), 0) - # No CDS overlap this time - self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2, cds_only=True)) + # No CDS overlap this time, but cDNA overlap. + for cds_only in (True, False): + with self.subTest(cds_only=cds_only): + self.assertIs(MonosublocusHolder.is_intersecting(self.t1, + t2, + cds_only=cds_only, + logger=logger), not cds_only) t2 = Transcript() t2.chrom = "Chr1" @@ -638,7 +648,13 @@ def test_only_CDS_overlap(self): t2.add_exons([(1350, 1560), (1801, 2000)]) t2.add_exons([(1401, 1560), (2801, 3850)], "CDS") t2.finalize() - self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2, cds_only=True)) + for min_overlap in [0.01, 0.05, 0.1, 0.2]: + with self.subTest(min_overlap=min_overlap): + self.assertIs(MonosublocusHolder.is_intersecting(self.t1, t2, + cds_only=True, + min_cds_overlap=min_overlap, + min_cdna_overlap=min_overlap, + logger=logger), (min_overlap <= 0.07)) def test_no_overlap(self): @@ -688,6 +704,7 @@ def setUp(self): self.json_conf = configurator.to_json(None) # self.json_conf["pick"] = dict() self.json_conf["pick"]["alternative_splicing"] = dict() + self.json_conf["pick"]["alternative_splicing"]["report"] = True self.json_conf["pick"]["alternative_splicing"]["max_utr_length"] = 2000 self.json_conf["pick"]["alternative_splicing"]["max_fiveutr_length"] = 1000 self.json_conf["pick"]["alternative_splicing"]["max_threeutr_length"] = 1000 @@ -696,10 +713,12 @@ def setUp(self): self.json_conf["pick"]["alternative_splicing"]["min_cds_overlap"] = 0 self.json_conf["pick"]["alternative_splicing"]["min_cdna_overlap"] = 0 self.json_conf["pick"]["alternative_splicing"]["min_score_perc"] = 0.1 - self.json_conf["pick"]["alternative_splicing"]["valid_ccodes"] = ["j", "n", "O", "mo"] - self.json_conf["pick"]["alternative_splicing"]["redundant_ccodes"] = ["c", "=", "_", "m"] + self.json_conf["pick"]["alternative_splicing"]["valid_ccodes"] = ["j", "G", "g"] + self.json_conf["pick"]["alternative_splicing"]["redundant_ccodes"] = ["c", "=", "_", "m", "n"] self.json_conf["pick"]["alternative_splicing"]["only_confirmed_introns"] = False + self.json_conf = configurator.check_json(self.json_conf) + t1 = """Chr1\tfoo\ttranscript\t1001\t3000\t.\t+\t.\tgene_id "Chr1.1"; transcript_id "Chr1.1.1"; Chr1\tfoo\texon\t1001\t1300\t.\t+\t.\tgene_id "Chr1.1"; transcript_id "Chr1.1.1"; Chr1\tfoo\tCDS\t1101\t1300\t.\t+\t.\tgene_id "Chr1.1"; transcript_id "Chr1.1.1"; @@ -1415,6 +1434,58 @@ def test_mixed_strands(self): self.assertEqual(sup.transcripts["t2"].retained_intron_num, 0) +class PicklingTest(unittest.TestCase): + + def setUp(self): + t1 = Transcript() + t1.chrom, t1.strand, t1.id = 1, "+", "t1" + t1.add_exons([(101, 500), (801, 1000), (1201, 1300), (1501, 1800)]) + t1.add_exons([(201, 500), # 300 + (801, 1000), # 200 + (1201, 1300), # 100 + (1501, 1530) # 30 + ], features="CDS") + t1.finalize() + + t2 = Transcript() + t2.chrom, t2.strand, t2.id = 1, "+", "t2" + t2.add_exons([(101, 500), (801, 1000), (1201, 1600)]) + t2.add_exons([(201, 500), # 300 + (801, 1000), # 200 + (1201, 1420), # 220 + ], features="CDS") + t2.finalize() + + t3 = Transcript() + t3.chrom, t3.strand, t3.id = 1, "+", "t3" + t3.add_exons([(101, 500), (801, 970), (1100, 1180)]) + t3.add_exons([(101, 500), (801, 970), (1100, 1130)], features="CDS") + t3.finalize() + + self.t1, self.t2, self.t3 = t1, t2, t3 + self.json_conf = configurator.to_json(None) + + def test_transcript_pickling(self): + + for transcript in [self.t1, self.t2, self.t3]: + with self.subTest(transcript=transcript): + pickled = pickle.dumps(transcript) + unpickled = pickle.loads(pickled) + self.assertEqual(transcript, unpickled) + + def test_locus_unpickling(self): + + for transcript in [self.t1, self.t2, self.t3]: + for (loc_type, loc_name) in [(_, _.__name__) for _ in (Superlocus, Sublocus, Monosublocus, Locus)]: + with self.subTest(transcript=transcript, loc_type=loc_type, loc_name=loc_name): + try: + loc = loc_type(transcript, json_conf=self.json_conf) + except TypeError as exc: + raise TypeError("{}\n{}".format(loc_name, exc)) + pickled = pickle.dumps(transcript) + unpickled = pickle.loads(pickled) + self.assertEqual(transcript, unpickled) + if __name__ == '__main__': unittest.main(verbosity=2) diff --git a/Mikado/tests/test_abstractlocus.py b/Mikado/tests/test_abstractlocus.py deleted file mode 100644 index 38b0787df..000000000 --- a/Mikado/tests/test_abstractlocus.py +++ /dev/null @@ -1,6 +0,0 @@ -import Mikado -import logging -import unittest - -__author__ = 'Luca Venturini' - diff --git a/Mikado/tests/test_clique_methods.py b/Mikado/tests/test_clique_methods.py index 7a71df089..c1fe755ed 100644 --- a/Mikado/tests/test_clique_methods.py +++ b/Mikado/tests/test_clique_methods.py @@ -1,9 +1,10 @@ -from Mikado.loci.clique_methods import find_cliques, find_communities -from Mikado.loci.clique_methods import _get_unvisited_neighbours, reid_daid_hurley -from Mikado.loci.clique_methods import define_graph -import networkx import unittest +import networkx + +from Mikado.transcripts.clique_methods import find_cliques, find_communities +from Mikado.transcripts.clique_methods import reid_daid_hurley + class TestCliques(unittest.TestCase): diff --git a/Mikado/tests/test_excluded.py b/Mikado/tests/test_excluded.py deleted file mode 100644 index 1b6d5ae7d..000000000 --- a/Mikado/tests/test_excluded.py +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env python3 - -import Mikado -import unittest -import logging - -__author__ = 'Luca Venturini' - - -class TestExcluded(unittest.TestCase): - pass - - -if __name__ == "__main__": - unittest.main() diff --git a/Mikado/tests/test_external_scores.py b/Mikado/tests/test_external_scores.py index 1aded404d..6961df817 100644 --- a/Mikado/tests/test_external_scores.py +++ b/Mikado/tests/test_external_scores.py @@ -1,10 +1,6 @@ import unittest + from Mikado.loci import Transcript -from Mikado.loci.transcript_methods import retrieval -from Mikado.parsers.bed12 import BED12 -from Mikado.configuration.configurator import to_json -import os -from sqlalchemy.engine import reflection class ExternalTester(unittest.TestCase): diff --git a/Mikado/tests/test_invalid_orfs.py b/Mikado/tests/test_invalid_orfs.py index 3955ca78c..c63702294 100644 --- a/Mikado/tests/test_invalid_orfs.py +++ b/Mikado/tests/test_invalid_orfs.py @@ -1,10 +1,11 @@ -import Mikado.loci.transcript -import Mikado.utilities.log_utils -import Mikado.exceptions -import Mikado.parsers import re import unittest +import Mikado.exceptions +import Mikado.parsers +import Mikado.transcripts.transcript +import Mikado.utilities.log_utils + __author__ = 'Luca Venturini' @@ -49,7 +50,7 @@ class MultOrfTester(unittest.TestCase): def setUp(self): """Basic creation test.""" - self.tr = Mikado.loci.transcript.Transcript(self.tr_gff_lines[0], logger=self.logger) + self.tr = Mikado.transcripts.transcript.Transcript(self.tr_gff_lines[0], logger=self.logger) for line in self.tr_gff_lines[1:]: self.tr.add_exon(line) diff --git a/Mikado/tests/test_splitting.py b/Mikado/tests/test_splitting.py index 6db83f759..a5fecbfcc 100644 --- a/Mikado/tests/test_splitting.py +++ b/Mikado/tests/test_splitting.py @@ -1,11 +1,13 @@ #!/usr/bin/env python3 +import logging import operator -import Mikado -from Mikado.loci.transcript_methods import splitting import unittest -import logging from sys import version_info + +import Mikado +from Mikado.transcripts.transcript_methods import splitting + if version_info.minor < 5: from sortedcontainers import SortedDict else: diff --git a/Mikado/tests/test_system_calls.py b/Mikado/tests/test_system_calls.py index 4fe051efa..57027db77 100644 --- a/Mikado/tests/test_system_calls.py +++ b/Mikado/tests/test_system_calls.py @@ -1,27 +1,29 @@ -import unittest -from Mikado.preparation import prepare -from Mikado.configuration import configurator, daijin_configurator -import pkg_resources -import tempfile -from Mikado.loci.transcript import Namespace -from Mikado.utilities.log_utils import create_null_logger -from Mikado.scales.compare import compare, load_index -import logging +import csv +import glob import gzip -import pyfaidx -import os import itertools -import csv -from Mikado.parsers import to_gff -from Mikado.subprograms.util.stats import Calculator -from Mikado.picking import picker -import Mikado.subprograms.configure -import Mikado.daijin +import logging +import os import random import sys -import glob +import tempfile +import unittest + +import pkg_resources +import pyfaidx import yaml +import Mikado.daijin +import Mikado.subprograms.configure +from Mikado.configuration import configurator, daijin_configurator +from Mikado.parsers import to_gff +from Mikado.picking import picker +from Mikado.preparation import prepare +from Mikado.scales.compare import compare, load_index +from Mikado.subprograms.util.stats import Calculator +from Mikado.transcripts.transcript import Namespace +from Mikado.utilities.log_utils import create_null_logger + class PrepareCheck(unittest.TestCase): diff --git a/Mikado/tests/test_transcript_checker.py b/Mikado/tests/test_transcript_checker.py index cf8949b87..e52f00546 100644 --- a/Mikado/tests/test_transcript_checker.py +++ b/Mikado/tests/test_transcript_checker.py @@ -1,15 +1,13 @@ -from Mikado.loci.transcriptchecker import TranscriptChecker -from Mikado.loci.transcript import Transcript -from Mikado.parsers.GFF import GffLine -from Mikado.parsers.GTF import GtfLine -import unittest -import pyfaidx -import pkg_resources -import tempfile import gzip import os - -# TODO: write this test! +import tempfile +import unittest +import pkg_resources +import pyfaidx +from Mikado.transcripts.transcriptchecker import TranscriptChecker +from Mikado.parsers.GFF import GffLine +from Mikado.parsers.GTF import GtfLine +from Mikado.transcripts.transcript import Transcript class TChekerTester(unittest.TestCase): diff --git a/Mikado/tests/test_transcript_methods.py b/Mikado/tests/test_transcript_methods.py index a54c89fd5..a2ec6e592 100644 --- a/Mikado/tests/test_transcript_methods.py +++ b/Mikado/tests/test_transcript_methods.py @@ -1,10 +1,12 @@ +import os import unittest + +from sqlalchemy.engine import reflection + +from Mikado.configuration.configurator import to_json from Mikado.loci import Transcript -from Mikado.loci.transcript_methods import retrieval from Mikado.parsers.bed12 import BED12 -from Mikado.configuration.configurator import to_json -import os -from sqlalchemy.engine import reflection +from Mikado.transcripts.transcript_methods import retrieval class WrongLoadedOrf(unittest.TestCase): diff --git a/Mikado/tests/transcript_tester_single.py b/Mikado/tests/transcript_tester_single.py index 44de3a77b..62db80a4f 100644 --- a/Mikado/tests/transcript_tester_single.py +++ b/Mikado/tests/transcript_tester_single.py @@ -4,16 +4,14 @@ Unit test for monoexonic transcripts. """ -import unittest +import operator import re -from Mikado.loci.transcript_methods.finalizing import _check_cdna_vs_utr -import intervaltree -import Mikado.parsers +import unittest + +import Mikado.exceptions import Mikado.loci +import Mikado.parsers from Mikado.loci import Transcript -import Mikado.exceptions -import operator -# from Mikado.py.serializers.orf import Orf from Mikado.utilities.log_utils import create_null_logger, create_default_logger @@ -159,7 +157,7 @@ def test_invalid_transcript(self): transcript.add_exons(gff_lines[1:]) with self.assertRaises(Mikado.exceptions.InvalidCDS): - Mikado.loci.transcript_methods.finalizing._check_cdna_vs_utr(transcript) + Mikado.transcripts.transcript_methods.finalizing._check_cdna_vs_utr(transcript) def test_utr(self): @@ -573,7 +571,7 @@ def test_orf_sorter(self): after_sorting = sorted([bed, bed2], reverse=True, - key=Mikado.loci.transcript_methods.retrieval.orf_sorter + key=Mikado.transcripts.transcript_methods.retrieval.orf_sorter ) self.assertEqual(after_sorting[0], bed) diff --git a/Mikado/transcripts/__init__.py b/Mikado/transcripts/__init__.py new file mode 100644 index 000000000..8b3610477 --- /dev/null +++ b/Mikado/transcripts/__init__.py @@ -0,0 +1,8 @@ +# coding: utf-8 + +""" + This module defines the transcript-like objects. +""" + +from .transcript import Transcript +from .transcriptchecker import TranscriptChecker \ No newline at end of file diff --git a/Mikado/loci/clique_methods.py b/Mikado/transcripts/clique_methods.py similarity index 100% rename from Mikado/loci/clique_methods.py rename to Mikado/transcripts/clique_methods.py diff --git a/Mikado/loci/transcript.py b/Mikado/transcripts/transcript.py similarity index 99% rename from Mikado/loci/transcript.py rename to Mikado/transcripts/transcript.py index 6841588d5..66fe3d054 100644 --- a/Mikado/loci/transcript.py +++ b/Mikado/transcripts/transcript.py @@ -6,33 +6,34 @@ # pylint: disable=too-many-lines -import logging +import builtins import copy -from sys import intern, maxsize -import re +import functools import inspect +import logging +import re +from ast import literal_eval +from sys import intern, maxsize + import intervaltree -from ..utilities.log_utils import create_null_logger -from ..utilities.intervaltree import Interval, IntervalTree -from sqlalchemy.sql.expression import desc, asc # SQLAlchemy imports from sqlalchemy import and_ -from sqlalchemy.ext import baked from sqlalchemy import bindparam +from sqlalchemy.ext import baked +from sqlalchemy.sql.expression import desc, asc # SQLAlchemy imports from ..exceptions import ModificationError, InvalidTranscript, CorruptIndex +from ..parsers.GFF import GffLine +from ..parsers.GTF import GtfLine +from ..parsers.bed12 import BED12 from ..serializers.blast_serializer import Query, Hit from ..serializers.external import External from ..serializers.orf import Orf -from .clique_methods import find_communities, define_graph -from ..parsers.GTF import GtfLine -from ..parsers.GFF import GffLine -from ..parsers.bed12 import BED12 +from ..transcripts.clique_methods import find_communities, define_graph +from ..utilities.log_utils import create_null_logger from .transcript_methods import splitting, retrieval +from .transcript_methods.finalizing import finalize from .transcript_methods.printing import create_lines_cds from .transcript_methods.printing import create_lines_no_cds, create_lines_bed, as_bed12 -from .transcript_methods.finalizing import finalize -import functools -import builtins -from ast import literal_eval +from ..utilities.intervaltree import Interval, IntervalTree class Namespace: diff --git a/Mikado/loci/transcript_methods/__init__.py b/Mikado/transcripts/transcript_methods/__init__.py similarity index 100% rename from Mikado/loci/transcript_methods/__init__.py rename to Mikado/transcripts/transcript_methods/__init__.py diff --git a/Mikado/loci/transcript_methods/finalizing.py b/Mikado/transcripts/transcript_methods/finalizing.py similarity index 99% rename from Mikado/loci/transcript_methods/finalizing.py rename to Mikado/transcripts/transcript_methods/finalizing.py index bb053a948..921540a85 100644 --- a/Mikado/loci/transcript_methods/finalizing.py +++ b/Mikado/transcripts/transcript_methods/finalizing.py @@ -7,7 +7,7 @@ from Mikado.utilities.intervaltree import IntervalTree import intervaltree import operator -from ...exceptions import InvalidCDS, InvalidTranscript +from Mikado.exceptions import InvalidCDS, InvalidTranscript __author__ = 'Luca Venturini' diff --git a/Mikado/loci/transcript_methods/printing.py b/Mikado/transcripts/transcript_methods/printing.py similarity index 99% rename from Mikado/loci/transcript_methods/printing.py rename to Mikado/transcripts/transcript_methods/printing.py index 1ddfdde1c..2585cc325 100644 --- a/Mikado/loci/transcript_methods/printing.py +++ b/Mikado/transcripts/transcript_methods/printing.py @@ -7,9 +7,9 @@ from collections import Counter from itertools import zip_longest import functools -from ...parsers.GTF import GtfLine -from ...parsers.GFF import GffLine -from ...parsers.bed12 import BED12 +from Mikado.parsers.GTF import GtfLine +from Mikado.parsers.GFF import GffLine +from Mikado.parsers.bed12 import BED12 __author__ = 'Luca Venturini' diff --git a/Mikado/loci/transcript_methods/retrieval.py b/Mikado/transcripts/transcript_methods/retrieval.py similarity index 99% rename from Mikado/loci/transcript_methods/retrieval.py rename to Mikado/transcripts/transcript_methods/retrieval.py index 55e01f5ee..dd3c0fbf6 100644 --- a/Mikado/loci/transcript_methods/retrieval.py +++ b/Mikado/transcripts/transcript_methods/retrieval.py @@ -3,14 +3,15 @@ from the database/dictionary provided during the pick operation. """ - +import operator from itertools import groupby -from sqlalchemy.orm.session import sessionmaker -from ...utilities import dbutils -from ..clique_methods import define_graph, find_cliques, find_communities -from ...serializers.junction import Junction + from sqlalchemy import and_ -import operator +from sqlalchemy.orm.session import sessionmaker + +from Mikado.serializers.junction import Junction +from Mikado.transcripts.clique_methods import define_graph, find_cliques, find_communities +from Mikado.utilities import dbutils __author__ = 'Luca Venturini' diff --git a/Mikado/loci/transcript_methods/splitting.py b/Mikado/transcripts/transcript_methods/splitting.py similarity index 99% rename from Mikado/loci/transcript_methods/splitting.py rename to Mikado/transcripts/transcript_methods/splitting.py index e6d9a667b..532ab99bb 100644 --- a/Mikado/loci/transcript_methods/splitting.py +++ b/Mikado/transcripts/transcript_methods/splitting.py @@ -12,9 +12,9 @@ import collections import operator from ...utilities.intervaltree import IntervalTree, Interval -from ...utilities import overlap -from ...exceptions import InvalidTranscript -from ...parsers.blast_utils import merge +from Mikado.utilities import overlap +from Mikado.exceptions import InvalidTranscript +from Mikado.parsers.blast_utils import merge __author__ = 'Luca Venturini' diff --git a/Mikado/loci/transcriptchecker.py b/Mikado/transcripts/transcriptchecker.py similarity index 100% rename from Mikado/loci/transcriptchecker.py rename to Mikado/transcripts/transcriptchecker.py diff --git a/Mikado/utilities/__init__.py b/Mikado/utilities/__init__.py index 357a88348..1dd3b18a0 100644 --- a/Mikado/utilities/__init__.py +++ b/Mikado/utilities/__init__.py @@ -8,10 +8,10 @@ from . import dbutils from . import log_utils import collections -from ..parsers import to_gff +import gzip from itertools import zip_longest from .overlap import overlap -import gzip +# from ..parsers import to_gff __author__ = 'Luca Venturini' diff --git a/util/bam2gtf.py b/util/bam2gtf.py index cdeed179f..ec33e83d3 100644 --- a/util/bam2gtf.py +++ b/util/bam2gtf.py @@ -1,10 +1,11 @@ #!/usr/bin/env python3 +import argparse import re import sys -import argparse + import pysam -from Mikado.loci.transcript import Transcript +from Mikado.transcripts.transcript import Transcript def to_bam(string): diff --git a/util/gffjunc_to_bed12.py b/util/gffjunc_to_bed12.py index b2e92724f..7b0086b24 100755 --- a/util/gffjunc_to_bed12.py +++ b/util/gffjunc_to_bed12.py @@ -3,10 +3,11 @@ """GFF +> BED12 converter for junction files.""" -import sys import argparse -import Mikado.parsers +import sys + import Mikado.loci +import Mikado.parsers def main(): @@ -45,7 +46,7 @@ def main(): bed12.block_starts = [0, bed12.block_sizes[0] + introns[0][1] - introns[0][0]] print(bed12, file=args.out) - transcript = Mikado.loci.transcript.Transcript(row) + transcript = Mikado.transcripts.transcript.Transcript(row) if transcript is not None: transcript.finalize() From 4149da6cba79a9dd5b10a0d42db3a7790cdc005b Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Fri, 3 Feb 2017 18:15:58 +0000 Subject: [PATCH 12/47] Implemented the new clustering algorithm, modified the configuration to make it tidier. --- CHANGELOG.md | 11 +- .../configuration_blueprint.json | 100 ++++--- Mikado/configuration/daijin_configurator.py | 2 +- Mikado/configuration/daijin_schema.json | 3 +- Mikado/loci/abstractlocus.py | 7 + Mikado/loci/locus.py | 4 +- Mikado/loci/monosublocusholder.py | 279 +++++------------- Mikado/loci/superlocus.py | 31 +- Mikado/picking/picker.py | 8 +- Mikado/subprograms/pick.py | 8 +- Mikado/tests/locus_tester.py | 29 +- 11 files changed, 193 insertions(+), 289 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6780ab34f..553dcf5a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,12 +2,19 @@ Changes in this release: +- **MAJOR**: re-written the clustering algorithm for the MonosublocusHolder stage. Now a holder will accept another monosublocus if: + - the cDNA and CDS overlap is over a user-specified threshold *OR* + OR + - there is some intronic overlap + OR + - one intron of either transcript is completely contained within an exon of the other. +- **MAJOR**: changed slightly the anatomy of the configuration files. Now "pick" has a new subsection, "clustering", dedicated to how to cluster the transcripts in the different steps. Currently it contains the keys "flank", "min_cdna_overlap" and "min_cds_overlap" (for the second clustering during the monosublocusHolder phase) and "cds_only" (to indicate whether we should only consider the CDS for clustering after the initial merging in the Superlocus). - Deprecated the "discard_definition" flag in Mikado serialise. Now Mikado will infer on its own whether to use the definition or the ID for serialising BLAST results. -- Now AbstractLocus implementation will check at runtime that the configuration is correct. +- Now AbstractLocus implementations have a private method to check the correctness of the json_conf. As a corollary, Transcript and children have been moved to their own subpackage ("transcripts") in order to break the circular dependency Mikado.loci.Abstractlocus <- Mikado.configurator <- Mikado.loci.Transcript. *Technical note*: checking the consinstency of the configuration is an expensive operation, so it will be executed on demand rather than automatically. The latter scenario, in tests, led to an increase in runtime of the pick stage of over 300%. - Re-written the "find_retained_introns" method of AbstractLocus, to solve some bugs found during the utilisation of last beta. As a corollary, expanded the intervaltree module to allow searches for "tagged" intervals. - Now the "monoloci_out" files contain the Monosublocus**Holder** step, not the Monosublocus step. This should help during fine-tuning. - Mikado now supports also Python3.6. - + #Version 1.0.0beta9 - "External scores" diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json index d062d4c62..7ebc3f31d 100644 --- a/Mikado/configuration/configuration_blueprint.json +++ b/Mikado/configuration/configuration_blueprint.json @@ -587,32 +587,22 @@ " deleted at the end of the run (see shm_db).", " for faster access. Default: false", "- exclude_cds: boolean flag. If set, the CDS information will not be printed in Mikado output. Default: false", - "- purge: boolean flag. If set, all loci where all transcripts have a score of 0 will be excluded", - " from the output. Default: false", - "- remove_overlapping_fragments: boolean flag. If set, fragments (defined as monoexonic loci", - " classified as P,x,i or p compared to another locus, will be removed from the output.", - "- fragments_maximal_cds: a locus will never be considered a fragment if its longest CDS is over", - " this length. Default: 100 bps.", - "- fragments_maximal_exons: a locus will never be considered a fragment if its representative transcript", - " has more than this number of exons. Default: 2", "- procs: number of processes to use. Default: 1", "- preload: boolean flag. If set, the whole database will be preloaded into memory for faster access. Useful when", " using SQLite databases.", - "- single_thread: boolean flag. If set, multithreading will be disabled - useful for profiling and debugging." + "- single_thread: boolean flag. If set, multithreading will be disabled - useful for profiling and debugging.", + "- consider_truncated_for_retained: boolean. Normally, Mikado considers only exons which span a whole intron as possible retained intron events. If this flag is set to true, also terminal exons will be considered.", + "- remove_overlapping_fragments: boolean, it specifies whether to remove putative fragments.", + "- purge: boolean, it specifies whether to remove loci where all transcripts fail the minimum checks, or whether to print them out in the subloci file instead." ], "SimpleComment": [ "Generic run options.", - "- purge: boolean flag. If set, all loci where all transcripts have a score of 0 will be excluded", - " from the output. Default: false", - "- remove_overlapping_fragments: boolean flag. If set, fragments (defined as monoexonic loci", - " classified as P,x,i or p compared to another locus, will be removed from the output.", "- procs: number of processes to use. Default: 1", "- intron_range: A range where most of the introns (99%) should fall into. Transcripts with too many", " introns larger or smaller than what is defined in this range will be penalised", " in the scoring. Default: [60, 900]", "- preload: boolean flag. If set, the whole database will be preloaded into memory for potentially faster access.", - "- single_thread: boolean flag. If set, multithreading will be disabled - useful for profiling and debugging.", - "- flank: integer, maximum flank to group transcripts together for analysis. Default: 0." + "- single_thread: boolean flag. If set, multithreading will be disabled - useful for profiling and debugging." ], "properties": { "shm": { @@ -627,31 +617,6 @@ "type": "boolean", "default": false }, - "purge": { - "type": "boolean", - "default": true - }, - "remove_overlapping_fragments": { - "type": "boolean", - "default": true - }, - "flank":{ - "type": "integer", - "minimum": 0, - "default": 200 - }, - "fragments_maximal_cds": { - "type": "integer", - "default": 200 - }, - "fragments_maximal_exons": { - "type": "integer", - "default": 2 - }, - "fragments_maximal_cdna": { - "type": "integer", - "default": 400 - }, "intron_range": { "type": "array", "items": { @@ -663,14 +628,6 @@ "minItems": 2, "default": [60, 900] }, - "subloci_from_cds_only": { - "type": "boolean", - "default": false - }, - "monoloci_from_simple_overlap": { - "type": "boolean", - "default": false - }, "consider_truncated_for_retained": { "type": "boolean", "default": false @@ -687,6 +644,53 @@ "preload": { "type": "boolean", "default": false + }, + "remove_overlapping_fragments": { + "type": "boolean", + "default": true + }, + "purge": { + "type": "boolean", + "default": true + } + + } + }, + "clustering": { + "type": "object", + "Comment": [ + "Parameters related to the clustering of transcripts into loci.", + "- cds_only: boolean, it specifies whether to cluster transcripts only according to their CDS (if present).", + "- min_cds_overlap: minimal CDS overlap for the second clustering.", + "- min_cdna_overlap: minimal cDNA overlap for the second clustering.", + "- flank: maximum distance for transcripts to be clustered within the same superlocus." + ], + "SimpleComment": [ + "Parameters related to the clustering of transcripts into loci.", + "- flank: maximum distance for transcripts to be clustered within the same superlocus." + ], + "required": ["flank"], + "properties":{ + "cds_only": { + "type": "boolean", + "default": false + }, + "min_cds_overlap":{ + "type": "number", + "minimum": 0.000001, + "maximum": 1, + "default": 0.2 + }, + "min_cdna_overlap":{ + "type": "number", + "minimum": 0.000001, + "maximum": 1, + "default": 0.2 + }, + "flank":{ + "type": "integer", + "minimum": 0, + "default": 200 } } }, diff --git a/Mikado/configuration/daijin_configurator.py b/Mikado/configuration/daijin_configurator.py index 3d01c995c..a8287ea0c 100644 --- a/Mikado/configuration/daijin_configurator.py +++ b/Mikado/configuration/daijin_configurator.py @@ -166,7 +166,7 @@ def create_daijin_config(args, level="ERROR"): config["mikado"]["pick"]["scoring_file"] = args.scoring if args.flank is not None: - config["mikado"]["pick"]["run_options"]["flank"] = args.flank + config["mikado"]["pick"]["clustering"]["flank"] = args.flank config["blastx"]["prot_db"] = args.prot_db assert "prot_db" in config["blastx"] diff --git a/Mikado/configuration/daijin_schema.json b/Mikado/configuration/daijin_schema.json index 850bae584..4006641dc 100644 --- a/Mikado/configuration/daijin_schema.json +++ b/Mikado/configuration/daijin_schema.json @@ -280,7 +280,8 @@ "run_options": { "type": "object", "properties": { - "flank": {"$ref": "configuration_blueprint.json#properties/pick/properties/run_options/properties/flank"} + "flank": { + "$ref": "configuration_blueprint.json#properties/pick/properties/clustering/properties/flank"} } } } diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py index 10a4d5676..20c1c934c 100644 --- a/Mikado/loci/abstractlocus.py +++ b/Mikado/loci/abstractlocus.py @@ -869,6 +869,13 @@ def json_conf(self): @json_conf.setter def json_conf(self, conf): + self.__json_conf = conf + + def _check_json(self): + """Private method to be invoked to verify that the configuration is correct. + Quite expensive to run, especially if done multiple times.""" + + conf = self.__json_conf if conf is None or isinstance(conf, (str, bytes)): conf = to_json(conf) elif isinstance(conf, dict): diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py index 57e11b02b..59fdf1c8e 100644 --- a/Mikado/loci/locus.py +++ b/Mikado/loci/locus.py @@ -584,7 +584,7 @@ def is_alternative_splicing(self, other): valid_ccodes = self.json_conf["pick"]["alternative_splicing"]["valid_ccodes"] redundant_ccodes = self.json_conf["pick"]["alternative_splicing"]["redundant_ccodes"] - if self.json_conf["pick"]["run_options"]["subloci_from_cds_only"] is True: + if self.json_conf["pick"]["clustering"]["cds_only"] is True: main_without_utr = self.primary_transcript.deepcopy() main_without_utr.remove_utrs() other_without_utr = other.deepcopy() @@ -606,7 +606,7 @@ def is_alternative_splicing(self, other): for tid in iter(tid for tid in self.transcripts if tid not in (self.primary_transcript_id, other.id)): candidate = self.transcripts[tid] - if self.json_conf["pick"]["run_options"]["subloci_from_cds_only"] is True: + if self.json_conf["pick"]["clustering"]["cds_only"] is True: candidate = candidate.deepcopy() candidate.remove_utrs() Assigner.compare(other_without_utr, candidate) diff --git a/Mikado/loci/monosublocusholder.py b/Mikado/loci/monosublocusholder.py index 3bdb6009f..a0bea0b0a 100644 --- a/Mikado/loci/monosublocusholder.py +++ b/Mikado/loci/monosublocusholder.py @@ -144,14 +144,9 @@ def __str__(self, print_cds=False, source_in_name=True): return "\n".join(lines) - def define_monosubloci(self, purge=False, excluded=None): - """Overriden and set to NotImplemented to avoid cross-calling it when inappropriate. - - :param purge: flag. Ignored. - :param excluded: flag. Ignored. - """ - raise NotImplementedError( - "Monosubloci are the input of this object, not the output.") + def define_monosubloci(self, **kwargs): + """Overriden and set to NotImplemented to avoid cross-calling it when inappropriate.""" + raise NotImplementedError("Monosubloci are the input of this object, not the output.") def define_loci(self, purge=False, excluded=None): """This is the main function of the class. It is analogous @@ -176,9 +171,13 @@ def define_loci(self, purge=False, excluded=None): self.calculate_scores() - graph = self.define_graph(self.transcripts, inters=self.is_intersecting, - cds_only=self.json_conf["pick"][ - "run_options"]["subloci_from_cds_only"], logger=self.logger) + graph = self.define_graph( + self.transcripts, + inters=self.is_intersecting, + logger=self.logger, + cds_only=self.json_conf["pick"]["clustering"]["cds_only"], + min_cdna_overlap=self.json_conf["pick"]["clustering"]["min_cdna_overlap"], + min_cds_overlap=self.json_conf["pick"]["clustering"]["min_cds_overlap"]) loci = [] while len(graph) > 0: @@ -220,10 +219,11 @@ def is_intersecting(cls, by definition span multiple subloci, we have to be less strict in our definition of what counts as an intersection. Criteria: - - 1 splice site in common (splice, not junction) - - If one or both of the transcript is monoexonic OR - one or both lack an ORF, check for any exonic overlap - - Otherwise, check for any CDS overlap. + - the cDNA and CDS overlap is over a user-specified threshold + OR + - there is some intronic overlap + OR + - one intron of either transcript is completely contained within an exon of the other. :param transcript :type transcript; Transcript @@ -252,21 +252,20 @@ def is_intersecting(cls, if logger is None or not isinstance(logger, logging.Logger): logger = create_null_logger("MSH") - if transcript.id == other.id or transcript.strand != other.strand: - logger.debug("Cannot intersect with itself (%s vs %s)", - transcript.id, other.id) + if transcript == other or transcript.id == other.id or transcript.strand != other.strand: + logger.debug("Cannot intersect with itself (%s vs %s) or a transcript on the other strand (%s and %s)", + transcript.id, other.id, transcript.strand, other.strand) return False - if cds_only is True and (transcript.is_coding and other.is_coding): - logger.debug("Considering only the CDS of %s and %s, as they are both coding; stripping the UTR", - transcript.id, other.id) - transcript = transcript.deepcopy() - transcript.remove_utrs() - other = other.deepcopy() - other.remove_utrs() - logger.debug("New coordinates: %s (%d-%d), %s (%d-%d)", - transcript.id, transcript.start, transcript.end, - other.id, other.start, other.end) + logger.debug("Consider only the CDS: %s", cds_only) + if cds_only is True: + logger.debug("%s %s, %s %s, so %s", + transcript.id, transcript.is_coding, + other.id, other.is_coding, + "removing the UTRs" if other.is_coding and transcript.is_coding else "not removing the UTRs" + ) + if transcript.is_coding and other.is_coding: + transcript, other = cls.__strip_utr(transcript, other, logger) # Calculate the relationship between the transcripts comparison, _ = c_compare(other, transcript) @@ -277,64 +276,61 @@ def is_intersecting(cls, logger.debug("No genomic overlap between %s and %s. Comparison: %s", transcript.id, other.id, comparison) return False # We do not want intersection with oneself - elif min(transcript.exon_num, other.exon_num) == 1: - logger.debug("%s and %s intersect (%s %s monoexonic); class code: %s", - transcript.id, other.id, - transcript.id if ( - transcript.exon_num == 1 and other.exon_num > 1) else - other.id if transcript.exon_num > 1 else "both", - "is" if max(transcript.exon_num, other.exon_num) > 1 else "are", - comparison.ccode) - return True elif comparison.j_f1[0] > 0 or comparison.ccode[0] == "h": # Simple case: they do intersect! logger.debug("%s and %s intersect; class code: %s", transcript.id, other.id, comparison.ccode) return True - elif comparison.ccode[0] == "o": + else: # Is at least one intron completely contained? - if cls._intron_contained_in_exon(transcript, other): - logger.debug("At least 1 intron of %s is completely contained within an exon of %s\n%s\n%s", - transcript.id, - other.id, - transcript.combined_cds_introns, other.coding_exons) - return True - elif cls._intron_contained_in_exon(other, transcript): - logger.debug("At least 1 intron of %s is completely contained within an exon of %s", - other.id, transcript.id) + if cls._intron_contained_in_exon(transcript, other) or cls._intron_contained_in_exon(other, transcript): + logger.debug("Intronic containment within an exon for the comparison %s and %s; intersecting", + transcript.id, other.id) return True + cdna_overlap, cds_overlap = cls.__calculate_overlap(transcript, other, comparison, cds_only=cds_only) + if cdna_overlap >= min_cdna_overlap and cds_overlap >= min_cds_overlap: + logger.debug("%s and %s have enough of CDS (%s) and cDNA (%s) overlap, intersecting.", + transcript.id, other.id, cds_overlap, cdna_overlap) + return True + else: + logger.debug("%s and %s do not have enough of CDS (%s) and cDNA (%s) overlap, not intersecting.", + transcript.id, other.id, cds_overlap, cdna_overlap) + return False + + @staticmethod + def __strip_utr(transcript, other, logger): + """Private method to remove the UTRs from both transcripts. Creates deep copies of the original objects, + to avoid bugs down the line.""" + logger.debug("Considering only the CDS of %s and %s, as they are both coding; stripping the UTR", + transcript.id, other.id) + transcript = transcript.deepcopy() + transcript.remove_utrs() + other = other.deepcopy() + other.remove_utrs() + logger.debug("New coordinates: %s (%d-%d), %s (%d-%d)", + transcript.id, transcript.start, transcript.end, + other.id, other.start, other.end) + return transcript, other + + @staticmethod + def __calculate_overlap(transcript, other, comparison, cds_only=False) -> (float, float): + """Private method to return the cDNA overlap and the CDS overlap of two transcripts.""" + cdna_overlap = max(comparison.n_recall[0], comparison.n_prec[0]) / 100 if cds_only is True and (transcript.is_coding and other.is_coding): - if cdna_overlap >= max(min_cds_overlap, min_cdna_overlap): - logger.debug("%s and %s have %s of CDS overlap (considering only the coding portion), they intersect.", - transcript.id, other.id, cdna_overlap) - return True - else: - logger.debug( - """Considering only the CDS, %s and %s do not have enough overlap (tot %s), they do not intersect. - Comparison: %s""", - transcript.id, other.id, cdna_overlap, comparison) - return False - elif not (cds_only is True and (transcript.is_coding and other.is_coding)): - logger.debug("Considering also the CDS of %s and %s, for the overlap.", - transcript.id, other.id) + return cdna_overlap, cdna_overlap + elif cds_only is False and (transcript.is_coding and other.is_coding): cds_transcript = transcript.deepcopy() cds_transcript.remove_utrs() cds_other = other.deepcopy() cds_other.remove_utrs() cds_comparison, _ = c_compare(cds_other, cds_transcript) cds_overlap = max(cds_comparison.n_recall[0], cds_comparison.n_prec[0]) / 100 - if cdna_overlap >= min_cdna_overlap and cds_overlap >= min_cds_overlap: - logger.debug("%s and %s have enough of CDS (%s) and cDNA (%s) overlap, intersecting.", - transcript.id, other.id, cds_overlap, cdna_overlap) - return True - else: - logger.debug("%s and %s do not have enough of CDS (%s) and cDNA (%s) overlap, not intersecting.", - transcript.id, other.id, cds_overlap, cdna_overlap) - return False + return cdna_overlap, cds_overlap + elif not (transcript.is_coding and other.is_coding): + return cdna_overlap, cdna_overlap else: - logger.debug("%s and %s fail to meet the requirements to intersect.") - return False + raise SyntaxError("Unhandled behaviour!") @staticmethod def _intron_contained_in_exon(transcript: Transcript, other: Transcript) -> bool: @@ -344,147 +340,14 @@ def _intron_contained_in_exon(transcript: Transcript, other: Transcript) -> bool return any((overlap(*_) == (_[0][1] - _[0][0])) for _ in itertools.product(transcript.introns, other.exons)) - @classmethod - def is_intersecting_old(cls, transcript, other, cds_only=False, logger=None, simple_overlap=False): - """ - Implementation of the is_intersecting method. Now that we are comparing transcripts that - by definition span multiple subloci, we have to be less strict in our definition of what - counts as an intersection. - Criteria: - - 1 splice site in common (splice, not junction) - - If one or both of the transcript is monoexonic OR - one or both lack an ORF, check for any exonic overlap - - Otherwise, check for any CDS overlap. - - :param transcript - :type transcript; Transcript - - :param other: - :type other: Transcript - - :param cds_only: boolean flag. If set to True, only - the CDS component of the transcripts will be considered to determine - whether they are intersecting or not. - :type cds_only: bool - - :param simple_overlap: boolean flag. If set to True, the intersection will be determined by a simple - overlapping check. Default: False - :type simple_overlap: bool - - :param logger: either None or a logger instance. If None, a null logger will be created. - - :rtype : bool - """ - - if logger is None or not isinstance(logger, logging.Logger): - logger = create_null_logger("MSH") - - if transcript.id == other.id: - logger.debug("Cannot intersect with itself (%s vs %s)", - transcript.id, other.id) - return False - elif cls.overlap( - (transcript.start, transcript.end), - (other.start, other.end)) <= 0: - logger.debug("No genomic overlap between %s and %s", transcript.id, other.id) - return False # We do not want intersection with oneself - - if not any((overlap(*_) > 0) for _ in itertools.product(transcript.exons, other.exons)): - logger.debug("No exonic overlap between %s and %s", - transcript.id, other.id) - return False - - # At this point we pretty much know that we have an interval - if simple_overlap is True: - if (cds_only is True and all((_.is_coding is True for _ in (transcript, other)))): - if any((overlap(*_) > 0) for _ in itertools.product(transcript.combined_cds, other.combined_cds)): - logger.debug("%s and %s are both coding and they overlap on their CDS - they intersect", - transcript.id, other.id) - return True - else: - logger.debug("%s and %s are both coding but they do not intersect.", transcript.id, other.id) - return False - else: - logger.debug("%s and %s are not both coding, but their exons overlap. Returning True.") - return True - else: - # Both transcripts are multiexonic - if not any([other.monoexonic, transcript.monoexonic]): - if cds_only is True and all((_.is_coding is True for _ in (transcript, other))): - # First check for splice site interaction - if any(((overlap(*_) > 0) for _ in itertools.product( - transcript.combined_cds_introns, - other.combined_cds_introns))): - logger.debug("At least one combined CDS intron of %s intersects a combined CDS intron of %s; %s %s", - transcript.id, other.id, transcript.combined_cds_introns, other.combined_cds_introns) - return True - elif any((overlap(*_) == (_[0][1] - _[0][0])) - for _ in itertools.product(transcript.combined_cds_introns, other.coding_exons)): - logger.debug("At least 1 intron of %s is completely contained within an exon of %s\n%s\n%s", - transcript.id, - other.id, - transcript.combined_cds_introns, other.coding_exons) - return True - elif any((overlap(*_) == (_[0][1] - _[0][0])) - for _ in itertools.product(other.combined_cds_introns, transcript.coding_exons)): - logger.debug("At least 1 intron of %s is completely contained within an exon of %s", - other.id, transcript.id) - return True - else: - logger.debug("No combined CDS intron of %s intersects a combined CDS intron of %s\n%s\n%s", - transcript.id, other.id, - transcript.combined_cds_introns, - other.combined_cds_introns) - else: - if any(((overlap(*_) > 0) for _ in itertools.product(transcript.introns, other.introns))): - logger.debug("At least 1 intron of %s intersects another intron in %s", - transcript.id, other.id) - return True - # elif any((overlap(*_) == (_[0][1] - _[0][0] + 1 )) - elif any((overlap(*_) == (_[0][1] - _[0][0])) - for _ in itertools.product(transcript.introns, other.exons)): - logger.debug("At least 1 intron of %s is completely contained within an exon of %s\n%s\n%s", - transcript.id, other.id, transcript.introns, other.exons) - return True - elif any((overlap(*_) == (_[0][1] - _[0][0])) - for _ in itertools.product(other.introns, transcript.exons)): - logger.debug("At least 1 intron of %s is completely contained within an exon of %s", - other.id, transcript.id) - return True - else: - logger.debug("No intron in %s intersects introns in %s", - transcript.id, other.id) - else: - if cds_only is True and all((_.is_coding is True for _ in (transcript, other))): - if any(True for comb in itertools.product(transcript.combined_cds, other.combined_cds) if - cls.overlap(*comb) >= 0): - logger.debug("CDS overlap between %s and %s", - transcript.id, other.id) - return True - else: - logger.debug("No CDS overlap between %s and %s", - transcript.id, other.id) - # return False - else: - if any(True for comb in itertools.product(transcript.exons, other.exons) if - cls.overlap(*comb) >= 0): - logger.debug("Genomic overlap between %s and %s", - transcript.id, other.id) - return True - else: - logger.debug("No genomic overlap between %s and %s", - transcript.id, other.id) - - return False - - @classmethod def in_locus(cls, monosublocus: Abstractlocus, transcript: Transcript, flank=0, logger=None, cds_only=False, - simple_overlap=False) -> bool: + min_cdna_overlap=0.2, + min_cds_overlap=0.2) -> bool: """This method checks whether a transcript / monosbulocus falls inside the Locus coordinates. @@ -500,6 +363,7 @@ def in_locus(cls, monosublocus: Abstractlocus, :param flank: optional flank argument :type flank: int """ + if hasattr(transcript, "transcripts"): assert len(transcript.transcripts) == 1 transcript = transcript.transcripts[list(transcript.transcripts.keys())[0]] @@ -512,8 +376,9 @@ def in_locus(cls, monosublocus: Abstractlocus, is_in_locus = cls.is_intersecting(tran, transcript, logger=logger, - cds_only=cds_only) - # simple_overlap=simple_overlap) + cds_only=cds_only, + min_cds_overlap=min_cds_overlap, + min_cdna_overlap=min_cdna_overlap) if is_in_locus is True: break return is_in_locus diff --git a/Mikado/loci/superlocus.py b/Mikado/loci/superlocus.py index 72f22d928..b36375cbd 100644 --- a/Mikado/loci/superlocus.py +++ b/Mikado/loci/superlocus.py @@ -28,7 +28,6 @@ from ..serializers.junction import Junction, Chrom from ..serializers.orf import Orf from ..utilities import dbutils, grouper - if version_info.minor < 5: from sortedcontainers import SortedDict else: @@ -553,13 +552,20 @@ def load_all_transcript_data(self, engine=None, data_dict=None): self.session.query(Query).filter( Query.query_name.in_(tid_group))) # Retrieve the external scores - external = self.session.query(External).filter(External.query_id.in_(query_ids.keys())) + if query_ids: + external = self.session.query(External).filter(External.query_id.in_(query_ids.keys())) + else: + external = [] for ext in external: data_dict["external"][ext.query][ext.source] = ext.score # Load the ORFs from the table - orfs = self.session.query(Orf).filter(Orf.query_id.in_(query_ids.keys())) + if query_ids: + orfs = self.session.query(Orf).filter(Orf.query_id.in_(query_ids.keys())) + else: + orfs = [] + for orf in orfs: data_dict["orfs"][orf.query].append(orf.as_bed12()) @@ -885,7 +891,7 @@ def define_subloci(self): self.subloci_defined = True return - cds_only = self.json_conf["pick"]["run_options"]["subloci_from_cds_only"] + cds_only = self.json_conf["pick"]["clustering"]["cds_only"] self.logger.debug("Calculating the transcript graph for %d transcripts", len(self.transcripts)) transcript_graph = self.define_graph(self.transcripts, inters=self.is_intersecting, @@ -1134,14 +1140,18 @@ def define_alternative_splicing(self): candidates = collections.defaultdict(set) primary_transcripts = set(locus.primary_transcript_id for locus in self.loci.values()) - cds_only = self.json_conf["pick"]["run_options"]["subloci_from_cds_only"] - simple_overlap = self.json_conf["pick"]["run_options"]["monoloci_from_simple_overlap"] + cds_only = self.json_conf["pick"]["clustering"]["cds_only"] + # simple_overlap = self.json_conf["pick"]["run_options"]["monoloci_from_simple_overlap"] + cds_overlap = self.json_conf["pick"]["clustering"]["min_cds_overlap"] + cdna_overlap = self.json_conf["pick"]["clustering"]["min_cdna_overlap"] + t_graph = self.define_graph(self.transcripts, inters=MonosublocusHolder.is_intersecting, cds_only=cds_only, - logger=self.logger) - # simple_overlap=simple_overlap) - + logger=self.logger, + min_cdna_overlap=cdna_overlap, + min_cds_overlap=cds_overlap) + cliques = self.find_cliques(t_graph) loci_cliques = dict() @@ -1173,8 +1183,7 @@ def calculate_mono_metrics(self): """Wrapper to calculate the metrics for the monosubloci.""" self.monoholders = [] - cds_only = self.json_conf["pick"]["run_options"]["subloci_from_cds_only"] - simple_overlap = self.json_conf["pick"]["run_options"]["monoloci_from_simple_overlap"] + cds_only = self.json_conf["pick"]["clustering"]["cds_only"] for monosublocus_instance in sorted(self.monosubloci): found_holder = False for holder in self.monoholders: diff --git a/Mikado/picking/picker.py b/Mikado/picking/picker.py index 51b72efb9..f04d80469 100644 --- a/Mikado/picking/picker.py +++ b/Mikado/picking/picker.py @@ -807,7 +807,7 @@ def __submit_multi_threading(self, data_dict): self.__test_sortedness(row, current_transcript) if Superlocus.in_locus( current_locus, current_transcript, - flank=self.json_conf["pick"]["run_options"]["flank"]) is True: + flank=self.json_conf["pick"]["clustering"]["flank"]) is True: current_locus.add_transcript_to_locus(current_transcript, check_in_locus=False) else: @@ -836,7 +836,7 @@ def __submit_multi_threading(self, data_dict): if current_transcript is not None and invalid is False: if Superlocus.in_locus( current_locus, current_transcript, - flank=self.json_conf["pick"]["run_options"]["flank"]) is True: + flank=self.json_conf["pick"]["clustering"]["flank"]) is True: current_locus.add_transcript_to_locus( current_transcript, check_in_locus=False) else: @@ -971,7 +971,7 @@ def __submit_single_threaded(self, data_dict): self.__test_sortedness(row, current_transcript) if Superlocus.in_locus( current_locus, current_transcript, - flank=self.json_conf["pick"]["run_options"]["flank"]) is True: + flank=self.json_conf["pick"]["clustering"]["flank"]) is True: current_locus.add_transcript_to_locus(current_transcript, check_in_locus=False) else: @@ -1012,7 +1012,7 @@ def __submit_single_threaded(self, data_dict): if current_transcript is not None and invalid is False: if Superlocus.in_locus( current_locus, current_transcript, - flank=self.json_conf["pick"]["run_options"]["flank"]) is True: + flank=self.json_conf["pick"]["clustering"]["flank"]) is True: current_locus.add_transcript_to_locus( current_transcript, check_in_locus=False) else: diff --git a/Mikado/subprograms/pick.py b/Mikado/subprograms/pick.py index d039cd179..ea1b51f61 100644 --- a/Mikado/subprograms/pick.py +++ b/Mikado/subprograms/pick.py @@ -57,7 +57,7 @@ def check_run_options(args): args.json_conf["pick"]["run_options"]["purge"] = True if args.flank is not None: - args.json_conf["pick"]["run_options"]["flank"] = args.flank + args.json_conf["pick"]["clustering"]["flank"] = args.flank if args.output_dir is not None: args.json_conf["pick"]["files"]["output_dir"] = args.output_dir @@ -91,8 +91,8 @@ def check_run_options(args): if args.monoloci_from_simple_overlap is True: args.json_conf["pick"]["run_options"]["monoloci_from_simple_overlap"] = True - if args.subloci_from_cds_only is True: - args.json_conf["pick"]["run_options"]["subloci_from_cds_only"] = True + if args.cds_only is True: + args.json_conf["pick"]["clustering"]["cds_only"] = True if args.consider_truncated_for_retained is True: args.json_conf["pick"]["run_options"]["consider_truncated_for_retained"] = True @@ -188,7 +188,7 @@ def pick_parser(): parser.add_argument('--purge', action='store_true', default=False, help='''Flag. If set, the pipeline will suppress any loci whose transcripts do not pass the requirements set in the JSON file.''') - parser.add_argument("--subloci-from-cds-only", dest="subloci_from_cds_only", + parser.add_argument("--cds-only", dest="cds_only", default=False, action="store_true", help=""""Flag. If set, Mikado will only look for overlap in the coding features when clustering transcripts (unless one transcript is non-coding, in which case diff --git a/Mikado/tests/locus_tester.py b/Mikado/tests/locus_tester.py index ce1cd063d..d038b12ff 100644 --- a/Mikado/tests/locus_tester.py +++ b/Mikado/tests/locus_tester.py @@ -379,7 +379,7 @@ def test_non_redundant_as(self): # self.locus.add_transcript_to_locus(t2) self.assertEqual(self.locus.is_alternative_splicing(t2)[:2], (True, "J")) - self.locus.json_conf["pick"]["run_options"]["subloci_from_cds_only"] = True + self.locus.json_conf["pick"]["clustering"]["cds_only"] = True self.assertEqual(self.locus.is_alternative_splicing(t2)[:2], (False, "=")) @@ -644,13 +644,27 @@ def test_only_CDS_overlap(self): t2.id = "G2.1" t2.parent = "G2" t2.start = 1350 - t2.end = 3000 - t2.add_exons([(1350, 1560), (1801, 2000)]) + t2.end = 3850 + t2.add_exons([(1350, 1560), (2801, 3850)]) t2.add_exons([(1401, 1560), (2801, 3850)], "CDS") + # logger.setLevel("DEBUG") + t2.logger = logger t2.finalize() - for min_overlap in [0.01, 0.05, 0.1, 0.2]: + self.assertTrue(t2.is_coding) + for min_overlap in [0.1, 0.2, 0.3, 0.5]: with self.subTest(min_overlap=min_overlap): self.assertIs(MonosublocusHolder.is_intersecting(self.t1, t2, + cds_only=False, + min_cds_overlap=0.07, + min_cdna_overlap=min_overlap, + logger=logger), (min_overlap <= 0.12)) + + self.assertTrue(t2.is_coding) + + for min_overlap in [0.01, 0.05, 0.1, 0.2]: + with self.subTest(min_overlap=min_overlap): + self.assertIs(MonosublocusHolder.is_intersecting(self.t1, + t2, cds_only=True, min_cds_overlap=min_overlap, min_cdna_overlap=min_overlap, @@ -671,7 +685,7 @@ def test_no_overlap(self): t2.finalize() self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2)) - def test_same_id(self): + def test_sameness(self): t2 = Transcript() t2.chrom = "Chr1" @@ -1478,10 +1492,7 @@ def test_locus_unpickling(self): for transcript in [self.t1, self.t2, self.t3]: for (loc_type, loc_name) in [(_, _.__name__) for _ in (Superlocus, Sublocus, Monosublocus, Locus)]: with self.subTest(transcript=transcript, loc_type=loc_type, loc_name=loc_name): - try: - loc = loc_type(transcript, json_conf=self.json_conf) - except TypeError as exc: - raise TypeError("{}\n{}".format(loc_name, exc)) + loc = loc_type(transcript, json_conf=self.json_conf) pickled = pickle.dumps(transcript) unpickled = pickle.loads(pickled) self.assertEqual(transcript, unpickled) From 5ff713341f2606e39f3c67e7ccc9c7308af3a246 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Sat, 4 Feb 2017 22:25:04 +0000 Subject: [PATCH 13/47] Moved "purge" and "remove_overlapping_fragments" to "clustering". They are under a PendingDeprecationWarning. --- .../configuration_blueprint.json | 29 ++++++++++--------- Mikado/loci/monosublocusholder.py | 2 +- Mikado/loci/sublocus.py | 2 +- Mikado/loci/superlocus.py | 2 +- Mikado/picking/loci_processer.py | 4 +-- Mikado/picking/picker.py | 14 ++++++++- Mikado/subprograms/pick.py | 2 +- Mikado/tests/test_system_calls.py | 6 ++-- 8 files changed, 37 insertions(+), 24 deletions(-) diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json index 7ebc3f31d..b2a813c77 100644 --- a/Mikado/configuration/configuration_blueprint.json +++ b/Mikado/configuration/configuration_blueprint.json @@ -592,8 +592,8 @@ " using SQLite databases.", "- single_thread: boolean flag. If set, multithreading will be disabled - useful for profiling and debugging.", "- consider_truncated_for_retained: boolean. Normally, Mikado considers only exons which span a whole intron as possible retained intron events. If this flag is set to true, also terminal exons will be considered.", - "- remove_overlapping_fragments: boolean, it specifies whether to remove putative fragments.", - "- purge: boolean, it specifies whether to remove loci where all transcripts fail the minimum checks, or whether to print them out in the subloci file instead." + "- remove_overlapping_fragments: DEPRECATED, see clustering.", + "- purge: DEPRECATED, see clustering." ], "SimpleComment": [ "Generic run options.", @@ -602,7 +602,8 @@ " introns larger or smaller than what is defined in this range will be penalised", " in the scoring. Default: [60, 900]", "- preload: boolean flag. If set, the whole database will be preloaded into memory for potentially faster access.", - "- single_thread: boolean flag. If set, multithreading will be disabled - useful for profiling and debugging." + "- single_thread: boolean flag. If set, multithreading will be disabled - useful for profiling and debugging.", + "- flank: DEPRECATED. Now set in the clustering section." ], "properties": { "shm": { @@ -644,16 +645,7 @@ "preload": { "type": "boolean", "default": false - }, - "remove_overlapping_fragments": { - "type": "boolean", - "default": true - }, - "purge": { - "type": "boolean", - "default": true } - } }, "clustering": { @@ -663,13 +655,14 @@ "- cds_only: boolean, it specifies whether to cluster transcripts only according to their CDS (if present).", "- min_cds_overlap: minimal CDS overlap for the second clustering.", "- min_cdna_overlap: minimal cDNA overlap for the second clustering.", - "- flank: maximum distance for transcripts to be clustered within the same superlocus." + "- flank: maximum distance for transcripts to be clustered within the same superlocus.", + "- remove_overlapping_fragments: boolean, it specifies whether to remove putative fragments.", + "- purge: boolean, it specifies whether to remove loci where all transcripts fail the minimum checks, or whether to print them out in the subloci file instead." ], "SimpleComment": [ "Parameters related to the clustering of transcripts into loci.", "- flank: maximum distance for transcripts to be clustered within the same superlocus." ], - "required": ["flank"], "properties":{ "cds_only": { "type": "boolean", @@ -691,6 +684,14 @@ "type": "integer", "minimum": 0, "default": 200 + }, + "remove_overlapping_fragments": { + "type": "boolean", + "default": true + }, + "purge": { + "type": "boolean", + "default": true } } }, diff --git a/Mikado/loci/monosublocusholder.py b/Mikado/loci/monosublocusholder.py index a0bea0b0a..09197765d 100644 --- a/Mikado/loci/monosublocusholder.py +++ b/Mikado/loci/monosublocusholder.py @@ -51,7 +51,7 @@ def __init__(self, monosublocus_instance: Monosublocus, json_conf=None, logger=N self.metrics_calculated = False self.json_conf = json_conf self.excluded = None - self.purge = self.json_conf["pick"]["run_options"]["purge"] + self.purge = self.json_conf["pick"]["clustering"]["purge"] self.feature = "MonosublocusHolder" self.score = monosublocus_instance.score self.scores_calculated = False diff --git a/Mikado/loci/sublocus.py b/Mikado/loci/sublocus.py index bdaf91d99..052115f90 100644 --- a/Mikado/loci/sublocus.py +++ b/Mikado/loci/sublocus.py @@ -62,7 +62,7 @@ def __init__(self, span, json_conf=None, logger=None, verified_introns=None): self.fixed_size = True if span.feature == "sublocus" else False if span.__name__ == "transcript": span.finalize() - self.purge = self.json_conf["pick"]["run_options"]["purge"] + self.purge = self.json_conf["pick"]["clustering"]["purge"] self.source = self.json_conf["pick"]["output_format"]["source"] self.excluded = None diff --git a/Mikado/loci/superlocus.py b/Mikado/loci/superlocus.py index b36375cbd..ea7f8ec9f 100644 --- a/Mikado/loci/superlocus.py +++ b/Mikado/loci/superlocus.py @@ -118,7 +118,7 @@ def __init__(self, raise NoJsonConfigError("I am missing the configuration for prioritizing transcripts!") self.__regressor = None self.json_conf = json_conf - self.purge = self.json_conf["pick"]["run_options"]["purge"] + self.purge = self.json_conf["pick"]["clustering"]["purge"] self.splices = set(self.splices) self.introns = set(self.introns) diff --git a/Mikado/picking/loci_processer.py b/Mikado/picking/loci_processer.py index 51afa8fe9..6369d39f0 100644 --- a/Mikado/picking/loci_processer.py +++ b/Mikado/picking/loci_processer.py @@ -279,7 +279,7 @@ def print_locus(stranded_locus, for locus in stranded_locus.loci: gene_counter += 1 fragment_test = ( - json_conf["pick"]["run_options"]["remove_overlapping_fragments"] + json_conf["pick"]["clustering"]["remove_overlapping_fragments"] is True and stranded_locus.loci[locus].is_fragment is True) if fragment_test is True: @@ -421,7 +421,7 @@ def remove_fragments(stranded_loci, json_conf, logger): loci_to_check[False] = loci_to_check.pop(True) loci_to_check[True] = set() - bool_remove_fragments = json_conf["pick"]["run_options"]["remove_overlapping_fragments"] + bool_remove_fragments = json_conf["pick"]["clustering"]["remove_overlapping_fragments"] for stranded_locus in stranded_loci: to_remove = set() for locus_id, locus_instance in stranded_locus.loci.items(): diff --git a/Mikado/picking/picker.py b/Mikado/picking/picker.py index f04d80469..62eb6aef8 100644 --- a/Mikado/picking/picker.py +++ b/Mikado/picking/picker.py @@ -37,7 +37,9 @@ import multiprocessing.managers from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier import pickle - +import warnings +logging.captureWarnings(True) +warnings.simplefilter("always") # pylint: disable=too-many-instance-attributes class Picker: @@ -117,6 +119,16 @@ def __init__(self, json_conf, commandline=""): self.setup_logger() self.logger.info("Multiprocessing method: %s", self.json_conf["multiprocessing_method"]) + + for key in ("remove_overlapping_fragments", "flank", "purge"): + if key in self.json_conf["pick"]["run_options"]: + # Put warnings in place for the deprecation of some options. + warns = PendingDeprecationWarning( + """The \"{}\" property has now been moved to the pick/clustering section. +Please update your configuration files in the future.""".format(key)) + self.logger.warn(warns) + self.json_conf["pick"]["clustering"][key] = self.json_conf["pick"]["run_options"][key] + self.context = multiprocessing.get_context() if self.json_conf["pick"]["scoring_file"].endswith((".pickle", ".model")): with open(self.json_conf["pick"]["scoring_file"], "rb") as forest: diff --git a/Mikado/subprograms/pick.py b/Mikado/subprograms/pick.py index ea1b51f61..20907d735 100644 --- a/Mikado/subprograms/pick.py +++ b/Mikado/subprograms/pick.py @@ -54,7 +54,7 @@ def check_run_options(args): if args.no_cds is not False: args.json_conf["pick"]["run_options"]["exclude_cds"] = True if args.purge is not False: - args.json_conf["pick"]["run_options"]["purge"] = True + args.json_conf["pick"]["clustering"]["purge"] = True if args.flank is not None: args.json_conf["pick"]["clustering"]["flank"] = args.flank diff --git a/Mikado/tests/test_system_calls.py b/Mikado/tests/test_system_calls.py index 57027db77..d3530b62a 100644 --- a/Mikado/tests/test_system_calls.py +++ b/Mikado/tests/test_system_calls.py @@ -552,7 +552,7 @@ def test_purging(self): with self.subTest(purging=purging): json_conf["pick"]["files"]["loci_out"] = "mikado.purging_{}.loci.gff3".format(purging) json_conf["pick"]["files"]["log"] = "mikado.purging_{}.log".format(purging) - json_conf["pick"]["run_options"]["purge"] = purging + json_conf["pick"]["clustering"]["purge"] = purging json_conf["pick"]["scoring_file"] = scoring_file.name json_conf = configurator.check_json(json_conf) self.assertEqual(len(json_conf["scoring"].keys()), 1, json_conf["scoring"].keys()) @@ -601,7 +601,7 @@ def test_purging(self): json_conf["pick"]["files"]["loci_out"] = "mikado.purging_{}.loci.gff3".format(purging) json_conf["pick"]["files"]["subloci_out"] = "mikado.purging_{}.subloci.gff3".format(purging) json_conf["pick"]["files"]["log"] = "mikado.purging_{}.log".format(purging) - json_conf["pick"]["run_options"]["purge"] = purging + json_conf["pick"]["clustering"]["purge"] = purging json_conf["pick"]["scoring_file"] = scoring_file.name json_conf = configurator.check_json(json_conf) self.assertEqual(len(json_conf["scoring"].keys()), 2, json_conf["scoring"].keys()) @@ -641,7 +641,7 @@ def test_purging(self): json_conf["pick"]["files"]["loci_out"] = "mikado.purging_{}.loci.gff3".format(purging) json_conf["pick"]["files"]["subloci_out"] = "mikado.purging_{}.subloci.gff3".format(purging) json_conf["pick"]["files"]["log"] = "mikado.purging_{}.log".format(purging) - json_conf["pick"]["run_options"]["purge"] = purging + json_conf["pick"]["clustering"]["purge"] = purging json_conf["pick"]["scoring_file"] = scoring_file.name json_conf = configurator.check_json(json_conf) self.assertEqual(len(json_conf["scoring"].keys()), 2, json_conf["scoring"].keys()) From 151d4ef6463e17100083ffc4ae182b91161437a7 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Sat, 4 Feb 2017 22:26:59 +0000 Subject: [PATCH 14/47] Also removed the max_utr keys, as they are redundant. --- Mikado/configuration/configuration_blueprint.json | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json index b2a813c77..af84a0f19 100644 --- a/Mikado/configuration/configuration_blueprint.json +++ b/Mikado/configuration/configuration_blueprint.json @@ -331,21 +331,6 @@ "minimum": 1, "default": 5 }, - "max_utr_length": { - "type": "integer", - "default": 2500, - "minimum": 1 - }, - "max_fiveutr_length": { - "type": "integer", - "default": 2500, - "minimum": 1 - }, - "max_threeutr_length": { - "type": "integer", - "default": 2500, - "minimum": 1 - }, "valid_ccodes": { "type": "array", "items": { From c4ac7143ab67148ec446a24e374b2bf331eab35d Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Mon, 6 Feb 2017 15:34:23 +0000 Subject: [PATCH 15/47] Clustering redone. Time to write a battery of tests. --- CHANGELOG.md | 12 +- .../configuration_blueprint.json | 9 +- Mikado/loci/abstractlocus.py | 10 +- Mikado/loci/monosublocusholder.py | 216 +++++++++++------- Mikado/loci/superlocus.py | 41 ++-- Mikado/parsers/bed12.py | 1 + Mikado/tests/locus_tester.py | 7 +- Mikado/transcripts/transcript.py | 57 ++++- 8 files changed, 239 insertions(+), 114 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 553dcf5a4..7b88783a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,9 +8,17 @@ Changes in this release: - there is some intronic overlap OR - one intron of either transcript is completely contained within an exon of the other. -- **MAJOR**: changed slightly the anatomy of the configuration files. Now "pick" has a new subsection, "clustering", dedicated to how to cluster the transcripts in the different steps. Currently it contains the keys "flank", "min_cdna_overlap" and "min_cds_overlap" (for the second clustering during the monosublocusHolder phase) and "cds_only" (to indicate whether we should only consider the CDS for clustering after the initial merging in the Superlocus). + OR + - at least one of the transcripts is monoexonic and there is some overlap of any kind. This behaviour (which was the default until this release) can be switched off through pick/clustering/simple_overlap_for_monoexonic (default true). +- **MAJOR**: changed slightly the anatomy of the configuration files. Now "pick" has a new subsection, "clustering", dedicated to how to cluster the transcripts in the different steps. Currently it contains the keys: + - "flank" + - "min_cdna_overlap" and "min_cds_overlap" (for the second clustering during the monosublocusHolder phase) + - "cds_only": to indicate whether we should only consider the CDS for clustering after the initial merging in the Superlocus. + - "simple_overlap_for_monoexonic": to switch on/off the old default behaviour with monoexonic transcripts + - "purge": whether to completely exclude failed loci, previously under "run_options" + - "remove_overlapping_fragments": whether to exclude fragments, previously under "run_options" - Deprecated the "discard_definition" flag in Mikado serialise. Now Mikado will infer on its own whether to use the definition or the ID for serialising BLAST results. -- Now AbstractLocus implementations have a private method to check the correctness of the json_conf. As a corollary, Transcript and children have been moved to their own subpackage ("transcripts") in order to break the circular dependency Mikado.loci.Abstractlocus <- Mikado.configurator <- Mikado.loci.Transcript. *Technical note*: checking the consinstency of the configuration is an expensive operation, so it will be executed on demand rather than automatically. The latter scenario, in tests, led to an increase in runtime of the pick stage of over 300%. +- Now AbstractLocus implementations have a private method to check the correctness of the json_conf. As a corollary, Transcript and children have been moved to their own subpackage ("transcripts") in order to break the circular dependency Mikado.loci.Abstractlocus <- Mikado.configurator <- Mikado.loci.Transcript. *Technical note*: checking the consinstency of the configuration is an expensive operation, so it will be executed on demand rather than automatically. - Re-written the "find_retained_introns" method of AbstractLocus, to solve some bugs found during the utilisation of last beta. As a corollary, expanded the intervaltree module to allow searches for "tagged" intervals. - Now the "monoloci_out" files contain the Monosublocus**Holder** step, not the Monosublocus step. This should help during fine-tuning. - Mikado now supports also Python3.6. diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json index af84a0f19..1b8ebb6c9 100644 --- a/Mikado/configuration/configuration_blueprint.json +++ b/Mikado/configuration/configuration_blueprint.json @@ -642,7 +642,10 @@ "- min_cdna_overlap: minimal cDNA overlap for the second clustering.", "- flank: maximum distance for transcripts to be clustered within the same superlocus.", "- remove_overlapping_fragments: boolean, it specifies whether to remove putative fragments.", - "- purge: boolean, it specifies whether to remove loci where all transcripts fail the minimum checks, or whether to print them out in the subloci file instead." + "- purge: boolean, it specifies whether to remove loci where all transcripts fail the minimum checks, or whether to print them out in the subloci file instead.", + "- simple_overlap_for_monoexonic: boolean. If set to true (default), then any overlap mean inclusion", + "in a locus for or against a monoexonic transcript. If set to false, normal controls for the percentage", + "of overlap will apply." ], "SimpleComment": [ "Parameters related to the clustering of transcripts into loci.", @@ -677,6 +680,10 @@ "purge": { "type": "boolean", "default": true + }, + "simple_overlap_for_monoexonic": { + "type": "boolean", + "default": true } } }, diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py index 20c1c934c..d26fa8789 100644 --- a/Mikado/loci/abstractlocus.py +++ b/Mikado/loci/abstractlocus.py @@ -176,7 +176,9 @@ def __getitem__(self, item): @staticmethod def overlap(first_interval: (int, int), - second_interval: (int, int), flank=0) -> int: + second_interval: (int, int), + flank=0, + positive=False) -> int: """:param first_interval: a tuple of integers :type first_interval: [int,int] @@ -198,7 +200,7 @@ def overlap(first_interval: (int, int), Input: two 2-tuples of integers. """ - return overlap(first_interval, second_interval, flank) + return overlap(first_interval, second_interval, flank, positive=positive) @staticmethod def evaluate(param: str, conf: dict) -> bool: @@ -343,7 +345,7 @@ def choose_best(cls, transcripts: dict) -> str: # ###### Class instance methods ####### - def add_transcript_to_locus(self, transcript, check_in_locus=True): + def add_transcript_to_locus(self, transcript, check_in_locus=True, logger=None, **kwargs): """ :param transcript :type transcript: Mikado.loci_objects.transcript.Transcript @@ -367,7 +369,7 @@ def add_transcript_to_locus(self, transcript, check_in_locus=True): if self.initialized is True: if check_in_locus is False: pass - elif not self.in_locus(self, transcript): + elif not self.in_locus(self, transcript, **kwargs): raise NotInLocusError("""Trying to merge a Locus with an incompatible transcript! Locus: {lchrom}:{lstart}-{lend} {lstrand} [{stids}] Transcript: {tchrom}:{tstart}-{tend} {tstrand} {tid} diff --git a/Mikado/loci/monosublocusholder.py b/Mikado/loci/monosublocusholder.py index 09197765d..16f63ecbd 100644 --- a/Mikado/loci/monosublocusholder.py +++ b/Mikado/loci/monosublocusholder.py @@ -79,10 +79,37 @@ def add_transcript_to_locus(self, transcript, check_in_locus=True): :type check_in_locus: bool """ - Abstractlocus.add_transcript_to_locus(self, transcript, - check_in_locus=check_in_locus) + # + # monosublocus: Abstractlocus, + # transcript: Transcript, + # flank = 0, + # logger = None, + # cds_only = False, + # min_cdna_overlap = 0.2, + # min_cds_overlap = 0.2, + # classic_method = False + + self.logger.warning("Using the MSH add_transcript_to_locus method for %s and %s", + self.id, transcript.id) + + if check_in_locus is True and self.in_locus( + self, + transcript, + flank=self.json_conf["pick"]["clustering"]["flank"], + logger=self.logger, + cds_only=self.json_conf["pick"]["clustering"]["cds_only"], + min_cdna_overlap=self.json_conf["pick"]["clustering"]["min_cdna_overlap"], + min_cds_overlap=self.json_conf["pick"]["clustering"]["min_cds_overlap"], + simple_overlap_for_monoexonic=self.json_conf["pick"]["clustering"]["simple_overlap_for_monoexonic"] + ) is False: + + self.logger.debug("%s is not a valid intersection for %s", transcript.id, self.id) + return False + + Abstractlocus.add_transcript_to_locus(self, transcript, check_in_locus=False) self.locus_verified_introns = set.union(self.locus_verified_introns, transcript.verified_introns) + # pylint: enable=arguments-differ def add_monosublocus(self, monosublocus_instance: Monosublocus): @@ -177,7 +204,9 @@ def define_loci(self, purge=False, excluded=None): logger=self.logger, cds_only=self.json_conf["pick"]["clustering"]["cds_only"], min_cdna_overlap=self.json_conf["pick"]["clustering"]["min_cdna_overlap"], - min_cds_overlap=self.json_conf["pick"]["clustering"]["min_cds_overlap"]) + min_cds_overlap=self.json_conf["pick"]["clustering"]["min_cds_overlap"], + simple_overlap_for_monoexonic=self.json_conf["pick"]["clustering"]["simple_overlap_for_monoexonic"] + ) loci = [] while len(graph) > 0: @@ -213,7 +242,8 @@ def is_intersecting(cls, cds_only=False, logger=None, min_cdna_overlap=0.2, - min_cds_overlap=0.2) -> bool: + min_cds_overlap=0.2, + simple_overlap_for_monoexonic=True) -> bool: """ Implementation of the is_intersecting method. Now that we are comparing transcripts that by definition span multiple subloci, we have to be less strict in our definition of what @@ -221,19 +251,22 @@ def is_intersecting(cls, Criteria: - the cDNA and CDS overlap is over a user-specified threshold OR - - there is some intronic overlap + - either transcript is monoexonic and simple_overlap_for_monoexonic is True, if there is exonic overlap OR - one intron of either transcript is completely contained within an exon of the other. + The user can specify whether she prefers to consider the whole transcript (default) or whether to consider + instead the **selected ORF** of the transcripts for the comparison. Please note that intersection in secondary + ORFs will not be valid under this scenario. + :param transcript :type transcript; Transcript :param other: :type other: Transcript - :param cds_only: boolean flag. If set to True, only - the CDS component of the transcripts will be considered to determine - whether they are intersecting or not. + :param cds_only: boolean flag. If set to True, only the CDS component of the transcripts will be + considered to determine whether they are intersecting or not. :type cds_only: bool :param min_cdna_overlap: float. This is the minimum cDNA overlap for two transcripts to be considered as intersecting, @@ -244,6 +277,10 @@ def is_intersecting(cls, even when all other conditions fail. :type min_cds_overlap: float + :param simple_overlap_for_monoexonic: boolean flag. If set to true, any overlap for monoexonic transcripts + will be enough to trigger incorporation in the locus. + :type simple_overlap_for_monoexonic: bool + :param logger: either None or a logger instance. If None, a null logger will be created. :rtype : bool @@ -257,80 +294,96 @@ def is_intersecting(cls, transcript.id, other.id, transcript.strand, other.strand) return False - logger.debug("Consider only the CDS: %s", cds_only) - if cds_only is True: - logger.debug("%s %s, %s %s, so %s", - transcript.id, transcript.is_coding, - other.id, other.is_coding, - "removing the UTRs" if other.is_coding and transcript.is_coding else "not removing the UTRs" - ) - if transcript.is_coding and other.is_coding: - transcript, other = cls.__strip_utr(transcript, other, logger) - - # Calculate the relationship between the transcripts - comparison, _ = c_compare(other, transcript) - - logger.debug("Starting to check %s and %s" ,transcript.id, other.id) - if comparison.n_f1[0] == 0: - # No overlap. Return False - logger.debug("No genomic overlap between %s and %s. Comparison: %s", - transcript.id, other.id, comparison) - return False # We do not want intersection with oneself - elif comparison.j_f1[0] > 0 or comparison.ccode[0] == "h": - # Simple case: they do intersect! - logger.debug("%s and %s intersect; class code: %s", transcript.id, other.id, comparison.ccode) - return True + if cds_only is True and transcript.is_coding and other.is_coding: + logger.debug("Consider only the CDS: %s", cds_only) + intersecting, reason = cls._transcripts_are_intersecting( + transcript._selected_orf_transcript, + other._selected_orf_transcript, + min_cdna_overlap=min_cdna_overlap, + min_cds_overlap=min_cds_overlap, + simple_overlap_for_monoexonic=simple_overlap_for_monoexonic, + is_internal_orf=True) else: - # Is at least one intron completely contained? - if cls._intron_contained_in_exon(transcript, other) or cls._intron_contained_in_exon(other, transcript): - logger.debug("Intronic containment within an exon for the comparison %s and %s; intersecting", - transcript.id, other.id) - return True - - cdna_overlap, cds_overlap = cls.__calculate_overlap(transcript, other, comparison, cds_only=cds_only) - if cdna_overlap >= min_cdna_overlap and cds_overlap >= min_cds_overlap: - logger.debug("%s and %s have enough of CDS (%s) and cDNA (%s) overlap, intersecting.", - transcript.id, other.id, cds_overlap, cdna_overlap) - return True - else: - logger.debug("%s and %s do not have enough of CDS (%s) and cDNA (%s) overlap, not intersecting.", - transcript.id, other.id, cds_overlap, cdna_overlap) - return False + intersecting, reason = cls._transcripts_are_intersecting( + transcript, + other, + min_cdna_overlap=min_cdna_overlap, + min_cds_overlap=min_cds_overlap, + simple_overlap_for_monoexonic=simple_overlap_for_monoexonic, + is_internal_orf=False) - @staticmethod - def __strip_utr(transcript, other, logger): - """Private method to remove the UTRs from both transcripts. Creates deep copies of the original objects, - to avoid bugs down the line.""" - logger.debug("Considering only the CDS of %s and %s, as they are both coding; stripping the UTR", - transcript.id, other.id) - transcript = transcript.deepcopy() - transcript.remove_utrs() - other = other.deepcopy() - other.remove_utrs() - logger.debug("New coordinates: %s (%d-%d), %s (%d-%d)", - transcript.id, transcript.start, transcript.end, - other.id, other.start, other.end) - return transcript, other + logger.debug(reason) + return intersecting - @staticmethod - def __calculate_overlap(transcript, other, comparison, cds_only=False) -> (float, float): - """Private method to return the cDNA overlap and the CDS overlap of two transcripts.""" - - cdna_overlap = max(comparison.n_recall[0], comparison.n_prec[0]) / 100 - if cds_only is True and (transcript.is_coding and other.is_coding): - return cdna_overlap, cdna_overlap - elif cds_only is False and (transcript.is_coding and other.is_coding): - cds_transcript = transcript.deepcopy() - cds_transcript.remove_utrs() - cds_other = other.deepcopy() - cds_other.remove_utrs() - cds_comparison, _ = c_compare(cds_other, cds_transcript) - cds_overlap = max(cds_comparison.n_recall[0], cds_comparison.n_prec[0]) / 100 - return cdna_overlap, cds_overlap - elif not (transcript.is_coding and other.is_coding): - return cdna_overlap, cdna_overlap + @classmethod + def _transcripts_are_intersecting(cls, + transcript: Transcript, + other: Transcript, + min_cdna_overlap=0.2, + min_cds_overlap=0.2, + simple_overlap_for_monoexonic=True, + is_internal_orf=False): + """Private method which is called by is_intersecting. It decouples the determination of whether two transcripts + intersect from the public interface of the method. + :param transcript + :type transcript; Transcript + + :param other: + :type other: Transcript + + :param cds_only: boolean flag. If set to True, only + the CDS component of the transcripts will be considered to determine + whether they are intersecting or not. + :type cds_only: bool + + :param min_cdna_overlap: float. This is the minimum cDNA overlap for two transcripts to be considered as intersecting, + even when all other conditions fail. + :type min_cdna_overlap: float + + :param min_cds_overlap: float. This is the minimum CDS overlap for two transcripts to be considered as intersecting, + even when all other conditions fail. + :type min_cds_overlap: float + + :param simple_overlap_for_monoexonic: boolean flag. If set to true, any overlap for monoexonic transcripts + will be enough to trigger incorporation in the locus. + :type simple_overlap_for_monoexonic: bool + """ + + comparison, _ = c_compare(other, transcript) + if comparison.n_f1[0] == 0: + reason = "No genomic overlap between {} and {}".format(transcript.id, other.id) + intersecting = False + return intersecting, reason + + if comparison.j_f1[0] > 0 or comparison.ccode[0] == "h": + reason = "{} and {} intersect; class code: {}".format(transcript.id, other.id, comparison.ccode[0]) + intersecting = True + elif simple_overlap_for_monoexonic is True and any(_.monoexonic is True for _ in (transcript, other)): + reason = "Simple overlap for monoexonic transcripts, for {} and {}".format(transcript.id, other.id) + intersecting = True + elif cls._intron_contained_in_exon(transcript, other) or cls._intron_contained_in_exon(other, transcript): + reason = "Intronic containment within an exon for the comparison {} and {}; intersecting".format( + transcript.id, other.id) + intersecting = True else: - raise SyntaxError("Unhandled behaviour!") + cdna_overlap = max(comparison.n_prec[0], comparison.n_recall[0])/ 100 + if is_internal_orf is True or not (transcript.is_coding and other.is_coding): + cds_overlap = cdna_overlap + else: + cds_overlap = 0 + for segment in transcript.selected_cds: + for o_segment in other.selected_cds: + cds_overlap += cls.overlap(segment, o_segment, positive=True, flank=0) + cds_overlap /= min(transcript.selected_cds_length, other.selected_cds_length) + assert cds_overlap <= 1 + intersecting = (cdna_overlap >= min_cdna_overlap and cds_overlap >= min_cds_overlap) + reason = "{} and {} {}share enough cDNA ({}%, min. {}%) and CDS ({}%, min. {}%), {}intersecting".format( + transcript.id, other.id, + "do not " if not intersecting else "", + cdna_overlap, min_cdna_overlap, + cds_overlap, min_cds_overlap, "not" if not intersecting else "") + + return intersecting, reason @staticmethod def _intron_contained_in_exon(transcript: Transcript, other: Transcript) -> bool: @@ -347,7 +400,8 @@ def in_locus(cls, monosublocus: Abstractlocus, logger=None, cds_only=False, min_cdna_overlap=0.2, - min_cds_overlap=0.2) -> bool: + min_cds_overlap=0.2, + simple_overlap_for_monoexonic=False) -> bool: """This method checks whether a transcript / monosbulocus falls inside the Locus coordinates. @@ -378,7 +432,9 @@ def in_locus(cls, monosublocus: Abstractlocus, logger=logger, cds_only=cds_only, min_cds_overlap=min_cds_overlap, - min_cdna_overlap=min_cdna_overlap) + min_cdna_overlap=min_cdna_overlap, + simple_overlap_for_monoexonic=simple_overlap_for_monoexonic + ) if is_in_locus is True: break return is_in_locus diff --git a/Mikado/loci/superlocus.py b/Mikado/loci/superlocus.py index ea7f8ec9f..f83a23341 100644 --- a/Mikado/loci/superlocus.py +++ b/Mikado/loci/superlocus.py @@ -28,6 +28,7 @@ from ..serializers.junction import Junction, Chrom from ..serializers.orf import Orf from ..utilities import dbutils, grouper +import itertools if version_info.minor < 5: from sortedcontainers import SortedDict else: @@ -1183,15 +1184,19 @@ def calculate_mono_metrics(self): """Wrapper to calculate the metrics for the monosubloci.""" self.monoholders = [] - cds_only = self.json_conf["pick"]["clustering"]["cds_only"] for monosublocus_instance in sorted(self.monosubloci): found_holder = False for holder in self.monoholders: - if MonosublocusHolder.in_locus(holder, - monosublocus_instance, - logger=self.logger, - cds_only=cds_only): - # simple_overlap=simple_overlap): + if MonosublocusHolder.in_locus( + holder, + monosublocus_instance, + logger=self.logger, + cds_only=self.json_conf["pick"]["clustering"]["cds_only"], + min_cdna_overlap=self.json_conf["pick"]["clustering"]["min_cdna_overlap"], + min_cds_overlap=self.json_conf["pick"]["clustering"]["min_cds_overlap"], + simple_overlap_for_monoexonic=self.json_conf["pick"]["clustering"][ + "simple_overlap_for_monoexonic"] + ): holder.add_monosublocus(monosublocus_instance) found_holder = True break @@ -1258,25 +1263,25 @@ def is_intersecting(cls, transcript, other, cds_only=False): return False # We do not want intersection with oneself if transcript.monoexonic is False and other.monoexonic is False: - if cds_only is False: + if cds_only is False or transcript.is_coding is False or other.is_coding is False: intersection = set.intersection(transcript.introns, other.introns) else: intersection = set.intersection(transcript.combined_cds_introns, other.combined_cds_introns) - if len(intersection) > 0: - intersecting = True - else: - intersecting = False + intersecting = (len(intersection) > 0) elif transcript.monoexonic is True and other.monoexonic is True: - if transcript.start == other.start or transcript.end == other.end: - intersecting = True - else: - test_result = cls.overlap( + + if cds_only is False or transcript.is_coding is False or other.is_coding is False: + intersecting = (cls.overlap( (transcript.start, transcript.end), - (other.start, other.end) - ) - intersecting = test_result > 0 + (other.start, other.end), positive=False) > 0) + else: + intersecting = any([cls.overlap(cds_comb[0], + cds_comb[1], + positive=False) > 0] for cds_comb in itertools.product( + transcript.internal_orf_boundaries, + other.internal_orf_boundaries)) else: intersecting = False diff --git a/Mikado/parsers/bed12.py b/Mikado/parsers/bed12.py index bdda7594d..8b931a2cc 100644 --- a/Mikado/parsers/bed12.py +++ b/Mikado/parsers/bed12.py @@ -454,6 +454,7 @@ def invalid(self): ) return True + self.invalid_reason = '' return False @property diff --git a/Mikado/tests/locus_tester.py b/Mikado/tests/locus_tester.py index d038b12ff..2745b4649 100644 --- a/Mikado/tests/locus_tester.py +++ b/Mikado/tests/locus_tester.py @@ -561,8 +561,11 @@ def test_intron_not_contained_in_exon(self): for min_cdna_overlap in (0.01, 1): with self.subTest(min_cdna_overlap=min_cdna_overlap): self.assertIs(MonosublocusHolder.is_intersecting( - self.t1, t2, logger=None, min_cdna_overlap=min_cdna_overlap, - min_cds_overlap=min_cdna_overlap), (min_cdna_overlap != 1)) + self.t1, t2, + logger=logger, + cds_only=False, + min_cdna_overlap=min_cdna_overlap, + min_cds_overlap=min_cdna_overlap), (min_cdna_overlap < 0.28)) def test_noCDSOverlap(self): diff --git a/Mikado/transcripts/transcript.py b/Mikado/transcripts/transcript.py index 66fe3d054..80ea3e6f0 100644 --- a/Mikado/transcripts/transcript.py +++ b/Mikado/transcripts/transcript.py @@ -281,6 +281,7 @@ def __init__(self, *args, self.__blast_score = 0 # Homology score self.__derived_children = set() self.__external_scores = Namespace(default=0) + self.__internal_orf_transcripts = [] # Starting settings for everything else self.chrom = None @@ -319,7 +320,6 @@ def __init__(self, *args, self.__segmenttree = IntervalTree() self.__cds_introntree = IntervalTree() self._possibly_without_exons = False - # self.query_id = None if len(args) == 0: return @@ -509,14 +509,13 @@ def __getattribute__(self, item): # ######## Class instance methods #################### - def add_exon(self, gffline, feature=None): + def add_exon(self, gffline, feature=None, phase=None): """This function will append an exon/CDS feature to the object. :param gffline: an annotation line :type gffline: (Mikado.parsers.GFF.GffLine | Mikado.parsers.GTF.GtfLine | tuple | list) :type feature: flag to indicate what kind of feature we are adding """ - phase = None if isinstance(gffline, (tuple, list)): assert len(gffline) == 2 start, end = sorted(gffline) @@ -587,7 +586,7 @@ def add_exon(self, gffline, feature=None): store.append(segment) return - def add_exons(self, exons, features=None): + def add_exons(self, exons, features=None, phases=None): """ Wrapper of the add_exon method for multiple lines. @@ -604,9 +603,15 @@ def add_exons(self, exons, features=None): raise InvalidTranscript("Mismatch between exons and features! %s,\t%s", exons, features) - - for exon, feature in zip(exons, features): - self.add_exon(exon, feature) + if phases is None: + phases = [None] * len(exons) + elif len(phases) != len(exons): + raise InvalidTranscript("Mismatch between exons and features! %s,\t%s", + exons, + features) + + for exon, feature, phase in zip(exons, features, phases): + self.add_exon(exon, feature=feature, phase=phase) return def format(self, format_name, @@ -726,6 +731,43 @@ def get_internal_orf_beds(self): yield new_row + @property + def _selected_orf_transcript(self): + + """This method will return the selected internal ORF as a transcript object.""" + + self.finalize() + if not self.is_coding: + return [] + return self._internal_orfs_transcripts[self.selected_internal_orf_index] + + @property + def _internal_orfs_transcripts(self): + """This method will return all internal ORFs as transcript objects. + Note: this will exclude the UTR part, even when the transcript only has one ORF.""" + + self.finalize() + if not self.is_coding: + return [] + elif len(self.__internal_orf_transcripts) == len(self.internal_orfs): + return self.__internal_orf_transcripts + else: + for num, orf in enumerate(self.internal_orfs, start=1): + torf = Transcript() + torf.chrom, torf.strand = self.chrom, self.strand + torf.derives_from = self.id + torf.id = "{}.orf{}".format(self.id, num) + __exons, __phases = [], [] + for segment in [_ for _ in orf if _[0] == "CDS"]: + __exons.append(segment[1]) + __phases.append(segment[2]) + torf.add_exons(__exons, features="exon", phases=None) + torf.add_exons(__exons, features="CDS", phases=__phases) + torf.finalize() + self.__internal_orf_transcripts.append(torf) + + return self.__internal_orf_transcripts + def split_by_cds(self): """This method is used for transcripts that have multiple ORFs. It will split them according to the CDS information into multiple transcripts. @@ -896,6 +938,7 @@ def unfinalize(self): return self.internal_orfs = [] + self.__internal_orf_transcripts = [] self.combined_utr = [] self.finalized = False From f1691c0b0a8095827e8c38d2527c06b83bfa8648 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Mon, 6 Feb 2017 17:29:14 +0000 Subject: [PATCH 16/47] Removed a log message. --- Mikado/loci/monosublocusholder.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/Mikado/loci/monosublocusholder.py b/Mikado/loci/monosublocusholder.py index 16f63ecbd..057b40c34 100644 --- a/Mikado/loci/monosublocusholder.py +++ b/Mikado/loci/monosublocusholder.py @@ -89,9 +89,6 @@ def add_transcript_to_locus(self, transcript, check_in_locus=True): # min_cds_overlap = 0.2, # classic_method = False - self.logger.warning("Using the MSH add_transcript_to_locus method for %s and %s", - self.id, transcript.id) - if check_in_locus is True and self.in_locus( self, transcript, From 3e196828ca92d716771db709a8794d0c3182c860 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Tue, 7 Feb 2017 12:30:57 +0000 Subject: [PATCH 17/47] When printing out fragments, now Mikado will find the best match and specify what the target is fragment of, with class code and distance. --- CHANGELOG.md | 4 +- MANIFEST.in | 1 - .../configuration_blueprint.json | 2 +- Mikado/configuration/scoring_blueprint.json | 44 +++++++++++++++-- Mikado/loci/abstractlocus.py | 4 ++ Mikado/loci/locus.py | 16 +++---- Mikado/picking/loci_processer.py | 47 +++++++++++-------- Mikado/scales/assigner.py | 10 ++-- Mikado/tests/configuration.yaml | 2 +- Mikado/transcripts/transcript.py | 1 - 10 files changed, 87 insertions(+), 44 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b88783a1..c59299c0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,9 +16,11 @@ Changes in this release: - "cds_only": to indicate whether we should only consider the CDS for clustering after the initial merging in the Superlocus. - "simple_overlap_for_monoexonic": to switch on/off the old default behaviour with monoexonic transcripts - "purge": whether to completely exclude failed loci, previously under "run_options" - - "remove_overlapping_fragments": whether to exclude fragments, previously under "run_options" + - "remove_overlapping_fragments": whether to exclude fragments, previously under "run_options" +- When printing out putative fragments, now Mikado will indicate the class code of the fragment, the match against which it was deemed a fragment of, and the distance of said fragment (if they are not overlapping). - Deprecated the "discard_definition" flag in Mikado serialise. Now Mikado will infer on its own whether to use the definition or the ID for serialising BLAST results. - Now AbstractLocus implementations have a private method to check the correctness of the json_conf. As a corollary, Transcript and children have been moved to their own subpackage ("transcripts") in order to break the circular dependency Mikado.loci.Abstractlocus <- Mikado.configurator <- Mikado.loci.Transcript. *Technical note*: checking the consinstency of the configuration is an expensive operation, so it will be executed on demand rather than automatically. +- Made the checks for the scoring files more robust. - Re-written the "find_retained_introns" method of AbstractLocus, to solve some bugs found during the utilisation of last beta. As a corollary, expanded the intervaltree module to allow searches for "tagged" intervals. - Now the "monoloci_out" files contain the Monosublocus**Holder** step, not the Monosublocus step. This should help during fine-tuning. - Mikado now supports also Python3.6. diff --git a/MANIFEST.in b/MANIFEST.in index 47e5aceb3..3de2927b5 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -6,7 +6,6 @@ include requirements.txt recursive-include . *py recursive-include Mikado *pyx recursive-include Mikado *pxd -recursive-include bin *py recursive-include util *py recursive-include Mikado *snakefile *json *yaml recursive-include Mikado/tests * \ No newline at end of file diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json index 1b8ebb6c9..e5be28546 100644 --- a/Mikado/configuration/configuration_blueprint.json +++ b/Mikado/configuration/configuration_blueprint.json @@ -562,7 +562,7 @@ }, "run_options": { "type": "object", - "required": ["procs", "preload", "single_thread"], + "required": ["procs", "single_thread"], "Comment": [ "Generic run options.", "- shm: boolean flag. If set and the DB is sqlite, it will be copied onto the /dev/shm faux partition", diff --git a/Mikado/configuration/scoring_blueprint.json b/Mikado/configuration/scoring_blueprint.json index d8b0b0705..5f5ab7832 100644 --- a/Mikado/configuration/scoring_blueprint.json +++ b/Mikado/configuration/scoring_blueprint.json @@ -9,10 +9,46 @@ "filter": { "type": "object", "properties":{ - "operator": {"type": "string", "enum": ["gt", "ge", "eq", "lt", "le", "ne", "in", "not in"], - "optional": false}, - "value": {"oneOf": [{"type": "array"}, {"type": "number"}, {"type": "boolean"}], - "optional": false} + "oneOf": [ + { + "operator": { "oneOf": ["gt", "ge", "lt", "le"], "optional": false}, + "value": {"type": "number", "optional": false} + }, + { + "operator": { "oneOf": ["ne", "eq"], "optional": false}, + "value": { + "oneOf": [{"type": "number"}, {"type": "boolean"}], + "optional": false} + }, + { + "operator": { + "oneOf": [ + "in", + "not in" + ], + "optional": false + }, + "value": {"type": "array"} + }, + { + "operator": { + "oneOf": [ + "within", + "not within" + ], + "optional": false + }, + "value": { + "type": "array", + "items": { + "type": "number" + }, + "uniqueItems": true, + "maxItems": 2, + "minItems": 2 + } + } + ] } } } diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py index d26fa8789..66fc23b7d 100644 --- a/Mikado/loci/abstractlocus.py +++ b/Mikado/loci/abstractlocus.py @@ -232,6 +232,10 @@ def evaluate(param: str, conf: dict) -> bool: comparison = (param in conf["value"]) elif conf["operator"] == "not in": comparison = (param not in conf["value"]) + elif conf["operator"] == "within": + comparison = (param in range(conf["value"][0], conf["value"][1])) + elif conf["operator"] == "not within": + comparison = (param not in range(conf["value"][0], conf["value"][1])) else: raise ValueError("Unknown operator: {0}".format(conf["operator"])) return comparison diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py index 59fdf1c8e..eb2f90ef7 100644 --- a/Mikado/loci/locus.py +++ b/Mikado/loci/locus.py @@ -355,17 +355,15 @@ def other_is_fragment(self, This function checks whether another *monoexonic* Locus *on the opposite strand* is a fragment,by checking its classification according to Assigner.compare. - Briefly, a transcript is classified as fragment - if it follows the following criteria: - - - it is monoexonic - - it has a combined_cds_length inferior to maximal_cds - - it is classified as x,i,P """ if not isinstance(self, type(other)): raise TypeError("I can compare only loci.") + if other.primary_transcript_id == self.primary_transcript_id: + self.logger.debug("Self-comparisons are not allowed!") + return False, None + self.logger.debug("Comparing %s with %s", self.primary_transcript_id, other.primary_transcript_id) @@ -381,14 +379,14 @@ def other_is_fragment(self, if result.ccode[0] in ("i", "P", "p", "x", "X", "m", "_"): self.logger.debug("{0} is a fragment (ccode {1})".format( other.primary_transcript.id, result.ccode[0])) - return True + return True, result # Adding c's because fragments might very well be contained! elif other.strand is None and (result.n_f1[0] > 0 or result.ccode in ("rI", "ri")): self.logger.debug("Unstranded {0} is a fragment (ccode {1})".format( other.primary_transcript.id, result.ccode[0])) - return True + return True, result - return False + return False, None def set_json_conf(self, jconf: dict): """ diff --git a/Mikado/picking/loci_processer.py b/Mikado/picking/loci_processer.py index 6369d39f0..ee8cf020a 100644 --- a/Mikado/picking/loci_processer.py +++ b/Mikado/picking/loci_processer.py @@ -1,9 +1,11 @@ from multiprocessing import Process from multiprocessing.managers import AutoProxy import logging +from itertools import product import logging.handlers as logging_handlers import functools from ..utilities import dbutils +from ..scales.assigner import Assigner from ..loci.superlocus import Superlocus from ..parsers.GFF import GffLine from ..serializers.external import ExternalSource @@ -409,8 +411,13 @@ def remove_fragments(stranded_loci, json_conf, logger): total = 0 + stranded_loci_dict = dict() + loci_to_superloci = dict() + for stranded_locus in stranded_loci: + stranded_loci_dict[stranded_locus.id] = stranded_locus for _, locus_instance in stranded_locus.loci.items(): + loci_to_superloci[locus_instance.id] = stranded_locus.id logger.debug("Assessing whether %s could be a fragment", _) total += 1 is_fragment = locus_instance.is_putative_fragment() @@ -421,27 +428,27 @@ def remove_fragments(stranded_loci, json_conf, logger): loci_to_check[False] = loci_to_check.pop(True) loci_to_check[True] = set() - bool_remove_fragments = json_conf["pick"]["clustering"]["remove_overlapping_fragments"] + comparisons = collections.defaultdict(list) + # Produce a list of duples + + for locus_to_check, gene in product(loci_to_check[True], loci_to_check[False]): + is_to_be_filtered, comparison = gene.other_is_fragment(locus_to_check) + if is_to_be_filtered is True: + comparisons[locus_to_check.id].append(comparison) + + for locus in comparisons: + if json_conf["pick"]["clustering"]["remove_overlapping_fragments"] is True: + # A bit convoluted: use the locus ID to find the correct superlocus, then delete the ID inside the SL. + del stranded_loci_dict[loci_to_superloci[locus]].loci[locus] + else: + best_comparison = sorted(comparisons[locus], reverse=True, key=Assigner.get_f1)[0] + stranded_loci_dict[loci_to_superloci[locus]].loci[locus].is_fragment = True + stranded_loci_dict[loci_to_superloci[locus]].loci[locus].attributes["fragment_of"] = best_comparison.ref_id[0] + stranded_loci_dict[loci_to_superloci[locus]].loci[locus].attributes["fragment_class_code"] = best_comparison.ccode[0] + if best_comparison.distance[0] > 0: + stranded_loci_dict[loci_to_superloci[locus]].loci[locus].attributes["distance"] = best_comparison.distance[0] + for stranded_locus in stranded_loci: - to_remove = set() - for locus_id, locus_instance in stranded_locus.loci.items(): - if locus_instance in loci_to_check[True]: - logger.debug("Checking if %s is a fragment", locus_instance.id) - - for other_locus in iter( - olocus for olocus in loci_to_check[False] - if olocus.primary_transcript_id != locus_instance.primary_transcript_id): - if other_locus.other_is_fragment( - locus_instance) is True: - if bool_remove_fragments is False: - # Just mark it as a fragment - stranded_locus.loci[locus_id].is_fragment = True - else: - to_remove.add(locus_id) - # del stranded_locus.loci[locus_id] - break - for locus_id in to_remove: - del stranded_locus.loci[locus_id] yield stranded_locus diff --git a/Mikado/scales/assigner.py b/Mikado/scales/assigner.py index 446a92a6f..c222843e2 100644 --- a/Mikado/scales/assigner.py +++ b/Mikado/scales/assigner.py @@ -194,14 +194,14 @@ def dubious_getter(dubious_result): @staticmethod def get_f1(curr_result): """ - Simple getter for F1 statistics (N_F1 and J_F1) + Simple getter for F1 statistics (N_F1, J_F1 and distance). :param curr_result: a result storer :type curr_result: ResultStorer - :return: (J_F1, N_F1) - :rtype (float, float) + :return: (J_F1, N_F1, distance) + :rtype (float, float, int) """ - return curr_result.j_f1[0], curr_result.n_f1[0] + return curr_result.j_f1[0], curr_result.n_f1[0], curr_result.distance[0] def __prepare_transcript(self, prediction: Transcript): """ @@ -479,8 +479,6 @@ def self_analyse_prediction(self, prediction: Transcript, distances): results = [] best_results = [] for gene in result_dict: - - if result_dict[gene][0].n_recall[0] > 10 or result_dict[gene][0].j_f1[0] > 0: results.extend(result_dict[gene]) best_results.append(result_dict[gene][0]) diff --git a/Mikado/tests/configuration.yaml b/Mikado/tests/configuration.yaml index 140ea58c4..49d0f5b9d 100644 --- a/Mikado/tests/configuration.yaml +++ b/Mikado/tests/configuration.yaml @@ -77,7 +77,7 @@ pick: # for profiling and debugging. # - flank: integer, maximum flank to group transcripts together for analysis. # Default: 0. - preload: false + intron_range: [60, 90] procs: 1 single_thread: false scoring_file: plants.yaml diff --git a/Mikado/transcripts/transcript.py b/Mikado/transcripts/transcript.py index 80ea3e6f0..9bd718a67 100644 --- a/Mikado/transcripts/transcript.py +++ b/Mikado/transcripts/transcript.py @@ -2820,7 +2820,6 @@ def max_exon_length(self): max_exon_length.category = "cDNA" max_exon_length.rtype = "int" - @Metric def min_exon_length(self): """This metric will return the length of the biggest exon in the transcript.""" From 68bde819c168c1cc82229c492a44eef597550f29 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Tue, 7 Feb 2017 16:11:02 +0000 Subject: [PATCH 18/47] CDS_only now considers only the selected CDS, at all stages. --- Mikado/loci/superlocus.py | 17 +++++++----- build.sh | 8 ++++++ meta.yaml | 54 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 7 deletions(-) create mode 100644 build.sh create mode 100644 meta.yaml diff --git a/Mikado/loci/superlocus.py b/Mikado/loci/superlocus.py index f83a23341..88e2eef19 100644 --- a/Mikado/loci/superlocus.py +++ b/Mikado/loci/superlocus.py @@ -1266,8 +1266,8 @@ def is_intersecting(cls, transcript, other, cds_only=False): if cds_only is False or transcript.is_coding is False or other.is_coding is False: intersection = set.intersection(transcript.introns, other.introns) else: - intersection = set.intersection(transcript.combined_cds_introns, - other.combined_cds_introns) + intersection = set.intersection(transcript.selected_cds_introns, + other.selected_cds_introns) intersecting = (len(intersection) > 0) elif transcript.monoexonic is True and other.monoexonic is True: @@ -1277,11 +1277,14 @@ def is_intersecting(cls, transcript, other, cds_only=False): (transcript.start, transcript.end), (other.start, other.end), positive=False) > 0) else: - intersecting = any([cls.overlap(cds_comb[0], - cds_comb[1], - positive=False) > 0] for cds_comb in itertools.product( - transcript.internal_orf_boundaries, - other.internal_orf_boundaries)) + # intersecting = any([cls.overlap(cds_comb[0], + # cds_comb[1], + # positive=False) > 0] for cds_comb in itertools.product( + # transcript.internal_orf_boundaries, + # other.internal_orf_boundaries)) + intersecting = (cls.overlap( + (transcript.selected_cds_start, transcript.selected_cds_end), + (other.selected_cds_start, other.selected_cds_end), positive=False) > 0) else: intersecting = False diff --git a/build.sh b/build.sh new file mode 100644 index 000000000..cf53f9849 --- /dev/null +++ b/build.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +pip install intervaltree sqlalchemy_utils pyfaidx python-magic drmaa snakemake + +minor=$(python -c "import sys; print(sys.version_info.minor)") + +python setup.py bdist_wheel +pip install dist/Mikado-1.0.0b10-cp3${minor}-cp3${minor}-*whl; \ No newline at end of file diff --git a/meta.yaml b/meta.yaml new file mode 100644 index 000000000..95ec518e6 --- /dev/null +++ b/meta.yaml @@ -0,0 +1,54 @@ +package: + name: mikado + version: "1.0.0beta10" + +source: + git_rev: "1.0.0beta10" + git_url: "https://github.com/lucventurini/mikado.git" + +requirements: + build: + - python + - setuptools + - wheel + - pyyaml + - jsonschema + - cython + - numpy + - networkx + - sqlalchemy + - biopython + - nose + - scikit-learn + - scipy + # - drmaa + - snakemake + - docutils + - tabulate + - ujson + run: + - python + - wheel + - pyyaml + - jsonschema + - cython + - numpy + - networkx + - sqlalchemy + - biopython + - nose + - scikit-learn + - scipy + # - drmaa + - snakemake + - docutils + - tabulate + - ujson +test: + imports: + - + +about: + home: https://mikado.readthedocs.io/ + license: GPL3 + license_file: LICENSE.txt \ No newline at end of file From 183f05f7a76fe7cf7499e34be2ac32360652c2d8 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Tue, 7 Feb 2017 18:20:00 +0000 Subject: [PATCH 19/47] Added a numeric max distance for fragments. Debugged add_transcript_feature_to_gtf. --- MANIFEST.in | 3 +- .../configuration_blueprint.json | 8 +++- Mikado/loci/locus.py | 7 +--- build.sh | 8 ++-- meta.yaml | 10 +++-- util/add_transcript_feature_to_gtf.py | 41 +++++++------------ 6 files changed, 38 insertions(+), 39 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 3de2927b5..e06f37f7e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -8,4 +8,5 @@ recursive-include Mikado *pyx recursive-include Mikado *pxd recursive-include util *py recursive-include Mikado *snakefile *json *yaml -recursive-include Mikado/tests * \ No newline at end of file +recursive-include Mikado/tests * +recursive-exclude docs/ * \ No newline at end of file diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json index e5be28546..60cd6bc19 100644 --- a/Mikado/configuration/configuration_blueprint.json +++ b/Mikado/configuration/configuration_blueprint.json @@ -645,7 +645,8 @@ "- purge: boolean, it specifies whether to remove loci where all transcripts fail the minimum checks, or whether to print them out in the subloci file instead.", "- simple_overlap_for_monoexonic: boolean. If set to true (default), then any overlap mean inclusion", "in a locus for or against a monoexonic transcript. If set to false, normal controls for the percentage", - "of overlap will apply." + "of overlap will apply.", + "- max_distance_for_fragments: maximum distance from a valid locus for another to be considered a fragment." ], "SimpleComment": [ "Parameters related to the clustering of transcripts into loci.", @@ -684,6 +685,11 @@ "simple_overlap_for_monoexonic": { "type": "boolean", "default": true + }, + "max_distance_for_fragments": { + "type": "number", + "minimum": 0, + "default": 2000 } } }, diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py index eb2f90ef7..d9bc5cbb7 100644 --- a/Mikado/loci/locus.py +++ b/Mikado/loci/locus.py @@ -369,6 +369,7 @@ def other_is_fragment(self, other.primary_transcript_id) result, _ = Assigner.compare(other.primary_transcript, self.primary_transcript) + max_distance = self.json_conf["pick"]["clustering"]["max_distance_for_fragments"] # Exclude anything which is completely contained within an intron, # or is a monoexonic fragment overlapping/in the neighborhood self.logger.debug("Comparison between {0} (strand {3}) and {1}: class code \"{2}\"".format( @@ -376,15 +377,11 @@ def other_is_fragment(self, other.primary_transcript.id, result.ccode[0], other.strand)) - if result.ccode[0] in ("i", "P", "p", "x", "X", "m", "_"): + if result.ccode[0] in ("i", "P", "p", "x", "X", "m", "_") and result.distance[0] <= max_distance: self.logger.debug("{0} is a fragment (ccode {1})".format( other.primary_transcript.id, result.ccode[0])) return True, result # Adding c's because fragments might very well be contained! - elif other.strand is None and (result.n_f1[0] > 0 or result.ccode in ("rI", "ri")): - self.logger.debug("Unstranded {0} is a fragment (ccode {1})".format( - other.primary_transcript.id, result.ccode[0])) - return True, result return False, None diff --git a/build.sh b/build.sh index cf53f9849..e987a7591 100644 --- a/build.sh +++ b/build.sh @@ -1,8 +1,10 @@ #!/usr/bin/env bash -pip install intervaltree sqlalchemy_utils pyfaidx python-magic drmaa snakemake +pip install intervaltree sqlalchemy_utils pyfaidx python-magic drmaa snakemake; minor=$(python -c "import sys; print(sys.version_info.minor)") -python setup.py bdist_wheel -pip install dist/Mikado-1.0.0b10-cp3${minor}-cp3${minor}-*whl; \ No newline at end of file +${PYTHON} setup.py bdist_wheel || exit 1; + +wheel=$(ls dist/*whl); +pip install --no-deps ${wheel} \ No newline at end of file diff --git a/meta.yaml b/meta.yaml index 95ec518e6..5eef452a6 100644 --- a/meta.yaml +++ b/meta.yaml @@ -3,7 +3,7 @@ package: version: "1.0.0beta10" source: - git_rev: "1.0.0beta10" + git_rev: "1.0.0b9" git_url: "https://github.com/lucventurini/mikado.git" requirements: @@ -28,6 +28,7 @@ requirements: - ujson run: - python + - setuptools - wheel - pyyaml - jsonschema @@ -44,9 +45,12 @@ requirements: - docutils - tabulate - ujson + test: - imports: - - + requires: + - nose + commands: + - python setup.py nosetests about: home: https://mikado.readthedocs.io/ diff --git a/util/add_transcript_feature_to_gtf.py b/util/add_transcript_feature_to_gtf.py index fcc888a36..db2c44039 100644 --- a/util/add_transcript_feature_to_gtf.py +++ b/util/add_transcript_feature_to_gtf.py @@ -7,9 +7,11 @@ import sys from Mikado.parsers.GTF import GTF +from Mikado.transcripts import Transcript from copy import deepcopy import operator import argparse +from collections import defaultdict class Obj(object): @@ -36,34 +38,21 @@ def main(): args.gtf.close() - for record in GTF(args.gtf.name): - if current.transcript != record.transcript: - if current.transcript is not None: - print(current, file=args.out) - exon_no = 0 - for row in filter(lambda x: x.feature == "exon", - sorted(rows, key=operator.attrgetter("start"))): - exon_no += 1 - row.attributes["exon_number"] = exon_no - print(row, file=args.out) - exon_no = 0 - for row in filter(lambda x: x.feature == "CDS", - sorted(rows, key=operator.attrgetter("start"))): - exon_no += 1 - row.attributes["exon_number"] = exon_no - print(row, file=args.out) - rows = [record] - current = deepcopy(record) - current.feature = "transcript" + transcript_lines = defaultdict(list) - else: - current.end = max(current.end, record.end) - current.start = min(current.start, record.start) - rows.append(record) + [transcript_lines[_.transcript].append(_) for _ in GTF(args.gtf.name) if _.header is False] + args.gtf.close() + transcripts = list() + + for tid in transcript_lines: + transcript = Transcript(transcript_lines[tid][0]) + transcript.add_exons(transcript_lines[tid]) + transcripts.append(transcript) + + for transcript in sorted(transcripts): + print(transcript.format("gtf"), file=args.out) - print(current, file=args.out) - for row in rows: - print(row, file=args.out) + args.out.close() if __name__ == '__main__': main() From ac4fdd48634ef6d42739127e2230795afbf7c413 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Tue, 7 Feb 2017 18:26:30 +0000 Subject: [PATCH 20/47] BF for add_transcript_feature_to_gtf --- util/add_transcript_feature_to_gtf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/add_transcript_feature_to_gtf.py b/util/add_transcript_feature_to_gtf.py index db2c44039..34caebb53 100644 --- a/util/add_transcript_feature_to_gtf.py +++ b/util/add_transcript_feature_to_gtf.py @@ -40,7 +40,7 @@ def main(): transcript_lines = defaultdict(list) - [transcript_lines[_.transcript].append(_) for _ in GTF(args.gtf.name) if _.header is False] + [transcript_lines[_.transcript].append(_) for _ in GTF(args.gtf.name) if _.header is False and _.is_exon is True] args.gtf.close() transcripts = list() From 9eff559acb7ef33abf698699b65189112c0403fb Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Tue, 7 Feb 2017 19:07:00 +0000 Subject: [PATCH 21/47] Now add_transcript_feature_to_gtf.py functions with sys.stdout --- util/add_transcript_feature_to_gtf.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/util/add_transcript_feature_to_gtf.py b/util/add_transcript_feature_to_gtf.py index 34caebb53..384a32dd3 100644 --- a/util/add_transcript_feature_to_gtf.py +++ b/util/add_transcript_feature_to_gtf.py @@ -23,10 +23,6 @@ def main(): """ Main script function. """ - current = Obj() - current.transcript = None - - rows = [] parser = argparse.ArgumentParser("Script to add a transcript feature to e.g. Cufflinks GTFs") parser.add_argument("gtf", type=argparse.FileType(), @@ -52,7 +48,8 @@ def main(): for transcript in sorted(transcripts): print(transcript.format("gtf"), file=args.out) - args.out.close() + if args.out is not sys.stdout: + args.out.close() if __name__ == '__main__': main() From ed3db137c4d4ae9a4a782ed86f1969a588c7d3c8 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 8 Feb 2017 11:47:54 +0000 Subject: [PATCH 22/47] Fixed a small bug in checking expressions --- Mikado/configuration/configurator.py | 29 +++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/Mikado/configuration/configurator.py b/Mikado/configuration/configurator.py index 387f2432d..6af59e4fc 100644 --- a/Mikado/configuration/configurator.py +++ b/Mikado/configuration/configurator.py @@ -224,7 +224,7 @@ def check_all_requirements(json_conf): json_conf = check_requirements(json_conf, require_schema, "not_fragmentary") - except InvalidJson as exc: + except (InvalidJson, SyntaxError) as exc: print(json_conf["not_fragmentary"]["expression"]) print(type(json_conf["not_fragmentary"]["expression"])) raise exc @@ -236,13 +236,23 @@ def check_all_requirements(json_conf): # Check requirements will MODIFY IN PLACE the expression, so the copying # must happend before, not after. - json_conf = check_requirements(json_conf, - require_schema, - "as_requirements") + try: + json_conf = check_requirements(json_conf, + require_schema, + "as_requirements") + except (InvalidJson, SyntaxError) as exc: + print(json_conf["as_requirements"]["expression"]) + print(type(json_conf["as_requirements"]["expression"])) + raise exc - json_conf = check_requirements(json_conf, - require_schema, - "requirements") + try: + json_conf = check_requirements(json_conf, + require_schema, + "requirements") + except (InvalidJson, SyntaxError) as exc: + print(json_conf["as_requirements"]["expression"]) + print(type(json_conf["as_requirements"]["expression"])) + raise exc return json_conf @@ -282,6 +292,7 @@ def check_requirements(json_conf, require_schema, index): key_name = ".".join(dots) else: key_name = ".".join(dots[:-1]) + print(key_name) key_value = dots[1] else: key_name = dots[0] @@ -330,8 +341,8 @@ def check_requirements(json_conf, require_schema, index): expr = " ".join(json_conf[index]["expression"]) newexpr = expr[:] - keys = list(key for key in re.findall( - "([^ ()]+)", expr) if key not in ("and", "or", "not", "xor")) + keys = set([key for key in re.findall( + "([^ ()]+)", expr) if key not in ("and", "or", "not", "xor")]) diff_params = set.difference( set(keys), set(json_conf[index]["parameters"].keys())) From a79c83b5ef6c546182ab5f307cd90846ff6303d7 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 8 Feb 2017 13:25:23 +0000 Subject: [PATCH 23/47] Fixed the assignment of scoring file from the command line for pick. --- .../configuration_blueprint.json | 3 ++ Mikado/configuration/configurator.py | 33 +++++++++++++++--- Mikado/picking/picker.py | 34 ++++++++++++++----- Mikado/subprograms/pick.py | 32 ++++++++++++----- Mikado/utilities/log_utils.py | 16 +++++++++ build.sh | 4 +-- 6 files changed, 99 insertions(+), 23 deletions(-) diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json index 60cd6bc19..3f4ca22a2 100644 --- a/Mikado/configuration/configuration_blueprint.json +++ b/Mikado/configuration/configuration_blueprint.json @@ -56,6 +56,9 @@ "CRITICAL" ], "default": "WARNING" + }, + "log": { + "oneOf": [null, {"type": "string", "default": null}] } } }, diff --git a/Mikado/configuration/configurator.py b/Mikado/configuration/configurator.py index 6af59e4fc..af811d24b 100644 --- a/Mikado/configuration/configurator.py +++ b/Mikado/configuration/configurator.py @@ -13,6 +13,7 @@ import pickle import re from multiprocessing import get_start_method +from logging import Logger import jsonschema import pkg_resources import yaml @@ -437,7 +438,7 @@ def create_validator(simple=False): return validator -def check_json(json_conf, simple=False, external_dict=None): +def check_json(json_conf, simple=False, external_dict=None, logger=None): """ Wrapper for the various checks performed on the configuration file. @@ -452,6 +453,9 @@ def check_json(json_conf, simple=False, external_dict=None): :param external_dict: optional external dictionary with values to pass to the configuration. :type external_dict: (dict|None) + :param logger: external logger instance + :type logger: Logger + :return json_conf :rtype: dict """ @@ -466,7 +470,9 @@ def check_json(json_conf, simple=False, external_dict=None): # with open(blue_print) as blue: # blue_print = json.load(blue) - logger = create_default_logger("check_json") + if not isinstance(logger, Logger): + logger = create_default_logger("check_json") + try: validator = create_validator(simple=simple) @@ -477,7 +483,14 @@ def check_json(json_conf, simple=False, external_dict=None): validator.validate(json_conf) assert "files" in json_conf["pick"] + overwritten = False if "scoring_file" in json_conf["pick"]: + if "scoring" in json_conf or "requirements" in json_conf: + logger.info("Overwriting the scoring configuration using '%s' as scoring file", + json_conf["pick"]["scoring_file"]) + overwritten = True + [json_conf.pop(_, None) for _ in ("scoring", "requirements", "as_requirements", "not_fragmentary")] + if os.path.exists(os.path.abspath(json_conf["pick"]["scoring_file"])): json_conf["pick"]["scoring_file"] = os.path.abspath( json_conf["pick"]["scoring_file"]) @@ -543,10 +556,14 @@ def check_json(json_conf, simple=False, external_dict=None): logger.exception(exc) raise + if overwritten is True: + logger.debug("Scoring parameters: {}".format("\n".join(["\n"] + [ + "{}: {}".format(_, json_conf["scoring"][_]) for _ in json_conf["scoring"].keys()]))) + return json_conf -def to_json(string, simple=False): +def to_json(string, simple=False, logger=None): """ Function to serialise the JSON for configuration and check its consistency. @@ -556,9 +573,15 @@ def to_json(string, simple=False): :param simple: boolean flag indicating whether we desire the simplified version of the configuration, or not. :type simple: bool + + :param logger: optional logger to be used. + :type logger: Logger + + :rtype: dict """ - logger = create_default_logger("to_json") + if not isinstance(logger, Logger): + logger = create_default_logger("to_json") try: if string is None or string == '' or string == dict(): @@ -575,7 +598,7 @@ def to_json(string, simple=False): json_dict = json.load(json_file) json_dict["filename"] = string # json_dict = frozendict(check_json(json_dict, simple=simple)) - json_dict = check_json(json_dict, simple=simple) + json_dict = check_json(json_dict, simple=simple, logger=logger) except Exception as exc: logger.exception(exc) raise diff --git a/Mikado/picking/picker.py b/Mikado/picking/picker.py index 62eb6aef8..bd13653f9 100644 --- a/Mikado/picking/picker.py +++ b/Mikado/picking/picker.py @@ -57,7 +57,7 @@ def __init__(self, json_conf, commandline=""): prepared by the json_utils functions. :param json_conf: Either a configuration dictionary or the configuration file. - :type json_conf: str,dict + :type json_conf: (str|dict) :param commandline: optional, the commandline used to start the program :type commandline: str @@ -73,18 +73,36 @@ def __init__(self, json_conf, commandline=""): "log_handler", "log_writer", "logger", "engine"] # Now we start the real work - if isinstance(json_conf, str): - assert os.path.exists(json_conf) - json_conf = to_json(json_conf) - else: - json_conf = check_json(json_conf) - self.commandline = commandline self.json_conf = json_conf + if isinstance(self.json_conf, str): + assert os.path.exists(self.json_conf) + self.json_conf = to_json(self.json_conf, logger=self.logger) + # pylint: disable=no-member + multiprocessing.set_start_method(self.json_conf["multiprocessing_method"], + force=True) + self.input_file = self.json_conf["pick"]["files"]["input"] + self.logging_queue = multiprocessing.Queue(-1) + self.printer_queue = multiprocessing.Queue(-1) + self.setup_logger() + elif isinstance(self.json_conf, dict): + # pylint: disable=no-member + self.input_file = self.json_conf["pick"]["files"]["input"] + multiprocessing.set_start_method(self.json_conf["multiprocessing_method"], + force=True) + self.logging_queue = multiprocessing.Queue(-1) + self.printer_queue = multiprocessing.Queue(-1) + self.setup_logger() + self.json_conf = check_json(self.json_conf, logger=self.logger) + else: + raise TypeError(type(self.json_conf)) + + assert isinstance(self.json_conf, dict) + self.regressor = None self.procs = self.json_conf["pick"]["run_options"]["procs"] - self.input_file = self.json_conf["pick"]["files"]["input"] + # Check the input file with self.define_input() as _: pass diff --git a/Mikado/subprograms/pick.py b/Mikado/subprograms/pick.py index 20907d735..00f522605 100644 --- a/Mikado/subprograms/pick.py +++ b/Mikado/subprograms/pick.py @@ -9,6 +9,7 @@ from ..picking import Picker from ..configuration.configurator import to_json, check_json from ..exceptions import UnsortedInput # , InvalidJson +from ..utilities.log_utils import create_default_logger, create_logger_from_conf def check_log_settings(args): @@ -35,7 +36,7 @@ def check_log_settings(args): return args -def check_run_options(args): +def check_run_options(args, logger=None): """ Quick method to check the consistency of run option settings from the namespace. @@ -117,7 +118,7 @@ def check_run_options(args): raise ValueError("Invalid/inexistent scoring file: {}".format(args.scoring_file)) args.json_conf["pick"]["scoring_file"] = args.scoring_file - args.json_conf = check_json(args.json_conf) + args.json_conf = check_json(args.json_conf, logger=logger) return args @@ -130,15 +131,30 @@ def pick(args): """ - args = check_log_settings(args) - args = check_run_options(args) + logger = create_default_logger("pick_init") + + args.json_conf.close() + args.json_conf = to_json(args.json_conf.name, logger=logger) + + try: + args = check_log_settings(args) + except Exception as exc: + logger.error(exc) + raise exc + + try: + args = check_run_options(args, logger=logger) + except Exception as exc: + logger.error(exc) + raise exc creator = Picker(args.json_conf, commandline=" ".join(sys.argv)) try: creator() # Run - except UnsortedInput as err: - print(err, file=sys.stderr) - sys.exit(1) + except Exception as exc: + logger.error(exc) + + sys.exit(1) def pick_parser(): @@ -154,7 +170,7 @@ def pick_parser(): help="""Number of processors to use. Default: look in the configuration file (1 if undefined)""") parser.add_argument("--json-conf", dest="json_conf", - type=to_json, required=True, + type=argparse.FileType("r"), required=True, help="JSON/YAML configuration file for Mikado.") parser.add_argument("--scoring-file", dest="scoring_file", type=str, default=None, diff --git a/Mikado/utilities/log_utils.py b/Mikado/utilities/log_utils.py index 52389ba93..a69e5603d 100644 --- a/Mikado/utilities/log_utils.py +++ b/Mikado/utilities/log_utils.py @@ -100,3 +100,19 @@ def create_queue_logger(instance, prefix=""): instance.logger.setLevel(instance._log_handler.level) instance.logger.propagate = False return + + +def create_logger_from_conf(conf, name="mikado", mode="a"): + + logger = logging.getLogger(name) + handle = conf["log_settings"].get("log", None) + if handle is None: + handler = logging.StreamHandler() + else: + handler = logging.FileHandler(conf["log_settings"]['log'], mode=mode) + + handler.setFormatter(formatter) + logger.setLevel(conf["log_settings"]['log_level']) + logger.addHandler(handler) + logger.propagate = False + return logger diff --git a/build.sh b/build.sh index e987a7591..98ffcedee 100644 --- a/build.sh +++ b/build.sh @@ -6,5 +6,5 @@ minor=$(python -c "import sys; print(sys.version_info.minor)") ${PYTHON} setup.py bdist_wheel || exit 1; -wheel=$(ls dist/*whl); -pip install --no-deps ${wheel} \ No newline at end of file +wheel=$(ls dist/*3${minor}*whl); +pip install --no-deps ${wheel}; \ No newline at end of file From e96848e481206e216d1cc5ecb62ac2fb2eb0395d Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 8 Feb 2017 13:44:26 +0000 Subject: [PATCH 24/47] Now reloading of the scoring file will happen only if necessary --- Mikado/configuration/configurator.py | 108 ++++++++++++++------------- Mikado/tests/locus_tester.py | 3 + 2 files changed, 61 insertions(+), 50 deletions(-) diff --git a/Mikado/configuration/configurator.py b/Mikado/configuration/configurator.py index af811d24b..daf9c2ab0 100644 --- a/Mikado/configuration/configurator.py +++ b/Mikado/configuration/configurator.py @@ -438,6 +438,39 @@ def create_validator(simple=False): return validator +def _check_scoring_file(json_conf, logger): + + overwritten = False + + if json_conf.get("__loaded_scoring", json_conf["pick"]["scoring_file"]) != json_conf["pick"]["scoring_file"]: + logger.info("Overwriting the scoring configuration using '%s' as scoring file", + json_conf["pick"]["scoring_file"]) + overwritten = True + [json_conf.pop(_, None) for _ in ("scoring", "requirements", "as_requirements", "not_fragmentary")] + + if os.path.exists(os.path.abspath(json_conf["pick"]["scoring_file"])): + json_conf["pick"]["scoring_file"] = os.path.abspath( + json_conf["pick"]["scoring_file"]) + elif os.path.exists(os.path.join( + os.path.dirname(json_conf["filename"]), + json_conf["pick"]["scoring_file"])): + json_conf["pick"]["scoring_file"] = os.path.join( + os.path.dirname(json_conf["filename"]), + json_conf["pick"]["scoring_file"]) + elif os.path.exists( + resource_filename(__name__, os.path.join("scoring_files", + json_conf["pick"]["scoring_file"]))): + json_conf["pick"]["scoring_file"] = resource_filename( + __name__, + os.path.join("scoring_files", + json_conf["pick"]["scoring_file"])) + else: + raise InvalidJson( + "Scoring file not found: {0}".format( + json_conf["pick"]["scoring_file"])) + return json_conf, overwritten + + def check_json(json_conf, simple=False, external_dict=None, logger=None): """ @@ -484,60 +517,35 @@ def check_json(json_conf, simple=False, external_dict=None, logger=None): assert "files" in json_conf["pick"] overwritten = False - if "scoring_file" in json_conf["pick"]: - if "scoring" in json_conf or "requirements" in json_conf: - logger.info("Overwriting the scoring configuration using '%s' as scoring file", - json_conf["pick"]["scoring_file"]) - overwritten = True - [json_conf.pop(_, None) for _ in ("scoring", "requirements", "as_requirements", "not_fragmentary")] - - if os.path.exists(os.path.abspath(json_conf["pick"]["scoring_file"])): - json_conf["pick"]["scoring_file"] = os.path.abspath( - json_conf["pick"]["scoring_file"]) - elif os.path.exists(os.path.join( - os.path.dirname(json_conf["filename"]), - json_conf["pick"]["scoring_file"])): - json_conf["pick"]["scoring_file"] = os.path.join( - os.path.dirname(json_conf["filename"]), - json_conf["pick"]["scoring_file"]) - elif os.path.exists( - resource_filename(__name__, os.path.join("scoring_files", - json_conf["pick"]["scoring_file"]))): - json_conf["pick"]["scoring_file"] = resource_filename( - __name__, - os.path.join("scoring_files", - json_conf["pick"]["scoring_file"])) - else: - raise InvalidJson( - "Scoring file not found: {0}".format( - json_conf["pick"]["scoring_file"])) - - if json_conf["pick"]["scoring_file"].endswith(("yaml", "json")): - with open(json_conf["pick"]["scoring_file"]) as scoring_file: - if json_conf["pick"]["scoring_file"].endswith("yaml"): - scoring = yaml.load(scoring_file) - else: - scoring = json.load(scoring_file) - assert isinstance(json_conf, dict) and isinstance(scoring, dict),\ - (type(json_conf), type(scoring)) + json_conf, overwritten = _check_scoring_file(json_conf, logger) + + if json_conf["pick"]["scoring_file"].endswith(("yaml", "json")): + with open(json_conf["pick"]["scoring_file"]) as scoring_file: + if json_conf["pick"]["scoring_file"].endswith("yaml"): + scoring = yaml.load(scoring_file) + else: + scoring = json.load(scoring_file) + assert isinstance(json_conf, dict) and isinstance(scoring, dict),\ + (type(json_conf), type(scoring)) + json_conf = merge_dictionaries(json_conf, scoring) + json_conf = check_all_requirements(json_conf) + json_conf = check_scoring(json_conf) + + elif json_conf["pick"]["scoring_file"].endswith(("model", "pickle")): + with open(json_conf["pick"]["scoring_file"], "rb") as forest: + scoring = pickle.load(forest) + assert isinstance(scoring, dict) + assert "scoring" in scoring and isinstance(scoring["scoring"], (RandomForestRegressor, RandomForestClassifier)) + del scoring["scoring"] json_conf = merge_dictionaries(json_conf, scoring) json_conf = check_all_requirements(json_conf) - json_conf = check_scoring(json_conf) - - elif json_conf["pick"]["scoring_file"].endswith(("model", "pickle")): + else: + raise InvalidJson( + "Invalid scoring file: {0}".format( + json_conf["pick"]["scoring_file"])) - with open(json_conf["pick"]["scoring_file"], "rb") as forest: - scoring = pickle.load(forest) - assert isinstance(scoring, dict) - assert "scoring" in scoring and isinstance(scoring["scoring"], (RandomForestRegressor, RandomForestClassifier)) - del scoring["scoring"] - json_conf = merge_dictionaries(json_conf, scoring) - json_conf = check_all_requirements(json_conf) - else: - raise InvalidJson( - "Invalid scoring file: {0}".format( - json_conf["pick"]["scoring_file"])) + json_conf["__loaded_scoring"] = json_conf["pick"]["scoring_file"] if external_dict is not None: if not isinstance(external_dict, dict): diff --git a/Mikado/tests/locus_tester.py b/Mikado/tests/locus_tester.py index 2745b4649..6e0151c7f 100644 --- a/Mikado/tests/locus_tester.py +++ b/Mikado/tests/locus_tester.py @@ -202,6 +202,8 @@ def test_boolean_requirement(self): del jconf["requirements"] + # del jconf["scoring_file"] + jconf["requirements"] = dict() jconf["requirements"]["parameters"] = dict() jconf["requirements"]["expression"] = ["suspicious_splicing"] @@ -218,6 +220,7 @@ def test_boolean_requirement(self): "evaluated[\"suspicious_splicing\"]") jconf = configurator.check_json(jconf) + self.assertEqual( jconf["requirements"]["expression"], "evaluated[\"suspicious_splicing\"]") From 792535a4319439660ca642589e1bee932049e507 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 8 Feb 2017 14:06:53 +0000 Subject: [PATCH 25/47] Implemented unit test to verify that the external scoring file is set appropriately. --- Mikado/tests/test_system_calls.py | 42 +++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/Mikado/tests/test_system_calls.py b/Mikado/tests/test_system_calls.py index d3530b62a..34191bd2e 100644 --- a/Mikado/tests/test_system_calls.py +++ b/Mikado/tests/test_system_calls.py @@ -494,6 +494,48 @@ def test_subprocess(self): [os.remove(_) for _ in glob.glob(os.path.join(tempfile.gettempdir(), "mikado.subproc.") + "*")] + def test_different_scoring(self): + + json_conf = configurator.to_json(None) + + json_conf["pick"]["files"]["input"] = pkg_resources.resource_filename("Mikado.tests", + "mikado_prepared.gtf") + json_conf["pick"]["files"]["output_dir"] = tempfile.gettempdir() + json_conf["pick"]["files"]["loci_out"] = "mikado.test_diff.loci.gff3" + json_conf["pick"]["files"]["subloci_out"] = "mikado.test_diff.subloci.gff3" + json_conf["pick"]["files"]["monoloci_out"] = "mikado.test_diff.monoloci.gff3" + json_conf["pick"]["files"]["log"] = "mikado.test_diff.log" + json_conf["db_settings"]["db"] = pkg_resources.resource_filename("Mikado.tests", "mikado.db") + json_conf["log_settings"]["log_level"] = "WARNING" + self.assertEqual(os.path.basename(json_conf["pick"]["scoring_file"]), + "plants.yaml") + json_file = os.path.join(tempfile.gettempdir(), "mikado.yaml") + with open(json_file, "wt") as json_handle: + Mikado.subprograms.configure.print_config(yaml.dump(json_conf, default_flow_style=False), + json_handle) + + sys.argv = ["mikado", "pick", "--json-conf", json_file] + with self.assertRaises(SystemExit): + pkg_resources.load_entry_point("Mikado", "console_scripts", "mikado")() + + import csv + with open(os.path.join(json_conf["pick"]["files"]["output_dir"], "mikado.test_diff.loci.scores.tsv")) as tsv: + reader = csv.DictReader(tsv, delimiter="\t") + score_names = [_ for _ in json_conf["scoring"]] + score_header = [_ for _ in reader.fieldnames if _ not in ("tid", "parent", "score", "source_score")] + self.assertEqual(sorted(score_names), sorted(score_header)) + + scoring_file = pkg_resources.resource_filename("Mikado.tests", "scoring_only_cds.yaml") + sys.argv = ["mikado", "pick", "--json-conf", json_file, "--scoring-file", scoring_file] + with self.assertRaises(SystemExit): + pkg_resources.load_entry_point("Mikado", "console_scripts", "mikado")() + with open(os.path.join(json_conf["pick"]["files"]["output_dir"], "mikado.test_diff.loci.scores.tsv")) as tsv: + reader = csv.DictReader(tsv, delimiter="\t") + score_header = [_ for _ in reader.fieldnames if _ not in ("tid", "parent", "score", "source_score")] + self.assertEqual(score_header, ["selected_cds_length"]) + + [os.remove(_) for _ in glob.glob(os.path.join(tempfile.gettempdir(), "mikado.test_diff.") + "*")] + def test_purging(self): gtf = """Chr1 foo transcript 100 1000 . + . gene_id "foo1"; transcript_id "foo1.1" From 40e2b45a0d85778066040735001ef6c5ff6a4d9b Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 8 Feb 2017 14:21:08 +0000 Subject: [PATCH 26/47] BF for get_f1, now the distance is negative - ie lower distance means better --- Mikado/scales/assigner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Mikado/scales/assigner.py b/Mikado/scales/assigner.py index c222843e2..1e61b019c 100644 --- a/Mikado/scales/assigner.py +++ b/Mikado/scales/assigner.py @@ -201,7 +201,7 @@ def get_f1(curr_result): :return: (J_F1, N_F1, distance) :rtype (float, float, int) """ - return curr_result.j_f1[0], curr_result.n_f1[0], curr_result.distance[0] + return curr_result.j_f1[0], curr_result.n_f1[0], -curr_result.distance[0] def __prepare_transcript(self, prediction: Transcript): """ From cf22dac1e62770dd17356a4719109a4cf0961ad1 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 8 Feb 2017 14:25:37 +0000 Subject: [PATCH 27/47] Added the file needed for the new unit test --- Mikado/tests/scoring_only_cds.yaml | 64 ++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 Mikado/tests/scoring_only_cds.yaml diff --git a/Mikado/tests/scoring_only_cds.yaml b/Mikado/tests/scoring_only_cds.yaml new file mode 100644 index 000000000..5d1670a4b --- /dev/null +++ b/Mikado/tests/scoring_only_cds.yaml @@ -0,0 +1,64 @@ +requirements: + expression: [(combined_cds_fraction.ncrna or combined_cds_fraction.coding) and ((exon_num.multi and (cdna_length.multi or combined_cds_length.multi) and max_intron_length and min_intron_length and proportion_verified_introns_inlocus), or, + (exon_num.mono and (combined_cds_length.mono or cdna_length.mono)))] + parameters: + combined_cds_fraction.ncrna: {operator: eq, value: 0} + combined_cds_fraction.coding: {operator: gt, value: 0.35} + cdna_length.mono: {operator: gt, value: 800} + cdna_length.multi: {operator: ge, value: 400} + combined_cds_length.mono: {operator: gt, value: 300} + combined_cds_length.multi: {operator: gt, value: 100} + exon_num.mono: {operator: eq, value: 1} + exon_num.multi: {operator: gt, value: 1} + max_intron_length: {operator: le, value: 4500000} + min_intron_length: {operator: ge, value: 5} + proportion_verified_introns_inlocus: {operator: gt, value: 0} +not_fragmentary: +# expression: [((exon_num.multi and (cdna_length.multi or combined_cds_length.multi)), or, (exon_num.mono and combined_cds_length.mono))] +# parameters: +# exon_num.multi: {operator: gt, value: 1} +# cdna_length.multi: {operator: ge, value: 200} +# combined_cds_length.multi: {operator: gt, value: 150} +# exon_num.mono: {operator: eq, value: 1} +# combined_cds_length.mono: {operator: gt, value: 600} + expression: [combined_cds_length] + parameters: + combined_cds_length: {operator: gt, value: 300} +scoring: + # blast_score: {rescaling: max} + # cdna_length: {rescaling: max} + # cds_not_maximal: {rescaling: min} + # cds_not_maximal_fraction: {rescaling: min} + # exon_fraction: {rescaling: max} + # exon_num: {rescaling: max} + # five_utr_length: + # filter: {operator: le, value: 2500} + # rescaling: target + # value: 100 + # five_utr_num: + # filter: {operator: lt, value: 4} + # rescaling: target + # value: 2 + # end_distance_from_junction: + # filter: {operator: lt, value: 55} + # rescaling: min + # highest_cds_exon_number: {rescaling: max} + # intron_fraction: {rescaling: max} + # is_complete: {rescaling: target, value: true} + # number_internal_orfs: {rescaling: target, value: 1} + # proportion_verified_introns: {rescaling: max} + # proportion_verified_introns_inlocus: {rescaling: max} + # retained_fraction: {rescaling: min} + # retained_intron_num: {rescaling: min} + # selected_cds_fraction: {rescaling: target, value: 0.6} + # selected_cds_intron_fraction: {rescaling: max} + selected_cds_length: {rescaling: max} + # selected_cds_num: {rescaling: max} + # three_utr_length: + # filter: {operator: le, value: 2500} + # rescaling: target + # value: 300 + # three_utr_num: + # filter: {operator: lt, value: 3} + # rescaling: target + # value: 1 From a3259036ef920b58ac58fc23faadf412a87ecd90 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 8 Feb 2017 16:10:20 +0000 Subject: [PATCH 28/47] BF for ordering properly perfect matches on both strands. --- Mikado/scales/assigner.py | 5 +- Mikado/tests/assigner_tester.py | 97 +++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 1 deletion(-) diff --git a/Mikado/scales/assigner.py b/Mikado/scales/assigner.py index 1e61b019c..6e3acd405 100644 --- a/Mikado/scales/assigner.py +++ b/Mikado/scales/assigner.py @@ -201,7 +201,10 @@ def get_f1(curr_result): :return: (J_F1, N_F1, distance) :rtype (float, float, int) """ - return curr_result.j_f1[0], curr_result.n_f1[0], -curr_result.distance[0] + return (curr_result.j_f1[0], + curr_result.n_f1[0], + -curr_result.distance[0], + (curr_result.ccode[0] not in ("x", "X"))) def __prepare_transcript(self, prediction: Transcript): """ diff --git a/Mikado/tests/assigner_tester.py b/Mikado/tests/assigner_tester.py index 8c9fe92ea..129e5f3bc 100644 --- a/Mikado/tests/assigner_tester.py +++ b/Mikado/tests/assigner_tester.py @@ -19,6 +19,103 @@ class AssignerTester(unittest.TestCase): This unit test has the purpose of testing the scales module of Mikado.py. """ + def test_get_f1(self): + + # __slots__ = ["ref_id", "ref_gene", "ccode", + # "tid", "gid", + # "tid_num_exons", "ref_num_exons", + # "n_prec", "n_recall", "n_f1", + # "j_prec", "j_recall", "j_f1", + # "e_prec", "e_recall", "e_f1", + # "distance"] + + result_perfect = Mikado.scales.resultstorer.ResultStorer( + "t1", "g1", "=", + "p1", "pg1", "2", "2", + 100, 100, 100, + 100, 100, 100, + 100, 100, 100, + 0) + + result_perfect_j = Mikado.scales.resultstorer.ResultStorer( + "t1", "g1", "=", + "p1", "pg1", "2", "2", + 80, 80, 80, + 100, 100, 100, + 0, 0, 0, + 0) + + # This does not make any sense, but it's only for the tests + result_perfect_n = Mikado.scales.resultstorer.ResultStorer( + "t1", "g1", "j", + "p1", "pg1", "2", "2", + 100, 100, 100, + 80, 80, 80, + 0, 0, 0, + 0) + + result_imperfect = Mikado.scales.resultstorer.ResultStorer( + "t1", "g1", "j", + "p1", "pg1", "2", "2", + 80, 80, 80, + 80, 80, 80, + 0, 0, 0, + 0) + + result_near = Mikado.scales.resultstorer.ResultStorer( + "t1", "g1", "p", + "p1", "pg1", "2", "2", + 0, 0, 0, + 0, 0, 0, + 0, 0, 0, + 10) + + result_middle = Mikado.scales.resultstorer.ResultStorer( + "t1", "g1", "p", + "p1", "pg1", "2", "2", + 0, 0, 0, + 0, 0, 0, + 0, 0, 0, + 500) + + result_far = Mikado.scales.resultstorer.ResultStorer( + "t1", "g1", "p", + "p1", "pg1", "2", "2", + 0, 0, 0, + 0, 0, 0, + 0, 0, 0, + 1000) + + result_x = Mikado.scales.resultstorer.ResultStorer( + "t1", "g1", "x", + "p1", "pg1", "2", "2", + 100, 100, 100, + 100, 100, 100, + 100, 100, 100, + 1000) + + self.assertEqual(sorted( + [result_perfect, result_imperfect, result_perfect_j, result_perfect_n], + key=Mikado.scales.assigner.Assigner.get_f1, reverse=True), + [result_perfect, result_perfect_j, result_perfect_n, result_imperfect] + ) + + self.assertEqual(sorted([result_perfect, result_far], key=Mikado.scales.assigner.Assigner.get_f1, reverse=True), + [result_perfect, result_far]) + + self.assertEqual(sorted( + [result_far, result_near, result_middle, result_imperfect, result_perfect], + key=Mikado.scales.assigner.Assigner.get_f1, reverse=True), + [result_perfect, result_imperfect, result_near, result_middle, result_far] + ) + + self.assertEqual(sorted( + [result_perfect, result_x], + key=Mikado.scales.assigner.Assigner.get_f1, reverse=True), + [result_perfect, result_x] + ) + + def test_self(self): reference = Mikado.loci.Transcript() reference.start = 100 From 24131e587211c08f585db537220bbbb7447f5327 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 8 Feb 2017 16:50:54 +0000 Subject: [PATCH 29/47] Now Mikado compare will also output the location of the matches, in WebApollo-like format. --- Mikado/picking/loci_processer.py | 5 ++-- Mikado/scales/assigner.py | 40 +++++++++++++++++++++++--------- Mikado/scales/contrast.pyx | 7 +++++- Mikado/scales/resultstorer.py | 8 ++++--- Mikado/tests/assigner_tester.py | 16 ++++++------- Mikado/transcripts/transcript.py | 8 +++++++ 6 files changed, 59 insertions(+), 25 deletions(-) diff --git a/Mikado/picking/loci_processer.py b/Mikado/picking/loci_processer.py index ee8cf020a..b1a5eb89a 100644 --- a/Mikado/picking/loci_processer.py +++ b/Mikado/picking/loci_processer.py @@ -262,8 +262,9 @@ def print_locus(stranded_locus, level="monosubloci", print_cds=not json_conf["pick"]["run_options"]["exclude_cds"]) if mono_lines != '': - mono_lines = "\n".join( - ["{0}/{1}".format(counter, line) for line in mono_lines.split("\n")]) + if counter is not None: + mono_lines = "\n".join( + ["{0}/{1}".format(counter, line) for line in mono_lines.split("\n")]) print(mono_lines, file=mono_out) mono_metrics_rows = [_ for _ in stranded_locus.print_monoholder_metrics() if _ != {} and "tid" in _] diff --git a/Mikado/scales/assigner.py b/Mikado/scales/assigner.py index 6e3acd405..0aeeed2af 100644 --- a/Mikado/scales/assigner.py +++ b/Mikado/scales/assigner.py @@ -15,7 +15,7 @@ from collections import namedtuple from functools import partial from logging import handlers as log_handlers - +from re import search as re_search from Mikado.transcripts.transcript import Transcript from .accountant import Accountant from .contrast import compare as c_compare @@ -317,6 +317,13 @@ def __check_for_fusions(self, prediction, matches): for key in ResultStorer.__slots__: if key in ["gid", "tid", "distance", "tid_num_exons"]: values.append(getattr(best[0], key)) + elif key == "location": + positions = [(group[0], int(group[1]), int(group[2])) for group in + [re_search("(.*):(\d+)\.\.(\d+)", _.location[0]).groups() for _ in best]] + chrom = set(_[0] for _ in positions).pop() + start = min(_[1] for _ in positions) + end = max(_[1] for _ in positions) + values.append("{}:{}..{}".format(chrom, start, end)) elif key == "ccode": values.append(tuple(["f"] + [_.ccode[0] for _ in best])) else: @@ -426,7 +433,7 @@ def self_analyse_prediction(self, prediction: Transcript, distances): prediction.parent[0], prediction.exon_num, "-", - *[0] * 9 + ["-"]) + *[0] * 9 + ["-"] + [prediction.location]) self.stat_calculator.store(prediction, best_result, None) results = [best_result] elif genes[0][1] > 0: @@ -574,13 +581,14 @@ def get_best(self, prediction: Transcript): if len(distances) == 0 or distances[0][1] > self.args.distance: ccode = "u" # noinspection PyTypeChecker,PyUnresolvedReferences + print(prediction.location) best_result = ResultStorer("-", "-", ccode, prediction.id, prediction.parent[0], prediction.exon_num, "-", - *[0] * 9 + ["-"]) + *[0] * 9 + ["-"] + [prediction.location]) self.stat_calculator.store(prediction, best_result, None) results = [best_result] elif distances[0][1] > 0: @@ -767,13 +775,15 @@ def print_refmap(self) -> None: "best_ccode", "best_tid", "best_gid", "best_nRecall", "best_nPrecision", "best_nF1", "best_jRecall", "best_jPrecision", "best_jF1", - "best_eRecall", "best_ePrecision", "best_eF1"] + "best_eRecall", "best_ePrecision", "best_eF1", + "location"] else: fields = ["ref_id", "ccode", "tid", "gid", "nF1", "jF1", "eF1", "ref_gene", "best_ccode", "best_tid", "best_gid", - "best_nF1", "best_jF1", "best_eF1"] + "best_nF1", "best_jF1", "best_eF1", + "location"] out_tuple = namedtuple("refmap", fields) rower = csv.DictWriter(out, fields, delimiter="\t") @@ -805,11 +815,13 @@ def print_refmap(self) -> None: best.tid, best.gid, best.n_recall[0], best.n_prec[0], best.n_f1[0], best.j_recall[0], best.j_prec[0], best.j_f1[0], - best.e_recall[0], best.e_prec[0], best.e_f1[0]]) + best.e_recall[0], best.e_prec[0], best.e_f1[0], + best.location[0]]) else: row = tuple([tid, gid, ",".join(best.ccode), best.tid, best.gid, - best.n_f1[0], best.j_f1[0], best.e_f1[0]]) + best.n_f1[0], best.j_f1[0], best.e_f1[0], + best.location[0]]) rows.append(row) @@ -838,7 +850,9 @@ def print_refmap(self) -> None: best_pick.gid, best_pick.n_recall[0], best_pick.n_prec[0], best_pick.n_f1[0], best_pick.j_recall[0], best_pick.j_prec[0], best_pick.j_f1[0], - best_pick.e_recall[0], best_pick.e_prec[0], best_pick.e_f1[0]) + best_pick.e_recall[0], best_pick.e_prec[0], best_pick.e_f1[0], + row[14] #Location + ) else: row = out_tuple(row[0], # Ref TID row[2], # class code @@ -850,12 +864,16 @@ def print_refmap(self) -> None: best_pick.tid, best_pick.gid, best_pick.n_f1[0], best_pick.j_f1[0], - best_pick.e_f1[0]) + best_pick.e_f1[0], + row[8] # Location + ) else: if self.args.extended_refmap is True: - row = out_tuple(*[row[0]] + ["NA"] * 12 + [row[1]] + ["NA"] * 12) + row = out_tuple(*[row[0]] + ["NA"] * 12 + [row[1]] + ["NA"] * 12 + [ + self.genes[gid][row[0]].location]) else: - row = out_tuple(*[row[0]] + ["NA"] * 6 + [row[1]] + ["NA"] * 6) + row = out_tuple(*[row[0]] + ["NA"] * 6 + [row[1]] + ["NA"] * 6 + [ + self.genes[gid][row[0]].location]) # noinspection PyProtectedMember,PyProtectedMember rower.writerow(row._asdict()) self.logger.info("Finished printing RefMap") diff --git a/Mikado/scales/contrast.pyx b/Mikado/scales/contrast.pyx index 6d475a02c..3b945e0ba 100644 --- a/Mikado/scales/contrast.pyx +++ b/Mikado/scales/contrast.pyx @@ -437,6 +437,10 @@ cpdef tuple compare(prediction, reference, bint lenient=False): ccode = "X" # "X{0}".format(ccode) reference_exon = None + location = "{}:{}..{}".format(reference.chrom, + min(reference.start, prediction.start), + max(reference.end, prediction.end)) + result = ResultStorer(reference.id, ",".join(reference.parent), ccode, prediction.id, @@ -455,7 +459,8 @@ cpdef tuple compare(prediction, reference, bint lenient=False): round(exon_precision * 100, 2), round(100 * exon_recall, 2), round(100 * exon_f1, 2), - distance) + distance, + location) if ccode == "": raise ValueError("Ccode is null;\n{0}".format(repr(result))) diff --git a/Mikado/scales/resultstorer.py b/Mikado/scales/resultstorer.py index e87218abf..ce1689b9f 100644 --- a/Mikado/scales/resultstorer.py +++ b/Mikado/scales/resultstorer.py @@ -14,7 +14,8 @@ class ResultStorer: "n_prec", "n_recall", "n_f1", "j_prec", "j_recall", "j_f1", "e_prec", "e_recall", "e_f1", - "distance"] + "distance", + "location"] def __init__(self, *args): @@ -34,7 +35,7 @@ def __init__(self, *args): self.n_prec, self.n_recall, self.n_f1,\ self.j_prec, self.j_recall, self.j_f1, \ self.e_prec, self.e_recall, self.e_f1, \ - self.distance = args + self.distance, self.location = args for index, key in enumerate(self.__slots__): if index < 3: @@ -69,9 +70,10 @@ def _asdict(self): result_dict[attr] = getattr(self, attr) for attr in (self.__slots__[6],): # prediction exons result_dict[attr] = ",".join("{0}".format(x) for x in getattr(self, attr)) - for attr in self.__slots__[7:-1]: + for attr in self.__slots__[7:-2]: result_dict[attr] = ",".join("{0:,.2f}".format(x) for x in getattr(self, attr)) result_dict["distance"] = self.distance[0] # Last attribute + result_dict["location"] = self.location[0] return result_dict def as_dict(self): diff --git a/Mikado/tests/assigner_tester.py b/Mikado/tests/assigner_tester.py index 129e5f3bc..a4910480a 100644 --- a/Mikado/tests/assigner_tester.py +++ b/Mikado/tests/assigner_tester.py @@ -35,7 +35,7 @@ def test_get_f1(self): 100, 100, 100, 100, 100, 100, 100, 100, 100, - 0) + 0, "chr1:100..10000") result_perfect_j = Mikado.scales.resultstorer.ResultStorer( "t1", "g1", "=", @@ -43,7 +43,7 @@ def test_get_f1(self): 80, 80, 80, 100, 100, 100, 0, 0, 0, - 0) + 0, "chr1:100..10000") # This does not make any sense, but it's only for the tests result_perfect_n = Mikado.scales.resultstorer.ResultStorer( @@ -52,7 +52,7 @@ def test_get_f1(self): 100, 100, 100, 80, 80, 80, 0, 0, 0, - 0) + 0, "chr1:100..10000") result_imperfect = Mikado.scales.resultstorer.ResultStorer( "t1", "g1", "j", @@ -60,7 +60,7 @@ def test_get_f1(self): 80, 80, 80, 80, 80, 80, 0, 0, 0, - 0) + 0, "chr1:100..10000") result_near = Mikado.scales.resultstorer.ResultStorer( "t1", "g1", "p", @@ -68,7 +68,7 @@ def test_get_f1(self): 0, 0, 0, 0, 0, 0, 0, 0, 0, - 10) + 10, "chr1:100..10000") result_middle = Mikado.scales.resultstorer.ResultStorer( "t1", "g1", "p", @@ -76,7 +76,7 @@ def test_get_f1(self): 0, 0, 0, 0, 0, 0, 0, 0, 0, - 500) + 500, "chr1:100..10000") result_far = Mikado.scales.resultstorer.ResultStorer( "t1", "g1", "p", @@ -84,7 +84,7 @@ def test_get_f1(self): 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1000) + 1000, "chr1:100..10000") result_x = Mikado.scales.resultstorer.ResultStorer( "t1", "g1", "x", @@ -92,7 +92,7 @@ def test_get_f1(self): 100, 100, 100, 100, 100, 100, 100, 100, 100, - 1000) + 1000, "chr1:100..10000") self.assertEqual(sorted( [result_perfect, result_imperfect, result_perfect_j, result_perfect_n], diff --git a/Mikado/transcripts/transcript.py b/Mikado/transcripts/transcript.py index 9bd718a67..8158605d8 100644 --- a/Mikado/transcripts/transcript.py +++ b/Mikado/transcripts/transcript.py @@ -1322,6 +1322,14 @@ def gene(self): return self.attributes["gene_id"] + @property + def location(self): + """Web-apollo compatible string for the location of the transcript.""" + + return "{}:{}..{}".format(self.chrom, + self.start, + self.end) + @property def score(self): """Numerical value which summarizes the reliability of the transcript.""" From b3bd2ee5f006407f6f47c2c1752e8a6b1c8365db Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 8 Feb 2017 17:59:58 +0000 Subject: [PATCH 30/47] Started to work on moving the fragment class codes into the configuration. Started on a util to print out the class code table, for convenience. --- CHANGELOG.md | 1 + .../configuration_blueprint.json | 27 +++ Mikado/subprograms/util/class_codes.py | 165 ++++++++++++++++++ 3 files changed, 193 insertions(+) create mode 100644 Mikado/subprograms/util/class_codes.py diff --git a/CHANGELOG.md b/CHANGELOG.md index c59299c0c..c96548608 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ Changes in this release: - "simple_overlap_for_monoexonic": to switch on/off the old default behaviour with monoexonic transcripts - "purge": whether to completely exclude failed loci, previously under "run_options" - "remove_overlapping_fragments": whether to exclude fragments, previously under "run_options" +- Mikado compare now also provides the location of the matches in TMAP and REFMAP files. - When printing out putative fragments, now Mikado will indicate the class code of the fragment, the match against which it was deemed a fragment of, and the distance of said fragment (if they are not overlapping). - Deprecated the "discard_definition" flag in Mikado serialise. Now Mikado will infer on its own whether to use the definition or the ID for serialising BLAST results. - Now AbstractLocus implementations have a private method to check the correctness of the json_conf. As a corollary, Transcript and children have been moved to their own subpackage ("transcripts") in order to break the circular dependency Mikado.loci.Abstractlocus <- Mikado.configurator <- Mikado.loci.Transcript. *Technical note*: checking the consinstency of the configuration is an expensive operation, so it will be executed on demand rather than automatically. diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json index 3f4ca22a2..6f4d74223 100644 --- a/Mikado/configuration/configuration_blueprint.json +++ b/Mikado/configuration/configuration_blueprint.json @@ -693,6 +693,33 @@ "type": "number", "minimum": 0, "default": 2000 + }, + "fragment_class_codes": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "p", + "P", + "i", + "I", + "ri", + "rI", + "x", + "X", + "m", + "_" + ] + }, + "default": [ + "p", + "P", + "x", + "X", + "i", + "m", + "_" + ] } } }, diff --git a/Mikado/subprograms/util/class_codes.py b/Mikado/subprograms/util/class_codes.py new file mode 100644 index 000000000..50a599cc5 --- /dev/null +++ b/Mikado/subprograms/util/class_codes.py @@ -0,0 +1,165 @@ +from collections import OrderedDict as odict + +codes = odict() + +codes["="] = {"Definition": "Complete intron chain match.", + "ref_multi": True, + "pred_multi": True, + "nucl": "NA", + "junc": "100%, 100%, 100%", + "reverse": "=", + "category": "Match"} +codes["_"] = {"Definition": "Complete match between two monoexonic transcripts.", + "ref_multi": False, + "pred_multi": False, + "nucl": "NA", + "junc": "NA, NA, >=80%", + "reverse": "_", + "category": "Match"} +codes["n"] = {"Definition": """Intron chain extension, ie. both transcripts are multiexonic and +the prediction has novel splice sites outside of the reference transcript boundaries.""", + "ref_multi": True, + "pred_multi": True, + "nucl": "100%, < 100%, < 100%", + "junc": "100%, < 100%, < 100%", + "reverse": "c", + "category": "Extension"} +codes["J"] ={"Definition": """Intron chain extension, ie. both transcripts are multiexonic and +the prediction has novel splice sites inside of the reference transcript boundaries.""", + "ref_multi": True, + "pred_multi": True, + "nucl": "100%, <= 100%, < 100%", + "junc": "100%, < 100%, < 100%", + "reverse": "C", + "category": "Extension"} +codes["c"] = {"Definition": """The prediction is either multiexonic and with its intron chain completely contained +within that of the reference, or monoexonic and contained within one of the reference exons.""", + "ref_multi": "NA", + "pred_multi": "NA", + "nucl": "< 100%, 100%, NA", + "junc": "< 100%, 100%, NA", + "reverse": "n", + "category": "Extension"}, +codes["C"] = {"Definition": """The prediction intron chain is completely contained within that of the reference +transcript, but it partially debords either into its introns or outside of the reference boundaries.""", + "ref_multi": True, + "pred_multi": True, + "nucl": "<= 100%, < 100%, < 100%", + "junc": "< 100%, 100%, < 100%", + "reverse": "J or j", + "category": "Extension"} +codes["j"] = {"Definition": """Alternative splicing event.""", + "ref_multi": True, + "pred_multi": True, + "nucl": "NA", + "junc": "<= 100%, 100%, < 100%", + "reverse": "j", + "category": "Alternative splicing"} + + + + +# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ +# | **j** | Alternative splicing event. | True | True | NA | <= 100%, < 100%, | **j** | **Alternative | +# | | | | | | < 100% | | splicing** | +# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ +# | **h** | Structural match between two | True | True | > 0%, > 0%, > 0% | 0%, 0%, 0% | **h** | **Alternative | +# | | models where no splice site | | | | | | splicing** | +# | | is conserved but **at least**| | | | | | | +# | | one intron of the reference | | | | | | | +# | | and one intron of the | | | | | | | +# | | prediction partially overlap.| | | | | | | +# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ +# | **g** | The monoexonic prediction | True | False | > 0%, > 0%, | 0% | **G** | **Alternative | +# | ("mo" before | overlaps one or more exons of| | | between 0 and 100%| | | splicing** | +# | release 1) | the reference transcript; the| | | | | | | +# | | borders of the prediction | | | | | | | +# | | cannot fall inside the | | | | | | | +# | | introns of the reference. | | | | | | | +# | | The prediction transcript | | | | | | | +# | | can bridge multiple exons | | | | | | | +# | | of the reference model. | | | | | | | +# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ +# | **G** | Generic match of a | False | True | > 0%, > 0%, > 0% | 0% | **g** | **Alternative | +# | ("O" before | multiexonic prediction | | | | | | splicing** | +# | release 1) | transcript versus a | | | | | | | +# | | monoexonic reference. | | | | | | | +# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ +# | **o** | Generic overlap between two | True | True | > 0%, > 0%, > 0% | 0%, 0%, 0% | **o** | **Overlap** | +# | | multiexonic transcripts, | | | | | | | +# | | which do not share **any** | | | | | | | +# | | overlap among their introns. | | | | | | | +# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ +# | **e** | Single exon transcript | True | False | > 0%, > 0%, | 0% | **G** | **Overlap** | +# | | overlapping *one* reference | | | between 0 and 100%| | | | +# | | exon and at least 10 bps of a| | | | | | | +# | | reference intron, indicating | | | | | | | +# | | a possible pre-mRNA fragment.| | | | | | | +# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ +# | **m** | Generic match between two | False | False | NA, NA, **< 80%** | NA | **m** | **Overlap** | +# | | monoexonic transcripts. | | | | | | | +# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ +# | **i** | Monoexonic prediction | True | False | 0% | 0% | **ri** | **Intronic** | +# | | completely contained within | | | | | | | +# | | one intron of the reference | | | | | | | +# | | transcript. | | | | | | | +# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ +# | **I** | Prediction completely | True | True | 0% | 0% | **rI** | **Intronic** | +# | | contained within the introns | | | | | | | +# | | of the reference transcript. | | | | | | | +# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ +# | **rI** | Reference completely | True | True | 0% | 0% | **I** | **Intronic** | +# | | contained within the introns | | | | | | | +# | | of the prediction transcript.| | | | | | | +# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ +# | **ri** | Reverse intron transcript - | False | True | 0% | 0% | **i** | **Intronic** | +# | | the monoexonic reference is | | | | | | | +# | | completely contained within | | | | | | | +# | | one intron of the prediction | | | | | | | +# | | transcript. | | | | | | | +# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ +# | **f** | Fusion - this special code | NA | NA | **> 10%**, NA, NA | **> 0%**, NA, NA | NA | **Fusion** | +# | | is applied when a prediction | | | | | | | +# | | intersects more than one | | | | | | | +# | | reference transcript. To be | | | | | | | +# | | considered for fusions, | | | | | | | +# | | candidate references must | | | | | | | +# | | **either** share at least one| | | | | | | +# | | splice junction with the | | | | | | | +# | | prediction, **or** have at | | | | | | | +# | | least 10% of its bases | | | | | | | +# | | recalled. If two or more | | | | | | | +# | | reference transcripts fit | | | | | | | +# | | these constraints, then the | | | | | | | +# | | prediction model is | | | | | | | +# | | classified as a **fusion**. | | | | | | | +# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ +# | **x** | Monoexonic match on the | NA | False | >= 0% | 0% | **x** or **X** | **Fragment** | +# | | *opposite* strand. | | | | | | | +# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ +# | **X** | Multiexonic match on the | NA | True | >= 0% | 0% | **x** or **X** | **Fragment** | +# | | *opposite* strand. | | | | | | | +# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ +# | **p** | The prediction is on the same| NA | NA | 0% | 0% | **p** | **No overlap** | +# | | strand of a neighbouring but | | | | | | | +# | | non-overlapping transcript. | | | | | | | +# | | Probable polymerase run-on. | | | | | | | +# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ +# | **P** | The prediction is on the | NA | NA | 0% | 0% | **P** | **No overlap** | +# | | *opposite* strand of a | | | | | | | +# | | neighbouring but | | | | | | | +# | | non-overlapping transcript. | | | | | | | +# | | Probable polymerase run-on. | | | | | | | +# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ +# | **u** | Unknown - no suitable model | NA | NA | 0% | 0% | NA | **No overlap** | +# | | has been found near enough | | | | | | | +# | | the prediction to perform a | | | | | | | +# | | comparison. | | | | | | | +# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ + + + + + + + From d15cc3450a697f01778d60790dd0b3f038ee7b9b Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Thu, 9 Feb 2017 13:28:05 +0000 Subject: [PATCH 31/47] Moved the options regarding fragments to a new subsection. Added the class_codes utility. This utility and metrics now support interactive querying. --- CHANGELOG.md | 21 +- .../configuration_blueprint.json | 51 +- Mikado/loci/locus.py | 8 +- Mikado/picking/loci_processer.py | 4 +- Mikado/picking/picker.py | 13 +- Mikado/scales/class_codes.py | 483 ++++++++++++++++++ Mikado/subprograms/util/__init__.py | 7 + Mikado/subprograms/util/class_codes.py | 248 ++++----- Mikado/subprograms/util/metrics.py | 24 +- 9 files changed, 663 insertions(+), 196 deletions(-) create mode 100644 Mikado/scales/class_codes.py diff --git a/CHANGELOG.md b/CHANGELOG.md index c96548608..46beec374 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,14 +10,21 @@ Changes in this release: - one intron of either transcript is completely contained within an exon of the other. OR - at least one of the transcripts is monoexonic and there is some overlap of any kind. This behaviour (which was the default until this release) can be switched off through pick/clustering/simple_overlap_for_monoexonic (default true). -- **MAJOR**: changed slightly the anatomy of the configuration files. Now "pick" has a new subsection, "clustering", dedicated to how to cluster the transcripts in the different steps. Currently it contains the keys: - - "flank" - - "min_cdna_overlap" and "min_cds_overlap" (for the second clustering during the monosublocusHolder phase) - - "cds_only": to indicate whether we should only consider the CDS for clustering after the initial merging in the Superlocus. - - "simple_overlap_for_monoexonic": to switch on/off the old default behaviour with monoexonic transcripts - - "purge": whether to completely exclude failed loci, previously under "run_options" - - "remove_overlapping_fragments": whether to exclude fragments, previously under "run_options" +- **MAJOR**: changed slightly the anatomy of the configuration files. Now "pick" has two new subsections, "clustering" and "fragments". + - Clustering: dedicated to how to cluster the transcripts in the different steps. Currently it contains the keys: + - "flank" + - "min_cdna_overlap" and "min_cds_overlap" (for the second clustering during the monosublocusHolder phase) + - "cds_only": to indicate whether we should only consider the CDS for clustering after the initial merging in the Superlocus. + - "simple_overlap_for_monoexonic": to switch on/off the old default behaviour with monoexonic transcripts + - "purge": whether to completely exclude failed loci, previously under "run_options" + - Fragments: dedicated to how to identify and treat putative fragments. Currently it contains the keys: + - "remove": whether to exclude fragments, previously under "run_options" + - "valid_class_codes": which class codes constitute a fragment match. Only class codes in the "Intronic", "Overlap" (inclusive of _) and "Fragment" categories are allowed. + - max_distance: for non-overlapping fragments (ie p and P), maximum distance from the gene. - Mikado compare now also provides the location of the matches in TMAP and REFMAP files. +- Introduced a new utility, "class_codes", to print out the information of the class codes. The definition of class codes is now contained in a subpackage of "scales". +- The "metrics" utility now allows for interactive querying based on category or metric name. +- The class code repertoire for putative fragments has been expanded, and made configurable through the "fragments" section. - When printing out putative fragments, now Mikado will indicate the class code of the fragment, the match against which it was deemed a fragment of, and the distance of said fragment (if they are not overlapping). - Deprecated the "discard_definition" flag in Mikado serialise. Now Mikado will infer on its own whether to use the definition or the ID for serialising BLAST results. - Now AbstractLocus implementations have a private method to check the correctness of the json_conf. As a corollary, Transcript and children have been moved to their own subpackage ("transcripts") in order to break the circular dependency Mikado.loci.Abstractlocus <- Mikado.configurator <- Mikado.loci.Transcript. *Technical note*: checking the consinstency of the configuration is an expensive operation, so it will be executed on demand rather than automatically. diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json index 6f4d74223..859cfd423 100644 --- a/Mikado/configuration/configuration_blueprint.json +++ b/Mikado/configuration/configuration_blueprint.json @@ -654,47 +654,60 @@ "SimpleComment": [ "Parameters related to the clustering of transcripts into loci.", "- flank: maximum distance for transcripts to be clustered within the same superlocus." - ], - "properties":{ + ], + "properties": { "cds_only": { "type": "boolean", "default": false }, - "min_cds_overlap":{ + "min_cds_overlap": { "type": "number", "minimum": 0.000001, "maximum": 1, "default": 0.2 }, - "min_cdna_overlap":{ + "min_cdna_overlap": { "type": "number", "minimum": 0.000001, "maximum": 1, "default": 0.2 }, - "flank":{ + "purge": { + "type": "boolean", + "default": true + }, + "flank": { "type": "integer", "minimum": 0, "default": 200 }, - "remove_overlapping_fragments": { - "type": "boolean", - "default": true - }, - "purge": { + "simple_overlap_for_monoexonic": { "type": "boolean", "default": true - }, - "simple_overlap_for_monoexonic": { + } + } + }, + "fragments":{ + "type": "object", + "Comment": ["Parameters related to the handling of fragments.", + "- remove: boolean. Whether to remove fragments or leave them, properly tagged.", + "- max_distance: maximum distance of a putative fragment from a valid gene.", + "- valid_class_codes: which class codes will be considered as fragments. Default:", + "(p, P, x, X, i, m, _). Choices: '_' plus any class code with category", + "'Intronic', 'Fragment', or 'Overlap'."], + "SimpleComment": ["Parameters related to the handling of fragments.", + "- remove: boolean. Whether to remove fragments or leave them, properly tagged."], + "properties": { + "remove": { "type": "boolean", "default": true }, - "max_distance_for_fragments": { + "max_distance": { "type": "number", "minimum": 0, "default": 2000 }, - "fragment_class_codes": { + "valid_class_codes": { "type": "array", "items": { "type": "string", @@ -708,7 +721,9 @@ "x", "X", "m", - "_" + "_", + "e", + "o" ] }, "default": [ @@ -721,9 +736,9 @@ "_" ] } - } - }, - "files": { + } + }, + "files": { "type": "object", "Comment": [ "Input and output files for Mikado pick.", diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py index d9bc5cbb7..cfae342b3 100644 --- a/Mikado/loci/locus.py +++ b/Mikado/loci/locus.py @@ -369,19 +369,17 @@ def other_is_fragment(self, other.primary_transcript_id) result, _ = Assigner.compare(other.primary_transcript, self.primary_transcript) - max_distance = self.json_conf["pick"]["clustering"]["max_distance_for_fragments"] - # Exclude anything which is completely contained within an intron, - # or is a monoexonic fragment overlapping/in the neighborhood + max_distance = self.json_conf["pick"]["fragments"]["max_distance"] self.logger.debug("Comparison between {0} (strand {3}) and {1}: class code \"{2}\"".format( self.primary_transcript.id, other.primary_transcript.id, result.ccode[0], other.strand)) - if result.ccode[0] in ("i", "P", "p", "x", "X", "m", "_") and result.distance[0] <= max_distance: + if (result.ccode[0] in self.json_conf["pick"]["fragments"]["valid_class_codes"] and + result.distance[0] <= max_distance): self.logger.debug("{0} is a fragment (ccode {1})".format( other.primary_transcript.id, result.ccode[0])) return True, result - # Adding c's because fragments might very well be contained! return False, None diff --git a/Mikado/picking/loci_processer.py b/Mikado/picking/loci_processer.py index b1a5eb89a..f4e53ca08 100644 --- a/Mikado/picking/loci_processer.py +++ b/Mikado/picking/loci_processer.py @@ -282,7 +282,7 @@ def print_locus(stranded_locus, for locus in stranded_locus.loci: gene_counter += 1 fragment_test = ( - json_conf["pick"]["clustering"]["remove_overlapping_fragments"] + json_conf["pick"]["fragments"]["remove"] is True and stranded_locus.loci[locus].is_fragment is True) if fragment_test is True: @@ -438,7 +438,7 @@ def remove_fragments(stranded_loci, json_conf, logger): comparisons[locus_to_check.id].append(comparison) for locus in comparisons: - if json_conf["pick"]["clustering"]["remove_overlapping_fragments"] is True: + if json_conf["pick"]["fragments"]["remove"] is True: # A bit convoluted: use the locus ID to find the correct superlocus, then delete the ID inside the SL. del stranded_loci_dict[loci_to_superloci[locus]].loci[locus] else: diff --git a/Mikado/picking/picker.py b/Mikado/picking/picker.py index bd13653f9..7c946ff39 100644 --- a/Mikado/picking/picker.py +++ b/Mikado/picking/picker.py @@ -134,18 +134,23 @@ def __init__(self, json_conf, commandline=""): force=True) self.logging_queue = multiprocessing.Queue(-1) self.printer_queue = multiprocessing.Queue(-1) - self.setup_logger() + # self.setup_logger() self.logger.info("Multiprocessing method: %s", self.json_conf["multiprocessing_method"]) for key in ("remove_overlapping_fragments", "flank", "purge"): if key in self.json_conf["pick"]["run_options"]: # Put warnings in place for the deprecation of some options. + + if key == "remove_overlapping_fragments": + self.json_conf["pick"]["fragments"]["remove"] = self.json_conf["pick"]["run_options"].pop(key) + new_home = "fragments/remove" + else: + self.json_conf["pick"]["clustering"][key] = self.json_conf["pick"]["run_options"].pop(key) + new_home = "clustering/{}".format(key) warns = PendingDeprecationWarning( - """The \"{}\" property has now been moved to the pick/clustering section. -Please update your configuration files in the future.""".format(key)) + """The \"{}\" property has now been moved to pick/{}. Please update your configuration files in the future.""".format(key, new_home)) self.logger.warn(warns) - self.json_conf["pick"]["clustering"][key] = self.json_conf["pick"]["run_options"][key] self.context = multiprocessing.get_context() if self.json_conf["pick"]["scoring_file"].endswith((".pickle", ".model")): diff --git a/Mikado/scales/class_codes.py b/Mikado/scales/class_codes.py new file mode 100644 index 000000000..d427c0a38 --- /dev/null +++ b/Mikado/scales/class_codes.py @@ -0,0 +1,483 @@ +""" +This module contains the definitions of the class codes, using a custom class. +""" + + +from collections import OrderedDict as odict + +def _is_digit(value): + if not ((value is None) or (isinstance(value, (float, int)) and 0 <= value <= 100)): + raise ValueError("Invalid numeric value: {}, type: {}".format(value, type(value))) + return True + + +def _is_boolean(value): + if value not in (True, False, None): + raise ValueError("Invalid boolean value: {}, type: {}".format(value, type(value))) + return True + + +class ClassCode: + + """Container for the class codes .""" + + def __init__(self, code): + + self.__code = code + self.__definition = None + self.__ref_multi = None + self.__pred_multi = None + self.__nucl_f1, self.__nucl_prec, self.__nucl_rec = None, None, None + self.__junc_f1, self.__junc_prec, self.__junc_rec = None, None, None + self.__reverse = None + self.__category = None + + def __eq__(self, other): + if hasattr(other, "code") and self.code == other.code: + return True + else: + return False + + def __hash__(self): + return hash(self.code) + + @property + def code(self): + return self.__code + + @property + def _nucl_prec(self): + return self.__nucl_prec + + @_nucl_prec.setter + def _nucl_prec(self, value): + self.__nucl_prec = value + + @property + def _nucl_rec(self): + return self.__nucl_rec + + @_nucl_rec.setter + def _nucl_rec(self, value): + self.__nucl_rec = value + + @property + def _nucl_f1(self): + return self.__nucl_f1 + + @_nucl_f1.setter + def _nucl_f1(self, value): + self.__nucl_f1 = value + + @property + def nucl(self): + if self._nucl_f1 is None and self._nucl_rec is None and self._nucl_prec is None: + return "NA" + else: + line = [] + for val in (self._nucl_rec, self._nucl_prec, self._nucl_f1): + if val is not None: + line.append("{}%".format(val)) + else: + line.append("NA") + return ", ".join(line) + + @property + def junc(self): + if self._junc_f1 is None and self._junc_rec is None and self._junc_prec is None: + return "NA" + else: + line = [] + for val in (self._junc_rec, self._junc_prec, self._junc_f1): + if val is not None: + line.append("{}%".format(val)) + else: + line.append("NA") + return ", ".join(line) + + @property + def _junc_prec(self): + return self.__junc_prec + + @_junc_prec.setter + def _junc_prec(self, value): + self.__junc_prec = value + + @property + def _junc_rec(self): + return self.__junc_rec + + @_junc_rec.setter + def _junc_rec(self, value): + self.__junc_rec = value + + @property + def _junc_f1(self): + return self.__junc_f1 + + @_junc_f1.setter + def _junc_f1(self, value): + self.__junc_f1 = value + + @property + def pred_multi(self): + if self.__pred_multi is None: + return "NA" + else: + return self.__pred_multi + + @pred_multi.setter + def pred_multi(self, value): + if _is_boolean(value): + self.__pred_multi = value + + @property + def ref_multi(self): + if self.__ref_multi is None: + return "NA" + else: + return self.__ref_multi + + @ref_multi.setter + def ref_multi(self, value): + if _is_boolean(value): + self.__ref_multi = value + + @property + def definition(self): + if self.__definition is None: + return "NA" + else: + return self.__definition + + @definition.setter + def definition(self, value): + if isinstance(value, bytes): + value = value.decode() + elif not (isinstance(value, str) or value is None): + raise ValueError("Invalid value for definition: {}, type {}".format(value, type(value))) + self.__definition = value + + @property + def category(self): + if self.__category is None: + return "NA" + else: + return self.__category + + @category.setter + def category(self, value): + if isinstance(value, bytes): + value = value.decode() + elif not (isinstance(value, str) or value is None): + raise ValueError("Invalid value for category: {}, type {}".format(value, type(value))) + self.__category = value + + @property + def reverse(self): + if self.__reverse is None: + return "NA" + else: + return self.__reverse + + @reverse.setter + def reverse(self, value): + if isinstance(value, bytes): + value = value.decode() + elif not (isinstance(value, str) or value is None): + raise ValueError("Invalid value for reverse: {}, type {}".format(value, type(value))) + + self.__reverse = value + + +def code_equal(): + equal = ClassCode("=") + equal.definition = "Complete intron chain match." + equal.pred_multi, equal.ref_multi = True, True + equal._junc_f1, equal._junc_prec, equal._junc_rec = [100] * 3 + equal.reverse = "=" + equal.category = "Match" + return equal + + +def code_underscore(): + underscore = ClassCode("_") + underscore.definition = "Complete match between two monoexonic transcripts." + underscore.ref_multi, underscore.pred_multi = False, False + underscore._nucl_f1 = ">=80" + underscore.reverse = "_" + underscore.category = "Match" + return underscore + + +def code_n(): + code = ClassCode("n") + code.definition = """Intron chain extension, ie. both transcripts are multiexonic and + the prediction has novel splice sites outside of the reference transcript boundaries.""" + code.ref_multi, code.pred_multi = True, True + code._nucl_rec, code._nucl_prec, code._nucl_f1 = (100, "< 100", "<100") + code._junc_rec, code._junc_prec, code._junc_f1 = (100, "< 100", "<100") + code.reverse = "c" + code.category = "Extension" + return code + + +def code_capital_j(): + code = ClassCode("J") + code.definition = """Intron chain extension, ie. both transcripts are multiexonic and + the prediction has novel splice sites inside of the reference transcript boundaries.""" + code.ref_multi, code.pred_multi = True, True + code._nucl_rec, code._nucl_prec, code._nucl_f1 = (100, "<= 100", "<100") + code._junc_rec, code._junc_prec, code._junc_f1 = (100, "< 100", "<100") + code.reverse = "C" + code.category = "Extension" + return code + + +def code_c(): + code = ClassCode("c") + code.definition = """The prediction is either multiexonic and with its intron chain completely contained + within that of the reference, or monoexonic and contained within one of the reference exons.""" + code.pred_multi, code.ref_multi = None, None + code._nucl_rec, code._nucl_prec, code._nucl_f1 = "< 100", "100", None + code._junc_rec, code._junc_prec, code._junc_f1 = "< 100", "100", None + code.reverse = "n" + code.category = "Extension" + return code + + +def code_capital_c(): + code = ClassCode("C") + code.definition = """The prediction intron chain is completely contained within that of the reference + transcript, but it partially debords either into its introns or outside of the reference boundaries.""" + code.pred_multi, code.ref_multi = True, True + code._nucl_rec, code._nucl_prec, code._nucl_f1 = "<= 100", "< 100", "< 100" + code._junc_rec, code._junc_prec, code._junc_f1 = "< 100", "100", "< 100" + code.reverse = "J or j" + code.category = "Extension" + return code + + +def code_j(): + code = ClassCode("j") + code.definition = """Alternative splicing event.""" + code.ref_multi, code.pred_multi = True, True + code._junc_rec, code._junc_prec, code._junc_f1 = "<= 100", "100", "< 100" + code.reverse = "j or C" + code.category = "Alternative splicing" + return code + + +def code_h(): + code = ClassCode("h") + code.definition = """Structural match between two models where where no splice site is conserved but at least + one intron of the reference and one intron of the prediction partially overlap.""" + code.ref_multi, code_n.pred_multi = True, True + code._nucl_rec, code._nucl_prec, code._nucl_f1 = "> 0", "> 0", "> 0" + code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 + code.reverse = "h" + code.category = "Alternative splicing" + return code + + +def code_g(): + code = ClassCode("g") + code.definition = """The monoexonic prediction overlaps one or more exons of the reference transcript; + the borders of the prediction cannot fall inside the introns of the reference. + The prediction transcript can bridge multiple exons of the reference model.""" + code.ref_multi, code.pred_multi = True, False + code._nucl_rec, code._nucl_prec, code._nucl_f1 = "> 0", "> 0", "0% < F1 < 100" + code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 + code.reverse = "G" + code.category = "Alternative splicing" + return code + + +def code_capital_g(): + code = ClassCode("G") + code.definition = """Generic match of a multiexonic prediction transcript versus a monoexonic reference.""" + code.ref_multi, code.pred_multi = False, True + code._nucl_rec, code._nucl_prec, code._nucl_f1 = "> 0", "> 0", "0% < F1 < 100" + code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 + code.reverse = "g" + code.category = "Alternative splicing" + return code + + +def code_o(): + code = ClassCode("o") + code.definition = """Generic overlap between two multiexonic transcripts, + which do not share any overlap among their introns.""" + code.ref_multi, code.pred_multi = True, True + code.ref_multi, code.pred_multi = True, True + code._nucl_rec, code._nucl_prec, code._nucl_f1 = "> 0", "> 0", "0% < F1 < 100" + code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 + code.reverse = "o" + code.category = "Overlap" + return code + + +def code_e(): + code = ClassCode("e") + code.definition = """Single exon transcript overlapping one reference exon and at least 10 bps of a + reference intron, indicating a possible pre-mRNA fragment.""" + code.ref_multi, code.pred_multi = True, False + code._nucl_rec, code._nucl_prec, code._nucl_f1 = "> 0", "> 0", "0% < F1 < 100" + code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 + code.reverse = "G" + code.category = "Overlap" + return code + + +def code_m(): + code = ClassCode("m") + code.definition = """Generic match between two monoexonic transcripts.""" + code.ref_multi, code.pred_multi = False, False + code._nucl_rec, code._nucl_prec, code._nucl_f1 = None, None, "< 80" + code._junc_rec, code._junc_prec, code._junc_f1 = None, None, None + code.reverse = "m" + code.category = "Overlap" + return code + + +def code_i(): + code = ClassCode("i") + code.definition = "Monoexonic prediction completely contained within one intron of the reference transcript." + code.ref_multi, code.pred_multi = True, False + code._nucl_rec, code._nucl_prec, code._nucl_f1 = 0, 0, 0 + code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 + code.reverse = "ri" + code.category = "Intronic" + return code + + +def code_capital_i(): + code = ClassCode("I") + code.definition = "Prediction completely contained within the introns of the reference transcript." + code.ref_multi, code.pred_multi = True, True + code._nucl_rec, code._nucl_prec, code._nucl_f1 = 0, 0, 0 + code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 + code.reverse = "rI" + code.category = "Intronic" + return code + + +def code_r_i(): + code = ClassCode("ri") + code.definition = """Reverse intron transcript - the monoexonic reference is completely contained + within one intron of the prediction transcript.""" + code.ref_multi, code.pred_multi = False, True + code._nucl_rec, code._nucl_prec, code._nucl_f1 = 0, 0, 0 + code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 + code.reverse = "i" + code.category = "Intronic" + return code + + +def code_r_capital_i(): + code = ClassCode("rI") + code.definition = """Multiexonic reference completely contained within the introns of the prediction transcript.""" + code.ref_multi, code.pred_multi = True, True + code._nucl_rec, code._nucl_prec, code._nucl_f1 = 0, 0, 0 + code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 + code.reverse = "I" + code.category = "Intronic" + return code + + +def code_f(): + code = ClassCode("f") + code.definition = """Fusion - this special code is applied when a prediction intersects more than one + reference transcript. To be considered for fusions, candidate references must **either** share at least one + splice junction with the prediction, **or** have at least 10% of its bases recalled. + If two or more reference transcripts fit these constraints, then the prediction model is classified as a fusion.""" + code.ref_multi, code.pred_multi = None, None + code._nucl_rec, code._nucl_prec, code._nucl_f1 = "> 10", 0, 0 + code._junc_rec, code._junc_prec, code._junc_f1 = "> 0", 0, 0 + code.reverse = None + code.category = "Fusion" + return code + + +def code_x(): + code = ClassCode("x") + code.definition = "Monoexonic match on the **opposite** strand." + code.ref_multi, code.pred_multi = None, False + code._nucl_rec, code._nucl_prec, code._nucl_f1 = ">0", ">0", ">0" + code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 + code.reverse = "x or X" + code.category = "Fragment" + return code + + +def code_capital_x(): + code = ClassCode("X") + code.definition = "Multiexonic match on the **opposite** strand." + code.ref_multi, code.pred_multi = None, True + code._nucl_rec, code._nucl_prec, code._nucl_f1 = ">0", ">0", ">0" + code._junc_rec, code._junc_prec, code._junc_f1 = None, None, None + code.reverse = "x or X" + code.category = "Fragment" + return code + + +def code_p(): + code = ClassCode("p") + code.definition = """The prediction is on the same strand of a neighbouring but non-overlapping transcript. + Probable polymerase run-on""" + code.ref_multi, code.pred_multi = None, None + code._nucl_rec, code._nucl_prec, code._nucl_f1 = 0, 0, 0 + code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 + code.reverse = "p" + code.category = "Fragment" + return code + + +def code_capital_p(): + code = ClassCode("P") + code.definition = """The prediction is on the opposite strand of a neighbouring but non-overlapping transcript. + Probable polymerase run-on.""" + code.ref_multi, code.pred_multi = None, None + code._nucl_rec, code._nucl_prec, code._nucl_f1 = 0, 0, 0 + code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 + code.reverse = "P" + code.category = "Fragment" + return code + + +def code_u(): + code = ClassCode("u") + code.definition = """Unknown - no suitable model has been found near enough the prediction to + perform a comparison.""" + code.ref_multi, code.pred_multi = None, None + code._nucl_rec, code._nucl_prec, code._nucl_f1 = 0, 0, 0 + code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 + code.reverse = None + code.category = "Unknown" + return code + + +codes = odict() +codes["="] = code_equal() +codes["_"] = code_underscore() +codes["n"] = code_n() +codes["J"] = code_capital_j() +codes["c"] = code_c() +codes["C"] = code_capital_c() +codes["j"] = code_j() +codes["h"] = code_h() +codes["g"] = code_g() +codes["G"] = code_capital_g() +codes["o"] = code_o() +codes["e"] = code_e() +codes["m"] = code_m() +codes["i"] = code_i() +codes["I"] = code_capital_i() +codes["ri"] = code_r_i() +codes["rI"] = code_r_capital_i() +codes["f"] = code_f() +codes["x"], codes["X"] = code_x(), code_capital_x() +codes["p"], codes["P"], codes["u"] = code_p(), code_capital_p(), code_u() + +assert len(set(codes.values())) == len(codes), set.difference(set(codes.keys()), set([_.code for _ in codes.values()])) +assert all(_ == codes[_].code for _ in codes) \ No newline at end of file diff --git a/Mikado/subprograms/util/__init__.py b/Mikado/subprograms/util/__init__.py index 39d997c17..1e9d967e4 100644 --- a/Mikado/subprograms/util/__init__.py +++ b/Mikado/subprograms/util/__init__.py @@ -14,6 +14,7 @@ from . import grep from . import merge_blast from . import convert +from . import class_codes import argparse __author__ = 'Luca Venturini' @@ -36,6 +37,12 @@ def util_parser(): utils.choices["awk_gtf"].prog = "mikado util awk_gtf" utils.choices["awk_gtf"].description = "Script to retrieve specific feature slices from a GTF file." + utils.add_parser("class_codes", + description="Script to print out the class codes.") + utils.choices["class_codes"] = class_codes.code_parser() + utils.choices["class_codes"].prog = "mikado util class_codes" + utils.choices["class_codes"].description = "Script to print out the class codes." + utils.add_parser("convert", description="Script to do GTF <-> GFF3 > BED12 conversions.") utils.choices["convert"] = convert.convert_parser() diff --git a/Mikado/subprograms/util/class_codes.py b/Mikado/subprograms/util/class_codes.py index 50a599cc5..eb824e4cf 100644 --- a/Mikado/subprograms/util/class_codes.py +++ b/Mikado/subprograms/util/class_codes.py @@ -1,163 +1,95 @@ -from collections import OrderedDict as odict - -codes = odict() - -codes["="] = {"Definition": "Complete intron chain match.", - "ref_multi": True, - "pred_multi": True, - "nucl": "NA", - "junc": "100%, 100%, 100%", - "reverse": "=", - "category": "Match"} -codes["_"] = {"Definition": "Complete match between two monoexonic transcripts.", - "ref_multi": False, - "pred_multi": False, - "nucl": "NA", - "junc": "NA, NA, >=80%", - "reverse": "_", - "category": "Match"} -codes["n"] = {"Definition": """Intron chain extension, ie. both transcripts are multiexonic and -the prediction has novel splice sites outside of the reference transcript boundaries.""", - "ref_multi": True, - "pred_multi": True, - "nucl": "100%, < 100%, < 100%", - "junc": "100%, < 100%, < 100%", - "reverse": "c", - "category": "Extension"} -codes["J"] ={"Definition": """Intron chain extension, ie. both transcripts are multiexonic and -the prediction has novel splice sites inside of the reference transcript boundaries.""", - "ref_multi": True, - "pred_multi": True, - "nucl": "100%, <= 100%, < 100%", - "junc": "100%, < 100%, < 100%", - "reverse": "C", - "category": "Extension"} -codes["c"] = {"Definition": """The prediction is either multiexonic and with its intron chain completely contained -within that of the reference, or monoexonic and contained within one of the reference exons.""", - "ref_multi": "NA", - "pred_multi": "NA", - "nucl": "< 100%, 100%, NA", - "junc": "< 100%, 100%, NA", - "reverse": "n", - "category": "Extension"}, -codes["C"] = {"Definition": """The prediction intron chain is completely contained within that of the reference -transcript, but it partially debords either into its introns or outside of the reference boundaries.""", - "ref_multi": True, - "pred_multi": True, - "nucl": "<= 100%, < 100%, < 100%", - "junc": "< 100%, 100%, < 100%", - "reverse": "J or j", - "category": "Extension"} -codes["j"] = {"Definition": """Alternative splicing event.""", - "ref_multi": True, - "pred_multi": True, - "nucl": "NA", - "junc": "<= 100%, 100%, < 100%", - "reverse": "j", - "category": "Alternative splicing"} - - - - -# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ -# | **j** | Alternative splicing event. | True | True | NA | <= 100%, < 100%, | **j** | **Alternative | -# | | | | | | < 100% | | splicing** | -# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ -# | **h** | Structural match between two | True | True | > 0%, > 0%, > 0% | 0%, 0%, 0% | **h** | **Alternative | -# | | models where no splice site | | | | | | splicing** | -# | | is conserved but **at least**| | | | | | | -# | | one intron of the reference | | | | | | | -# | | and one intron of the | | | | | | | -# | | prediction partially overlap.| | | | | | | -# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ -# | **g** | The monoexonic prediction | True | False | > 0%, > 0%, | 0% | **G** | **Alternative | -# | ("mo" before | overlaps one or more exons of| | | between 0 and 100%| | | splicing** | -# | release 1) | the reference transcript; the| | | | | | | -# | | borders of the prediction | | | | | | | -# | | cannot fall inside the | | | | | | | -# | | introns of the reference. | | | | | | | -# | | The prediction transcript | | | | | | | -# | | can bridge multiple exons | | | | | | | -# | | of the reference model. | | | | | | | -# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ -# | **G** | Generic match of a | False | True | > 0%, > 0%, > 0% | 0% | **g** | **Alternative | -# | ("O" before | multiexonic prediction | | | | | | splicing** | -# | release 1) | transcript versus a | | | | | | | -# | | monoexonic reference. | | | | | | | -# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ -# | **o** | Generic overlap between two | True | True | > 0%, > 0%, > 0% | 0%, 0%, 0% | **o** | **Overlap** | -# | | multiexonic transcripts, | | | | | | | -# | | which do not share **any** | | | | | | | -# | | overlap among their introns. | | | | | | | -# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ -# | **e** | Single exon transcript | True | False | > 0%, > 0%, | 0% | **G** | **Overlap** | -# | | overlapping *one* reference | | | between 0 and 100%| | | | -# | | exon and at least 10 bps of a| | | | | | | -# | | reference intron, indicating | | | | | | | -# | | a possible pre-mRNA fragment.| | | | | | | -# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ -# | **m** | Generic match between two | False | False | NA, NA, **< 80%** | NA | **m** | **Overlap** | -# | | monoexonic transcripts. | | | | | | | -# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ -# | **i** | Monoexonic prediction | True | False | 0% | 0% | **ri** | **Intronic** | -# | | completely contained within | | | | | | | -# | | one intron of the reference | | | | | | | -# | | transcript. | | | | | | | -# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ -# | **I** | Prediction completely | True | True | 0% | 0% | **rI** | **Intronic** | -# | | contained within the introns | | | | | | | -# | | of the reference transcript. | | | | | | | -# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ -# | **rI** | Reference completely | True | True | 0% | 0% | **I** | **Intronic** | -# | | contained within the introns | | | | | | | -# | | of the prediction transcript.| | | | | | | -# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ -# | **ri** | Reverse intron transcript - | False | True | 0% | 0% | **i** | **Intronic** | -# | | the monoexonic reference is | | | | | | | -# | | completely contained within | | | | | | | -# | | one intron of the prediction | | | | | | | -# | | transcript. | | | | | | | -# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ -# | **f** | Fusion - this special code | NA | NA | **> 10%**, NA, NA | **> 0%**, NA, NA | NA | **Fusion** | -# | | is applied when a prediction | | | | | | | -# | | intersects more than one | | | | | | | -# | | reference transcript. To be | | | | | | | -# | | considered for fusions, | | | | | | | -# | | candidate references must | | | | | | | -# | | **either** share at least one| | | | | | | -# | | splice junction with the | | | | | | | -# | | prediction, **or** have at | | | | | | | -# | | least 10% of its bases | | | | | | | -# | | recalled. If two or more | | | | | | | -# | | reference transcripts fit | | | | | | | -# | | these constraints, then the | | | | | | | -# | | prediction model is | | | | | | | -# | | classified as a **fusion**. | | | | | | | -# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ -# | **x** | Monoexonic match on the | NA | False | >= 0% | 0% | **x** or **X** | **Fragment** | -# | | *opposite* strand. | | | | | | | -# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ -# | **X** | Multiexonic match on the | NA | True | >= 0% | 0% | **x** or **X** | **Fragment** | -# | | *opposite* strand. | | | | | | | -# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ -# | **p** | The prediction is on the same| NA | NA | 0% | 0% | **p** | **No overlap** | -# | | strand of a neighbouring but | | | | | | | -# | | non-overlapping transcript. | | | | | | | -# | | Probable polymerase run-on. | | | | | | | -# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ -# | **P** | The prediction is on the | NA | NA | 0% | 0% | **P** | **No overlap** | -# | | *opposite* strand of a | | | | | | | -# | | neighbouring but | | | | | | | -# | | non-overlapping transcript. | | | | | | | -# | | Probable polymerase run-on. | | | | | | | -# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ -# | **u** | Unknown - no suitable model | NA | NA | 0% | 0% | NA | **No overlap** | -# | | has been found near enough | | | | | | | -# | | the prediction to perform a | | | | | | | -# | | comparison. | | | | | | | -# +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ +import sys +import argparse +import tabulate +from ...scales import class_codes +import textwrap +from itertools import zip_longest +def launch(args): + + rows = [] + + if len(args.code) > 0: + codes = [class_codes.codes[_] for _ in args.metric] + elif len(args.category) > 0: + codes = [class_codes.codes[_] for _ in class_codes.codes + if class_codes.codes[_].category in args.category] + else: + codes = [class_codes.codes[_] for _ in class_codes.codes] + + for code in codes: + definition = textwrap.wrap(code.definition, 30) + code_rows = zip_longest([code.code], + definition, + [code.ref_multi], + [code.pred_multi], + [code.nucl], + [code.junc], + [code.reverse], + [code.category]) + rows.extend(code_rows) + + __table_format = tabulate._table_formats[args.format] + + if args.format not in ("grid", "fancy_grid"): + + print(tabulate.tabulate(rows, + headers=["Class code", + "Definition", + "Reference multiexonic?", + "Prediction multiexonic?", + "Nucleotide: RC, PC, F1", + "Junction: RC, PC, F1", + "Reverse", + "Category"], + tablefmt=args.format)) + + else: + out_of_header = False + separator = None + + for row in tabulate.tabulate(rows, + headers=["Metric name", "Description", "Category", "Data type", "Usable raw"], + tablefmt=args.format).split("\n"): + if row[:2] == __table_format.lineabove.begin + __table_format.lineabove.hline: + separator = row + if not out_of_header: + print(row) + continue + if row[:2] == __table_format.linebelowheader.begin + __table_format.linebelowheader.hline: + out_of_header = True + print(row, file=args.out) + continue + elif out_of_header is False: + print(row, file=args.out) + elif row[:2] == __table_format.linebetweenrows[0] + __table_format.linebetweenrows[1]: + continue + elif row[0] == __table_format.datarow.begin: + if row.strip().split(__table_format.datarow.sep)[1].strip() != "": + print(separator, file=args.out) + print(row, file=args.out) + print(separator, file=args.out) + print(file=args.out) + return + + +def code_parser(): + + """ + Command line parser for the utility. + """ + + parser = argparse.ArgumentParser("Script to generate the available class codes.") + parser.add_argument("-f", "--format", choices=tabulate.tabulate_formats, default="rst") + parser.add_argument("-c", "--category", nargs="+", + choices=list(set(_.category for _ in class_codes.codes.values()))) + parser.add_argument("-o", "--out", type=argparse.FileType("w"), default=sys.stdout) + parser.add_argument("code", nargs="*", help="Codes to query.", + default=[], + choices=[[]] + list(class_codes.codes.keys())) + parser.set_defaults(func=launch) + return parser diff --git a/Mikado/subprograms/util/metrics.py b/Mikado/subprograms/util/metrics.py index 85e445e49..ad7956a22 100644 --- a/Mikado/subprograms/util/metrics.py +++ b/Mikado/subprograms/util/metrics.py @@ -30,6 +30,15 @@ def launch(args): sorted(metric for metric in metric_names if metric not in ("tid", "parent", "score"))) + if len(args.metric) > 0: + if not all(metric in metrics for metric in args.metric): + print("Invalid metrics selected: {}".format( + ", ".join(sorted(metric for metric in args.metric if metric not in metrics)))) + metrics = args.metric + elif len(args.category) > 0: + metrics = [metric for metric in Transcript.get_available_metrics() if + getattr(getattr(Transcript, metric), "category", "Descriptive") in args.category] + rows = [] for metric in metrics: @@ -105,7 +114,18 @@ def metric_parser(): """ parser = argparse.ArgumentParser("Script to generate the available metrics") - parser.add_argument("-f", "--format", choices=tabulate.tabulate_formats, default="rst") - parser.add_argument("-o", "--out", type=argparse.FileType("w"), default=sys.stdout) + parser.add_argument("-f", "--format", + help="Format of the table to be printed out.", + choices=tabulate.tabulate_formats, default="rst") + parser.add_argument("-o", "--out", + help="Optional output file", + type=argparse.FileType("w"), default=sys.stdout) + parser.add_argument("-c", "--category", + help="Available categories to select from.", + default=[], nargs="+", + choices=sorted(set( + [_ for _ in [getattr(getattr(Transcript, metric), "category", "Descriptive") for metric in + Transcript.get_available_metrics()] if _ is not None] + ["Descriptive"]))) + parser.add_argument("metric", nargs="*") parser.set_defaults(func=launch) return parser From 33702de548135ec6435acf1048b759ac3a782f96 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Thu, 9 Feb 2017 14:52:31 +0000 Subject: [PATCH 32/47] BugFixes for serialise --- Mikado/serializers/junction.py | 2 ++ Mikado/utilities/dbutils.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Mikado/serializers/junction.py b/Mikado/serializers/junction.py index 66ab77160..7257a08f8 100644 --- a/Mikado/serializers/junction.py +++ b/Mikado/serializers/junction.py @@ -199,6 +199,8 @@ def serialize(self): self.session.begin(subtransactions=True) for line in self.fai: name, length = line.rstrip().split()[:2] + if self.session.query(Chrom).filter(Chrom.name == name).all(): + continue try: current_chrom = Chrom(name, length=int(length)) except ValueError: diff --git a/Mikado/utilities/dbutils.py b/Mikado/utilities/dbutils.py index 7427f0369..421a35236 100644 --- a/Mikado/utilities/dbutils.py +++ b/Mikado/utilities/dbutils.py @@ -50,9 +50,9 @@ def create_connector(json_conf, logger=None): func = None if db_settings["dbtype"] == "sqlite": - if database_exists("sqlite://{}".format(db_settings["db"])): + if not database_exists("sqlite:///{}".format(db_settings["db"])): logger.warning("No database found, creating a mock one!") - create_database("sqlite://{}".format(db_settings["db"])) + create_database("sqlite:///{}".format(db_settings["db"])) if json_conf["pick"]["run_options"]['shm'] is False: logger.debug("Connecting to %s", db_settings["db"]) func = sqlite3.connect(database=db_settings["db"], check_same_thread=False) From 2e7b7166d2b936645e09a6a6ab8705e079835e38 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Thu, 9 Feb 2017 15:26:06 +0000 Subject: [PATCH 33/47] Now fusions **have to be found on the same strand**. --- Mikado/scales/assigner.py | 28 ++++++++++++++------------ Mikado/scales/class_codes.py | 3 ++- Mikado/subprograms/util/class_codes.py | 13 +++++++++--- 3 files changed, 27 insertions(+), 17 deletions(-) diff --git a/Mikado/scales/assigner.py b/Mikado/scales/assigner.py index 0aeeed2af..ac85b8ade 100644 --- a/Mikado/scales/assigner.py +++ b/Mikado/scales/assigner.py @@ -369,18 +369,20 @@ def __prepare_result(self, prediction, distances): prediction.id, matches) + same_strand = False if len(matches) > 1 and prediction.strand is not None: - correct = list() - for match in matches: - for gid in self.positions[prediction.chrom][match[0]]: - if any([self.genes[gid].strand in (None, prediction.strand)]): - correct.append(match) - break - if len(correct) > 0: - matches = correct[:] - del correct - - if len(matches) > 1: + correct = list() + for match in matches: + for gid in self.positions[prediction.chrom][match[0]]: + if any([self.genes[gid].strand in (None, prediction.strand)]): + correct.append(match) + break + if len(correct) > 0: + matches = correct[:] + same_strand = True + del correct + + if len(matches) > 1 and same_strand is True: self.logger.debug("More than one match for %s: %s", prediction.id, matches) @@ -402,6 +404,8 @@ def __prepare_result(self, prediction, distances): def self_analyse_prediction(self, prediction: Transcript, distances): + """This method will be invoked during a self analysis run.""" + assert len(distances) >= 1 and distances[0][1] == 0 genes = [] @@ -482,8 +486,6 @@ def self_analyse_prediction(self, prediction: Transcript, distances): gene.id, prediction.id, ", ".join(list(gene.transcripts.keys())) )) - # best.append(result_dict[gene.id][0]) - if same_strand is True: # This is a fusion, period results = [] diff --git a/Mikado/scales/class_codes.py b/Mikado/scales/class_codes.py index d427c0a38..3383037a3 100644 --- a/Mikado/scales/class_codes.py +++ b/Mikado/scales/class_codes.py @@ -5,6 +5,7 @@ from collections import OrderedDict as odict + def _is_digit(value): if not ((value is None) or (isinstance(value, (float, int)) and 0 <= value <= 100)): raise ValueError("Invalid numeric value: {}, type: {}".format(value, type(value))) @@ -272,7 +273,7 @@ def code_h(): code = ClassCode("h") code.definition = """Structural match between two models where where no splice site is conserved but at least one intron of the reference and one intron of the prediction partially overlap.""" - code.ref_multi, code_n.pred_multi = True, True + code.ref_multi, code.pred_multi = True, True code._nucl_rec, code._nucl_prec, code._nucl_f1 = "> 0", "> 0", "> 0" code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 code.reverse = "h" diff --git a/Mikado/subprograms/util/class_codes.py b/Mikado/subprograms/util/class_codes.py index eb824e4cf..2987d67cc 100644 --- a/Mikado/subprograms/util/class_codes.py +++ b/Mikado/subprograms/util/class_codes.py @@ -11,7 +11,7 @@ def launch(args): rows = [] if len(args.code) > 0: - codes = [class_codes.codes[_] for _ in args.metric] + codes = [class_codes.codes[_] for _ in args.code] elif len(args.category) > 0: codes = [class_codes.codes[_] for _ in class_codes.codes if class_codes.codes[_].category in args.category] @@ -50,7 +50,14 @@ def launch(args): separator = None for row in tabulate.tabulate(rows, - headers=["Metric name", "Description", "Category", "Data type", "Usable raw"], + headers=["Class code", + "Definition", + "Reference multiexonic?", + "Prediction multiexonic?", + "Nucleotide: RC, PC, F1", + "Junction: RC, PC, F1", + "Reverse", + "Category"], tablefmt=args.format).split("\n"): if row[:2] == __table_format.lineabove.begin + __table_format.lineabove.hline: separator = row @@ -82,7 +89,7 @@ def code_parser(): parser = argparse.ArgumentParser("Script to generate the available class codes.") parser.add_argument("-f", "--format", choices=tabulate.tabulate_formats, default="rst") - parser.add_argument("-c", "--category", nargs="+", + parser.add_argument("-c", "--category", nargs="+", default=[], choices=list(set(_.category for _ in class_codes.codes.values()))) parser.add_argument("-o", "--out", type=argparse.FileType("w"), default=sys.stdout) parser.add_argument("code", nargs="*", help="Codes to query.", From 9f566e8ab78d5ed305a5281b0798b37ee0b432af Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Thu, 9 Feb 2017 15:34:28 +0000 Subject: [PATCH 34/47] Corrected the location for fusion events --- Mikado/scales/assigner.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Mikado/scales/assigner.py b/Mikado/scales/assigner.py index ac85b8ade..57e3852b1 100644 --- a/Mikado/scales/assigner.py +++ b/Mikado/scales/assigner.py @@ -262,7 +262,7 @@ def __check_for_fusions(self, prediction, matches): strands = collections.defaultdict(set) - # Get all the results for the single genea + # Get all the results for the single gene # We *want* to do the calculation for all hits for match in matches: match_to_gene[match[0]] = self.positions[prediction.chrom][match[0]] @@ -314,16 +314,16 @@ def __check_for_fusions(self, prediction, matches): # Now retrieve the results according to their order on the genome # Keep only the result, not their position best = [_[1] for _ in sorted(best, key=lambda res: (res[0][0], res[0][1]))] + chrom = prediction.chrom + start = min([prediction.start] + [self.genes[_.ref_gene[0]][_.ref_id[0]].start for _ in best]) + end = max([prediction.end] + [self.genes[_.ref_gene[0]][_.ref_id[0]].end for _ in best]) + location = "{}:{}..{}".format(chrom, start, end) + for key in ResultStorer.__slots__: if key in ["gid", "tid", "distance", "tid_num_exons"]: values.append(getattr(best[0], key)) elif key == "location": - positions = [(group[0], int(group[1]), int(group[2])) for group in - [re_search("(.*):(\d+)\.\.(\d+)", _.location[0]).groups() for _ in best]] - chrom = set(_[0] for _ in positions).pop() - start = min(_[1] for _ in positions) - end = max(_[1] for _ in positions) - values.append("{}:{}..{}".format(chrom, start, end)) + values.append(location) elif key == "ccode": values.append(tuple(["f"] + [_.ccode[0] for _ in best])) else: From 4a663dec2430d6adeb1d965971811f7157cc1ce1 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Thu, 9 Feb 2017 16:00:05 +0000 Subject: [PATCH 35/47] Added the mods to compare to the changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 46beec374..e1c1f4c84 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ Changes in this release: - "remove": whether to exclude fragments, previously under "run_options" - "valid_class_codes": which class codes constitute a fragment match. Only class codes in the "Intronic", "Overlap" (inclusive of _) and "Fragment" categories are allowed. - max_distance: for non-overlapping fragments (ie p and P), maximum distance from the gene. +- Solved a long-standing bug which caused Mikado compare to consider as fusion also hits. - Mikado compare now also provides the location of the matches in TMAP and REFMAP files. - Introduced a new utility, "class_codes", to print out the information of the class codes. The definition of class codes is now contained in a subpackage of "scales". - The "metrics" utility now allows for interactive querying based on category or metric name. From d3ed210362116321febd4246584c71f0356194c2 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Thu, 9 Feb 2017 16:03:32 +0000 Subject: [PATCH 36/47] Unstranded transcripts now continue to be regarded as fusions. --- Mikado/scales/assigner.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Mikado/scales/assigner.py b/Mikado/scales/assigner.py index 57e3852b1..d85079d3a 100644 --- a/Mikado/scales/assigner.py +++ b/Mikado/scales/assigner.py @@ -381,6 +381,8 @@ def __prepare_result(self, prediction, distances): matches = correct[:] same_strand = True del correct + elif len(matches) > 1 and prediction.strand is None: + same_strand = True if len(matches) > 1 and same_strand is True: self.logger.debug("More than one match for %s: %s", From 889d61a0aafb9f72712cc7b71eb8da2116f09ee1 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Fri, 10 Feb 2017 13:12:54 +0000 Subject: [PATCH 37/47] Improved the docs for class_codes. Picking now should crash informatively. --- CHANGELOG.md | 10 +- .../configuration_blueprint.json | 6 +- Mikado/scales/class_codes.py | 694 +++++++++++------- Mikado/subprograms/pick.py | 11 +- Mikado/subprograms/util/class_codes.py | 2 +- 5 files changed, 440 insertions(+), 283 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e1c1f4c84..ad35e0fe2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,11 +12,11 @@ Changes in this release: - at least one of the transcripts is monoexonic and there is some overlap of any kind. This behaviour (which was the default until this release) can be switched off through pick/clustering/simple_overlap_for_monoexonic (default true). - **MAJOR**: changed slightly the anatomy of the configuration files. Now "pick" has two new subsections, "clustering" and "fragments". - Clustering: dedicated to how to cluster the transcripts in the different steps. Currently it contains the keys: - - "flank" - - "min_cdna_overlap" and "min_cds_overlap" (for the second clustering during the monosublocusHolder phase) - - "cds_only": to indicate whether we should only consider the CDS for clustering after the initial merging in the Superlocus. - - "simple_overlap_for_monoexonic": to switch on/off the old default behaviour with monoexonic transcripts - - "purge": whether to completely exclude failed loci, previously under "run_options" + - "flank" + - "min_cdna_overlap" and "min_cds_overlap" (for the second clustering during the monosublocusHolder phase) + - "cds_only": to indicate whether we should only consider the CDS for clustering after the initial merging in the Superlocus. + - "simple_overlap_for_monoexonic": to switch on/off the old default behaviour with monoexonic transcripts + - "purge": whether to completely exclude failed loci, previously under "run_options" - Fragments: dedicated to how to identify and treat putative fragments. Currently it contains the keys: - "remove": whether to exclude fragments, previously under "run_options" - "valid_class_codes": which class codes constitute a fragment match. Only class codes in the "Intronic", "Overlap" (inclusive of _) and "Fragment" categories are allowed. diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json index 859cfd423..532ca29df 100644 --- a/Mikado/configuration/configuration_blueprint.json +++ b/Mikado/configuration/configuration_blueprint.json @@ -284,8 +284,8 @@ "- keep_retained_introns: Whether to consider as valid AS events where one intron", "is retained compared to the primary or any other valid AS. Default: false.", "- max_isoforms: Maximum number of isoforms per locus. 1 implies no AS reported. Default: 3", - "- valid_ccodes: Valid class codes for AS events. See documentation for details. Choices:", - "j, n, O, e, o, h, J, C, mo. Default: j, J, O, mo", + "- valid_ccodes: Valid class codes for AS events. Valid codes are in categories", + "'Alternative splicing', 'Extension' (with junction F1 lower than 100%), and Overlap (exluding m). Default: j, J, g, G, C, h", "- max_utr_length: Maximum length of the UTR for AS events. Default: 10e6 (i.e. no limit)", "- max_fiveutr_length: Maximum length of the 5'UTR for AS events. Default: 10e6 (i.e. no limit)", "- max_threeutr_length: Maximum length of the 5'UTR for AS events. Default: 10e6 (i.e. no limit)", @@ -340,13 +340,11 @@ "type": "string", "enum": [ "j", - "O", "e", "o", "h", "J", "C", - "mo", "g", "G" ] diff --git a/Mikado/scales/class_codes.py b/Mikado/scales/class_codes.py index 3383037a3..7045ecd6f 100644 --- a/Mikado/scales/class_codes.py +++ b/Mikado/scales/class_codes.py @@ -3,7 +3,7 @@ """ -from collections import OrderedDict as odict +from collections import OrderedDict def _is_digit(value): @@ -18,7 +18,7 @@ def _is_boolean(value): return True -class ClassCode: +class _ClassCode: """Container for the class codes .""" @@ -42,6 +42,18 @@ def __eq__(self, other): def __hash__(self): return hash(self.code) + def __str__(self): + lines = list() + lines.append("- Code: {}".format(self.code)) + lines.append("- Definition: {}".format(self.definition)) + lines.append("- Reference multiexonic: {}".format(self.ref_multi)) + lines.append("- Prediction multiexonic: {}".format(self.pred_multi)) + lines.append("- Nucleotide recall, precision, F1: {}".format(self.nucl)) + lines.append("- Junction recall, precision, F1: {}".format(self.junc)) + lines.append("- Reverse class code: {}".format(self.reverse)) + lines.append("- Category: {}".format(self.category)) + return "\n".join(lines) + @property def code(self): return self.__code @@ -191,294 +203,440 @@ def reverse(self, value): self.__reverse = value -def code_equal(): - equal = ClassCode("=") - equal.definition = "Complete intron chain match." - equal.pred_multi, equal.ref_multi = True, True - equal._junc_f1, equal._junc_prec, equal._junc_rec = [100] * 3 - equal.reverse = "=" - equal.category = "Match" - return equal +class Equal: + + _code = _ClassCode("=") + _code.definition = "Complete intron chain match." + _code.pred_multi, _code.ref_multi = True, True + _code._junc_f1, _code._junc_prec, _code._junc_rec = [100] * 3 + _code.reverse = "=" + _code.category = "Match" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + __doc__ = str(_code) -def code_underscore(): - underscore = ClassCode("_") - underscore.definition = "Complete match between two monoexonic transcripts." - underscore.ref_multi, underscore.pred_multi = False, False - underscore._nucl_f1 = ">=80" - underscore.reverse = "_" - underscore.category = "Match" - return underscore +class UnderScore: + + _code = _ClassCode("_") + _code.definition = "Complete match between two monoexonic transcripts." + _code.ref_multi, _code.pred_multi = False, False + _code._nucl_f1 = ">=80" + _code.reverse = "_" + _code.category = "Match" -def code_n(): - code = ClassCode("n") - code.definition = """Intron chain extension, ie. both transcripts are multiexonic and + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) + + +class CodeN: + + _code = _ClassCode("n") + _code.definition = """Intron chain extension, ie. both transcripts are multiexonic and the prediction has novel splice sites outside of the reference transcript boundaries.""" - code.ref_multi, code.pred_multi = True, True - code._nucl_rec, code._nucl_prec, code._nucl_f1 = (100, "< 100", "<100") - code._junc_rec, code._junc_prec, code._junc_f1 = (100, "< 100", "<100") - code.reverse = "c" - code.category = "Extension" - return code + _code.ref_multi, _code.pred_multi = True, True + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = (100, "< 100", "<100") + _code._junc_rec, _code._junc_prec, _code._junc_f1 = (100, "< 100", "<100") + _code.reverse = "c" + _code.category = "Extension" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + __doc__ = str(_code) -def code_capital_j(): - code = ClassCode("J") - code.definition = """Intron chain extension, ie. both transcripts are multiexonic and + +class CodeCapitalJ: + + _code = _ClassCode("J") + _code.definition = """Intron chain extension, ie. both transcripts are multiexonic and the prediction has novel splice sites inside of the reference transcript boundaries.""" - code.ref_multi, code.pred_multi = True, True - code._nucl_rec, code._nucl_prec, code._nucl_f1 = (100, "<= 100", "<100") - code._junc_rec, code._junc_prec, code._junc_f1 = (100, "< 100", "<100") - code.reverse = "C" - code.category = "Extension" - return code + _code.ref_multi, _code.pred_multi = True, True + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = (100, "<= 100", "<100") + _code._junc_rec, _code._junc_prec, _code._junc_f1 = (100, "< 100", "<100") + _code.reverse = "C" + _code.category = "Extension" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) -def code_c(): - code = ClassCode("c") - code.definition = """The prediction is either multiexonic and with its intron chain completely contained +class CodeC: + _code = _ClassCode("c") + _code.definition = """The prediction is either multiexonic and with its intron chain completely contained within that of the reference, or monoexonic and contained within one of the reference exons.""" - code.pred_multi, code.ref_multi = None, None - code._nucl_rec, code._nucl_prec, code._nucl_f1 = "< 100", "100", None - code._junc_rec, code._junc_prec, code._junc_f1 = "< 100", "100", None - code.reverse = "n" - code.category = "Extension" - return code + _code.pred_multi, _code.ref_multi = None, None + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = "< 100", "100", None + _code._junc_rec, _code._junc_prec, _code._junc_f1 = "< 100", "100", None + _code.reverse = "n" + _code.category = "Extension" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) -def code_capital_c(): - code = ClassCode("C") - code.definition = """The prediction intron chain is completely contained within that of the reference +class CodeCapitalC: + + _code = _ClassCode("C") + _code.definition = """The prediction intron chain is completely contained within that of the reference transcript, but it partially debords either into its introns or outside of the reference boundaries.""" - code.pred_multi, code.ref_multi = True, True - code._nucl_rec, code._nucl_prec, code._nucl_f1 = "<= 100", "< 100", "< 100" - code._junc_rec, code._junc_prec, code._junc_f1 = "< 100", "100", "< 100" - code.reverse = "J or j" - code.category = "Extension" - return code - - -def code_j(): - code = ClassCode("j") - code.definition = """Alternative splicing event.""" - code.ref_multi, code.pred_multi = True, True - code._junc_rec, code._junc_prec, code._junc_f1 = "<= 100", "100", "< 100" - code.reverse = "j or C" - code.category = "Alternative splicing" - return code - - -def code_h(): - code = ClassCode("h") - code.definition = """Structural match between two models where where no splice site is conserved but at least + _code.pred_multi, _code.ref_multi = True, True + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = "<= 100", "< 100", "< 100" + _code._junc_rec, _code._junc_prec, _code._junc_f1 = "< 100", "100", "< 100" + _code.reverse = "J or j" + _code.category = "Extension" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) + + +class CodeJ: + _code = _ClassCode("j") + _code.definition = """Alternative splicing event.""" + _code.ref_multi, _code.pred_multi = True, True + _code._junc_rec, _code._junc_prec, _code._junc_f1 = "<= 100", "100", "< 100" + _code.reverse = "j or C" + _code.category = "Alternative splicing" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) + + +class CodeH: + _code = _ClassCode("h") + _code.definition = """Structural match between two models where where no splice site is conserved but at least one intron of the reference and one intron of the prediction partially overlap.""" - code.ref_multi, code.pred_multi = True, True - code._nucl_rec, code._nucl_prec, code._nucl_f1 = "> 0", "> 0", "> 0" - code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 - code.reverse = "h" - code.category = "Alternative splicing" - return code - - -def code_g(): - code = ClassCode("g") - code.definition = """The monoexonic prediction overlaps one or more exons of the reference transcript; - the borders of the prediction cannot fall inside the introns of the reference. - The prediction transcript can bridge multiple exons of the reference model.""" - code.ref_multi, code.pred_multi = True, False - code._nucl_rec, code._nucl_prec, code._nucl_f1 = "> 0", "> 0", "0% < F1 < 100" - code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 - code.reverse = "G" - code.category = "Alternative splicing" - return code - - -def code_capital_g(): - code = ClassCode("G") - code.definition = """Generic match of a multiexonic prediction transcript versus a monoexonic reference.""" - code.ref_multi, code.pred_multi = False, True - code._nucl_rec, code._nucl_prec, code._nucl_f1 = "> 0", "> 0", "0% < F1 < 100" - code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 - code.reverse = "g" - code.category = "Alternative splicing" - return code - - -def code_o(): - code = ClassCode("o") - code.definition = """Generic overlap between two multiexonic transcripts, + _code.ref_multi, _code.pred_multi = True, True + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = "> 0", "> 0", "> 0" + _code._junc_rec, _code._junc_prec, _code._junc_f1 = 0, 0, 0 + _code.reverse = "h" + _code.category = "Alternative splicing" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) + + +class CodeG: + + _code = _ClassCode("g") + _code.definition = """The monoexonic prediction overlaps one or more exons of the reference + transcript; the borders of the prediction cannot fall inside the introns of the reference. + The prediction transcript can bridge multiple exons of the reference model.""" + _code.ref_multi, _code.pred_multi = True, False + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = "> 0", "> 0", "0% < F1 < 100" + _code._junc_rec, _code._junc_prec, _code._junc_f1 = 0, 0, 0 + _code.reverse = "G" + _code.category = "Alternative splicing" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) + + +class CodeCapitalG: + _code = _ClassCode("G") + _code.definition = """Generic match of a multiexonic prediction transcript versus a monoexonic reference.""" + _code.ref_multi, _code.pred_multi = False, True + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = "> 0", "> 0", "0% < F1 < 100" + _code._junc_rec, _code._junc_prec, _code._junc_f1 = 0, 0, 0 + _code.reverse = "g" + _code.category = "Alternative splicing" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) + + +class CodeO: + _code = _ClassCode("o") + _code.definition = """Generic overlap between two multiexonic transcripts, which do not share any overlap among their introns.""" - code.ref_multi, code.pred_multi = True, True - code.ref_multi, code.pred_multi = True, True - code._nucl_rec, code._nucl_prec, code._nucl_f1 = "> 0", "> 0", "0% < F1 < 100" - code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 - code.reverse = "o" - code.category = "Overlap" - return code - - -def code_e(): - code = ClassCode("e") - code.definition = """Single exon transcript overlapping one reference exon and at least 10 bps of a + _code.ref_multi, _code.pred_multi = True, True + _code.ref_multi, _code.pred_multi = True, True + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = "> 0", "> 0", "0% < F1 < 100" + _code._junc_rec, _code._junc_prec, _code._junc_f1 = 0, 0, 0 + _code.reverse = "o" + _code.category = "Overlap" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) + + +class CodeE: + _code = _ClassCode("e") + _code.definition = """Single exon transcript overlapping one reference exon and at least 10 bps of a reference intron, indicating a possible pre-mRNA fragment.""" - code.ref_multi, code.pred_multi = True, False - code._nucl_rec, code._nucl_prec, code._nucl_f1 = "> 0", "> 0", "0% < F1 < 100" - code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 - code.reverse = "G" - code.category = "Overlap" - return code - - -def code_m(): - code = ClassCode("m") - code.definition = """Generic match between two monoexonic transcripts.""" - code.ref_multi, code.pred_multi = False, False - code._nucl_rec, code._nucl_prec, code._nucl_f1 = None, None, "< 80" - code._junc_rec, code._junc_prec, code._junc_f1 = None, None, None - code.reverse = "m" - code.category = "Overlap" - return code - - -def code_i(): - code = ClassCode("i") - code.definition = "Monoexonic prediction completely contained within one intron of the reference transcript." - code.ref_multi, code.pred_multi = True, False - code._nucl_rec, code._nucl_prec, code._nucl_f1 = 0, 0, 0 - code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 - code.reverse = "ri" - code.category = "Intronic" - return code - - -def code_capital_i(): - code = ClassCode("I") - code.definition = "Prediction completely contained within the introns of the reference transcript." - code.ref_multi, code.pred_multi = True, True - code._nucl_rec, code._nucl_prec, code._nucl_f1 = 0, 0, 0 - code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 - code.reverse = "rI" - code.category = "Intronic" - return code - - -def code_r_i(): - code = ClassCode("ri") - code.definition = """Reverse intron transcript - the monoexonic reference is completely contained + _code.ref_multi, _code.pred_multi = True, False + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = "> 0", "> 0", "0% < F1 < 100" + _code._junc_rec, _code._junc_prec, _code._junc_f1 = 0, 0, 0 + _code.reverse = "G" + _code.category = "Overlap" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) + + +class CodeM: + _code = _ClassCode("m") + _code.definition = """Generic match between two monoexonic transcripts.""" + _code.ref_multi, _code.pred_multi = False, False + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = None, None, "< 80" + _code._junc_rec, _code._junc_prec, _code._junc_f1 = None, None, None + _code.reverse = "m" + _code.category = "Overlap" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) + + +class CodeI: + _code = _ClassCode("i") + _code.definition = "Monoexonic prediction completely contained within one intron of the reference transcript." + _code.ref_multi, _code.pred_multi = True, False + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = 0, 0, 0 + _code._junc_rec, _code._junc_prec, _code._junc_f1 = 0, 0, 0 + _code.reverse = "ri" + _code.category = "Intronic" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) + + +class CodeCapitalI: + _code = _ClassCode("I") + _code.definition = "Prediction completely contained within the introns of the reference transcript." + _code.ref_multi, _code.pred_multi = True, True + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = 0, 0, 0 + _code._junc_rec, _code._junc_prec, _code._junc_f1 = 0, 0, 0 + _code.reverse = "rI" + _code.category = "Intronic" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) + + +class CodeRI: + _code = _ClassCode("ri") + _code.definition = """Reverse intron transcript - the monoexonic reference is completely contained within one intron of the prediction transcript.""" - code.ref_multi, code.pred_multi = False, True - code._nucl_rec, code._nucl_prec, code._nucl_f1 = 0, 0, 0 - code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 - code.reverse = "i" - code.category = "Intronic" - return code - - -def code_r_capital_i(): - code = ClassCode("rI") - code.definition = """Multiexonic reference completely contained within the introns of the prediction transcript.""" - code.ref_multi, code.pred_multi = True, True - code._nucl_rec, code._nucl_prec, code._nucl_f1 = 0, 0, 0 - code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 - code.reverse = "I" - code.category = "Intronic" - return code - - -def code_f(): - code = ClassCode("f") - code.definition = """Fusion - this special code is applied when a prediction intersects more than one - reference transcript. To be considered for fusions, candidate references must **either** share at least one - splice junction with the prediction, **or** have at least 10% of its bases recalled. - If two or more reference transcripts fit these constraints, then the prediction model is classified as a fusion.""" - code.ref_multi, code.pred_multi = None, None - code._nucl_rec, code._nucl_prec, code._nucl_f1 = "> 10", 0, 0 - code._junc_rec, code._junc_prec, code._junc_f1 = "> 0", 0, 0 - code.reverse = None - code.category = "Fusion" - return code - - -def code_x(): - code = ClassCode("x") - code.definition = "Monoexonic match on the **opposite** strand." - code.ref_multi, code.pred_multi = None, False - code._nucl_rec, code._nucl_prec, code._nucl_f1 = ">0", ">0", ">0" - code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 - code.reverse = "x or X" - code.category = "Fragment" - return code - - -def code_capital_x(): - code = ClassCode("X") - code.definition = "Multiexonic match on the **opposite** strand." - code.ref_multi, code.pred_multi = None, True - code._nucl_rec, code._nucl_prec, code._nucl_f1 = ">0", ">0", ">0" - code._junc_rec, code._junc_prec, code._junc_f1 = None, None, None - code.reverse = "x or X" - code.category = "Fragment" - return code - - -def code_p(): - code = ClassCode("p") - code.definition = """The prediction is on the same strand of a neighbouring but non-overlapping transcript. + _code.ref_multi, _code.pred_multi = False, True + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = 0, 0, 0 + _code._junc_rec, _code._junc_prec, _code._junc_f1 = 0, 0, 0 + _code.reverse = "i" + _code.category = "Intronic" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) + + +class CodeRCapitalI: + _code = _ClassCode("rI") + _code.definition = """Multiexonic reference completely contained within the introns of the prediction transcript.""" + _code.ref_multi, _code.pred_multi = True, True + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = 0, 0, 0 + _code._junc_rec, _code._junc_prec, _code._junc_f1 = 0, 0, 0 + _code.reverse = "I" + _code.category = "Intronic" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) + + +class CodeF: + _code = _ClassCode("f") + _code.definition = """Fusion - this special code is applied when a prediction intersects more + than one reference transcript. To be considered for fusions, candidate references must + **either** share at least one splice junction with the prediction, **or** have at least 10% of + its bases recalled. If two or more reference transcripts fit these constraints, then the + prediction model is classified as a fusion.""" + _code.ref_multi, _code.pred_multi = None, None + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = "> 10", 0, 0 + _code._junc_rec, _code._junc_prec, _code._junc_f1 = "> 0", 0, 0 + _code.reverse = None + _code.category = "Fusion" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) + + +class CodeX: + _code = _ClassCode("x") + _code.definition = "Monoexonic match on the **opposite** strand." + _code.ref_multi, _code.pred_multi = None, False + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = ">0", ">0", ">0" + _code._junc_rec, _code._junc_prec, _code._junc_f1 = 0, 0, 0 + _code.reverse = "x or X" + _code.category = "Fragment" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) + + +class CodeCapitalX: + _code = _ClassCode("X") + _code.definition = "Multiexonic match on the **opposite** strand." + _code.ref_multi, _code.pred_multi = None, True + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = ">0", ">0", ">0" + _code._junc_rec, _code._junc_prec, _code._junc_f1 = None, None, None + _code.reverse = "x or X" + _code.category = "Fragment" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) + + +class CodeP: + _code = _ClassCode("p") + _code.definition = """The prediction is on the same strand of a neighbouring but non-overlapping transcript. Probable polymerase run-on""" - code.ref_multi, code.pred_multi = None, None - code._nucl_rec, code._nucl_prec, code._nucl_f1 = 0, 0, 0 - code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 - code.reverse = "p" - code.category = "Fragment" - return code + _code.ref_multi, _code.pred_multi = None, None + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = 0, 0, 0 + _code._junc_rec, _code._junc_prec, _code._junc_f1 = 0, 0, 0 + _code.reverse = "p" + _code.category = "Fragment" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) -def code_capital_p(): - code = ClassCode("P") - code.definition = """The prediction is on the opposite strand of a neighbouring but non-overlapping transcript. +class CodeCapitalP: + _code = _ClassCode("P") + _code.definition = """The prediction is on the opposite strand of a neighbouring but non-overlapping transcript. Probable polymerase run-on.""" - code.ref_multi, code.pred_multi = None, None - code._nucl_rec, code._nucl_prec, code._nucl_f1 = 0, 0, 0 - code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 - code.reverse = "P" - code.category = "Fragment" - return code + _code.ref_multi, _code.pred_multi = None, None + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = 0, 0, 0 + _code._junc_rec, _code._junc_prec, _code._junc_f1 = 0, 0, 0 + _code.reverse = "P" + _code.category = "Fragment" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) -def code_u(): - code = ClassCode("u") - code.definition = """Unknown - no suitable model has been found near enough the prediction to +class CodeU: + _code = _ClassCode("u") + _code.definition = """Unknown - no suitable model has been found near enough the prediction to perform a comparison.""" - code.ref_multi, code.pred_multi = None, None - code._nucl_rec, code._nucl_prec, code._nucl_f1 = 0, 0, 0 - code._junc_rec, code._junc_prec, code._junc_f1 = 0, 0, 0 - code.reverse = None - code.category = "Unknown" - return code - - -codes = odict() -codes["="] = code_equal() -codes["_"] = code_underscore() -codes["n"] = code_n() -codes["J"] = code_capital_j() -codes["c"] = code_c() -codes["C"] = code_capital_c() -codes["j"] = code_j() -codes["h"] = code_h() -codes["g"] = code_g() -codes["G"] = code_capital_g() -codes["o"] = code_o() -codes["e"] = code_e() -codes["m"] = code_m() -codes["i"] = code_i() -codes["I"] = code_capital_i() -codes["ri"] = code_r_i() -codes["rI"] = code_r_capital_i() -codes["f"] = code_f() -codes["x"], codes["X"] = code_x(), code_capital_x() -codes["p"], codes["P"], codes["u"] = code_p(), code_capital_p(), code_u() - -assert len(set(codes.values())) == len(codes), set.difference(set(codes.keys()), set([_.code for _ in codes.values()])) -assert all(_ == codes[_].code for _ in codes) \ No newline at end of file + _code.ref_multi, _code.pred_multi = None, None + _code._nucl_rec, _code._nucl_prec, _code._nucl_f1 = 0, 0, 0 + _code._junc_rec, _code._junc_prec, _code._junc_f1 = 0, 0, 0 + _code.reverse = None + _code.category = "Unknown" + + code, definition = _code.code, _code.definition + pred_multi, ref_multi = _code.pred_multi, _code.ref_multi + category, reverse = _code.category, _code.reverse + nucl, junc = _code.nucl, _code.junc + + __doc__ = str(_code) + + +codes = OrderedDict() +codes["="] = Equal +codes["_"] = UnderScore +codes["n"] = CodeN +codes["J"] = CodeCapitalJ +codes["c"] = CodeC +codes["C"] = CodeCapitalC +codes["j"] = CodeJ +codes["h"] = CodeH +codes["g"] = CodeG +codes["G"] = CodeCapitalG +codes["o"] = CodeO +codes["e"] = CodeE +codes["m"] = CodeM +codes["i"] = CodeI +codes["I"] = CodeCapitalI +codes["ri"] = CodeRI +codes["rI"] = CodeRCapitalI +codes["f"] = CodeF +codes["x"], codes["X"] = CodeX, CodeCapitalX +codes["p"], codes["P"], codes["u"] = CodeP, CodeCapitalP, CodeU + +assert len(set(codes.values())) == len(codes), set.difference(set(codes.keys()), + set([_.code for _ in codes.values()])) +assert all(_ == codes[_].code for _ in codes) diff --git a/Mikado/subprograms/pick.py b/Mikado/subprograms/pick.py index 00f522605..eac3d3d0c 100644 --- a/Mikado/subprograms/pick.py +++ b/Mikado/subprograms/pick.py @@ -149,12 +149,13 @@ def pick(args): raise exc creator = Picker(args.json_conf, commandline=" ".join(sys.argv)) - try: - creator() # Run - except Exception as exc: - logger.error(exc) + creator() + # try: + # creator() # Run + # except Exception as exc: + # logger.error(exc) - sys.exit(1) + sys.exit(0) def pick_parser(): diff --git a/Mikado/subprograms/util/class_codes.py b/Mikado/subprograms/util/class_codes.py index 2987d67cc..16d42c956 100644 --- a/Mikado/subprograms/util/class_codes.py +++ b/Mikado/subprograms/util/class_codes.py @@ -27,7 +27,7 @@ def launch(args): [code.nucl], [code.junc], [code.reverse], - [code.category]) + code.category.split()) rows.extend(code_rows) __table_format = tabulate._table_formats[args.format] From bca39ffb1736a6e2805f79d650561fb0646b636f Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Mon, 13 Feb 2017 17:20:38 +0000 Subject: [PATCH 38/47] Solved a bug which caused Mikado to miscalculate retained introns during multiprocessing. Fixed logging for multiprocessing + DEBUG. --- .../configuration_blueprint.json | 2 +- Mikado/loci/abstractlocus.py | 45 ++-- Mikado/loci/sublocus.py | 25 ++- Mikado/loci/superlocus.py | 18 +- Mikado/picking/loci_processer.py | 42 +++- Mikado/picking/picker.py | 199 +++++++++--------- Mikado/tests/locus_tester.py | 43 +++- Mikado/transcripts/transcript.py | 58 +++-- .../transcript_methods/finalizing.py | 7 +- 9 files changed, 260 insertions(+), 179 deletions(-) diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json index 532ca29df..45634f54d 100644 --- a/Mikado/configuration/configuration_blueprint.json +++ b/Mikado/configuration/configuration_blueprint.json @@ -643,7 +643,7 @@ "- min_cdna_overlap: minimal cDNA overlap for the second clustering.", "- flank: maximum distance for transcripts to be clustered within the same superlocus.", "- remove_overlapping_fragments: boolean, it specifies whether to remove putative fragments.", - "- purge: boolean, it specifies whether to remove loci where all transcripts fail the minimum checks, or whether to print them out in the subloci file instead.", + "- purge: boolean, it specifies whether to remove transcripts which fail the minimum requirements check - or whether to ignore those requirements altogether.", "- simple_overlap_for_monoexonic: boolean. If set to true (default), then any overlap mean inclusion", "in a locus for or against a monoexonic transcript. If set to false, normal controls for the percentage", "of overlap will apply.", diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py index 66fc23b7d..98a83a428 100644 --- a/Mikado/loci/abstractlocus.py +++ b/Mikado/loci/abstractlocus.py @@ -546,7 +546,8 @@ def _is_exon_retained_in_transcript(exon: tuple, frags: list, candidate: Transcript, consider_truncated=False, - terminal=False): + terminal=False, + logger=create_null_logger()): """Private static method to verify whether a given exon is a retained intron of the candidate Transcript. :param exon: the exon to be considered. @@ -565,9 +566,14 @@ def _is_exon_retained_in_transcript(exon: tuple, :param terminal: whether the exon is at the 3' end. :type terminal: bool + :param logger: a logger for the static function. Default: null logger. + :type logger: logging.Logger + :rtype: bool """ + logger.debug("Considering exon %s against candidate %s", exon, candidate.id) + found_exons = sorted( candidate.segmenttree.find(exon[0], exon[1], strict=False, value="exon"), reverse=(candidate.strand == "-")) @@ -575,20 +581,19 @@ def _is_exon_retained_in_transcript(exon: tuple, candidate.segmenttree.find(exon[0], exon[1], strict=not consider_truncated, value="intron"), reverse=(candidate.strand == "-")) + is_retained = False + if len(found_exons) == 0 or len(found_introns) == 0: - return False + is_retained = False elif len(found_exons) == 1 and len(found_introns) == 1: found_exons = found_exons.pop() found_introns = found_introns.pop() if candidate.strand != "-" and found_exons[1] + 1 == found_introns[0]: - return consider_truncated and terminal + is_retained = (consider_truncated and terminal) elif candidate.strand == "-" and found_exons[0] - 1 == found_introns[1]: - return consider_truncated and terminal - else: - return False - else: + is_retained = (consider_truncated and terminal) + elif len(found_exons) >= 2: # Now we have to check whether the matched introns contain both coding and non-coding parts - assert len(found_exons) >= 2, (found_exons, found_introns) for index, exon in enumerate(found_exons[:-1]): intron = found_introns[index] if candidate.strand == "-": @@ -596,15 +601,18 @@ def _is_exon_retained_in_transcript(exon: tuple, else: assert exon[1] == intron[0] - 1 for frag in frags: + if is_retained: + break # The fragment is just a sub-section of the exon if (overlap(frag, exon) < exon[1] - exon[0] and overlap(frag, exon, positive=True) == 0 and overlap(frag, intron, positive=True)): - return True + is_retained = True elif overlap(frag, exon) == exon[1] - exon[0]: - return True + is_retained = True - return False + logger.debug("%s in %s %s a retained intron", exon, candidate.id, "is" if is_retained is True else "is not") + return is_retained def find_retained_introns(self, transcript: Transcript): @@ -644,10 +652,12 @@ def find_retained_introns(self, transcript: Transcript): retained_introns = [] consider_truncated = self.json_conf["pick"]["run_options"]["consider_truncated_for_retained"] for exon in transcript.exons: - is_retained = False + self.logger.debug("Checking exon %s of %s", exon, transcript.id) + # is_retained = False to_consider, frags = self._exon_to_be_considered( exon, transcript, consider_truncated=consider_truncated) if not to_consider: + self.logger.debug("Exon %s of %s is not to be considered", exon, transcript.id) continue if exon[0] == transcript.start and transcript.strand == "-": @@ -663,18 +673,25 @@ def find_retained_introns(self, transcript: Transcript): elif candidate.strand != transcript.strand and None not in (transcript.strand, candidate.strand): continue + self.logger.debug("Checking %s in %s against %s", exon, transcript.id, candidate.id) is_retained = self._is_exon_retained_in_transcript(exon, frags, # transcript, candidate, terminal=terminal, - consider_truncated=consider_truncated) + consider_truncated=consider_truncated, + logger=self.logger) if is_retained: self.logger.debug("Exon %s of %s is a retained intron of %s", exon, transcript.id, candidate.id) retained_introns.append(exon) break + self.logger.debug("%s has %d retained introns%s", + transcript.id, + len(retained_introns), + " ({})".format(retained_introns) if retained_introns else "") + transcript.retained_introns = tuple(sorted(retained_introns)) return @@ -961,6 +978,8 @@ def logger(self, logger): # logger.propagate = False self.__logger = logger + # self.__logger.setLevel("DEBUG") + @logger.deleter def logger(self): """ diff --git a/Mikado/loci/sublocus.py b/Mikado/loci/sublocus.py index 052115f90..fbd444b96 100644 --- a/Mikado/loci/sublocus.py +++ b/Mikado/loci/sublocus.py @@ -261,9 +261,6 @@ def __check_requirements(self): """ self.get_metrics() - if self.purge is False: - self.logger.debug("No purging for %s, returning", self.id) - return previous_not_passing = set() while True: @@ -273,15 +270,21 @@ def __check_requirements(self): return for tid in not_passing: self.transcripts[tid].score = 0 - - self.metrics_calculated = False - if self.excluded is None: - excluded = Monosublocus(self.transcripts[tid], logger=self.logger) - excluded.json_conf = self.json_conf - self.excluded = Excluded(excluded) + if self.purge is True: + self.metrics_calculated = False + self.logger.debug("Excluding %s from %s because of failed requirements", + tid, self.id) + if self.excluded is None: + excluded = Monosublocus(self.transcripts[tid], logger=self.logger) + excluded.json_conf = self.json_conf + self.excluded = Excluded(excluded) + else: + self.excluded.add_transcript_to_locus(self.transcripts[tid]) + self.remove_transcript_from_locus(tid) else: - self.excluded.add_transcript_to_locus(self.transcripts[tid]) - self.remove_transcript_from_locus(tid) + self.logger.debug("%s has been assigned a score of 0 because it fails basic requirements", + self.id) + return if len(self.transcripts) == 0: return diff --git a/Mikado/loci/superlocus.py b/Mikado/loci/superlocus.py index 88e2eef19..e47407d85 100644 --- a/Mikado/loci/superlocus.py +++ b/Mikado/loci/superlocus.py @@ -366,7 +366,8 @@ def split_strands(self): stranded=True, json_conf=self.json_conf, source=self.source, - logger=self.logger) + logger=self.logger + ) assert len(new_locus.introns) > 0 or new_locus.monoexonic is True for cdna in strand[1:]: if new_locus.in_locus(new_locus, cdna): @@ -378,7 +379,8 @@ def split_strands(self): stranded=True, json_conf=self.json_conf, source=self.source, - logger=self.logger) + logger=self.logger + ) assert len(new_locus.introns) > 0 or new_locus.monoexonic is True new_loci.append(new_locus) @@ -913,10 +915,6 @@ def define_subloci(self): self.subloci_defined = True return - # Reset the source with the correct value - # for tid in self.transcripts: - # self.transcripts[tid].source = self.source - self.logger.debug("Calculated the transcript graph for %d transcripts: %s", len(self.transcripts), str(transcript_graph)) @@ -932,8 +930,9 @@ def define_subloci(self): subl = sorted(subl) new_sublocus = Sublocus(subl[0], json_conf=self.json_conf, - logger=self.logger) - # verified_introns=self.locus_verified_introns) + logger=self.logger + ) + new_sublocus.logger = self.logger if self.regressor is not None: new_sublocus.regressor = self.regressor for ttt in subl[1:]: @@ -1119,6 +1118,9 @@ def define_loci(self): self.loci_defined = True + self.logger.debug("Looking for AS events in %s: %s", + self.id, + self.json_conf["pick"]["alternative_splicing"]["report"]) if self.json_conf["pick"]["alternative_splicing"]["report"] is True: self.define_alternative_splicing() diff --git a/Mikado/picking/loci_processer.py b/Mikado/picking/loci_processer.py index f4e53ca08..27e7ceb70 100644 --- a/Mikado/picking/loci_processer.py +++ b/Mikado/picking/loci_processer.py @@ -555,6 +555,7 @@ def analyse_locus(slocus: Superlocus, logger.debug("Divided into %d loci", len(stranded_loci)) for stranded_locus in stranded_loci: + stranded_locus.logger = logger try: stranded_locus.define_loci() except KeyboardInterrupt: @@ -678,13 +679,26 @@ def identifier(self): def __getstate__(self): state = self.__dict__.copy() - for h in state["_handles"]: - h.close() + to_delete = [] + for num in range(len(state["_handles"])): + h = state["_handles"][num] + if isinstance(h, list): + [_.close() for _ in h] + state["_handles"][num] = h + to_delete.append(num) + elif hasattr(h, "close"): + state["_handles"][num].close() + else: + raise TypeError("Erroneous type in _handles: {}".format(h)) + + for index in sorted(to_delete, reverse=True): + del state["_handles"][index] for name in ["locus_metrics", "locus_scores", "locus_out", "sub_metrics", "sub_scores", "sub_out", "mono_metrics", "mono_scores", "mono_out"]: state[name] = None + state["engine"] = None state["analyse_locus"] = None del state["handler"] @@ -698,6 +712,12 @@ def terminate(self): def __close_handles(self): """Private method to flush and close all handles.""" + try: + self.handler.release() + except RuntimeError: + pass + self.handler.close() + for group in self._handles: [_.flush() for _ in group if hasattr(_, "flush") and _.closed is False] [_.close() for _ in group if hasattr(_, "close") and _.closed is False] @@ -796,20 +816,26 @@ def _create_handles(self, handles): metrics.extend(["external.{}".format(_.source) for _ in session.query(ExternalSource.source).all()]) metrics = Superlocus.available_metrics[:3] + sorted(metrics) - self._handles.append(self.__create_step_handles(handles[0], - metrics, score_keys)) + self.locus_metrics, self.locus_scores, self.locus_out = self.__create_step_handles( + handles[0], metrics, score_keys) + + self._handles.append([self.locus_metrics, + self.locus_scores, + self.locus_out]) # Subloci if handles[1][0]: - self._handles.append(self.__create_step_handles(handles[1], - metrics, score_keys)) + self.sub_metrics, self.sub_scores, self.sub_out = self.__create_step_handles( + handles[1], metrics, score_keys) + self._handles.append([self.sub_metrics, self.sub_scores, self.sub_out]) else: self._handles.append([None, None, None]) # Monoloci if handles[2][0]: - self._handles.append(self.__create_step_handles(handles[2], - metrics, score_keys)) + self.mono_metrics, self.mono_scores, self.mono_out = self.__create_step_handles( + handles[2], metrics, score_keys) + self._handles.append([self.mono_metrics, self.mono_scores, self.mono_out]) else: self._handles.append([None, None, None]) diff --git a/Mikado/picking/picker.py b/Mikado/picking/picker.py index 7c946ff39..05676f7c3 100644 --- a/Mikado/picking/picker.py +++ b/Mikado/picking/picker.py @@ -57,7 +57,6 @@ def __init__(self, json_conf, commandline=""): prepared by the json_utils functions. :param json_conf: Either a configuration dictionary or the configuration file. - :type json_conf: (str|dict) :param commandline: optional, the commandline used to start the program :type commandline: str @@ -75,30 +74,8 @@ def __init__(self, json_conf, commandline=""): # Now we start the real work self.commandline = commandline self.json_conf = json_conf - if isinstance(self.json_conf, str): - assert os.path.exists(self.json_conf) - self.json_conf = to_json(self.json_conf, logger=self.logger) - # pylint: disable=no-member - multiprocessing.set_start_method(self.json_conf["multiprocessing_method"], - force=True) - self.input_file = self.json_conf["pick"]["files"]["input"] - self.logging_queue = multiprocessing.Queue(-1) - self.printer_queue = multiprocessing.Queue(-1) - self.setup_logger() - elif isinstance(self.json_conf, dict): - # pylint: disable=no-member - self.input_file = self.json_conf["pick"]["files"]["input"] - multiprocessing.set_start_method(self.json_conf["multiprocessing_method"], - force=True) - self.logging_queue = multiprocessing.Queue(-1) - self.printer_queue = multiprocessing.Queue(-1) - self.setup_logger() - self.json_conf = check_json(self.json_conf, logger=self.logger) - else: - raise TypeError(type(self.json_conf)) - - assert isinstance(self.json_conf, dict) + self.__load_configuration() self.regressor = None self.procs = self.json_conf["pick"]["run_options"]["procs"] @@ -107,61 +84,15 @@ def __init__(self, json_conf, commandline=""): with self.define_input() as _: pass - if self.json_conf["pick"]["files"]["subloci_out"]: - self.sub_out = path_join( - self.json_conf["pick"]["files"]["output_dir"], - self.json_conf["pick"]["files"]["subloci_out"] - ) - else: - self.sub_out = "" - if self.json_conf["pick"]["files"]["monoloci_out"]: - self.monolocus_out = path_join( - self.json_conf["pick"]["files"]["output_dir"], - self.json_conf["pick"]["files"]["monoloci_out"] - ) - else: - self.monolocus_out = "" - self.locus_out = path_join( - self.json_conf["pick"]["files"]["output_dir"], - self.json_conf["pick"]["files"]["loci_out"]) - - assert self.locus_out != '' - assert self.locus_out != self.sub_out and self.locus_out != self.monolocus_out - assert (not self.sub_out and not self.monolocus_out) or (self.sub_out != self.monolocus_out) - + self.__create_output_handles() # pylint: disable=no-member multiprocessing.set_start_method(self.json_conf["multiprocessing_method"], force=True) - self.logging_queue = multiprocessing.Queue(-1) - self.printer_queue = multiprocessing.Queue(-1) + # self.setup_logger() self.logger.info("Multiprocessing method: %s", self.json_conf["multiprocessing_method"]) - for key in ("remove_overlapping_fragments", "flank", "purge"): - if key in self.json_conf["pick"]["run_options"]: - # Put warnings in place for the deprecation of some options. - - if key == "remove_overlapping_fragments": - self.json_conf["pick"]["fragments"]["remove"] = self.json_conf["pick"]["run_options"].pop(key) - new_home = "fragments/remove" - else: - self.json_conf["pick"]["clustering"][key] = self.json_conf["pick"]["run_options"].pop(key) - new_home = "clustering/{}".format(key) - warns = PendingDeprecationWarning( - """The \"{}\" property has now been moved to pick/{}. Please update your configuration files in the future.""".format(key, new_home)) - self.logger.warn(warns) - - self.context = multiprocessing.get_context() - if self.json_conf["pick"]["scoring_file"].endswith((".pickle", ".model")): - with open(self.json_conf["pick"]["scoring_file"], "rb") as forest: - self.regressor = pickle.load(forest) - if not isinstance(self.regressor["scoring"], (RandomForestRegressor, RandomForestClassifier)): - exc = TypeError("Invalid regressor provided, type: %s", type(self.regressor["scoring"])) - self.logger.critical(exc) - return - else: - self.regressor = None # pylint: enable=no-member self.manager = self.context.Manager() @@ -175,22 +106,6 @@ def __init__(self, json_conf, commandline=""): dbutils.DBBASE.metadata.create_all(engine) engine.dispose() - self.logger_queue_handler = logging_handlers.QueueHandler(self.logging_queue) - self.queue_logger = logging.getLogger("parser") - self.queue_logger.addHandler(self.logger_queue_handler) - - # Configure SQL logging - sqllogger = logging.getLogger("sqlalchemy.engine") - # if json_conf["log_settings"]["log_level"] == "DEBUG": - # sqllogger.setLevel("DEBUG") - # else: - sqllogger.setLevel(json_conf["log_settings"]["sql_level"]) - sqllogger.addHandler(self.logger_queue_handler) - - # We need to set this to the lowest possible level, - # otherwise we overwrite the global configuration - self.queue_logger.setLevel(self.json_conf["log_settings"]["log_level"]) - self.queue_logger.propagate = False if self.json_conf["pick"]["run_options"]["single_thread"] is True: # Reset threads to 1 if self.json_conf["pick"]["run_options"]["procs"] > 1: @@ -231,6 +146,81 @@ def define_input(self): return parser(self.input_file) + def __load_configuration(self): + + """Private method to load the configuration""" + + if isinstance(self.json_conf, str): + assert os.path.exists(self.json_conf) + self.json_conf = to_json(self.json_conf, logger=self.logger) + # pylint: disable=no-member + multiprocessing.set_start_method(self.json_conf["multiprocessing_method"], + force=True) + self.input_file = self.json_conf["pick"]["files"]["input"] + self.setup_logger() + elif isinstance(self.json_conf, dict): + # pylint: disable=no-member + self.input_file = self.json_conf["pick"]["files"]["input"] + multiprocessing.set_start_method(self.json_conf["multiprocessing_method"], + force=True) + self.setup_logger() + self.json_conf = check_json(self.json_conf, logger=self.logger) + else: + raise TypeError(type(self.json_conf)) + assert isinstance(self.json_conf, dict) + + for key in ("remove_overlapping_fragments", "flank", "purge"): + if key in self.json_conf["pick"]["run_options"]: + # Put warnings in place for the deprecation of some options. + + if key == "remove_overlapping_fragments": + self.json_conf["pick"]["fragments"]["remove"] = self.json_conf["pick"]["run_options"].pop(key) + new_home = "fragments/remove" + else: + self.json_conf["pick"]["clustering"][key] = self.json_conf["pick"]["run_options"].pop(key) + new_home = "clustering/{}".format(key) + warns = PendingDeprecationWarning( + """The \"{}\" property has now been moved to pick/{}. Please update your configuration files in the future.""".format( + key, new_home)) + self.logger.warn(warns) + + self.context = multiprocessing.get_context() + if self.json_conf["pick"]["scoring_file"].endswith((".pickle", ".model")): + with open(self.json_conf["pick"]["scoring_file"], "rb") as forest: + self.regressor = pickle.load(forest) + if not isinstance(self.regressor["scoring"], (RandomForestRegressor, RandomForestClassifier)): + exc = TypeError("Invalid regressor provided, type: %s", type(self.regressor["scoring"])) + self.logger.critical(exc) + return + else: + self.regressor = None + + def __create_output_handles(self): + + """Create all the output-related variables.""" + + if self.json_conf["pick"]["files"]["subloci_out"]: + self.sub_out = path_join( + self.json_conf["pick"]["files"]["output_dir"], + self.json_conf["pick"]["files"]["subloci_out"] + ) + else: + self.sub_out = "" + if self.json_conf["pick"]["files"]["monoloci_out"]: + self.monolocus_out = path_join( + self.json_conf["pick"]["files"]["output_dir"], + self.json_conf["pick"]["files"]["monoloci_out"] + ) + else: + self.monolocus_out = "" + self.locus_out = path_join( + self.json_conf["pick"]["files"]["output_dir"], + self.json_conf["pick"]["files"]["loci_out"]) + + assert self.locus_out != '' + assert self.locus_out != self.sub_out and self.locus_out != self.monolocus_out + assert (not self.sub_out and not self.monolocus_out) or (self.sub_out != self.monolocus_out) + def setup_shm_db(self): """ This method will copy the SQLite input DB into memory. @@ -280,6 +270,8 @@ def setup_logger(self): logging.handlers.QueueListener instance listening on the logging_queue instance attribute (which is a normal mp.Manager.Queue instance).""" + self.logging_queue = multiprocessing.Queue(-1) + self.printer_queue = multiprocessing.Queue(-1) self.formatter = formatter self.main_logger = logging.getLogger("main_logger") if not os.path.exists(self.json_conf["pick"]["files"]["output_dir"]): @@ -353,6 +345,20 @@ def setup_logger(self): self.logging_queue, self.logger) self.log_writer.start() + self.logger_queue_handler = logging_handlers.QueueHandler(self.logging_queue) + self.queue_logger = logging.getLogger("parser") + self.queue_logger.addHandler(self.logger_queue_handler) + + self.queue_logger.setLevel(logging.getLevelName(self.json_conf["log_settings"]["log_level"])) + self.logger.warn("Current level for queue: %s", logging.getLevelName(self.queue_logger.level)) + + self.queue_logger.propagate = False + + # Configure SQL logging + sqllogger = logging.getLogger("sqlalchemy.engine") + sqllogger.setLevel(self.json_conf["log_settings"]["sql_level"]) + sqllogger.addHandler(self.logger_queue_handler) + return def __print_gff_headers(self, locus_out, score_keys): @@ -848,8 +854,10 @@ def __submit_multi_threading(self, data_dict): else: if current_locus is not None: counter += 1 - self.logger.debug("Submitting locus # %d (%s)", counter, - None if not current_locus else current_locus.id) + self.logger.debug("Submitting locus # %d (%s), with transcripts:\n%s", + counter, + None if not current_locus else current_locus.id, + ",".join(list(current_locus.transcripts.keys()))) locus_queue.put((current_locus, counter)) current_locus = Superlocus( current_transcript, @@ -898,8 +906,9 @@ def __submit_multi_threading(self, data_dict): counter += 1 locus_queue.put((current_locus, counter)) - self.logger.debug("Submitting locus %s, counter %d", - current_locus.id, counter) + self.logger.debug("Submitting locus %s, counter %d, with transcripts:\n%s", + current_locus.id, counter, + ", ".join(list(current_locus.transcripts.keys()))) locus_queue.put(("EXIT", float("inf"))) self.logger.info("Joining children processes") [_.join() for _ in working_processes] @@ -976,12 +985,6 @@ def __submit_single_threaded(self, data_dict): gene_counter = 0 if self.json_conf["pick"]["run_options"]["preload"] is False: - # db_connection = functools.partial(dbutils.create_connector, - # self.json_conf, - # self.logger) - # self.connection_pool = sqlalchemy.pool.QueuePool(db_connection, - # pool_size=1, - # max_overflow=2) self.engine = dbutils.connect(json_conf=self.json_conf, logger=self.logger) else: self.engine = None diff --git a/Mikado/tests/locus_tester.py b/Mikado/tests/locus_tester.py index 6e0151c7f..4b785181c 100644 --- a/Mikado/tests/locus_tester.py +++ b/Mikado/tests/locus_tester.py @@ -976,7 +976,6 @@ def test_real_retained_pos(self): self.assertEqual((len(sup.transcripts[pred.id].retained_introns) > 0), retained) - def test_retained_pos_truncated(self): """Here we verify that a real retained intron is called as such, even when the transcript is truncated.""" @@ -1015,6 +1014,15 @@ def test_retained_pos_truncated(self): sup.find_retained_introns(pred) self.assertEqual((len(sup.transcripts[pred.id].retained_introns) > 0), retained) + # Now check that things function also after unpickling + unpickled_t1 = pickle.loads(pickle.dumps(t1)) + unpickled_other = pickle.loads(pickle.dumps(pred)) + sup = Superlocus(unpickled_t1, json_conf=self.my_json) + sup.add_transcript_to_locus(unpickled_other) + sup.json_conf["pick"]["run_options"]["consider_truncated_for_retained"] = True + sup.find_retained_introns(pred) + self.assertEqual((len(sup.transcripts[pred.id].retained_introns) > 0), + retained) def test_real_retained_pos_truncated_skip(self): """Here we verify that a real retained intron is *NOT* called as such when @@ -1184,6 +1192,12 @@ def test_not_retained_pos(self): sup.add_transcript_to_locus(pred) sup.find_retained_introns(pred) self.assertEqual(sup.transcripts[pred.id].retained_intron_num, 0) + unpickled_t1 = pickle.loads(pickle.dumps(t1)) + unpickled_other = pickle.loads(pickle.dumps(pred)) + sup = Superlocus(unpickled_t1, json_conf=self.my_json) + sup.add_transcript_to_locus(unpickled_other) + sup.find_retained_introns(unpickled_other) + self.assertEqual(sup.transcripts[unpickled_other.id].retained_intron_num, 0) def test_real_retained_neg(self): """Here we verify that a real retained intron is called as such""" @@ -1207,12 +1221,20 @@ def test_real_retained_neg(self): ], features="CDS") t2.finalize() - sup = Superlocus(t1, json_conf=self.my_json) - sup.add_transcript_to_locus(t2) + with self.subTest(): + sup = Superlocus(t1, json_conf=self.my_json) + sup.add_transcript_to_locus(t2) - sup.find_retained_introns(t2) + sup.find_retained_introns(t2) + self.assertEqual(sup.transcripts["t2"].retained_introns, ((401, 1000),)) - self.assertEqual(sup.transcripts["t2"].retained_introns, ((401, 1000),)) + with self.subTest(): + unpickled_t1 = pickle.loads(pickle.dumps(t1)) + unpickled_other = pickle.loads(pickle.dumps(t2)) + sup = Superlocus(unpickled_t1, json_conf=self.my_json) + sup.add_transcript_to_locus(unpickled_other) + sup.find_retained_introns(unpickled_other) + self.assertEqual(sup.transcripts["t2"].retained_introns, ((401, 1000),)) def test_not_real_retained_neg(self): """Here we verify that a real retained intron is called as such""" @@ -1249,12 +1271,18 @@ def test_not_real_retained_neg(self): Abstractlocus._is_exon_retained_in_transcript((401, 1000), [Interval(401, 830)], t1)) for alt in [t2, t3]: + unpickled_t1 = pickle.loads(pickle.dumps(t1)) + unpickled_alt = pickle.loads(pickle.dumps(alt)) with self.subTest(alt=alt): sup = Superlocus(t1, json_conf=self.my_json) sup.find_retained_introns(alt) - self.assertEqual(alt.retained_intron_num, 0, alt.retained_introns) + with self.subTest(alt=alt): + sup = Superlocus(unpickled_t1, json_conf=self.my_json) + sup.find_retained_introns(unpickled_alt) + self.assertEqual(unpickled_alt.retained_intron_num, 0, + unpickled_alt.retained_introns) def test_not_retained_neg(self): """Here we verify that a false retained intron is not called as such""" @@ -1492,6 +1520,9 @@ def test_transcript_pickling(self): pickled = pickle.dumps(transcript) unpickled = pickle.loads(pickled) self.assertEqual(transcript, unpickled) + self.assertEqual(len(transcript.combined_cds), len(unpickled.cds_tree)) + self.assertEqual(len(transcript.cds_introntree), len(unpickled.cds_introntree)) + self.assertEqual(len(transcript.segmenttree), len(unpickled.segmenttree)) def test_locus_unpickling(self): diff --git a/Mikado/transcripts/transcript.py b/Mikado/transcripts/transcript.py index 8158605d8..4da132cd5 100644 --- a/Mikado/transcripts/transcript.py +++ b/Mikado/transcripts/transcript.py @@ -496,6 +496,9 @@ def __setstate__(self, state): self.__cds_tree = IntervalTree() self.__segmenttree = IntervalTree() self.__cds_introntree = IntervalTree() + _ = self.segmenttree + _ = self.cds_tree + _ = self.__cds_introntree # Set the logger to NullHandler self.logger = None @@ -1736,18 +1739,6 @@ def combined_cds_end(self): else: return self.combined_cds[-1][1] - @property - def _cds_introntree(self): - - """ - :rtype: intervaltree.IntervalTree - """ - - if len(self.__cds_introntree) != len(self.combined_cds_introns): - self.__cds_introntree = IntervalTree.from_tuples( - [(_[0], _[1] + 1) for _ in self.combined_cds_introns]) - return self.__cds_introntree - @property def selected_cds(self): """This property return the CDS exons of the ORF selected as best @@ -1827,37 +1818,42 @@ def cds_tree(self): :rtype: intervaltree.Intervaltree """ - return self.__cds_tree + if len(self.__cds_tree) != len(self.combined_cds): + self.__calculate_cds_tree() - @cds_tree.setter - def cds_tree(self, segments): - """ - Setter for CDS tree. It checks that the calculated tree is actually valid. - :param segments: the interval tree to be set. - :type segments: intervaltree.Intervaltree - :return: - """ + return self.__cds_tree - if segments is None: - self.cds_tree = IntervalTree() - elif isinstance(segments, IntervalTree): - assert len(segments) == len(self.combined_cds) - else: - raise TypeError("Invalid cds segments: %s, type %s", - segments, type(segments)) + def __calculate_cds_tree(self): - self.__cds_tree = segments + self.__cds_tree = IntervalTree.from_tuples( + [(cds[0], max(cds[1], cds[0] + 1)) for cds in self.combined_cds]) @property def segmenttree(self): if len(self.__segmenttree) != self.exon_num + len(self.introns): + self.__calculate_segment_tree() + + return self.__segmenttree + + @property + def cds_introntree(self): + + """ + :rtype: intervaltree.IntervalTree + """ + + if len(self.__cds_introntree) != len(self.combined_cds_introns): + self.__cds_introntree = IntervalTree.from_tuples( + [(_[0], _[1] + 1) for _ in self.combined_cds_introns]) + return self.__cds_introntree - self.__segmenttree = IntervalTree.from_intervals( + def __calculate_segment_tree(self): + + self.__segmenttree = IntervalTree.from_intervals( [Interval(*_, value="exon") for _ in self.exons] + [Interval(*_, value="intron") for _ in self.introns] ) - return self.__segmenttree @property def derived_children(self): diff --git a/Mikado/transcripts/transcript_methods/finalizing.py b/Mikado/transcripts/transcript_methods/finalizing.py index 921540a85..c6489a6af 100644 --- a/Mikado/transcripts/transcript_methods/finalizing.py +++ b/Mikado/transcripts/transcript_methods/finalizing.py @@ -608,9 +608,10 @@ def finalize(transcript): transcript.selected_internal_orf_index] if internal_cds[0] == "CDS") - # Create the interval tree - transcript.cds_tree = IntervalTree.from_tuples( - [(cds[0], max(cds[1], cds[0] + 1)) for cds in transcript.combined_cds]) + # Create the internal trees + _ = transcript.cds_tree + _ = transcript.cds_introntree + _ = transcript.segmenttree # BUG somewhere ... I am not sorting this properly before (why?) transcript.exons = sorted(transcript.exons) From e6b6952c8e7ba7a387c5771947676a4e26cc0f74 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Mon, 13 Feb 2017 17:51:01 +0000 Subject: [PATCH 39/47] Fixed a bug which nullified the requirements when disabling purge --- Mikado/loci/monosublocusholder.py | 1 + Mikado/loci/sublocus.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/Mikado/loci/monosublocusholder.py b/Mikado/loci/monosublocusholder.py index 057b40c34..b7198c7f8 100644 --- a/Mikado/loci/monosublocusholder.py +++ b/Mikado/loci/monosublocusholder.py @@ -47,6 +47,7 @@ def __init__(self, monosublocus_instance: Monosublocus, json_conf=None, logger=N # Abstractlocus Abstractlocus.__init__(self, verified_introns=verified_introns) self.logger = logger + self._not_passing = set() self.splitted = False self.metrics_calculated = False self.json_conf = json_conf diff --git a/Mikado/loci/sublocus.py b/Mikado/loci/sublocus.py index fbd444b96..ff109b083 100644 --- a/Mikado/loci/sublocus.py +++ b/Mikado/loci/sublocus.py @@ -66,6 +66,7 @@ def __init__(self, span, json_conf=None, logger=None, verified_introns=None): self.source = self.json_conf["pick"]["output_format"]["source"] self.excluded = None + self._not_passing = set() self.splitted = False # Flag to indicate that we have not calculated the metrics for the transcripts # Flag to indicate that we have not calculated the scores for the transcripts @@ -284,6 +285,7 @@ def __check_requirements(self): else: self.logger.debug("%s has been assigned a score of 0 because it fails basic requirements", self.id) + self._not_passing.add(tid) return if len(self.transcripts) == 0: @@ -307,7 +309,7 @@ def calculate_scores(self): return self.get_metrics() - not_passing = set() + # not_passing = set() if not hasattr(self, "logger"): self.logger = None self.logger.setLevel("DEBUG") @@ -334,7 +336,7 @@ def calculate_scores(self): self.transcripts[tid].scores = self.scores[tid].copy() for tid in self.transcripts: - if tid in not_passing: + if tid in self._not_passing: self.logger.debug("Excluding %s as it does not pass minimum requirements", tid) self.transcripts[tid].score = 0 @@ -343,7 +345,9 @@ def calculate_scores(self): if self.transcripts[tid].score == 0: self.logger.debug("Excluding %s as it has a score of 0", tid) - if tid not in not_passing: + if tid in self._not_passing: + pass + else: assert self.transcripts[tid].score == sum(self.scores[tid].values()), ( tid, self.transcripts[tid].score, sum(self.scores[tid].values()) ) @@ -503,7 +507,7 @@ def print_scores(self): assert key in self.scores[tid] and self.scores[tid][key] != "NA" and self.scores[tid][key] is not None, (key, self.scores[tid].keys()) row[key] = round(self.scores[tid][key], 2) - if calculate_total is True: + if calculate_total is True and tid not in self._not_passing: score_sum = sum(row[key] for key in score_keys) if round(score_sum, 2) != round(self.scores[tid]["score"], 2): From a4326654a161d114d3ef2db82f96a9661a0c6d60 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Mon, 13 Feb 2017 18:29:06 +0000 Subject: [PATCH 40/47] Updated the CHANGElog --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad35e0fe2..6f7ba12b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,8 @@ Changes in this release: - Made the checks for the scoring files more robust. - Re-written the "find_retained_introns" method of AbstractLocus, to solve some bugs found during the utilisation of last beta. As a corollary, expanded the intervaltree module to allow searches for "tagged" intervals. - Now the "monoloci_out" files contain the Monosublocus**Holder** step, not the Monosublocus step. This should help during fine-tuning. +- Minimal requirements for alternative splicing events are now specified with a syntax analogous to that of minimal requirements, and that for not considering a locus as a putative fragment, under the tag "as_requirements". +- Fixed a bug which caused transcript requirements to be ignored if pick/clustering/purge was set to False. - Mikado now supports also Python3.6. @@ -51,8 +53,7 @@ Changes in this release: - "only_non_canonical_splicing" will allow to identify transcripts whose splicing sites are all non-canonical. - It is now possible to give Mikado a tab-delimited file of pre-calculated metrics (which must be numeric), during serialise. The file should have the transcript ids in the first column and have a header as first line; this header must have "TID" as first field, and no repeated fields afterwards. External metrics can be specified in the scoring configuration using the syntax "external.{name of the score}". If an inexistent metric is asked for, Mikado will assign a default value of 0 to it. - It is now possible to use metrics with values between 0 and 1, inclusive directly as scoring, by specifying the parameter "use_raw: True". This is available only for metrics which have been tagged as being "usable raw", or with externally provided metrics. The option is valid only when looking for the maximum or minimum value for a metric, not when looking for a target. If an incorrect configuration is specified, Mikado will crash. -- Minimal requirements for alternative splicing events are now specified with a syntax analogous to that of minimal requirements, and that for not considering a locus as a putative fragment, under the tag "as_requirements". -- Mikado prepare in "lenient" mode will keep also transcripts with a mixture of strands for the splicing junctions. +- Mikado prepare in "lenient" mode will keep also transcripts with a mixture of strands for the splicing junctions. Such transcripts are marked with the "suspicious_splicing" GTF attribute. - Mikado prepare can be asked to keep all transcripts, even if they are redundant. The new behaviour (disabled by default) is switched on by the boolean parameter "prepare/keep_redundant". - Mikado pick can consider transcripts with CDS ending within a CDS intron as truncated due to a retained intron event. This potentially allows Mikado to detect retained introns even when only CDSs are provided. The behaviour is disabled by default, and can be switched on using the boolean configuration parameter "pick/run_options/consider_truncated_for_retained". - Some bugs have been detected and solved thanks to the collaboration with Hugo Darras. From cca34277ea01beb5ddf8f30a12f6063d96274d66 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Tue, 14 Feb 2017 11:28:49 +0000 Subject: [PATCH 41/47] Fixed a bug which caused the strand to be reversed in prepare even if strand_specific was set to True. --- Mikado/transcripts/transcript.py | 2 ++ Mikado/transcripts/transcriptchecker.py | 5 ++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Mikado/transcripts/transcript.py b/Mikado/transcripts/transcript.py index 4da132cd5..89d5bcf42 100644 --- a/Mikado/transcripts/transcript.py +++ b/Mikado/transcripts/transcript.py @@ -953,6 +953,8 @@ def reverse_strand(self): self.strand = "+" elif self.strand is None: pass + self.logger.warning("Transcript %s has been assigned to the wrong strand, reversing it.", + self.id) return def load_information_from_db(self, json_conf, introns=None, session=None, diff --git a/Mikado/transcripts/transcriptchecker.py b/Mikado/transcripts/transcriptchecker.py index 2fa70cf94..29d028d3d 100644 --- a/Mikado/transcripts/transcriptchecker.py +++ b/Mikado/transcripts/transcriptchecker.py @@ -194,19 +194,18 @@ def check_strand(self): canonical_counter["+"], canonical_counter["-"]) - if canonical_counter["+"] >= canonical_counter["-"] or self.strand_specific is False: + if canonical_counter["+"] >= canonical_counter["-"] or self.strand_specific is True: self.mixed_attribute = "{0}concordant,{1}discordant".format( canonical_counter["+"], canonical_counter["-"]) else: self.reverse_strand() + self.reversed = True self.mixed_attribute = "{0}concordant,{1}discordant".format( canonical_counter["-"], canonical_counter["+"]) elif canonical_counter["-"] > 0 and self.strand_specific is False: - self.logger.warning("Transcript %s has been assigned to the wrong strand, reversing it.", - self.id) self.reverse_strand() self.reversed = True elif canonical_counter["-"] > 0 and self.strand_specific is True: From 8d0c193c6137eae1da5563563d2ad3879cba9322 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Tue, 14 Feb 2017 15:44:40 +0000 Subject: [PATCH 42/47] Moved the scoring functions to abstractlocus. This should simplify code mantainance. --- Mikado/loci/abstractlocus.py | 258 ++++++++++++++++++++++++++++-- Mikado/loci/locus.py | 127 +++++---------- Mikado/loci/monosublocus.py | 1 + Mikado/loci/monosublocusholder.py | 3 +- Mikado/loci/sublocus.py | 221 ++----------------------- Mikado/tests/test_system_calls.py | 4 +- 6 files changed, 300 insertions(+), 314 deletions(-) diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py index 98a83a428..ea036c544 100644 --- a/Mikado/loci/abstractlocus.py +++ b/Mikado/loci/abstractlocus.py @@ -7,20 +7,24 @@ import abc import itertools import logging -import operator import random from sys import maxsize - import networkx from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier - +import numpy from ..transcripts.clique_methods import find_cliques, find_communities, define_graph from ..transcripts.transcript import Transcript from ..configuration.configurator import to_json, check_json from ..exceptions import NotInLocusError from ..utilities import overlap, merge_ranges +import operator from ..utilities.intervaltree import Interval, IntervalTree from ..utilities.log_utils import create_null_logger +from sys import version_info +if version_info.minor < 5: + from sortedcontainers import SortedDict +else: + from collections import OrderedDict as SortedDict # I do not care that there are too many attributes: this IS a massive class! @@ -46,7 +50,8 @@ def __init__(self, source="", verified_introns=None): self.__logger = None self.__stranded = False - + self._not_passing = set() + self._excluded_transcripts = set() self.transcripts = dict() self.introns, self.exons, self.splices = set(), set(), set() # Consider only the CDS part @@ -58,6 +63,13 @@ def __init__(self, source="", verified_introns=None): self.chrom = None self.cds_introns = set() self.__locus_verified_introns = set() + self.scores_calculated = False + self.scores = dict() + + self.purge = self.json_conf["pick"]["clustering"]["purge"] + + # self.purge = True + if verified_introns is not None: self.locus_verified_introns = verified_introns @@ -189,6 +201,9 @@ def overlap(first_interval: (int, int), :param flank: an optional extending parameter to check for neighbours :type flank: int + :param positive: if True, negative overlaps will return 0. Otherwise, the negative overlap is returned. + :type positive: bool + This static method returns the overlap between two intervals. Values<=0 indicate no overlap. @@ -359,6 +374,9 @@ def add_transcript_to_locus(self, transcript, check_in_locus=True, logger=None, or instead whether to trust the assignment to be correct :type check_in_locus: bool + :param logger: the logger to use for this function. + :type logger: logging.Logger + This method checks that a transcript is contained within the superlocus (using the "in_superlocus" class method) and upon a successful check extends the superlocus with the new transcript. @@ -621,14 +639,16 @@ def find_retained_introns(self, transcript: Transcript): A retained intron is defined as an exon which: - spans completely an intron of another model *between coding exons* - is not completely coding itself - - if the model is coding, the exon has *part* of the non-coding section lying inside the intron (ie the non-coding section must not be starting in the exonic part). + - if the model is coding, the exon has *part* of the non-coding section lying inside the intron + (ie the non-coding section must not be starting in the exonic part). If the "pick/run_options/consider_truncated_for_retained" flag in the configuration is set to true, an exon will be considered as a retained intron event also if: - it is the last exon of the transcript - it ends *within* an intron of another model *between coding exons* - is not completely coding itself - - if the model is coding, the exon has *part* of the non-coding section lying inside the intron (ie the non-coding section must not be starting in the exonic part). + - if the model is coding, the exon has *part* of the non-coding section lying inside the intron + (ie the non-coding section must not be starting in the exonic part). The results are stored inside the transcript instance, in the "retained_introns" tuple. :param transcript: a Transcript instance @@ -730,14 +750,9 @@ def get_metrics(self): """Quick wrapper to calculate the metrics for all the transcripts.""" - # TODO: Find an intelligent way ot restoring this check - # I disabled it because otherwise the values for surviving transcripts would be wrong - # But this effectively leads to a doubling of run time. A possibility would be to cache the results. if self.metrics_calculated is True: return - assert len(self._cds_introntree) == len(self.combined_cds_introns) - for tid in sorted(self.transcripts): self.calculate_metrics(tid) @@ -797,8 +812,11 @@ def calculate_metrics(self, tid: str): self.transcripts[tid].combined_cds_locus_fraction = 0 self.transcripts[tid].selected_cds_locus_fraction = 0 else: - self.transcripts[tid].combined_cds_locus_fraction = self.transcripts[tid].combined_cds_length / cds_bases - self.transcripts[tid].selected_cds_locus_fraction = self.transcripts[tid].selected_cds_length / selected_bases + selected_length = self.transcripts[tid].selected_cds_length + combined_length = self.transcripts[tid].combined_cds_length + + self.transcripts[tid].combined_cds_locus_fraction = combined_length / cds_bases + self.transcripts[tid].selected_cds_locus_fraction = selected_length / selected_bases if len(self.introns) > 0: _ = len(set.intersection(self.transcripts[tid].introns, self.introns)) @@ -870,6 +888,216 @@ def _check_not_passing(self, previous_not_passing=set()): return not_passing + def calculate_scores(self): + """ + Function to calculate a score for each transcript, given the metrics derived + with the calculate_metrics method and the scoring scheme provided in the JSON configuration. + If any requirements have been specified, all transcripts which do not pass them + will be assigned a score of 0 and subsequently ignored. + Scores are rounded to the nearest integer. + """ + + if self.scores_calculated is True: + self.logger.debug("Scores calculation already effectuated for %s", + self.id) + return + + self.get_metrics() + # not_passing = set() + if not hasattr(self, "logger"): + self.logger = None + self.logger.setLevel("DEBUG") + self.logger.debug("Calculating scores for {0}".format(self.id)) + if "requirements" in self.json_conf: + self.__check_requirements() + + if len(self.transcripts) == 0: + self.logger.warning("No transcripts pass the muster for {0}".format(self.id)) + self.scores_calculated = True + return + self.scores = dict() + + for tid in self.transcripts: + self.scores[tid] = dict() + # Add the score for the transcript source + self.scores[tid]["source_score"] = self.transcripts[tid].source_score + + if self.regressor is None: + for param in self.json_conf["scoring"]: + self._calculate_score(param) + + for tid in self.scores: + self.transcripts[tid].scores = self.scores[tid].copy() + + for tid in self.transcripts: + if tid in self._not_passing: + self.logger.debug("Excluding %s as it does not pass minimum requirements", + tid) + self.transcripts[tid].score = 0 + else: + self.transcripts[tid].score = sum(self.scores[tid].values()) + if self.transcripts[tid].score <= 0: + self.logger.debug("Excluding %s as it has a score <= 0", tid) + self.transcripts[tid].score = 0 + self._not_passing.add(tid) + + if tid in self._not_passing: + pass + else: + assert self.transcripts[tid].score == sum(self.scores[tid].values()), ( + tid, self.transcripts[tid].score, sum(self.scores[tid].values()) + ) + # if self.json_conf["pick"]["external_scores"]: + # assert any("external" in _ for _ in self.scores[tid].keys()), self.scores[tid].keys() + + self.scores[tid]["score"] = self.transcripts[tid].score + + else: + valid_metrics = self.regressor.metrics + metric_rows = SortedDict() + for tid, transcript in sorted(self.transcripts.items(), key=operator.itemgetter(0)): + for param in valid_metrics: + self.scores[tid][param] = "NA" + row = [] + for attr in valid_metrics: + val = getattr(transcript, attr) + if isinstance(val, bool): + if val: + val = 1 + else: + val = 0 + row.append(val) + # Necessary for sklearn .. + row = numpy.array(row) + # row = row.reshape(1, -1) + metric_rows[tid] = row + # scores = SortedDict.fromkeys(metric_rows.keys()) + if isinstance(self.regressor, RandomForestClassifier): + # We have to pick the second probability (correct) + for tid in metric_rows: + score = self.regressor.predict_proba(metric_rows[tid])[0][1] + self.scores[tid]["score"] = score + self.transcripts[tid].score = score + else: + pred_scores = self.regressor.predict(list(metric_rows.values())) + for pos, score in enumerate(pred_scores): + self.scores[list(metric_rows.keys())[pos]]["score"] = score + self.transcripts[list(metric_rows.keys())[pos]].score = score + + self.scores_calculated = True + + def __check_requirements(self): + """ + This private method will identify and delete all transcripts which do not pass + the minimum muster specified in the configuration. + :return: + """ + + self.get_metrics() + + previous_not_passing = set() + beginning = len(self.transcripts) + while True: + not_passing = self._check_not_passing( + previous_not_passing=previous_not_passing) + self.metrics_calculated = not (len(not_passing) > 0 and self.purge) + if len(not_passing) == 0: + return + self._not_passing.update(not_passing) + for tid in not_passing: + if self.purge in (False,): + print("%s has been assigned a score of 0 because it fails basic requirements" % self.id) + self.logger.debug("%s has been assigned a score of 0 because it fails basic requirements", + self.id) + self.transcripts[tid].score = 0 + else: + self.logger.debug("Excluding %s from %s because of failed requirements", + tid, self.id) + self.remove_transcript_from_locus(tid) + + if not self.purge: + assert len(self.transcripts) == beginning + + if len(self.transcripts) == 0 or self.metrics_calculated is True: + return + elif self.purge and len(not_passing) > 0: + assert self._not_passing + else: + # Recalculate the metrics + self.get_metrics() + + def _calculate_score(self, param): + """ + Private method that calculates a score for each transcript, + given a target parameter. + :param param: + :return: + """ + + rescaling = self.json_conf["scoring"][param]["rescaling"] + use_raw = self.json_conf["scoring"][param]["use_raw"] + + metrics = dict((tid, getattr(self.transcripts[tid], param)) for tid in self.transcripts) + + if use_raw is True and not param.startswith("external") and getattr(Transcript, param).usable_raw is False: + self.logger.warning("The \"%s\" metric cannot be used as a raw score for %s, switching to False", + param, self.id) + use_raw = False + if use_raw is True and rescaling == "target": + self.logger.warning("I cannot use a raw score for %s in %s when looking for a target. Switching to False", + param, self.id) + use_raw = False + + if rescaling == "target": + target = self.json_conf["scoring"][param]["value"] + denominator = max(abs(x - target) for x in metrics.values()) + else: + target = None + if use_raw is True and rescaling == "max": + denominator = 1 + elif use_raw is True and rescaling == "min": + denominator = -1 + else: + denominator = (max(metrics.values()) - min(metrics.values())) + if denominator == 0: + denominator = 1 + + scores = [] + for tid in metrics: + tid_metric = metrics[tid] + score = 0 + check = True + if ("filter" in self.json_conf["scoring"][param] and + self.json_conf["scoring"][param]["filter"] != {}): + check = self.evaluate(tid_metric, self.json_conf["scoring"][param]["filter"]) + + if check is True: + if use_raw is True: + if not isinstance(tid_metric, (float, int)) and 0 <= tid_metric <= 1: + error = ValueError( + "Only scores with values between 0 and 1 can be used raw. Please recheck your values.") + self.logger.exception(error) + raise error + score = tid_metric / denominator + elif rescaling == "target": + score = 1 - abs(tid_metric - target) / denominator + else: + if min(metrics.values()) == max(metrics.values()): + score = 1 + elif rescaling == "max": + score = abs((tid_metric - min(metrics.values())) / denominator) + elif rescaling == "min": + score = abs(1 - (tid_metric - min(metrics.values())) / denominator) + + score *= self.json_conf["scoring"][param]["multiplier"] + self.scores[tid][param] = round(score, 2) + scores.append(score) + + # This MUST be true + if "filter" not in self.json_conf["scoring"][param] and max(scores) <= 0: + self.logger.warning("All transcripts have a score of 0 for %s in %s", + param, self.id) + @classmethod @abc.abstractmethod def is_intersecting(cls, *args, **kwargs): @@ -892,6 +1120,10 @@ def json_conf(self): @json_conf.setter def json_conf(self, conf): + if conf is None or isinstance(conf, str): + conf = to_json(conf) + elif not isinstance(conf, dict): + raise TypeError("Invalid configuration!") self.__json_conf = conf def _check_json(self): diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py index cfae342b3..d7beb9023 100644 --- a/Mikado/loci/locus.py +++ b/Mikado/loci/locus.py @@ -9,7 +9,6 @@ import itertools import operator from collections import deque -from sys import version_info import pyfaidx from ..transcripts.transcript import Transcript from ..transcripts.transcriptchecker import TranscriptChecker @@ -18,10 +17,6 @@ from ..parsers.GFF import GffLine from ..scales.assigner import Assigner from ..utilities import overlap -if version_info.minor < 5: - from sortedcontainers import SortedDict -else: - from collections import OrderedDict as SortedDict class Locus(Sublocus, Abstractlocus): @@ -68,6 +63,7 @@ def __init__(self, transcript: Transcript, logger=None, json_conf=None): self.__id = None self.fai = None self.json_conf = json_conf + self.purge = self.json_conf["pick"]["clustering"]["purge"] # if verified_introns is not None: # self.locus_verified_introns = verified_introns @@ -393,25 +389,22 @@ def set_json_conf(self, jconf: dict): raise TypeError("Invalid configuration of type {0}".format(type(jconf))) self.json_conf = jconf - def get_metrics(self): - - """Quick wrapper to calculate the metrics for all the transcripts.""" - - # TODO: Find an intelligent way ot restoring this check - - if self.metrics_calculated is True: - return - - # self.logger.info("Calculating the intron tree for %s", self.id) - assert len(self._cds_introntree) == len(self.combined_cds_introns) - - for tid in sorted(self.transcripts): - self.calculate_metrics(tid) - - self.logger.debug("Finished to calculate the metrics for %s", self.id) - - self.metrics_calculated = True - return + # def get_metrics(self): + # + # """Quick wrapper to calculate the metrics for all the transcripts.""" + # + # if self.metrics_calculated is True: + # return + # + # assert len(self._cds_introntree) == len(self.combined_cds_introns) + # + # for tid in sorted(self.transcripts): + # self.calculate_metrics(tid) + # + # self.logger.debug("Finished to calculate the metrics for %s", self.id) + # + # self.metrics_calculated = True + # return def calculate_metrics(self, tid: str): """ @@ -463,59 +456,16 @@ def calculate_scores(self): if self.scores_calculated is True: return + self.scores = dict() self.get_metrics() + + super().calculate_scores() + if not hasattr(self, "logger"): self.logger = None self.logger.setLevel("DEBUG") self.logger.debug("Calculating scores for {0}".format(self.id)) - self.scores = dict() - for tid in self.transcripts: - self.scores[tid] = dict() - # Add the score for the transcript source - self.scores[tid]["source_score"] = self.transcripts[tid].source_score - - if self.regressor is None: - for param in self.json_conf["scoring"]: - self._calculate_score(param) - - for tid in self.scores: - self.transcripts[tid].scores = self.scores[tid].copy() - - for tid in self.transcripts: - - if tid in self.__orf_doubles: - del self.scores[tid] - continue - self.transcripts[tid].score = sum(self.scores[tid].values()) - self.scores[tid]["score"] = self.transcripts[tid].score - - else: - valid_metrics = self.regressor.metrics - metric_rows = SortedDict() - for tid, transcript in sorted(self.transcripts.items(), key=operator.itemgetter(0)): - for param in valid_metrics: - self.scores[tid][param] = "NA" - row = [] - for attr in valid_metrics: - val = getattr(transcript, attr) - if isinstance(val, bool): - if val: - val = 1 - else: - val = 0 - row.append(val) - metric_rows[tid] = row - # scores = SortedDict.fromkeys(metric_rows.keys()) - for pos, score in enumerate(self.regressor.predict(list(metric_rows.values()))): - tid = list(metric_rows.keys())[pos] - if tid in self.__orf_doubles: - del self.scores[tid] - continue - self.scores[tid]["score"] = score - self.transcripts[tid].score = score - - self.metric_lines_store = [] for row in self.prepare_metrics(): if row["tid"] in self.__orf_doubles: continue @@ -526,8 +476,8 @@ def calculate_scores(self): for partial in self.__orf_doubles[doubled]: if partial in self.transcripts: del self.transcripts[partial] - - self.scores_calculated = True + if partial in self.scores: + del self.scores[partial] def print_scores(self): """This method yields dictionary rows that are given to a csv.DictWriter class.""" @@ -542,19 +492,26 @@ def print_scores(self): row = dict().fromkeys(keys) row["tid"] = tid row["parent"] = self.id - row["score"] = round(self.scores[tid]["score"], 2) + if tid in self._not_passing: + row["score"] = 0 + else: + row["score"] = round(self.scores[tid]["score"], 2) calculate_total = (self.regressor is None) for key in score_keys: if calculate_total: assert self.scores[tid][key] != "NA" and self.scores[tid][key] is not None row[key] = round(self.scores[tid][key], 2) + if calculate_total is True: score_sum = sum(row[key] for key in score_keys) - # - assert round(score_sum, 2) == round(self.scores[tid]["score"], 2), ( - score_sum, - self.transcripts[tid].score, - tid) + if tid not in self._not_passing and self.scores[tid]["score"] > 0: + assert round(score_sum, 2) == round(self.scores[tid]["score"], 2), ( + score_sum, + self.transcripts[tid].score, + tid) + else: + assert self.scores[tid]["score"] == 0 + yield row def is_alternative_splicing(self, other): @@ -658,7 +615,7 @@ def _find_communities_boundaries(self, five_comm, three_comm): comm = five_comm.popleft() comm = deque(sorted(list(set.difference(set(comm), five_found)), - key=lambda tid: self[tid].start)) + key=lambda internal_tid: self[internal_tid].start)) if len(comm) == 1: continue first = comm.popleft() @@ -688,7 +645,7 @@ def _find_communities_boundaries(self, five_comm, three_comm): comm = three_comm.popleft() comm = deque(sorted(list(set.difference(set(comm), three_found)), - key=lambda tid: self[tid].end, reverse=True)) + key=lambda internal_tid: self[internal_tid].end, reverse=True)) if len(comm) == 1: continue first = comm.popleft() @@ -709,7 +666,7 @@ def _find_communities_boundaries(self, five_comm, three_comm): three_found.add(tid) else: continue - comm = deque([_ for _ in comm if _ not in three_found ]) + comm = deque([_ for _ in comm if _ not in three_found]) if comm: three_comm.appendleft(comm) @@ -889,12 +846,8 @@ def expand_transcript(transcript, new_start, new_end, fai, logger): raise AssertionError(err) logger.debug("New ORF: %s", str(orf)) new_orfs.append(orf) - # from ..utilities.log_utils import create_default_logger - # transcript.logger = create_default_logger("TEMP") - # transcript.logger.setLevel("DEBUG") transcript.load_orfs(new_orfs) - # transcript.logger.setLevel("WARNING") # Now finalize again transcript.finalize() - return transcript \ No newline at end of file + return transcript diff --git a/Mikado/loci/monosublocus.py b/Mikado/loci/monosublocus.py index d7645ff47..71c1511ec 100644 --- a/Mikado/loci/monosublocus.py +++ b/Mikado/loci/monosublocus.py @@ -32,6 +32,7 @@ def __init__(self, transcript_instance, json_conf=None, logger=None): self.score = transcript_instance.score self.tid = transcript_instance.id self.logger = logger + self.purge = self.json_conf["pick"]["clustering"]["purge"] self.attributes = dict() self.json_conf = json_conf diff --git a/Mikado/loci/monosublocusholder.py b/Mikado/loci/monosublocusholder.py index b7198c7f8..da0277a15 100644 --- a/Mikado/loci/monosublocusholder.py +++ b/Mikado/loci/monosublocusholder.py @@ -51,6 +51,7 @@ def __init__(self, monosublocus_instance: Monosublocus, json_conf=None, logger=N self.splitted = False self.metrics_calculated = False self.json_conf = json_conf + self.purge = self.json_conf["pick"]["clustering"]["purge"] self.excluded = None self.purge = self.json_conf["pick"]["clustering"]["purge"] self.feature = "MonosublocusHolder" @@ -221,7 +222,7 @@ def define_loci(self, purge=False, excluded=None): to_remove.update(clique) if purge is False or selected_transcript.score > 0: - new_locus = Locus(selected_transcript, logger=self.logger) + new_locus = Locus(selected_transcript, logger=self.logger, json_conf=self.json_conf) loci.append(new_locus) self.logger.debug("Removing {0} transcripts from {1}".format(len(to_remove), self.id)) graph.remove_nodes_from(to_remove) # Remove nodes from graph, iterate diff --git a/Mikado/loci/sublocus.py b/Mikado/loci/sublocus.py index ff109b083..b9655f378 100644 --- a/Mikado/loci/sublocus.py +++ b/Mikado/loci/sublocus.py @@ -7,19 +7,13 @@ """ import itertools -from sys import version_info -import numpy -from sklearn.ensemble import RandomForestClassifier + + from ..transcripts.transcript import Transcript from .abstractlocus import Abstractlocus from .excluded import Excluded from .monosublocus import Monosublocus from ..parsers.GFF import GffLine -if version_info.minor < 5: - from sortedcontainers import SortedDict -else: - from collections import OrderedDict as SortedDict -import operator # pylint: disable=too-many-instance-attributes @@ -196,6 +190,12 @@ def define_monosubloci(self, purge=False, excluded=None): self.logger.debug("Launching calculate scores for {0}".format(self.id)) self.calculate_scores() + if self._excluded_transcripts and self.purge: + self.excluded = Excluded(self._excluded_transcripts.pop()) + while self._excluded_transcripts: + self.excluded.add_transcript_to_locus(self._excluded_transcripts.pop(), + check_in_locus=False) + self.logger.debug("Defining monosubloci for {0}".format(self.id)) transcript_graph = self.define_graph(self.transcripts, @@ -254,215 +254,12 @@ def load_scores(self, scores): else: self.transcripts[tid].score = 0 - def __check_requirements(self): - """ - This private method will identify and delete all transcripts which do not pass - the minimum muster specified in the configuration. - :return: - """ - - self.get_metrics() - - previous_not_passing = set() - while True: - not_passing = self._check_not_passing( - previous_not_passing=previous_not_passing) - if len(not_passing) == 0: - return - for tid in not_passing: - self.transcripts[tid].score = 0 - if self.purge is True: - self.metrics_calculated = False - self.logger.debug("Excluding %s from %s because of failed requirements", - tid, self.id) - if self.excluded is None: - excluded = Monosublocus(self.transcripts[tid], logger=self.logger) - excluded.json_conf = self.json_conf - self.excluded = Excluded(excluded) - else: - self.excluded.add_transcript_to_locus(self.transcripts[tid]) - self.remove_transcript_from_locus(tid) - else: - self.logger.debug("%s has been assigned a score of 0 because it fails basic requirements", - self.id) - self._not_passing.add(tid) - return - - if len(self.transcripts) == 0: - return - else: - # Recalculate the metrics - self.get_metrics() - def calculate_scores(self): - """ - Function to calculate a score for each transcript, given the metrics derived - with the calculate_metrics method and the scoring scheme provided in the JSON configuration. - If any requirements have been specified, all transcripts which do not pass them - will be assigned a score of 0 and subsequently ignored. - Scores are rounded to the nearest integer. - """ - - if self.scores_calculated is True: - self.logger.debug("Scores calculation already effectuated for %s", - self.id) - return - - self.get_metrics() - # not_passing = set() - if not hasattr(self, "logger"): - self.logger = None - self.logger.setLevel("DEBUG") - self.logger.debug("Calculating scores for {0}".format(self.id)) - if "requirements" in self.json_conf: - self.__check_requirements() - - if len(self.transcripts) == 0: - self.logger.warning("No transcripts pass the muster for {0}".format(self.id)) - self.scores_calculated = True - return - self.scores = dict() - - for tid in self.transcripts: - self.scores[tid] = dict() - # Add the score for the transcript source - self.scores[tid]["source_score"] = self.transcripts[tid].source_score - - if self.regressor is None: - for param in self.json_conf["scoring"]: - self._calculate_score(param) - - for tid in self.scores: - self.transcripts[tid].scores = self.scores[tid].copy() - - for tid in self.transcripts: - if tid in self._not_passing: - self.logger.debug("Excluding %s as it does not pass minimum requirements", - tid) - self.transcripts[tid].score = 0 - else: - self.transcripts[tid].score = sum(self.scores[tid].values()) - if self.transcripts[tid].score == 0: - self.logger.debug("Excluding %s as it has a score of 0", tid) - - if tid in self._not_passing: - pass - else: - assert self.transcripts[tid].score == sum(self.scores[tid].values()), ( - tid, self.transcripts[tid].score, sum(self.scores[tid].values()) - ) - # if self.json_conf["pick"]["external_scores"]: - # assert any("external" in _ for _ in self.scores[tid].keys()), self.scores[tid].keys() - - self.scores[tid]["score"] = self.transcripts[tid].score - - else: - valid_metrics = self.regressor.metrics - metric_rows = SortedDict() - for tid, transcript in sorted(self.transcripts.items(), key=operator.itemgetter(0)): - for param in valid_metrics: - self.scores[tid][param] = "NA" - row = [] - for attr in valid_metrics: - val = getattr(transcript, attr) - if isinstance(val, bool): - if val: - val = 1 - else: - val = 0 - row.append(val) - # Necessary for sklearn .. - row = numpy.array(row) - # row = row.reshape(1, -1) - metric_rows[tid] = row - # scores = SortedDict.fromkeys(metric_rows.keys()) - if isinstance(self.regressor, RandomForestClassifier): - # We have to pick the second probability (correct) - for tid in metric_rows: - score = self.regressor.predict_proba(metric_rows[tid])[0][1] - self.scores[tid]["score"] = score - self.transcripts[tid].score = score - else: - pred_scores = self.regressor.predict(list(metric_rows.values())) - for pos, score in enumerate(pred_scores): - self.scores[list(metric_rows.keys())[pos]]["score"] = score - self.transcripts[list(metric_rows.keys())[pos]].score = score + super().calculate_scores() self.metric_lines_store = [_ for _ in self.prepare_metrics()] self.scores_calculated = True - def _calculate_score(self, param): - """ - Private method that calculates a score for each transcript, - given a target parameter. - :param param: - :return: - """ - - rescaling = self.json_conf["scoring"][param]["rescaling"] - use_raw = self.json_conf["scoring"][param]["use_raw"] - - metrics = dict((tid, getattr(self.transcripts[tid], param)) for tid in self.transcripts) - - if use_raw is True and not param.startswith("external") and getattr(Transcript, param).usable_raw is False: - self.logger.warning("The \"%s\" metric cannot be used as a raw score for %s, switching to False", - param, self.id) - use_raw = False - if use_raw is True and rescaling == "target": - self.logger.warning("I cannot use a raw score for %s in %s when looking for a target. Switching to False", - param, self.id) - use_raw = False - - if rescaling == "target": - target = self.json_conf["scoring"][param]["value"] - denominator = max(abs(x - target) for x in metrics.values()) - else: - target = None - if use_raw is True and rescaling == "max": - denominator = 1 - elif use_raw is True and rescaling == "min": - denominator = -1 - else: - denominator = (max(metrics.values()) - min(metrics.values())) - if denominator == 0: - denominator = 1 - - scores = [] - for tid in metrics: - tid_metric = metrics[tid] - score = 0 - check = True - if ("filter" in self.json_conf["scoring"][param] and - self.json_conf["scoring"][param]["filter"] != {}): - check = self.evaluate(tid_metric, self.json_conf["scoring"][param]["filter"]) - - if check is True: - if use_raw is True: - if not isinstance(tid_metric, (float, int)) and 0 <= tid_metric <= 1: - error = ValueError( - "Only scores with values between 0 and 1 can be used raw. Please recheck your values.") - self.logger.exception(error) - raise error - score = tid_metric / denominator - elif rescaling == "target": - score = 1 - abs(tid_metric - target) / denominator - else: - if min(metrics.values()) == max(metrics.values()): - score = 1 - elif rescaling == "max": - score = abs((tid_metric - min(metrics.values())) / denominator) - elif rescaling == "min": - score = abs(1 - (tid_metric - min(metrics.values())) / denominator) - - score *= self.json_conf["scoring"][param]["multiplier"] - self.scores[tid][param] = round(score, 2) - scores.append(score) - - # This MUST be true - if "filter" not in self.json_conf["scoring"][param] and max(scores) == 0: - self.logger.warning("All transcripts have a score of 0 for %s in %s", - param, self.id) - def prepare_metrics(self): """This method prepares the dictionary "rows" diff --git a/Mikado/tests/test_system_calls.py b/Mikado/tests/test_system_calls.py index 34191bd2e..6c4665c5b 100644 --- a/Mikado/tests/test_system_calls.py +++ b/Mikado/tests/test_system_calls.py @@ -642,7 +642,9 @@ def test_purging(self): with self.subTest(purging=purging): json_conf["pick"]["files"]["loci_out"] = "mikado.purging_{}.loci.gff3".format(purging) json_conf["pick"]["files"]["subloci_out"] = "mikado.purging_{}.subloci.gff3".format(purging) - json_conf["pick"]["files"]["log"] = "mikado.purging_{}.log".format(purging) + json_conf["pick"]["files"]["log"] = os.path.join( + tempfile.gettempdir(), + "mikado.purging_{}.log".format(purging)) json_conf["pick"]["clustering"]["purge"] = purging json_conf["pick"]["scoring_file"] = scoring_file.name json_conf = configurator.check_json(json_conf) From f5e68605ce7d846960cf81ac3207612293d42146 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Tue, 14 Feb 2017 17:25:53 +0000 Subject: [PATCH 43/47] Update the changelog with today's changes --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f7ba12b2..630f830a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ Changes in this release: - When printing out putative fragments, now Mikado will indicate the class code of the fragment, the match against which it was deemed a fragment of, and the distance of said fragment (if they are not overlapping). - Deprecated the "discard_definition" flag in Mikado serialise. Now Mikado will infer on its own whether to use the definition or the ID for serialising BLAST results. - Now AbstractLocus implementations have a private method to check the correctness of the json_conf. As a corollary, Transcript and children have been moved to their own subpackage ("transcripts") in order to break the circular dependency Mikado.loci.Abstractlocus <- Mikado.configurator <- Mikado.loci.Transcript. *Technical note*: checking the consinstency of the configuration is an expensive operation, so it will be executed on demand rather than automatically. +- The methods to calculate scores and metrics have been moved to the AbstractLocus class, so to minimize the incidence of bugs due to code duplication and diversion. - Made the checks for the scoring files more robust. - Re-written the "find_retained_introns" method of AbstractLocus, to solve some bugs found during the utilisation of last beta. As a corollary, expanded the intervaltree module to allow searches for "tagged" intervals. - Now the "monoloci_out" files contain the Monosublocus**Holder** step, not the Monosublocus step. This should help during fine-tuning. From e2ef5808f6e5f52e3480f9c4c36e6104614cc506 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 15 Feb 2017 14:52:38 +0000 Subject: [PATCH 44/47] Unified the filtering of transcripts into the abstractlocus class. Solved a bug which caused double printing of metric lines. --- Mikado/loci/abstractlocus.py | 60 +++++++++++++++---------------- Mikado/loci/locus.py | 11 +++--- Mikado/loci/monosublocus.py | 3 +- Mikado/loci/monosublocusholder.py | 2 -- Mikado/loci/sublocus.py | 5 +-- Mikado/loci/superlocus.py | 51 +------------------------- Mikado/tests/locus_tester.py | 47 ++++++++++++++++++++++++ Mikado/transcripts/transcript.py | 4 +-- 8 files changed, 89 insertions(+), 94 deletions(-) diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py index ea036c544..20e96faac 100644 --- a/Mikado/loci/abstractlocus.py +++ b/Mikado/loci/abstractlocus.py @@ -66,10 +66,6 @@ def __init__(self, source="", verified_introns=None): self.scores_calculated = False self.scores = dict() - self.purge = self.json_conf["pick"]["clustering"]["purge"] - - # self.purge = True - if verified_introns is not None: self.locus_verified_introns = verified_introns @@ -364,7 +360,7 @@ def choose_best(cls, transcripts: dict) -> str: # ###### Class instance methods ####### - def add_transcript_to_locus(self, transcript, check_in_locus=True, logger=None, **kwargs): + def add_transcript_to_locus(self, transcript, check_in_locus=True, **kwargs): """ :param transcript :type transcript: Mikado.loci_objects.transcript.Transcript @@ -374,9 +370,6 @@ def add_transcript_to_locus(self, transcript, check_in_locus=True, logger=None, or instead whether to trust the assignment to be correct :type check_in_locus: bool - :param logger: the logger to use for this function. - :type logger: logging.Logger - This method checks that a transcript is contained within the superlocus (using the "in_superlocus" class method) and upon a successful check extends the superlocus with the new transcript. @@ -562,10 +555,10 @@ def _exon_to_be_considered(exon, @staticmethod def _is_exon_retained_in_transcript(exon: tuple, frags: list, - candidate: Transcript, + segmenttree: IntervalTree, + strand: str, consider_truncated=False, - terminal=False, - logger=create_null_logger()): + terminal=False): """Private static method to verify whether a given exon is a retained intron of the candidate Transcript. :param exon: the exon to be considered. @@ -590,14 +583,12 @@ def _is_exon_retained_in_transcript(exon: tuple, :rtype: bool """ - logger.debug("Considering exon %s against candidate %s", exon, candidate.id) - found_exons = sorted( - candidate.segmenttree.find(exon[0], exon[1], strict=False, value="exon"), - reverse=(candidate.strand == "-")) + segmenttree.find(exon[0], exon[1], strict=False, value="exon"), + reverse=(strand == "-")) found_introns = sorted( - candidate.segmenttree.find(exon[0], exon[1], strict=not consider_truncated, value="intron"), - reverse=(candidate.strand == "-")) + segmenttree.find(exon[0], exon[1], strict=not consider_truncated, value="intron"), + reverse=(strand == "-")) is_retained = False @@ -606,15 +597,15 @@ def _is_exon_retained_in_transcript(exon: tuple, elif len(found_exons) == 1 and len(found_introns) == 1: found_exons = found_exons.pop() found_introns = found_introns.pop() - if candidate.strand != "-" and found_exons[1] + 1 == found_introns[0]: + if strand != "-" and found_exons[1] + 1 == found_introns[0]: is_retained = (consider_truncated and terminal) - elif candidate.strand == "-" and found_exons[0] - 1 == found_introns[1]: + elif strand == "-" and found_exons[0] - 1 == found_introns[1]: is_retained = (consider_truncated and terminal) elif len(found_exons) >= 2: # Now we have to check whether the matched introns contain both coding and non-coding parts for index, exon in enumerate(found_exons[:-1]): intron = found_introns[index] - if candidate.strand == "-": + if strand == "-": assert intron[1] == exon[0] - 1 else: assert exon[1] == intron[0] - 1 @@ -629,7 +620,7 @@ def _is_exon_retained_in_transcript(exon: tuple, elif overlap(frag, exon) == exon[1] - exon[0]: is_retained = True - logger.debug("%s in %s %s a retained intron", exon, candidate.id, "is" if is_retained is True else "is not") + # logger.debug("%s in %s %s a retained intron", exon, candidate.id, "is" if is_retained is True else "is not") return is_retained def find_retained_introns(self, transcript: Transcript): @@ -687,20 +678,21 @@ def find_retained_introns(self, transcript: Transcript): else: terminal = False - for candidate in (_ for _ in self.transcripts.values()): + for tid, candidate in (_ for _ in self.transcripts.items() if _[0] != transcript.id): + segmenttree = candidate.segmenttree + strand = candidate.strand if candidate == transcript: continue - elif candidate.strand != transcript.strand and None not in (transcript.strand, candidate.strand): + elif strand != transcript.strand and None not in (transcript.strand, strand): continue self.logger.debug("Checking %s in %s against %s", exon, transcript.id, candidate.id) is_retained = self._is_exon_retained_in_transcript(exon, frags, - # transcript, - candidate, + strand=strand, + segmenttree=segmenttree, terminal=terminal, - consider_truncated=consider_truncated, - logger=self.logger) + consider_truncated=consider_truncated) if is_retained: self.logger.debug("Exon %s of %s is a retained intron of %s", exon, transcript.id, candidate.id) @@ -909,7 +901,7 @@ def calculate_scores(self): self.logger.setLevel("DEBUG") self.logger.debug("Calculating scores for {0}".format(self.id)) if "requirements" in self.json_conf: - self.__check_requirements() + self._check_requirements() if len(self.transcripts) == 0: self.logger.warning("No transcripts pass the muster for {0}".format(self.id)) @@ -986,7 +978,7 @@ def calculate_scores(self): self.scores_calculated = True - def __check_requirements(self): + def _check_requirements(self): """ This private method will identify and delete all transcripts which do not pass the minimum muster specified in the configuration. @@ -1000,9 +992,11 @@ def __check_requirements(self): while True: not_passing = self._check_not_passing( previous_not_passing=previous_not_passing) - self.metrics_calculated = not (len(not_passing) > 0 and self.purge) + if len(not_passing) == 0: + self.metrics_calculated = True return + self.metrics_calculated = not ((len(not_passing) > 0) and self.purge) self._not_passing.update(not_passing) for tid in not_passing: if self.purge in (False,): @@ -1284,3 +1278,9 @@ def locus_verified_introns(self, *args): type(args[0])) self.__locus_verified_introns = args[0] + + @property + def purge(self): + """This property relates to pick/clustering/purge.""" + + return self.json_conf.get("pick", dict()).get("clustering", {}).get("purge", True) \ No newline at end of file diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py index d7beb9023..499fbfeb2 100644 --- a/Mikado/loci/locus.py +++ b/Mikado/loci/locus.py @@ -63,7 +63,6 @@ def __init__(self, transcript: Transcript, logger=None, json_conf=None): self.__id = None self.fai = None self.json_conf = json_conf - self.purge = self.json_conf["pick"]["clustering"]["purge"] # if verified_introns is not None: # self.locus_verified_introns = verified_introns @@ -457,8 +456,8 @@ def calculate_scores(self): return self.scores = dict() + self.metric_lines_store = [] self.get_metrics() - super().calculate_scores() if not hasattr(self, "logger"): @@ -466,11 +465,11 @@ def calculate_scores(self): self.logger.setLevel("DEBUG") self.logger.debug("Calculating scores for {0}".format(self.id)) - for row in self.prepare_metrics(): - if row["tid"] in self.__orf_doubles: - continue + for index, item in enumerate(reversed(self.metric_lines_store)): + if item["tid"] in self.__orf_doubles: + del self.metric_lines_store[index] else: - self.metric_lines_store.append(row) + continue for doubled in self.__orf_doubles: for partial in self.__orf_doubles[doubled]: diff --git a/Mikado/loci/monosublocus.py b/Mikado/loci/monosublocus.py index 71c1511ec..41c67ebe9 100644 --- a/Mikado/loci/monosublocus.py +++ b/Mikado/loci/monosublocus.py @@ -32,7 +32,6 @@ def __init__(self, transcript_instance, json_conf=None, logger=None): self.score = transcript_instance.score self.tid = transcript_instance.id self.logger = logger - self.purge = self.json_conf["pick"]["clustering"]["purge"] self.attributes = dict() self.json_conf = json_conf @@ -47,7 +46,7 @@ def __str__(self, print_cds=True, source_in_name=True): # ########## Class instance methods ############## - def add_transcript_to_locus(self, transcript, check_in_locus=False): + def add_transcript_to_locus(self, transcript, check_in_locus=False, **kwargs): """For this basic class, this method raises a NotImplementedError - as this container should hold only one transcript. diff --git a/Mikado/loci/monosublocusholder.py b/Mikado/loci/monosublocusholder.py index da0277a15..773f6b0e7 100644 --- a/Mikado/loci/monosublocusholder.py +++ b/Mikado/loci/monosublocusholder.py @@ -51,9 +51,7 @@ def __init__(self, monosublocus_instance: Monosublocus, json_conf=None, logger=N self.splitted = False self.metrics_calculated = False self.json_conf = json_conf - self.purge = self.json_conf["pick"]["clustering"]["purge"] self.excluded = None - self.purge = self.json_conf["pick"]["clustering"]["purge"] self.feature = "MonosublocusHolder" self.score = monosublocus_instance.score self.scores_calculated = False diff --git a/Mikado/loci/sublocus.py b/Mikado/loci/sublocus.py index b9655f378..92ea0ea42 100644 --- a/Mikado/loci/sublocus.py +++ b/Mikado/loci/sublocus.py @@ -56,7 +56,6 @@ def __init__(self, span, json_conf=None, logger=None, verified_introns=None): self.fixed_size = True if span.feature == "sublocus" else False if span.__name__ == "transcript": span.finalize() - self.purge = self.json_conf["pick"]["clustering"]["purge"] self.source = self.json_conf["pick"]["output_format"]["source"] self.excluded = None @@ -223,7 +222,9 @@ def define_monosubloci(self, purge=False, excluded=None): )) to_remove.update(clique) if purge is False or selected_transcript.score > 0: - new_locus = Monosublocus(selected_transcript, logger=self.logger) + new_locus = Monosublocus(selected_transcript, + logger=self.logger, + json_conf=self.json_conf) new_locus.json_conf = self.json_conf self.monosubloci.append(new_locus) if len(to_remove) < 1: diff --git a/Mikado/loci/superlocus.py b/Mikado/loci/superlocus.py index e47407d85..b08efdca1 100644 --- a/Mikado/loci/superlocus.py +++ b/Mikado/loci/superlocus.py @@ -119,7 +119,6 @@ def __init__(self, raise NoJsonConfigError("I am missing the configuration for prioritizing transcripts!") self.__regressor = None self.json_conf = json_conf - self.purge = self.json_conf["pick"]["clustering"]["purge"] self.splices = set(self.splices) self.introns = set(self.introns) @@ -673,54 +672,6 @@ def load_all_transcript_data(self, engine=None, data_dict=None): # ##### Sublocus-related steps ###### - def __prefilter_transcripts(self): - - """Private method that will check whether there are any transcripts - not meeting the minimum requirements specified in the configuration. - :return: - """ - - self.excluded_transcripts = None - - not_passing = self._check_not_passing() - - if not not_passing: - self.logger.debug("No transcripts to be excluded for %s", self.id) - return - else: - self.logger.debug("""%d transcript%s do not pass the requirements for %s; -expression: %s""", - len(not_passing), - "" if len(not_passing) == 1 else "s", - self.id, - self.json_conf["requirements"]["expression"]) - - if self.purge is True: - self.logger.debug("Purging %d transcript%s from %s", - len(not_passing), - "" if len(not_passing) == 1 else "s", - self.id) - tid = not_passing.pop() - self.transcripts[tid].score = 0 - monosub = Monosublocus(self.transcripts[tid], logger=self.logger) - self.excluded_transcripts = Excluded(monosub, - json_conf=self.json_conf, - logger=self.logger) - self.excluded_transcripts.__name__ = "Excluded" - self.remove_transcript_from_locus(tid) - for tid in not_passing: - self.transcripts[tid].score = 0 - self.excluded_transcripts.add_transcript_to_locus( - self.transcripts[tid]) - self.remove_transcript_from_locus(tid) - else: - self.logger.debug("Keeping %d transcript%s in excluded loci from %s", - len(not_passing), - "" if len(not_passing) == 1 else "s", - self.id) - - return - def __reduce_complex_loci(self, transcript_graph): """ @@ -887,7 +838,7 @@ def define_subloci(self): self.subloci = [] # Check whether there is something to remove - self.__prefilter_transcripts() + self._check_requirements() if len(self.transcripts) == 0: # we have removed all transcripts from the Locus. Set the flag to True and exit. diff --git a/Mikado/tests/locus_tester.py b/Mikado/tests/locus_tester.py index 4b785181c..1d8f178a1 100644 --- a/Mikado/tests/locus_tester.py +++ b/Mikado/tests/locus_tester.py @@ -18,6 +18,7 @@ import Mikado.loci import pickle import inspect +from Mikado.parsers.bed12 import BED12 class OverlapTester(unittest.TestCase): @@ -931,6 +932,52 @@ def test_serialisation(self): locus = obj(candidate, json_conf=json_conf) pickle.dumps(locus) + def test_double_orf(self): + + t = Transcript() + t.add_exons([(101, 1000), (1101, 1200), (2001, 2900)]) + t.id = "t1" + t.strand = "+" + + orf1 = BED12() + orf1.transcriptomic = True + orf1.chrom = t.id + orf1.start = 1 + orf1.end = sum([_[1] - _[0] + 1 for _ in t.exons]) + orf1.strand = "+" + orf1.name = "t1.orf1" + orf1.block_sizes = (900,) + orf1.thick_start = 1 + orf1.thick_end = 900 + orf1.block_starts = (1,) + orf1.block_count = 1 + + orf2 = BED12() + orf2.transcriptomic = True + orf2.strand = "+" + orf2.chrom = t.id + orf2.start = 1 + orf2.end = sum([_[1] - _[0] + 1 for _ in t.exons]) + orf2.name = "t1.orf2" + orf2.block_sizes = (900,) + orf2.thick_start = 1001 + orf2.thick_end = 1900 + orf2.block_starts = (1,) + orf2.block_count = 1 + + self.assertFalse(orf1.invalid) + self.assertFalse(orf2.invalid) + + t.load_orfs([orf1, orf2]) + self.assertEqual(t.number_internal_orfs, 2) + + locus = Locus(t) + locus.calculate_scores() + self.assertTrue(list(locus.scores.keys()), [t.id]) + rows = list(locus.print_scores()) + self.assertEqual(len(rows), 1, rows) + self.assertEqual(rows[0]["tid"], t.id, rows[0]) + class RetainedIntronTester(unittest.TestCase): diff --git a/Mikado/transcripts/transcript.py b/Mikado/transcripts/transcript.py index 89d5bcf42..ae7c3f2b6 100644 --- a/Mikado/transcripts/transcript.py +++ b/Mikado/transcripts/transcript.py @@ -411,8 +411,8 @@ def __eq__(self, other) -> bool: if not isinstance(self, type(other)): return False - self.finalize() - other.finalize() + # self.finalize() + # other.finalize() if self.strand == other.strand and self.chrom == other.chrom: if other.start == self.start: From 4266e77e736d3368bfd617cb14d2769e0cd225d4 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 15 Feb 2017 14:59:51 +0000 Subject: [PATCH 45/47] Fixed a bug in _is_retained_exon --- Mikado/loci/abstractlocus.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py index 20e96faac..637c6908d 100644 --- a/Mikado/loci/abstractlocus.py +++ b/Mikado/loci/abstractlocus.py @@ -555,8 +555,7 @@ def _exon_to_be_considered(exon, @staticmethod def _is_exon_retained_in_transcript(exon: tuple, frags: list, - segmenttree: IntervalTree, - strand: str, + candidate: Transcript, consider_truncated=False, terminal=False): @@ -583,11 +582,12 @@ def _is_exon_retained_in_transcript(exon: tuple, :rtype: bool """ + strand = candidate.strand found_exons = sorted( - segmenttree.find(exon[0], exon[1], strict=False, value="exon"), + candidate.segmenttree.find(exon[0], exon[1], strict=False, value="exon"), reverse=(strand == "-")) found_introns = sorted( - segmenttree.find(exon[0], exon[1], strict=not consider_truncated, value="intron"), + candidate.segmenttree.find(exon[0], exon[1], strict=not consider_truncated, value="intron"), reverse=(strand == "-")) is_retained = False @@ -679,18 +679,13 @@ def find_retained_introns(self, transcript: Transcript): terminal = False for tid, candidate in (_ for _ in self.transcripts.items() if _[0] != transcript.id): - segmenttree = candidate.segmenttree - strand = candidate.strand - if candidate == transcript: - continue - elif strand != transcript.strand and None not in (transcript.strand, strand): + if candidate.strand != transcript.strand and None not in (transcript.strand, candidate.strand): continue self.logger.debug("Checking %s in %s against %s", exon, transcript.id, candidate.id) is_retained = self._is_exon_retained_in_transcript(exon, frags, - strand=strand, - segmenttree=segmenttree, + candidate, terminal=terminal, consider_truncated=consider_truncated) if is_retained: From c52357db8b9533ee6d68a2eca255e93ef3575a1c Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 15 Feb 2017 15:39:11 +0000 Subject: [PATCH 46/47] Now score is a read-only property for loci classes. --- Mikado/loci/abstractlocus.py | 8 ++++++++ Mikado/loci/locus.py | 12 ------------ Mikado/loci/monosublocus.py | 2 -- Mikado/loci/monosublocusholder.py | 11 ++--------- Mikado/loci/sublocus.py | 2 +- 5 files changed, 11 insertions(+), 24 deletions(-) diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py index 637c6908d..f786e2b34 100644 --- a/Mikado/loci/abstractlocus.py +++ b/Mikado/loci/abstractlocus.py @@ -1229,6 +1229,14 @@ def source(self, value): assert isinstance(value, str) self.__source = value + @property + def score(self): + + if len(self.transcripts): + return max(_.score for _ in self.transcripts.values()) + else: + return None + @property def _cds_introntree(self): diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py index 499fbfeb2..c56cf0f2a 100644 --- a/Mikado/loci/locus.py +++ b/Mikado/loci/locus.py @@ -46,7 +46,6 @@ def __init__(self, transcript: Transcript, logger=None, json_conf=None): self.locus_verified_introns = transcript.verified_introns self.metrics_calculated = False self.scores_calculated = False - self.score = transcript.score # A set of the transcript we will ignore during printing # because they are duplications of the original instance. Done solely to # get the metrics right. @@ -452,19 +451,8 @@ def calculate_scores(self): Scores are rounded to the nearest integer. """ - if self.scores_calculated is True: - return - - self.scores = dict() - self.metric_lines_store = [] - self.get_metrics() super().calculate_scores() - if not hasattr(self, "logger"): - self.logger = None - self.logger.setLevel("DEBUG") - self.logger.debug("Calculating scores for {0}".format(self.id)) - for index, item in enumerate(reversed(self.metric_lines_store)): if item["tid"] in self.__orf_doubles: del self.metric_lines_store[index] diff --git a/Mikado/loci/monosublocus.py b/Mikado/loci/monosublocus.py index 41c67ebe9..98cea7eba 100644 --- a/Mikado/loci/monosublocus.py +++ b/Mikado/loci/monosublocus.py @@ -26,10 +26,8 @@ def __init__(self, transcript_instance, json_conf=None, logger=None): # this must be defined straight away self.monoexonic = transcript_instance.monoexonic Abstractlocus.add_transcript_to_locus(self, transcript_instance) - self.score = transcript_instance.score self.feature = "Monosublocus" self.parent = None - self.score = transcript_instance.score self.tid = transcript_instance.id self.logger = logger self.attributes = dict() diff --git a/Mikado/loci/monosublocusholder.py b/Mikado/loci/monosublocusholder.py index 773f6b0e7..00c353b6d 100644 --- a/Mikado/loci/monosublocusholder.py +++ b/Mikado/loci/monosublocusholder.py @@ -53,7 +53,6 @@ def __init__(self, monosublocus_instance: Monosublocus, json_conf=None, logger=N self.json_conf = json_conf self.excluded = None self.feature = "MonosublocusHolder" - self.score = monosublocus_instance.score self.scores_calculated = False # Add the transcript to the Locus self.locus_verified_introns = set() @@ -137,7 +136,6 @@ def __str__(self, print_cds=False, source_in_name=True): for attr in ["chrom", 'feature', 'source', 'start', 'end', 'strand']: setattr(self_line, attr, getattr(self, attr)) self.calculate_scores() - self.score = max([_.score for _ in self.transcripts.values()]) self_line.phase, self_line.score = None, self.score if source_in_name is True: @@ -328,13 +326,8 @@ def _transcripts_are_intersecting(cls, :param other: :type other: Transcript - :param cds_only: boolean flag. If set to True, only - the CDS component of the transcripts will be considered to determine - whether they are intersecting or not. - :type cds_only: bool - - :param min_cdna_overlap: float. This is the minimum cDNA overlap for two transcripts to be considered as intersecting, - even when all other conditions fail. + :param min_cdna_overlap: float. This is the minimum cDNA overlap for + two transcripts to be considered as intersecting, even when all other conditions fail. :type min_cdna_overlap: float :param min_cds_overlap: float. This is the minimum CDS overlap for two transcripts to be considered as intersecting, diff --git a/Mikado/loci/sublocus.py b/Mikado/loci/sublocus.py index 92ea0ea42..0f87510ba 100644 --- a/Mikado/loci/sublocus.py +++ b/Mikado/loci/sublocus.py @@ -257,9 +257,9 @@ def load_scores(self, scores): def calculate_scores(self): + self.metric_lines_store = [] super().calculate_scores() self.metric_lines_store = [_ for _ in self.prepare_metrics()] - self.scores_calculated = True def prepare_metrics(self): From 4612973f49afaf0cda5d8bfea5d8a46ab5e880cd Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 15 Feb 2017 18:26:14 +0000 Subject: [PATCH 47/47] Updated the documentation. Ready for version 1 beta 10. --- MANIFEST.in | 1 + .../configuration_blueprint.json | 1 - Mikado/loci/abstractlocus.py | 4 +- Mikado/tests/assigner_tester.py | 84 ++--- docs/Algorithms.rst | 29 +- docs/Installation.rst | 37 +-- docs/Library/Mikado.loci.rst | 31 -- .../Mikado.loci.transcript_methods.rst | 46 --- docs/Library/Mikado.rst | 1 + docs/Library/Mikado.scales.rst | 8 + docs/Library/Mikado.subprograms.util.rst | 8 + docs/Library/Mikado.tests.rst | 16 - docs/Library/hpc.yaml | 1 - docs/Library/mikado.snakefile | 1 - docs/Library/tr.snakefile | 1 - docs/Usage/Compare.rst | 297 +++++++++--------- docs/Usage/Configure.rst | 237 +++++++++----- docs/Usage/Utilities.rst | 136 +++++++- 18 files changed, 524 insertions(+), 415 deletions(-) delete mode 100644 docs/Library/Mikado.loci.transcript_methods.rst delete mode 120000 docs/Library/hpc.yaml delete mode 120000 docs/Library/mikado.snakefile delete mode 120000 docs/Library/tr.snakefile diff --git a/MANIFEST.in b/MANIFEST.in index e06f37f7e..bad09e307 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -7,6 +7,7 @@ recursive-include . *py recursive-include Mikado *pyx recursive-include Mikado *pxd recursive-include util *py +exclude util/analyse_metrics.py util/create_model.py util/self_training.py recursive-include Mikado *snakefile *json *yaml recursive-include Mikado/tests * recursive-exclude docs/ * \ No newline at end of file diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json index 45634f54d..dd9644076 100644 --- a/Mikado/configuration/configuration_blueprint.json +++ b/Mikado/configuration/configuration_blueprint.json @@ -186,7 +186,6 @@ "maximum": 1, "default": 0.1}, "max_target_seqs": {"type": "integer", "default": 100000, "minimum": 1}, - "discard_definition": {"type": "boolean", "default": false}, "force": {"type": "boolean", "default": false}, "single_thread": {"type": "boolean", "default": false}, "procs": {"type": "integer", "default": 1, "minimum": 1} diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py index f786e2b34..d2da98a2e 100644 --- a/Mikado/loci/abstractlocus.py +++ b/Mikado/loci/abstractlocus.py @@ -244,9 +244,9 @@ def evaluate(param: str, conf: dict) -> bool: elif conf["operator"] == "not in": comparison = (param not in conf["value"]) elif conf["operator"] == "within": - comparison = (param in range(conf["value"][0], conf["value"][1])) + comparison = (param in range(*sorted([conf["value"][0], conf["value"][1] + 1]))) elif conf["operator"] == "not within": - comparison = (param not in range(conf["value"][0], conf["value"][1])) + comparison = (param not in range(*sorted([conf["value"][0], conf["value"][1] + 1]))) else: raise ValueError("Unknown operator: {0}".format(conf["operator"])) return comparison diff --git a/Mikado/tests/assigner_tester.py b/Mikado/tests/assigner_tester.py index a4910480a..1c390cbd5 100644 --- a/Mikado/tests/assigner_tester.py +++ b/Mikado/tests/assigner_tester.py @@ -402,8 +402,8 @@ def test_alternative(self): def test_mono_intronic(self): """ - R |=====|-------------------|=========| - P |=======| + R |xxxxx|-------------------|xxxxxxxxx| + P |xxxxxxx| Expected class code: i """ @@ -446,13 +446,13 @@ def test_mono_intronic(self): def test_multi_intronic(self): """ - R |=====|-------------------|=========| - P |===|----|====| + R |xxxxx|-------------------|xxxxxxxxx| + P |xxx|----|xxxx| OR - R |=====|----------|=========|---------|=========| - P |===|----------------|====| + R |xxxxx|----------|xxxxxxxxx|---------|xxxxxxxxx| + P |xxx|----------------|xxxx| Expected class code: I """ @@ -521,8 +521,8 @@ def test_multi_intronic(self): def test_overlap(self): """ - R |=====|-------|=====| - P |=====|---|====| + R |xxxxx|-------|xxxxx| + P |xxxxx|---|xxxx| No junction in common @@ -568,8 +568,8 @@ def test_ccode_e(self): """Case: - R ---|=====|-------|======|--- - P |======| + R ---|xxxxx|-------|xxxxxx|--- + P |xxxxxx| Exonic and intronic overlap @@ -615,8 +615,8 @@ def test_not_ccode_e(self): """Case: - R ---|=====|-------|======|--- - P |======| + R ---|xxxxx|-------|xxxxxx|--- + P |xxxxxx| Exonic overlap only @@ -660,8 +660,8 @@ def test_not_ccode_e(self): def test_left_extension(self): """ - R |=======|-------|=====| - P |=====|-------|====|-------|====| + R |xxxxxxx|-------|xxxxx| + P |xxxxx|-------|xxxx|-------|xxxx| Expected ccode: j @@ -696,8 +696,8 @@ def test_left_extension(self): def test_left_extension_n(self): """ - R |=======|-------|=====| - P |=====|-------|=====|-------|====| + R |xxxxxxx|-------|xxxxx| + P |xxxxx|-------|xxxxx|-------|xxxx| Expected ccode: j @@ -731,8 +731,8 @@ def test_left_extension_n(self): def test_right_extension(self): """ - R |=======|-------|=====| - P |=====|-------|====|-------|====| + R |xxxxxxx|-------|xxxxx| + P |xxxxx|-------|xxxx|-------|xxxx| Expected ccode: j """ @@ -765,8 +765,8 @@ def test_right_extension(self): def test_left_right_extension(self): """ - R |=======|-------|=====| - P |=====|-------|====|-------|====|------|=====| + R |xxxxxxx|-------|xxxxx| + P |xxxxx|-------|xxxx|-------|xxxx|------|xxxxx| Expected ccode: j """ @@ -799,8 +799,8 @@ def test_left_right_extension(self): def test_left_right_extension_novel(self): """ - R |====|-------|====| - P |=====|----|=======|-------|=====|------|=====| + R |xxxx|-------|xxxx| + P |xxxxx|----|xxxxxxx|-------|xxxxx|------|xxxxx| Expected ccode: n """ @@ -834,8 +834,8 @@ def test_left_right_extension_novel(self): def test_internal_extension(self): """ - R |========|-----------------|========| - P |=====|----|======|-----|=====| + R |xxxxxxxx|-----------------|xxxxxxxx| + P |xxxxx|----|xxxxxx|-----|xxxxx| Expected ccode: j, junction recall: 100% """ @@ -869,8 +869,8 @@ def test_internal_extension(self): def test_internal_external_extension(self): """ - R |======|-----------------|========| - P |=====|----|======|-----|=====|------|=======| + R |xxxxxx|-----------------|xxxxxxxx| + P |xxxxx|----|xxxxxx|-----|xxxxx|------|xxxxxxx| Expected ccode: j, junction recall: 100% """ @@ -903,8 +903,8 @@ def test_internal_external_extension(self): def test_left_right_internal_extension_novel(self): """ - R |====|-------|====| - P |=====|----|=======|--|=|--|=====|------|=====| + R |xxxx|-------|xxxx| + P |xxxxx|----|xxxxxxx|--|x|--|xxxxx|------|xxxxx| Expected ccode: j """ @@ -940,8 +940,8 @@ def test_left_right_internal_extension_novel(self): def test_contained_bleeding(self): """ - R |=======|--------|======|----|====|--------|=======| - P |==========|----|======| + R |xxxxxxx|--------|xxxxxx|----|xxxx|--------|xxxxxxx| + P |xxxxxxxxxx|----|xxxxxx| Expected class code: C :return: @@ -972,8 +972,8 @@ def test_contained_bleeding(self): def test_contained_alternative(self): """ - R |=======|--------|======|----|====|--------|=======| - P |=================|----|======| + R |xxxxxxx|--------|xxxxxx|----|xxxx|--------|xxxxxxx| + P |xxxxxxxxxxxxxxxxx|----|xxxxxx| Expected class code: C :return: @@ -1129,8 +1129,8 @@ def test_false_fusion(self): def test_h_case(self): """ - ===============-------------------============ - ===================-----------------========= + |xxxxxxxxxxxxxxx|-------------------|xxxxxxxxxxxx| + |xxxxxxxxxxxxxxxxxxx|-----------------|xxxxxxxxx| :return: """ @@ -1160,8 +1160,8 @@ def test_h_case(self): def test_double_h_case(self): """ - ===============---------=======-------============ - ===================-----------------========= + |xxxxxxxxxxxxxxx|---------|xxxxxxx|-------|xxxxxxxxxxxx| + |xxxxxxxxxxxxxxxxxxx|-----------------|xxxxxxxxx| :return: """ @@ -1191,8 +1191,8 @@ def test_double_h_case(self): def test_non_h_case(self): """ - ===============-------------------============ - =====-----========= + |xxxxxxxxxxxxxxx|-------------------|xxxxxxxxxxxx| + |xxxxx|-----|xxxxxxxxx| :return: """ @@ -1223,8 +1223,8 @@ def test_J_and_C_case(self): """ - 1 =========------------=======-----------========-------------============= - 2 =============-----------============ + 1 xxxxxxxxx------------xxxxxxx-----------xxxxxxxx-------------xxxxxxxxxxxxx + 2 xxxxxxxxxxxxx-----------xxxxxxxxxxxx We do expect the comparison to be: @@ -1266,8 +1266,8 @@ def test_J_and_C_case_in_exon(self): """ - 1 =========------------=======-----------========-------------============= - 2 =====================-----------============ + 1 xxxxxxxxx------------xxxxxxx-----------xxxxxxxx-------------xxxxxxxxxxxxx + 2 xxxxxxxxxxxxxxxxxxxxx-----------xxxxxxxxxxxx We do expect the comparison to be: diff --git a/docs/Algorithms.rst b/docs/Algorithms.rst index 3a9e6009f..6d3ab8541 100644 --- a/docs/Algorithms.rst +++ b/docs/Algorithms.rst @@ -20,6 +20,8 @@ Transcripts are scored and selected according to user-defined rules, based on ma The detection and analysis of a locus proceeds as follows: +.. _superloci: + #. When the first transcript is detected, Mikado will create a *superlocus* - a container of transcripts sharing the same genomic location - and assign the transcript to it. #. While traversing the genome, as long as any new transcript is within the maximum allowed flanking distance, it will be added to the superlocus. #. When the last transcript is added, Mikado performs the following preliminary operations: @@ -30,6 +32,7 @@ The detection and analysis of a locus proceeds as follows: * share the same strand * have at least 1bp overlap #. Analyse each of these novel "stranded" superloci separately. +.. _subloci: #. Create *subloci*, ie group transcripts so to minimize the probability of mistakenly merging multiple gene loci due to chimeras. These groups are defined as follows: * if the transcripts are multiexonic, they must share at least one intron, inclusive of the borders * if the transcripts are monoexonic, they must overlap by at least 1bp. @@ -39,18 +42,21 @@ The detection and analysis of a locus proceeds as follows: #. Select as winner the transcript with the highest score and assign it to a *monosublocus* #. Discard any transcript which is overlapping with it, according to the definitions in the point above #. Repeat the procedure from point 2 until no transcript remains in the sublocus -#. *Monosubloci* are gathered together into *monosubloci holders*, ie the seeds for the gene loci. Monosubloci holder have more lenient parameters to group transcripts, as the first phase should have already discarded most chimeras. Once a holder is created by a single *monosublocus*, any subsequent candidate *monosublocus* will be integrated only if the following conditions are satisfied: +.. _monosubloci: +#. *Monosubloci* are gathered together into *monosubloci holders*, ie the seeds for the gene loci. Monosubloci holders have more lenient parameters to group transcripts, as the first phase should have already discarded most chimeras. Once a holder is created by a single *monosublocus*, any subsequent candidate *monosublocus* will be integrated only if the following conditions are satisfied: * if the candidate is monoexonic, its exon must overlap at least one exon of a transcript already present in the holder * if the candidate is multiexonic and the holder contains only monoexonic transcripts, apply the same criterion, ie check whether its exons overlap the exons of at least one of the transcripts already present * if the candidate is multiexonic and the holder contains multiexonic transcripts, check whether one of the following conditions is satisfied: * at least one intron of the candidate overlaps with an intron of a transcript in the holder * at least one intron of the candidate is completely contained within an exon of a transcript in the holder * at least one intron of a transcript in the holder is completely contained within an exon of a transcript in the holder. + * the cDNA overlap and CDS overlap between the candidate and the transcript in the holder are over a :ref:`specified threshold `. Optionally, it is possible to tell Mikado to use a simpler algorithm, and integrate together all transcripts that share exon space. Such a simpler algorithm risks, however, chaining together multiple loci - especially in small, compact genomes. #. Once the holders are created, apply the same scoring and selection procedure of the sublocus selection step. The winning transcripts are assigned to the final *loci*. These are called the *primary transcripts of the loci*. #. Once the loci are created, track back to the original transcripts of the superlocus: #. discard any transcript overlapping more than one locus, as these are probably chimeras. #. For those transcripts that are overlapping to a single locus, verify that they are valid alternative splicing events using the :ref:`class code ` of the comparison against the primary transcript. Transcripts are re-scored dynamically when they are re-added in this fashion, to ensure their quality when compared with the primary transcript. +.. _fragments: #. Finally detect and either tag or discard fragments inside the initial *superlocus* (irrespective of strand): #. Check whether the primary transcript of any locus meets the criteria to be defined as a fragment (by default, maximum ORF of 30AA and maximum 2 exons - any transcript exceeding either criterion will be considered as non-fragment by default) #. If so, verify whether they are near enough any valid locus to be considered as a fragment (in general, class codes which constitute the "Intronic", "Fragmentary" and "No overlap" categories). @@ -93,9 +99,11 @@ Not all the available metrics will be necessarily used for scoring; the choice o Scoring files ~~~~~~~~~~~~~ -Mikado employs user-defined configuration files to define the desirable features in genes. These files are in either YAML or JSON format (default YAML) and are composed of two sections: +Mikado employs user-defined configuration files to define the desirable features in genes. These files are in either YAML or JSON format (default YAML) and are composed of four sections: #. a *requirements* section, specifying the minimum requirements that a transcript must satisfy to be considered as valid. **Any transcript failing these requirements will be scored at 0 and purged.** + #. a *not_fragmentary* section, specifying the minimum requirements that the primary transcript of a locus has to satisfy in order for the locus **not** to be considered as a putative fragment. + #. an *as_requirements* section, which specifies the minimum requirements for transcripts for them to be considered as possible valid alternative splicing events. #. a *scoring* section, specifying which features Mikado should look for in transcripts, and how each of them will be weighted. Conditions are specified using a strict set of :ref:`available operators ` and the values they have to consider. @@ -117,15 +125,22 @@ Mikado allows the following operators to express a relationship inside the scori * *ge*: greater or equal than (:math:`\ge`). Valid for comparisons with numbers. * *in*: member of (:math:`\in`). Valid for comparisons with arrays or sets. * *not in*: not member of (:math:`\notin`). Valid for comparisons with arrays or sets. +* *within*: value comprised in the range of the two values, inclusive. +* *not within*: value *not* comprised in the range of the two values, inclusive. Mikado will fail if an operator not present on this list is specified, or if the operator is assigned to compare against the wrong data type (eg. *eq* with an array). .. _requirements-section: -The requirements section ------------------------- +The "requirements", "as_requirements" and "not_fragmentary" sections +-------------------------------------------------------------------- + +These sections specifies the minimum requirements for a transcript at various stages. +* A transcript failing to pass the *requirements* check will be discarded outright (if "purge" is selected) or given a score of 0 otherwise. +* If a transcript has not been selected as the primary transcript of a locus, it has to pass the *as_requirements* check to be considered as a valid alternative splicing event. +* Finally, after loci have been defined, the primary transcripts of loci that do not pass the *not_fragmentary* section mark their loci to be compared against neighbouring loci which have passed this same check. -This section specifies the minimum requirements for a transcript. As transcripts failing to pass these checks will be discarded outright, **it is strongly advised to use lenient parameters in this section**. Being too stringent might end up removing valid models and potentially missing valid loci outright. Typically, transcripts filtered at this step should be obvious fragments, eg monoexonic transcripts produced by RNA-Seq with a total length lower than the *library* fragment length. +**It is strongly advised to use lenient parameters in the requirements section**, as failing to do so might result in discarding whole loci. Typically, transcripts filtered at this step should be obvious fragments, eg monoexonic transcripts produced by RNA-Seq with a total length lower than the *library* fragment length. This section is composed by two parts: * *parameters*: a list of the metrics to be considered. Each metric can be considered multiple times, by suffixing it with a "." construct (eg cdna_length.*mono* vs. cdna_length.*multi* to distinguish two uses of the cdna_length metric - once for monoexonic and once for multiexonic transcripts). Any parameter which is not a :ref:`valid metric name `, after removal of the suffix, **will cause an error**. Parameters have to specify the following: @@ -169,7 +184,7 @@ In order: (exon_num > 1 and cdna_length >= 100 and max_intron_length <= 200000 and min_intron_length >= 5) or (exon_num == 1 and cdna_length > 50) -Any transcript for which the expression evaluates to :math:`False` will be assigned a score of 0 outright and therefore discarded. +Any transcript for which the expression evaluates to :math:`False` will be assigned a score of 0 outright and discarded, unless the user has chosen to disable the purging of such transcripts. .. _scoring-section: @@ -237,7 +252,7 @@ Metrics belong to one of the following categories: .. hint:: Starting from version 1 beta8, Mikado allows to use externally defined metrics for the transcripts. These can be accessed using the keyword "external." within the configuration file. See the :ref:`relevant section ` for details. -.. important:: Starting from Mikado 1 beta 8, it is possible to use metrics with values between 0 and 1 directly as scores, without rescaling. This feature is available only +.. important:: Starting from Mikado 1 beta 8, it is possible to use metrics with values between 0 and 1 directly as scores, without rescaling. This feature is available only for metrics whose values naturally lie between 0 and 1, or that are boolean in nature. +-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ diff --git a/docs/Installation.rst b/docs/Installation.rst index 9baa9da13..a65fe6d1c 100644 --- a/docs/Installation.rst +++ b/docs/Installation.rst @@ -22,7 +22,7 @@ Mikado is available on PyPI, so it is possible to install it with ``pip3 install mikado`` -The source for the latest release can be obtained with +The source for the latest release on PyPI can be obtained with ``pip3 download mikado`` @@ -49,6 +49,10 @@ Followed by ``pip3 install dist/*whl`` +..note: + If you want to update your installation of Mikado, the command to be executed is + ``pip install -U dist/*whl`` + Testing the installed module ---------------------------- @@ -59,35 +63,6 @@ It is possible to test whether Mikado has been built successfully by opening a p .. block-end -This will run all the tests included in the suite. Although code coverage is not perfect yet, it is over 50% for the whole package and considerably higher for the core components. - -Python Dependencies -------------------- - -Mikado has been written for Python 3.4 and 3.5. It is dependent on the following Python3 modules: - -* wheel>=0.28.0 -* pyyaml [PyYaml]_ -* jsonschema -* Cython>=0.25 [Cython]_ -* numpy [Numpy]_ -* networkx>=1.10 [NetworkX]_ -* sqlalchemy>=1 -* sqlalchemy_utils -* biopython>=1.66 [BioPython]_ -* intervaltree -* nose -* pyfaidx -* scikit-learn>=0.17.0 [SciKit]_ -* scipy>=0.15.0 [Scipy]_ -* frozendict -* python-magic -* drmaa [DRMAA]_ -* snakemake [Snake]_ -* docutils -* tabulate -* ujson - -These dependencies will be installed automatically by PIP. +This will run all the tests included in the suite. Although code coverage is not perfect yet, it is at 70% for the whole package and over 80% for the core components. .. _GitHub: https://github.com/lucventurini/mikado diff --git a/docs/Library/Mikado.loci.rst b/docs/Library/Mikado.loci.rst index f6444520f..ebf4829f7 100644 --- a/docs/Library/Mikado.loci.rst +++ b/docs/Library/Mikado.loci.rst @@ -1,13 +1,6 @@ Mikado.loci package =================== -Subpackages ------------ - -.. toctree:: - - Mikado.loci.transcript_methods - Submodules ---------- @@ -19,14 +12,6 @@ Mikado.loci.abstractlocus module :undoc-members: :show-inheritance: -Mikado.loci.clique_methods module ---------------------------------- - -.. automodule:: Mikado.loci.clique_methods - :members: - :undoc-members: - :show-inheritance: - Mikado.loci.excluded module --------------------------- @@ -83,22 +68,6 @@ Mikado.loci.superlocus module :undoc-members: :show-inheritance: -Mikado.loci.transcript module ------------------------------ - -.. automodule:: Mikado.loci.transcript - :members: - :undoc-members: - :show-inheritance: - -Mikado.loci.transcriptchecker module ------------------------------------- - -.. automodule:: Mikado.loci.transcriptchecker - :members: - :undoc-members: - :show-inheritance: - Module contents --------------- diff --git a/docs/Library/Mikado.loci.transcript_methods.rst b/docs/Library/Mikado.loci.transcript_methods.rst deleted file mode 100644 index c5054b2e8..000000000 --- a/docs/Library/Mikado.loci.transcript_methods.rst +++ /dev/null @@ -1,46 +0,0 @@ -Mikado.loci.transcript_methods package -====================================== - -Submodules ----------- - -Mikado.loci.transcript_methods.finalizing module ------------------------------------------------- - -.. automodule:: Mikado.loci.transcript_methods.finalizing - :members: - :undoc-members: - :show-inheritance: - -Mikado.loci.transcript_methods.printing module ----------------------------------------------- - -.. automodule:: Mikado.loci.transcript_methods.printing - :members: - :undoc-members: - :show-inheritance: - -Mikado.loci.transcript_methods.retrieval module ------------------------------------------------ - -.. automodule:: Mikado.loci.transcript_methods.retrieval - :members: - :undoc-members: - :show-inheritance: - -Mikado.loci.transcript_methods.splitting module ------------------------------------------------ - -.. automodule:: Mikado.loci.transcript_methods.splitting - :members: - :undoc-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: Mikado.loci.transcript_methods - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/Library/Mikado.rst b/docs/Library/Mikado.rst index 291967ecf..b4a3561e6 100644 --- a/docs/Library/Mikado.rst +++ b/docs/Library/Mikado.rst @@ -16,6 +16,7 @@ Subpackages Mikado.serializers Mikado.subprograms Mikado.tests + Mikado.transcripts Mikado.utilities Submodules diff --git a/docs/Library/Mikado.scales.rst b/docs/Library/Mikado.scales.rst index b57a77dc5..7af60bea4 100644 --- a/docs/Library/Mikado.scales.rst +++ b/docs/Library/Mikado.scales.rst @@ -20,6 +20,14 @@ Mikado.scales.assigner module :undoc-members: :show-inheritance: +Mikado.scales.class_codes module +-------------------------------- + +.. automodule:: Mikado.scales.class_codes + :members: + :undoc-members: + :show-inheritance: + Mikado.scales.compare module ---------------------------- diff --git a/docs/Library/Mikado.subprograms.util.rst b/docs/Library/Mikado.subprograms.util.rst index 556ecd047..5a70ef1c2 100644 --- a/docs/Library/Mikado.subprograms.util.rst +++ b/docs/Library/Mikado.subprograms.util.rst @@ -12,6 +12,14 @@ Mikado.subprograms.util.awk_gtf module :undoc-members: :show-inheritance: +Mikado.subprograms.util.class_codes module +------------------------------------------ + +.. automodule:: Mikado.subprograms.util.class_codes + :members: + :undoc-members: + :show-inheritance: + Mikado.subprograms.util.convert module -------------------------------------- diff --git a/docs/Library/Mikado.tests.rst b/docs/Library/Mikado.tests.rst index d04e8a3cb..d82c140c6 100644 --- a/docs/Library/Mikado.tests.rst +++ b/docs/Library/Mikado.tests.rst @@ -44,14 +44,6 @@ Mikado.tests.parser_testing module :undoc-members: :show-inheritance: -Mikado.tests.test_abstractlocus module --------------------------------------- - -.. automodule:: Mikado.tests.test_abstractlocus - :members: - :undoc-members: - :show-inheritance: - Mikado.tests.test_blast_related module -------------------------------------- @@ -76,14 +68,6 @@ Mikado.tests.test_db_utils module :undoc-members: :show-inheritance: -Mikado.tests.test_excluded module ---------------------------------- - -.. automodule:: Mikado.tests.test_excluded - :members: - :undoc-members: - :show-inheritance: - Mikado.tests.test_external_scores module ---------------------------------------- diff --git a/docs/Library/hpc.yaml b/docs/Library/hpc.yaml deleted file mode 120000 index 9386db352..000000000 --- a/docs/Library/hpc.yaml +++ /dev/null @@ -1 +0,0 @@ -../../Mikado/daijin/hpc.yaml \ No newline at end of file diff --git a/docs/Library/mikado.snakefile b/docs/Library/mikado.snakefile deleted file mode 120000 index 53fa0dd56..000000000 --- a/docs/Library/mikado.snakefile +++ /dev/null @@ -1 +0,0 @@ -../../Mikado/daijin/mikado.snakefile \ No newline at end of file diff --git a/docs/Library/tr.snakefile b/docs/Library/tr.snakefile deleted file mode 120000 index f7502ef8a..000000000 --- a/docs/Library/tr.snakefile +++ /dev/null @@ -1 +0,0 @@ -../../Mikado/daijin/tr.snakefile \ No newline at end of file diff --git a/docs/Usage/Compare.rst b/docs/Usage/Compare.rst index e6ee55fce..4e39fe8f5 100644 --- a/docs/Usage/Compare.rst +++ b/docs/Usage/Compare.rst @@ -105,13 +105,14 @@ TMAP are tabular files that store the information regarding the best match for e #. **e_recall**: Exon recall of the reference model ( TP / (number of exons in the reference)) #. **e_f1**: `F1`_ of recall and precision at the exon level. #. **distance**: Distance of the model from its putative match. +#. **location**: location of the match, with the format :.. An example of TMAP file is as follows:: - ref_id ref_gene ccode tid gid tid_num_exons ref_num_exons n_prec n_recall n_f1 j_prec j_recall j_f1 e_prec e_recall e_f1 distance - AT5G66610.1 AT5G66610 = mikado.Chr5G1.2 mikado.Chr5G1 11 11 97.77 99.16 98.46 100.00 100.00 100.00 81.82 81.82 81.82 0 - AT5G66610.1 AT5G66610 j mikado.Chr5G1.1 mikado.Chr5G1 11 11 92.93 94.74 93.82 95.00 95.00 95.00 81.82 81.82 81.82 0 - AT5G66620.1,AT5G66630.1,AT5G66631.1 AT5G66620,AT5G66630,AT5G66631 f,j,J,O st_Stringtie_STAR.21710.6 Stringtie_STAR.21710 22 11,10,1 27.84,34.47,28.63 99.79,99.13,100.00 43.54,51.16,44.52 45.24,42.86,0.00 95.00,100.00,0.00 61.29,60.00,0.00 36.36,36.36,0.00 72.73,80.00,0.00 48.48,50.00,0.00 0 + ref_id ref_gene ccode tid gid tid_num_exons ref_num_exons n_prec n_recall n_f1 j_prec j_recall j_f1 e_prec e_recall e_f1 distance location +AT5G66600.2 AT5G66600 = cuff_cufflinks_star_at.23553.1 cuff_cufflinks_star_at.23553.1.gene 9 9 91.30 81.31 86.02 100.00 100.00 100.00 77.78 77.78 77.78 0 Chr5:26575000..26578163 +AT5G66600.2 AT5G66600 C cl_Chr5.6272 cl_Chr5.6272.gene 7 9 94.95 72.43 82.18 100.00 75.00 85.71 85.71 66.67 75.00 0 Chr5:26575000..26578087 +AT5G66620.1,AT5G66630.1,AT5G66631.1 AT5G66620,AT5G66630,AT5G66631 f,j,j,G st_Stringtie_STAR.21710.15 st_Stringtie_STAR.21710.15.gene 8 11,10,1 19.13,19.95,35.98 54.57,45.65,100.00 28.33,27.76,52.92 28.57,64.29,0.00 20.00,50.00,0.00 23.53,56.25,0.00 12.50,37.50,0.00 9.09,30.00,0.00 10.53,33.33,0.00 0 Chr5:26588402..26598231 You can notice that the third example is particular as the prediction transcript matches not one but multiple reference transcripts. This is a fusion_ event. @@ -136,13 +137,16 @@ RefMap files are tabular files which store the information regarding the best ma #. **best_nF1**: `F1`_ of recall and precision at the nucleotide level, for the best possible comparison. #. **best_jF1**: `F1`_ of recall and precision at the splice junction level, for the best possible comparison. #. **best_eF1**: `F1`_ of recall and precision at the exon level, for the best possible comparison. +#. **location**: location of the match, with the format :.. An example of a RefMap file is as follows:: - ref_id ccode tid gid nF1 jF1 eF1 ref_gene best_ccode best_tid best_gid best_nF1 best_jF1 best_eF1 - AT5G66610.1 = mikado.Chr5G1.2 mikado.Chr5G1 98.46 100.0 81.82 AT5G66610 = mikado.Chr5G1.2 mikado.Chr5G1 98.46 100.0 81.82 - AT5G66610.2 J mikado.Chr5G1.2 mikado.Chr5G1 93.91 94.74 76.19 AT5G66610 = mikado.Chr5G1.2 mikado.Chr5G1 98.46 100.0 81.82 - AT5G66630.1 f,n tr_c58_g1_i4.mrna1.6 c58_g1_i4.path1.6 66.32 94.74 76.19 AT5G66630 f,n tr_c58_g1_i4.mrna1.6 c58_g1_i4.path1.6 66.32 94.74 76.19 + ref_id ccode tid gid nF1 jF1 eF1 ref_gene best_ccode best_tid best_gid best_nF1 best_jF1 best_eF1 location + AT5G66610.1 = mikado.Chr5G4.2 mikado.Chr5G4 98.46 100.0 81.82 AT5G66610 = mikado.Chr5G4.2 mikado.Chr5G4 98.46 100.0 81.82 Chr5:26584780..26587912 +AT5G66610.2 J mikado.Chr5G4.2 mikado.Chr5G4 93.91 94.74 76.19 AT5G66610 = mikado.Chr5G4.2 mikado.Chr5G4 98.46 100.0 81.82 Chr5:26584774..26587912 +AT5G66620.1 j mikado.Chr5G6.1 mikado.Chr5G6 85.51 95.0 72.73 AT5G66620 j mikado.Chr5G6.1 mikado.Chr5G6 85.51 95.0 72.73 Chr5:26588402..26592423 +AT5G66630.1 n mikado.Chr5G8.2 mikado.Chr5G8 93.27 94.74 76.19 AT5G66630 n mikado.Chr5G8.2 mikado.Chr5G8 93.27 94.74 76.19 Chr5:26591981..26595922 + Please note that the third example (AT5G66630.1) has as best possible match a fusion_ event. @@ -241,148 +245,141 @@ All class codes fall within one of the following categories: .. topic:: Available class codes - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | Class code | Definition | Is the | Is the | Nucleotide: | Junction: | Reverse | Category | - | | | reference | prediction | Recall, | Recall, | class code | | - | | | transcript | transcript | Precision, | Precision, | | | - | | | multiexonic? | multiexonic? | F1 | F1 | | | - | | | | | | | | | - +==============+==============================+==============+===============+===================+===================+===================+===================+ - | **=** | Complete intron chain match. | True | True | NA | 100%, 100%, 100% | **=** | **Match** | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **_** | Complete match between two | False | False | NA, NA, **>=80%** | NA | **_** | **Match** | - | (underscore) | monoexonic transcripts. | | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **n** | Intron chain extension, ie. | True | True | **100%**, < 100%, | 100%, < 100%, | **c** | **Extension** | - | | both transcripts are | | | < 100% | < 100% | | | - | | multiexonic and the | | | | | | | - | | prediction has novel | | | | | | | - | | splice sites *outside* of | | | | | | | - | | the reference transcript | | | | | | | - | | boundaries. | | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **J** | Intron chain extension, | True | True | < 100%, <= 100%, | **100%**, < 100%, | **C** | **Extension** | - | | both transcripts are | | | < 100% | < 100% | | | - | | multiexonic and the | | | | | | | - | | prediction has novel | | | | | | | - | | splice sites *inside* of the | | | | | | | - | | reference transcript | | | | | | | - | | boundaries. | | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **c** | The prediction | True | NA | < 100%, **100%** | < 100%, **100%** | **n** | **Extension** | - | | is either multiexonic and | | | NA | NA | | | - | | with its intron chain | | | | | | | - | | completely contained within | | | | | | | - | | that of the reference, or | | | | | | | - | | monoexonic and contained | | | | | | | - | | within one of the reference | | | | | | | - | | exons. | | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **C** | The prediction intron chain | True | True | <= 100%, < 100%, | < 100%, **100%**, | **J** or **j** | **Extension** | - | | is completely contained | | | < 100% | < 100% | | | - | | within that of the | | | | | | | - | | reference transcript, but | | | | | | | - | | it partially debords either | | | | | | | - | | into its introns or outside | | | | | | | - | | of the reference boundaries. | | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **j** | Alternative splicing event. | True | True | NA | <= 100%, < 100%, | **j** | **Alternative | - | | | | | | < 100% | | splicing** | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **h** | Structural match between two | True | True | > 0%, > 0%, > 0% | 0%, 0%, 0% | **h** | **Alternative | - | | models where no splice site | | | | | | splicing** | - | | is conserved but **at least**| | | | | | | - | | one intron of the reference | | | | | | | - | | and one intron of the | | | | | | | - | | prediction partially overlap.| | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **g** | The monoexonic prediction | True | False | > 0%, > 0%, | 0% | **G** | **Alternative | - | ("mo" before | overlaps one or more exons of| | | between 0 and 100%| | | splicing** | - | release 1) | the reference transcript; the| | | | | | | - | | borders of the prediction | | | | | | | - | | cannot fall inside the | | | | | | | - | | introns of the reference. | | | | | | | - | | The prediction transcript | | | | | | | - | | can bridge multiple exons | | | | | | | - | | of the reference model. | | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **G** | Generic match of a | False | True | > 0%, > 0%, > 0% | 0% | **g** | **Alternative | - | ("O" before | multiexonic prediction | | | | | | splicing** | - | release 1) | transcript versus a | | | | | | | - | | monoexonic reference. | | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **o** | Generic overlap between two | True | True | > 0%, > 0%, > 0% | 0%, 0%, 0% | **o** | **Overlap** | - | | multiexonic transcripts, | | | | | | | - | | which do not share **any** | | | | | | | - | | overlap among their introns. | | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **e** | Single exon transcript | True | False | > 0%, > 0%, | 0% | **G** | **Overlap** | - | | overlapping *one* reference | | | between 0 and 100%| | | | - | | exon and at least 10 bps of a| | | | | | | - | | reference intron, indicating | | | | | | | - | | a possible pre-mRNA fragment.| | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **m** | Generic match between two | False | False | NA, NA, **< 80%** | NA | **m** | **Overlap** | - | | monoexonic transcripts. | | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **i** | Monoexonic prediction | True | False | 0% | 0% | **ri** | **Intronic** | - | | completely contained within | | | | | | | - | | one intron of the reference | | | | | | | - | | transcript. | | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **I** | Prediction completely | True | True | 0% | 0% | **rI** | **Intronic** | - | | contained within the introns | | | | | | | - | | of the reference transcript. | | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **rI** | Reference completely | True | True | 0% | 0% | **I** | **Intronic** | - | | contained within the introns | | | | | | | - | | of the prediction transcript.| | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **ri** | Reverse intron transcript - | False | True | 0% | 0% | **i** | **Intronic** | - | | the monoexonic reference is | | | | | | | - | | completely contained within | | | | | | | - | | one intron of the prediction | | | | | | | - | | transcript. | | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **f** | Fusion - this special code | NA | NA | **> 10%**, NA, NA | **> 0%**, NA, NA | NA | **Fusion** | - | | is applied when a prediction | | | | | | | - | | intersects more than one | | | | | | | - | | reference transcript. To be | | | | | | | - | | considered for fusions, | | | | | | | - | | candidate references must | | | | | | | - | | **either** share at least one| | | | | | | - | | splice junction with the | | | | | | | - | | prediction, **or** have at | | | | | | | - | | least 10% of its bases | | | | | | | - | | recalled. If two or more | | | | | | | - | | reference transcripts fit | | | | | | | - | | these constraints, then the | | | | | | | - | | prediction model is | | | | | | | - | | classified as a **fusion**. | | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **x** | Monoexonic match on the | NA | False | >= 0% | 0% | **x** or **X** | **Fragment** | - | | *opposite* strand. | | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **X** | Multiexonic match on the | NA | True | >= 0% | 0% | **x** or **X** | **Fragment** | - | | *opposite* strand. | | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **p** | The prediction is on the same| NA | NA | 0% | 0% | **p** | **No overlap** | - | | strand of a neighbouring but | | | | | | | - | | non-overlapping transcript. | | | | | | | - | | Probable polymerase run-on. | | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **P** | The prediction is on the | NA | NA | 0% | 0% | **P** | **No overlap** | - | | *opposite* strand of a | | | | | | | - | | neighbouring but | | | | | | | - | | non-overlapping transcript. | | | | | | | - | | Probable polymerase run-on. | | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - | **u** | Unknown - no suitable model | NA | NA | 0% | 0% | NA | **No overlap** | - | | has been found near enough | | | | | | | - | | the prediction to perform a | | | | | | | - | | comparison. | | | | | | | - +--------------+------------------------------+--------------+---------------+-------------------+-------------------+-------------------+-------------------+ - + +--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| Class code | Definition | Reference multiexonic? | Prediction multiexonic? | Nucleotide: RC, PC, F1 | Junction: RC, PC, F1 | Reverse | Category | ++==============+================================+==========================+===========================+============================+========================+===========+=============+ ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| = | Complete intron chain match. | True | True | NA | 100%, 100%, 100% | = | Match | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| _ | Complete match between two | False | False | NA, NA, >=80% | NA | _ | Match | +| | monoexonic transcripts. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| n | Intron chain extension, ie. | True | True | 100%, < 100%, <100% | 100%, < 100%, <100% | c | Extension | +| | both transcripts are | | | | | | | +| | multiexonic and the | | | | | | | +| | prediction has novel splice | | | | | | | +| | sites outside of the reference | | | | | | | +| | transcript boundaries. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| J | Intron chain extension, ie. | True | True | 100%, <= 100%, <100% | 100%, < 100%, <100% | C | Extension | +| | both transcripts are | | | | | | | +| | multiexonic and the | | | | | | | +| | prediction has novel splice | | | | | | | +| | sites inside of the reference | | | | | | | +| | transcript boundaries. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| c | The prediction is either | NA | NA | < 100%, 100%, NA | < 100%, 100%, NA | n | Extension | +| | multiexonic and with its | | | | | | | +| | intron chain completely | | | | | | | +| | contained within that of | | | | | | | +| | the reference, or monoexonic | | | | | | | +| | and contained within one of | | | | | | | +| | the reference exons. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| C | The prediction intron chain is | True | True | <= 100%, < 100%, < 100% | < 100%, 100%, < 100% | J or j | Extension | +| | completely contained within | | | | | | | +| | that of the reference | | | | | | | +| | transcript, but it partially | | | | | | | +| | debords either into its | | | | | | | +| | introns or outside of the | | | | | | | +| | reference boundaries. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| j | Alternative splicing event. | True | True | NA | <= 100%, 100%, < 100% | j or C | Alternative | +| | | | | | | | splicing | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| h | Structural match between two | True | True | > 0%, > 0%, > 0% | 0%, 0%, 0% | h | Alternative | +| | models where where no splice | | | | | | splicing | +| | site is conserved but at least | | | | | | | +| | one intron of the reference | | | | | | | +| | and one intron of the | | | | | | | +| | prediction partially overlap. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| g | The monoexonic prediction | True | False | > 0%, > 0%, 0% < F1 < 100% | 0%, 0%, 0% | G | Alternative | +| | overlaps one or more exons of | | | | | | splicing | +| | the reference transcript; | | | | | | | +| | the borders of the prediction | | | | | | | +| | cannot fall inside the introns | | | | | | | +| | of the reference. The | | | | | | | +| | prediction transcript can | | | | | | | +| | bridge multiple exons of the | | | | | | | +| | reference model. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| G | Generic match of a multiexonic | False | True | > 0%, > 0%, 0% < F1 < 100% | 0%, 0%, 0% | g | Alternative | +| | prediction transcript versus a | | | | | | splicing | +| | monoexonic reference. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| o | Generic overlap between two | True | True | > 0%, > 0%, 0% < F1 < 100% | 0%, 0%, 0% | o | Overlap | +| | multiexonic transcripts, | | | | | | | +| | which do not share any overlap | | | | | | | +| | among their introns. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| e | Single exon transcript | True | False | > 0%, > 0%, 0% < F1 < 100% | 0%, 0%, 0% | G | Overlap | +| | overlapping one reference exon | | | | | | | +| | and at least 10 bps of a | | | | | | | +| | reference intron, indicating a | | | | | | | +| | possible pre-mRNA fragment. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| m | Generic match between two | False | False | NA, NA, < 80% | NA | m | Overlap | +| | monoexonic transcripts. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| i | Monoexonic prediction | True | False | 0%, 0%, 0% | 0%, 0%, 0% | ri | Intronic | +| | completely contained within | | | | | | | +| | one intron of the reference | | | | | | | +| | transcript. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| I | Prediction completely | True | True | 0%, 0%, 0% | 0%, 0%, 0% | rI | Intronic | +| | contained within the introns | | | | | | | +| | of the reference transcript. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| ri | Reverse intron transcript - | False | True | 0%, 0%, 0% | 0%, 0%, 0% | i | Intronic | +| | the monoexonic reference is | | | | | | | +| | completely contained | | | | | | | +| | within one intron of the | | | | | | | +| | prediction transcript. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| rI | Multiexonic reference | True | True | 0%, 0%, 0% | 0%, 0%, 0% | I | Intronic | +| | completely contained within | | | | | | | +| | the introns of the prediction | | | | | | | +| | transcript. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| f | Fusion - this special code is | NA | NA | > 10%, 0%, 0% | > 0%, 0%, 0% | NA | Fusion | +| | applied when a prediction | | | | | | | +| | intersects more than one | | | | | | | +| | reference transcript. To be | | | | | | | +| | considered for fusions, | | | | | | | +| | candidate references must | | | | | | | +| | **either** share at least one | | | | | | | +| | splice junction with the | | | | | | | +| | prediction, **or** have at | | | | | | | +| | least 10% of its bases | | | | | | | +| | recalled. If two or more | | | | | | | +| | reference transcripts fit | | | | | | | +| | these constraints, then the | | | | | | | +| | prediction model is classified | | | | | | | +| | as a fusion. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| x | Monoexonic match on the | NA | False | >0%, >0%, >0% | 0%, 0%, 0% | x or X | Fragment | +| | **opposite** strand. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| X | Multiexonic match on the | NA | True | >0%, >0%, >0% | NA | x or X | Fragment | +| | **opposite** strand. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| p | The prediction is on the same | NA | NA | 0%, 0%, 0% | 0%, 0%, 0% | p | Fragment | +| | strand of a neighbouring but | | | | | | | +| | non-overlapping transcript. | | | | | | | +| | Probable polymerase run-on | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| P | The prediction is on the | NA | NA | 0%, 0%, 0% | 0%, 0%, 0% | P | Fragment | +| | opposite strand of a | | | | | | | +| | neighbouring but non- | | | | | | | +| | overlapping transcript. | | | | | | | +| | Probable polymerase run-on. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ +| u | Unknown - no suitable model | NA | NA | 0%, 0%, 0% | 0%, 0%, 0% | NA | Unknown | +| | has been found near enough the | | | | | | | +| | prediction to perform a | | | | | | | +| | comparison. | | | | | | | ++--------------+--------------------------------+--------------------------+---------------------------+----------------------------+------------------------+-----------+-------------+ Technical details ~~~~~~~~~~~~~~~~~ diff --git a/docs/Usage/Configure.rst b/docs/Usage/Configure.rst index 8cab6ac1d..87a3e4aab 100644 --- a/docs/Usage/Configure.rst +++ b/docs/Usage/Configure.rst @@ -172,10 +172,11 @@ This section of the configuration file deals with the :ref:`prepare stage of Mik .. _canonical-configuration: * canonical: this voice specifies the splice site donors and acceptors that are considered canonical for the species. By default, Mikado uses the canonical splice site (GT/AG) and the two semi-canonical pairs (GC/AG and AT/AC). Type: Array of two-element arrays, composed by two-letter strings. -* lenient: boolean value. If set to *false*, transcripts that only have non-canonical splice sites will be **removed** from the output. +* keep_redundant: if set to false (default), Mikado will only keep one copy of transcripts that are completely identical. +* lenient: boolean value. If set to *false*, transcripts that either only have non-canonical splice sites or have a mixture of canonical junctions on *both* strands will be **removed** from the output. Otherwise, they will left in, be properly tagged. * minimum_length: minimum length of the transcripts to be kept. * procs: number of processors to be used. -* strand_specific: boolean. If set to *true*, **all** input assemblies will be treated as strand-specific, therefore keeping the strand of monoexonic fragments as it was. +* strand_specific: boolean. If set to *true*, **all** input assemblies will be treated as strand-specific, therefore keeping the strand of monoexonic fragments as it was. Multiexonic transcripts will not have their strand reversed even if doing that would mean making some or all non-canonical junctions canonical. * strip_cds: boolean. If set to *true*, the CDS features will be stripped off the input transcripts. This might be necessary for eg transcripts obtained through alignment with `GMAP `_ [GMAP]_. * files: this sub-section is the most important, as it contains among other things the locations and labels for the input files. Voices: * gff: array of the input files, in GFF or GTF format. Please note that only CDS/exon/UTR features will be considered from these files. @@ -224,6 +225,7 @@ This section of the configuration file deals with the :ref:`prepare stage of Mik out_fasta: mikado_prepared.fasta output_dir: . strand_specific_assemblies: [] + keep_redundant: false lenient: false minimum_length: 200 procs: 1 @@ -238,7 +240,7 @@ Settings for the serialisation stage This section of the configuration file deals with the :ref:`serialisation stage of Mikado `. It specifies the location of the ORF BED12 files from TransDecoder, the location of the XML files from BLAST, the location of portcullis junctions, and other details important at run time. It has the following voices: -* discard_definition: boolean. This is used to specify whether we will use the ID or the definition of the sequences when parsing BLAST results. This is important when BLAST data might have a mock, local identifier for the sequence ("lcl|1") rather than its original ID. +* discard_definition: boolean. This is used to specify whether we will use the ID or the definition of the sequences when parsing BLAST results. This is important when BLAST data might have a mock, local identifier for the sequence ("lcl|1") rather than its original ID. :warning: Deprecated since v1 beta 10. * force: whether the database should be truncated and rebuilt, or just updated. .. _max-objects: @@ -290,7 +292,6 @@ This section of the configuration file deals with the :ref:`serialisation stage # - procs: Number of processors to use. Default: 1. # - single_thread: if true, Mikado prepare will force the usage of a single thread # in this step. - discard_definition: false files: blast_targets: - '' @@ -347,77 +348,161 @@ Each subsection of the pick configuration will be explained in its own right. Parameters regarding the alternative splicing ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -After selecting the best model for each locus, Mikado will backtrack and try to select valid alternative splicing events. This section deals with how Mikado will operate the selection. There are the following available parameters: +After selecting the best model for each locus, Mikado will backtrack and try to select valid alternative splicing events. This section deals with how Mikado will operate the selection. In order to be considered as valid potential AS events, transcripts have to satisfy the minimum :ref:`requirements specified in the scoring file `. These are the available parameters: * report: boolean. Whether to calculate and report possible alternative splicing events at all. By default this is set to true; *setting this parameter to false will inactivate all the options in this section*. * keep_retained_introns: boolean. It specifies whether transcripts with retained introns will be retained. A retained intron is defined as an exon at least partly non-coding, whose non-coding part falls within the intron of another transcript (so, retained intron events which yield a valid ORF will not be excluded). By default, such transcripts will be excluded. -* max_fiveutr_length: maximum 5' UTR length of any alternative splicing transcript. By default, this is set to 1Mbps, *de facto* inactivating this filter. -* max_threeutr_length: maximum 3' UTR length of any alternative splicing transcript. By default, this is set to 1Mbps, *de facto* inactivating this filter. -* max_utr_length: maximum total UTR length of any alternative splicing transcript. By default, this is set to 1Mbps, *de facto* inactivating this filter. * min_cdna_overlap: minimum cDNA overlap between the primary transcript and the AS candidate. By default, this is set to 0 and we rely only on the class code and the CDS overlap. It must be a number between 0 and 1. * min_cds_overlap: minimum CDS overlap between the primary transcript and the AS candidate. By default this is set to 0.6, ie 60%. It must be a number between 0 and 1. * min_score_perc: Minimum percentage of the score of the primary transcript that any candidate AS must have to be considered. By default, this is set to 0.6 (60%). It must be a number between 0 and 1. * only_confirmed_introns: boolean. This parameter determines whether to consider only transcripts whose introns are confirmed :ref:`in the dataset of reliable junctions `, or whether to consider all possible candidate transcripts. * redundant_ccodes: any candidate AS will be :ref:`compared ` against all the transcripts already retained in the locus. If any of these comparisons returns one of the :ref:`class codes ` specified in this array, **the transcript will be ignored**. Default class codes: =, _, m, c, n, C -* valid_ccodes: any candidate AS will be :ref:`compared ` against *the primary transcript* to determine the type of AS event. If the :ref:`class code ` is one of those specified in this array, the transcript will be considered further. Default class codes: j, J, g, G, h. +* valid_ccodes: any candidate AS will be :ref:`compared ` against *the primary transcript* to determine the type of AS event. If the :ref:`class code ` is one of those specified in this array, the transcript will be considered further. Valid class codes are within the categories "Alternative splicing", "Extension" with junction F1 lower than 100%, and Overlap (with the exclusion of "m"). Default class codes: j, J, g, G, h. +* pad: boolean option. If set to True, Mikado will try to pad transcripts so that they share the same 5'. Disabled by default. +* ts_max_splices: numerical. When padding is activated, at *most* how many splice junctions can the extended exon cross? +* ts_distance: numerical. When padding is activated, at *most* of how many base pairs can an exon be extended? + .. warning:: the AS transcript event does not need to be a valid AS event for *all* transcripts in the locus, only against the *primary* transcript. .. code-block:: yaml - pick: - # - scoring_file: a scoring file for the analysis. Default: plants.yaml. - # - source_score: a dictionary with pre-defined scores to assign to the transcripts - # according to their source. Eg all Cufflinks transcripts from the seed (label: - # "cuff_seed") could be assigned a default additional score of 1. alternative_splicing: - # Parameters related to alternative splicing reporting. - # - report: whether to report at all or not the AS events. - # - min_cds_overlap: minimum overlap between the CDS of the primary transcript - # and any AS event. Default: 60%. - # - min_cdna_overlap: minimum overlap between the CDNA of the primary transcript - # and any AS event. - # Default: 0% i.e. disabled, we check for the CDS overlap. - # - keep_retained_introns: Whether to consider as valid AS events where one intron - # is retained compared to the primary or any other valid AS. Default: false. - # - max_isoforms: Maximum number of isoforms per locus. 1 implies no AS reported. - # Default: 3 - # - valid_ccodes: Valid class codes for AS events. See documentation for details. - # Choices: - # j, n, O, e, o, h, J, C, mo. Default: j, J, O, mo - # - max_utr_length: Maximum length of the UTR for AS events. Default: 10e6 (i.e. - # no limit) - # - max_fiveutr_length: Maximum length of the 5UTR for AS events. Default: - # 10e6 (i.e. no limit) - # - max_threeutr_length: Maximum length of the 5UTR for AS events. Default: - # 10e6 (i.e. no limit) - # - min_score_perc: Minimum score threshold for subsequent AS events. - # Only transcripts with a score at least (best) * value are retained. - # - only_confirmed_introns: bring back AS events only when their introns are - # either - # present in the primary transcript or in the set of confirmed introns. - keep_retained_introns: false - max_fiveutr_length: 1000000 - max_isoforms: 3 - max_threeutr_length: 1000000 - max_utr_length: 1000000 - min_cdna_overlap: 0 - min_cds_overlap: 0.6 - min_score_perc: 0.6 - only_confirmed_introns: false - redundant_ccodes: - - c + # Parameters related to alternative splicing reporting. + # - report: whether to report at all or not the AS events. + # - min_cds_overlap: minimum overlap between the CDS of the primary transcript + # and any AS event. Default: 60%. + # - min_cdna_overlap: minimum overlap between the CDNA of the primary transcript + # and any AS event. + # Default: 0% i.e. disabled, we check for the CDS overlap. + # - keep_retained_introns: Whether to consider as valid AS events where one intron + # is retained compared to the primary or any other valid AS. Default: false. + # - max_isoforms: Maximum number of isoforms per locus. 1 implies no AS reported. + # Default: 3 + # - valid_ccodes: Valid class codes for AS events. Valid codes are in categories + # Alternative splicing, Extension (with junction F1 lower than 100%), + # and Overlap (exluding m). Default: j, J, g, G, C, h + # - max_utr_length: Maximum length of the UTR for AS events. Default: 10e6 (i.e. + # no limit) + # - max_fiveutr_length: Maximum length of the 5UTR for AS events. Default: + # 10e6 (i.e. no limit) + # - max_threeutr_length: Maximum length of the 5UTR for AS events. Default: + # 10e6 (i.e. no limit) + # - min_score_perc: Minimum score threshold for subsequent AS events. + # Only transcripts with a score at least (best) * value are retained. + # - only_confirmed_introns: bring back AS events only when their introns are + # either + # present in the primary transcript or in the set of confirmed introns. + # - pad: boolean switch. If true, Mikado will pad all the transcript in a gene + # so that their ends are the same + # - ts_distance: if padding, this is the maximum distance in base-pairs between + # the starts of transcripts + # to be considered to be padded together. + # - ts_max_splices: if padding, this is the maximum amount of splicing junctions + # that the transcript to pad + # is allowed to cross. If padding would lead to cross more than this number, + # the transcript will not be padded. + keep_retained_introns: false + max_isoforms: 5 + min_cdna_overlap: 0.5 + min_cds_overlap: 0.75 + min_score_perc: 0.5 + only_confirmed_introns: true + pad: false + redundant_ccodes: + - c + - m + - _ + - '=' + - n + report: true + ts_distance: 300 + ts_max_splices: 1 + valid_ccodes: + - j + - J + - C + - G + - g + - h + + +.. _clustering_specifics: + +Parameters regarding the clustering of transcripts in loci +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. note:: + New in version 1 beta 10. + +This section influences how Mikado clusters transcripts in its multi-stage selection. The available parameters are: + +* *flank*: numerical. When constructing :ref:`Superloci `, Mikado will use this value as the maximum distance +between transcripts for them to be integrated within the same superlocus. +* *cds_only*: boolean. If set to true, during the :ref:`picking stage ` Mikado will consider only the **primary ORF** to evaluate whether two transcripts intersect. Transcripts which eg. share introns in their UTR but have completely unrelated CDSs will be clustered separately. Disabled by default. +* *purge*: boolean. If true, any transcript failing the :ref:`specified requirements ` will be purged out. Otherwise, they will be assigned a score of 0 and might potentially appear in the final output, if no other transcript is present in the locus. +* *simple_overlap_for_monoexonic*: boolean. During the :ref:`second clustering `, by default monoexonic transcripts are clustered together even if they have a very slight overlap with another transcript. Manually setting this flag to *false* will cause Mikado to cluster monoexonic transcripts only if they have a minimum amount of cDNA and CDS overlap with the other transcripts in the holder. +* *min_cdna_overlap*: numerical, between 0 and 1. Minimum cDNA overlap between two multiexonic transcripts for them to be considered as intersecting, if all other conditions fail. +* *min_cdna_overlap*: numerical, between 0 and 1. Minimum CDS overlap between two multiexonic transcripts for them to be considered as intersecting, if all other conditions fail. + +.. code-block:: yaml + + clustering: + # Parameters related to the clustering of transcripts into loci. + # - cds_only: boolean, it specifies whether to cluster transcripts only according + # to their CDS (if present). + # - min_cds_overlap: minimal CDS overlap for the second clustering. + # - min_cdna_overlap: minimal cDNA overlap for the second clustering. + # - flank: maximum distance for transcripts to be clustered within the same superlocus. + # - remove_overlapping_fragments: boolean, it specifies whether to remove putative + # fragments. + # - purge: boolean, it specifies whether to remove transcripts which fail the + # minimum requirements check - or whether to ignore those requirements altogether. + # - simple_overlap_for_monoexonic: boolean. If set to true (default), then any + # overlap mean inclusion + # in a locus for or against a monoexonic transcript. If set to false, normal controls + # for the percentage + # of overlap will apply. + # - max_distance_for_fragments: maximum distance from a valid locus for another + # to be considered a fragment. + cds_only: false + flank: 200 + min_cdna_overlap: 0.2 + min_cds_overlap: 0.2 + purge: true + simple_overlap_for_monoexonic: true + +.. _fragment_options: + +Parameters regarding the detection of putative fragments +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This section determines how Mikado treats :ref:`potential fragments in the output `. Available options: + +* *remove*: boolean, default true. If set to true, fragments will be excluded from the final output; otherwise, they will be printed out, but properly tagged. +* *max_distance*: numerical. For non-overlapping fragments, this value determines the maximum distance from the valid gene. Eg. with the default setting of 2000, a putative fragment at the distance of 1000 will be tagged and dealt with as a fragment; an identical model at a distance of 3000 will be considered as a valid gene and left untouched. +* *valid_class_codes*: valid :ref:`class codes ` for potential fragments. Only Class Codes in the categories Overlap, Intronic, Fragment, with the addition of "_", are considered as valid choices. + +.. code-block:: yaml + + fragments: + # Parameters related to the handling of fragments. + # - remove: boolean. Whether to remove fragments or leave them, properly tagged. + # - max_distance: maximum distance of a putative fragment from a valid gene. + # - valid_class_codes: which class codes will be considered as fragments. Default: + # (p, P, x, X, i, m, _). Choices: _ plus any class code with category + # Intronic, Fragment, or Overlap. + max_distance: 2000 + remove: true + valid_class_codes: + - p + - P + - x + - X + - i - m - _ - - '=' - - n - - C - report: true - valid_ccodes: - - j - - J - - G - - g - - h + + .. _orf_loading: @@ -565,23 +650,19 @@ Generic parameters on the pick run This section deals with other parameters necessary for the run, such as the number of processors to use, but also more important algorithmic parameters such as how to recognise fragments. Parameters: +* *consider_truncated_for_retained*: normally, Mikado considers as retained introns only events in which a partially coding exon on the 3' side becomes non-coding in the middle of a CDS intron of another transcript in the locus. If this option is set to *true*, Mikado will consider as retained intron events also cases when the transcript has its CDS just end within a CDS intron of another model. Useful eg. when dealing with CDS models. * *exclude_cds*: whether to remove CDS/UTR information from the Mikado output. Default: *false*. -* *flank*: when creating superloci, Mikado will gather together all groups of overlapping transcripts that are within this distance. By default, this is set at 1 kbp. This parameter is important to recognize fragments derived from UTRs or misfired transcription in the neighborhood of real transcripts. -* *fragments_maximal_cds*: during the last control on fragments, Mikado will consider as non-fragmentary any transcript with an ORF of at least this value in bps. By default, this is set to 100, ie any transcript with an ORF of 33 AA or more will be considered by default as valid. -* *fragments_maximal_exons*: in addition, any transcript with more than this number of exons will be considered as non-fragmentary by definition. By default, this parameter is set at 2, ie any transcript with 3 or more exons will be considered non-fragmentary by definition. * *intron_range*: tuple that indicates the range of lengths in which most introns should fall. Transcripts with introns either shorter or longer than this interval will be potentially penalised, depending on the scoring scheme. For the paper, this parameter was set to a tuple of integers in which *98%* of the introns of the reference annotation were falling (ie cutting out the 1st and 99th percentiles). * *preload*: boolean. In certain cases, ie when the database is quite small, it might make sense to preload it in memory rather than relying on SQL queries. Set to *false* by default. * *shm*: boolean. In certain cases, especially when disk access is a severely limiting factor, it might make sense to copy a SQLite database into RAM before querying. If this parameter is set to *true*, Mikado will copy the SQLite database into a temporary file in RAM, and query it from there. * *shm_db*: string. If *shm* is set to true and this string is non-empty, Mikado will copy the database in memory to a file with this name *and leave it there for other Mikado runs*. The file will have to be removed manually. * *procs*: number of processors to use. Default: 1. * *single_thread*: boolean. If set to true, Mikado will completely disable multiprocessing. Useful mostly for debugging reasons. -* *subloci_from_cds_only*: boolean. If set to true, subloci will be built only using CDS information - therefore, transcripts with overlapping cDNA but discrete CDSs will be analysed separately. Most useful in cases of **compact** genomes, where genes lie near and it might be possible to analyse them together as the UTRs are overlapping. .. warning:: the shared-memory options are available only on Linux platforms. .. code-block:: yaml - pick: run_options: # Generic run options. # - shm: boolean flag. If set and the DB is sqlite, it will be copied onto the @@ -595,40 +676,28 @@ Parameters: # for faster access. Default: false # - exclude_cds: boolean flag. If set, the CDS information will not be printed # in Mikado output. Default: false - # - purge: boolean flag. If set, all loci where all transcripts have a score - # of 0 will be excluded - # from the output. Default: false - # - remove_overlapping_fragments: boolean flag. If set, fragments (defined as - # monoexonic loci - # classified as P,x,i or p compared to another locus, will be removed from - # the output. - # - fragments_maximal_cds: a locus will never be considered a fragment if its - # longest CDS is over - # this length. Default: 100 bps. - # - fragments_maximal_exons: a locus will never be considered a fragment if its - # representative transcript - # has more than this number of exons. Default: 2 # - procs: number of processes to use. Default: 1 # - preload: boolean flag. If set, the whole database will be preloaded into # memory for faster access. Useful when # using SQLite databases. # - single_thread: boolean flag. If set, multithreading will be disabled - useful # for profiling and debugging. + # - consider_truncated_for_retained: boolean. Normally, Mikado considers only + # exons which span a whole intron as possible retained intron events. If this + # flag is set to true, also terminal exons will be considered. + # - remove_overlapping_fragments: DEPRECATED, see clustering. + # - purge: DEPRECATED, see clustering. + consider_truncated_for_retained: false exclude_cds: false - flank: 1000 - fragments_maximal_cds: 100 - fragments_maximal_exons: 2 intron_range: - 60 - 900 preload: false procs: 1 - purge: false - remove_overlapping_fragments: true shm: false shm_db: '' single_thread: false - subloci_from_cds_only: false + Miscellanea ----------- diff --git a/docs/Usage/Utilities.rst b/docs/Usage/Utilities.rst index e6127e77f..baba1892b 100644 --- a/docs/Usage/Utilities.rst +++ b/docs/Usage/Utilities.rst @@ -31,6 +31,33 @@ Usage:: --start START --end END`` +.. _class-codes-command: + +class_codes +~~~~~~~~~~~ + +This utility is used to obtain information about any class code or category thereof. + +Usage:: + + $ mikado util class_codes --help + usage: mikado util class_codes [-h] + [-f {fancy_grid,grid,html,jira,latex,latex_booktabs,mediawiki,moinmoin,orgtbl,pipe,plain,psql,rst,simple,textile,tsv}] + [-c {Intronic,Match,Alternative splicing,Unknown,Fragment,Overlap,Extension,Fusion} [{Intronic,Match,Alternative splicing,Unknown,Fragment,Overlap,Extension,Fusion} ...]] + [-o OUT] + [{,=,_,n,J,c,C,j,h,g,G,o,e,m,i,I,ri,rI,f,x,X,p,P,u} [{,=,_,n,J,c,C,j,h,g,G,o,e,m,i,I,ri,rI,f,x,X,p,P,u} ...]] + + Script to print out the class codes. + + positional arguments: + {[],=,_,n,J,c,C,j,h,g,G,o,e,m,i,I,ri,rI,f,x,X,p,P,u} + Codes to query. + + optional arguments: + -h, --help show this help message and exit + -f {fancy_grid,grid,html,jira,latex,latex_booktabs,mediawiki,moinmoin,orgtbl,pipe,plain,psql,rst,simple,textile,tsv}, --format {fancy_grid,grid,html,jira,latex,latex_booktabs,mediawiki,moinmoin,orgtbl,pipe,plain,psql,rst,simple,textile,tsv} + -c {Intronic,Match,Alternative splicing,Unknown,Fragment,Overlap,Extension,Fusion} [{Intronic,Match,Alternative splicing,Unknown,Fragment,Overlap,Extension,Fusion} ...], --category {Intronic,Match,Alternative splicing,Unknown,Fragment,Overlap,Extension,Fusion} [{Intronic,Match,Alternative splicing,Unknown,Fragment,Overlap,Extension,Fusion} ...] + -o OUT, --out OUT convert ~~~~~~~ @@ -108,7 +135,25 @@ This command generates the documentation regarding the available transcript metr Usage:: - $ mikado util metrics + $ mikado util metrics --help + usage: mikado util metrics [-h] + [-f {fancy_grid,grid,html,jira,latex,latex_booktabs,mediawiki,moinmoin,orgtbl,pipe,plain,psql,rst,simple,textile,tsv}] + [-o OUT] + [-c {CDS,Descriptive,External,Intron,Locus,UTR,cDNA} [{CDS,Descriptive,External,Intron,Locus,UTR,cDNA} ...]] + [metric [metric ...]] + + Simple script to obtain the documentation on the transcript metrics. + + positional arguments: + metric + + optional arguments: + -h, --help show this help message and exit + -f {fancy_grid,grid,html,jira,latex,latex_booktabs,mediawiki,moinmoin,orgtbl,pipe,plain,psql,rst,simple,textile,tsv}, --format {fancy_grid,grid,html,jira,latex,latex_booktabs,mediawiki,moinmoin,orgtbl,pipe,plain,psql,rst,simple,textile,tsv} + Format of the table to be printed out. + -o OUT, --out OUT Optional output file + -c {CDS,Descriptive,External,Intron,Locus,UTR,cDNA} [{CDS,Descriptive,External,Intron,Locus,UTR,cDNA} ...], --category {CDS,Descriptive,External,Intron,Locus,UTR,cDNA} [{CDS,Descriptive,External,Intron,Locus,UTR,cDNA} ...] + Available categories to select from. .. _stat-command: @@ -217,6 +262,24 @@ This script is used to collect statistics obtained with from the :ref:`mikado ut optional arguments: -h, --help show this help message and exit +bam2gtf.py +~~~~~~~~~~ + +This script will use PySam to convert read alignments into a GTF file. Mostly useful to convert from BAM alignment of long reads (eg. PacBio) into a format which Mikado can interpret and use. + +Usage:: + + $ bam2gtf.py --help + usage: Script to convert from BAM to GTF, for PB alignments [-h] bam [out] + + positional arguments: + bam Input BAM file + out Optional output file + + optional arguments: + -h, --help show this help message and exit + + class_run.py ~~~~~~~~~~~~ @@ -270,10 +333,29 @@ Script to extract a list of sequences from a FASTA file, using the `pyfaidx BED12 converter [-h] gff [out] + + positional arguments: + gff + out + + optional arguments: + -h, --help show this help message and exit + grep.py ~~~~~~~ -A script to extract data from *column* files, using a list of targets. More efficient than a standard "grep -f" for this niche case. Usage:: +A script to extract data from *column* files, using a list of targets. More efficient than a standard "grep -f" for this niche case. + +Usage:: $ util/grep.py -h usage: grep.py [-h] [-v] [-s SEPARATOR] [-f FIELD] [-q] ids target [out] @@ -296,6 +378,38 @@ A script to extract data from *column* files, using a list of targets. More effi The field to look in the target file. -q, --quiet No logging. +merge_junction_bed12.py +~~~~~~~~~~~~~~~~~~~~~~~ + +This script will merge [Portcullis]_-like junctions into a single BED12, using the thick start/ends as unique keys. + +Usage:: + + $ merge_junction_bed12.py --help + usage: Script to merge BED12 files *based on the thickStart/End features*. + Necessary for merging junction files such as those produced by TopHat + [-h] [--delim DELIM] [-t THREADS] [--tophat] [-o OUTPUT] bed [bed ...] + + positional arguments: + bed Input BED files. Use "-" for stdin. + + optional arguments: + -h, --help show this help message and exit + --delim DELIM Delimiter for merged names. Default: ; + -t THREADS, --threads THREADS + Number of threads to use for multiprocessing. Default: + 1 + --tophat Flag. If set, tophat-like junction style is assumed. + This means that junctions are defined using the + blockSizes rather than thickStart/End. The script will + convert the lines to this latter format. By default, + the script assumes that the intron start/end are + defined using thickStart/End like in portcullis. + Mixed-type input files are not supported. + -o OUTPUT, --output OUTPUT + Output file. Default: stdout + + remove_from_embl.py ~~~~~~~~~~~~~~~~~~~ @@ -315,6 +429,24 @@ Quick script to remove sequences from a given organism from SwissProt files, and Organism to be excluded --format {fasta} Output format. Choices: fasta. Default: fasta. +sanitize_blast_db.py +~~~~~~~~~~~~~~~~~~~~ + +Simple script to clean the header of FASTA files, so to avoid runtime errors and incrongruencies with BLAST and other tools which might be sensitive to long descriptions or the presence of special characters. + +Usage:: + + $ sanitize_blast_db.py --help + usage: sanitize_blast_db.py [-h] [-o OUT] fasta [fasta ...] + + positional arguments: + fasta + + optional arguments: + -h, --help show this help message and exit + -o OUT, --out OUT + + split_fasta.py ~~~~~~~~~~~~~~