Skip to content

Commit

Permalink
Merge pull request #419 from EI-CoreBioinformatics/perf_issue_ONT_MT
Browse files Browse the repository at this point in the history
Solve performance issues affecting extra-dense superloci in ONT runs.
  • Loading branch information
ljyanesm authored Aug 20, 2021
2 parents 8dfc099 + 05e7660 commit 1bba1e3
Show file tree
Hide file tree
Showing 34 changed files with 1,271 additions and 639 deletions.
55 changes: 34 additions & 21 deletions Mikado/_transcripts/transcript_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,9 +244,9 @@ def __init__(self, *args,
self.loaded_bed12 = []
self.engine, self.session, self.sessionmaker = None, None, None
# Initialisation of the CDS segments used for finding retained introns
self.__cds_tree = IntervalTree()
self._cds_tree = IntervalTree()
self.__expandable = False
self.__segmenttree = IntervalTree()
self._segmenttree = IntervalTree()
self.__cds_introntree = IntervalTree()
self._possibly_without_exons = False
self._accept_undefined_multi = accept_undefined_multi
Expand Down Expand Up @@ -413,17 +413,17 @@ def __initialize_with_bam(self, transcript_row: pysam.AlignedSegment):
del tags["MD"]

# Set the strand
if transcript_row.is_reverse:
self.strand = "-"
else:
self.strand = "+"

if "XS" in tags:
self.strand = tags["XS"]
del tags["XS"]
elif "ts" in tags:
self.strand = tags["ts"] # Minimap2
self.strand = self.strand if tags["ts"] == '+' else self.reverse_strand() # Minimap2
del tags["ts"]
else:
if transcript_row.is_reverse:
self.strand = "-"
else:
self.strand = "+"

self.attributes.update(tags)
self.attributes["coverage"] = coverage
Expand Down Expand Up @@ -574,8 +574,7 @@ def __getstate__(self):

state = dict()
for key, item in self.__dict__.items():
if key in ("_TranscriptBase__segmenttree", "_TranscriptBase__cds_tree",
"_Transcript__segmenttree", "_Transcript__cds_tree"):
if key in ("_segmenttree", "_cds_tree"):
continue
state[key] = item
try:
Expand Down Expand Up @@ -1160,7 +1159,7 @@ def reverse_strand(self):
self.strip_cds()
self.logger.warning("Transcript %s has been assigned to the wrong strand, reversing it.",
self.id)
return
return self.strand

def as_dict(self, remove_attributes=False):

Expand Down Expand Up @@ -2189,24 +2188,26 @@ def cds_tree(self):
"""
This property returns an interval tree of the CDS segments.
"""
if len(self.__cds_tree) != len(self.combined_cds) + len(self.combined_cds_introns):
if not self._is_tree_unchanged(frozenset(self.combined_cds),
frozenset(self.combined_cds_introns),
self._cds_tree):
self._calculate_cds_tree()

return self.__cds_tree
return self._cds_tree

def _calculate_cds_tree(self):

"""
:rtype: IntervalTree
"""

self.__cds_tree = IntervalTree()
self._cds_tree = IntervalTree()

for exon in self.combined_cds:
self.__cds_tree.add(Interval(exon[0], exon[1], value=Interval(exon[0], exon[1], value="CDS")))
self._cds_tree.add(Interval(exon[0], exon[1], value=Interval(exon[0], exon[1], value="CDS")))

for intron in self.combined_cds_introns:
self.__cds_tree.add(Interval(intron[0], intron[1], value=Interval(intron[0], intron[1], value="intron")))
self._cds_tree.add(Interval(intron[0], intron[1], value=Interval(intron[0], intron[1], value="intron")))

return

Expand All @@ -2227,19 +2228,31 @@ def segmenttree(self):
:rtype: IntervalTree
"""

if len(self.__segmenttree) != self.exon_num + len(self.introns):
if not self._is_tree_unchanged(frozenset(self.exons), frozenset(self.introns), self._segmenttree):
self._calculate_segment_tree()

return self.__segmenttree
return self._segmenttree

@staticmethod
@functools.lru_cache(maxsize=100, typed=True)
def _is_tree_unchanged(exons: frozenset, introns: frozenset, tree):
"""Static method to ensure that the tree has not changed. Cached for performance."""
if len(tree) != len(exons) + len(introns):
return False
check_exons = set()
check_introns = set()
tree.traverse(lambda x: check_introns.add((x.start, x.end))
if x.value == 'intron' else check_exons.add((x.start, x.end)))
return check_exons == exons and introns == check_introns

def _calculate_segment_tree(self):

self.__segmenttree = IntervalTree()
self._segmenttree = IntervalTree()
for exon in self.exons:
self.__segmenttree.add(Interval(exon[0], exon[1], value="exon"))
self._segmenttree.add(Interval(exon[0], exon[1], value="exon"))

for intron in self.introns:
self.__segmenttree.add(Interval(intron[0], intron[1], value="intron"))
self._segmenttree.add(Interval(intron[0], intron[1], value="intron"))

@property
def derived_children(self):
Expand Down
6 changes: 3 additions & 3 deletions Mikado/configuration/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,14 @@ def select_attribute_for_output(final_config_level: dict, attr_parent, attr_name
is_required = attr_parent.Schema._declared_fields[attr_name].required

# This is not going to happen if we are looking at a nested property
if attr_parent.Schema._declared_fields[attr_name].default == attr_value:
if attr_parent.Schema._declared_fields[attr_name].dump_default == attr_value:
if is_required:
final_config_level[attr_name] = attr_value
else:
return
elif not is_nested:
callable_and_equal_to_default = (callable(attr_parent.Schema._declared_fields[attr_name].default) and
attr_parent.Schema._declared_fields[attr_name].default() == attr_value)
callable_and_equal_to_default = (callable(attr_parent.Schema._declared_fields[attr_name].dump_default) and
attr_parent.Schema._declared_fields[attr_name].dump_default() == attr_value)
if callable_and_equal_to_default and not is_required:
return
else:
Expand Down
Loading

0 comments on commit 1bba1e3

Please sign in to comment.