Skip to content

Commit

Permalink
Fix #186 and #189 (#191)
Browse files Browse the repository at this point in the history
* Fix #189
* Fix #186
* #183: added static seed from CLI for pick.
* #186: introduced a maximum intron length parameter for mikado prepare (prepare/max_intron_length), with a default value of 1M bps and a minimum value of 20.
* #186: there was a very serious bug in the evaluation of negative truncated ORFs, which potentially led to a lot of them being called incorrectly at the serialisation stage. Refactored the function responsible for the mishap and added a unit-test which confirmed fixing of the bug.
  • Loading branch information
lucventurini authored Jul 8, 2019
1 parent 89cb531 commit 9aa33f0
Show file tree
Hide file tree
Showing 14 changed files with 212 additions and 57 deletions.
3 changes: 0 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,8 @@ install:
- sed -i "s/defaults::python.*/defaults::python=$TRAVIS_PYTHON_VERSION/" environment.yml
- conda env create -n env_name --file environment.yml
- source activate env_name
- conda install --yes setuptools cython atlas numpy scipy scikit-learn biopython
- conda install --yes -c bioconda diamond prodigal samtools
- pip install pytest-cov codecov;
- if [[ "$(python -c "import sys; print(sys.version_info.minor)")" == "7" ]]; then wget https://github.com/pytries/datrie/archive/0.7.1.tar.gz; tar xf 0.7.1.tar.gz; cd datrie-0.7.1; ./update_c.sh; python3.7 setup.py build; python3.7 setup.py install; cd ../; fi;
- pip install -r requirements.txt
- python setup.py develop;
script:
- touch plants.yaml && python -c "import Mikado; print(Mikado.__version__)" && rm plants.yaml; # This is to verify we fixed bug #124
Expand Down
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Users are ***very strongly recommended*** to update Mikado as soon as possible.
**IMPORTANT**: this release has completely overhauled the scoring files. We now provide only two ("plant.yaml" and "mammalian.yaml"). "Plant.yaml" should function also for insect or fungal species, but we have not tested it extensively. Old scoring files can be found under "HISTORIC".

Two of the major highlits of this release are:
- the completion of the "padding" functionality. Briefly, if instructed to do so, now Mikado will be able to uniform the ends of transcripts within a single locus (similar to what was done for the last _Arabidopsis thaliana_ annotation release). The behaviour is controlled by the "pad" boolean switch, and by the "ts_max_splices" and "ts_distance" parameters under "pick". Please note that now "ts_distance" refers to the **transcriptomic** distance, ie, long introns are not considered for this purpose.
- the completion of the "padding" functionality. Briefly, if instructed to do so, now Mikado will be able to uniform the ends of transcripts within a single locus (similar to what was done for the last _Arabidopsis thaliana_ annotation release). The behaviour is controlled by the "pad" boolean switch, and by the "ts_max_splices" and "ts_distance" parameters under "pick". Please note that now "ts_distance" refers to the **transcriptomic** distance, ie, long introns are not considered for this purpose.
- general improvements in speed and multiprocessing, as well as flexibility, for the Mikado compare utility.

With this release, we are also officially dropping support for Python 3.4. Python 3.5 will not be automatically tested for, as many Conda dependencies are not up-to-date, complicating the TRAVIS setup.
Expand Down
3 changes: 3 additions & 0 deletions Mikado/configuration/configuration_blueprint.json
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,9 @@
"minimum_length": {
"type": "integer", "default": 200, "minimum": 1
},
"max_intron_length": {
"type": "integer", "default": 1000000, "minimum": 20
},
"procs": {"type": "integer", "default": 1},
"files": {
"Comment": ["Options related to the input and output files.",
Expand Down
10 changes: 10 additions & 0 deletions Mikado/loci/locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ def __init__(self, transcript: Transcript, logger=None, json_conf=None, **kwargs

self.counter = 0
transcript.attributes["primary"] = True
if transcript.is_coding:
transcript.feature = "mRNA"
else:
transcript.feature = "ncRNA"

self.counter = 0 # simple tag to avoid collisions
Abstractlocus.__init__(self, logger=logger, json_conf=json_conf, **kwargs)
# this must be defined straight away
Expand Down Expand Up @@ -373,7 +378,12 @@ def add_transcript_to_locus(self, transcript: Transcript, check_in_locus=True,
transcript.id, self.id)
transcript.attributes["primary"] = False

if transcript.is_coding:
transcript.feature = "mRNA"
else:
transcript.feature = "ncRNA"
Abstractlocus.add_transcript_to_locus(self, transcript)

self.locus_verified_introns.update(transcript.verified_introns)

def __check_as_requirements(self, transcript: Transcript) -> bool:
Expand Down
59 changes: 38 additions & 21 deletions Mikado/parsers/bed12.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

import random
import os
from Bio import SeqIO
from Bio import Seq
import Bio.SeqRecord
from . import Parser
Expand All @@ -19,6 +18,7 @@
import re
from ..utilities.log_utils import create_null_logger
from Bio.Data import CodonTable
import pysam

standard = CodonTable.ambiguous_dna_by_id[1]
standard.start_codons = ["ATG"]
Expand Down Expand Up @@ -396,16 +396,25 @@ def __check_validity(self, transcriptomic, fasta_index, sequence):
self.validity_checked = True
if sequence is not None:
self.fasta_length = len(sequence)
if isinstance(sequence, str):
sequence = Seq.Seq(sequence)
if hasattr(sequence, "seq"):
sequence = str(sequence.seq)
if not isinstance(sequence, str):
sequence = str(sequence)
# if isinstance(sequence, str):
# sequence = Seq.Seq(sequence)
else:
if self.id not in fasta_index:
self.__in_index = False
return

self.fasta_length = len(fasta_index[self.id])
sequence = fasta_index[self.id].seq
sequence = fasta_index[self.id]
if hasattr(sequence, "seq"):
sequence = str(sequence.seq)
if not isinstance(sequence, str):
sequence = str(sequence)

assert isinstance(sequence, str)
# Just double check that the sequence length is the same as what the BED would suggest
if self.invalid is True:
self.coding = False
Expand All @@ -415,8 +424,9 @@ def __check_validity(self, transcriptomic, fasta_index, sequence):
orf_sequence = sequence[
(self.thick_start - 1 if not self.phase else self.start + self.phase - 1):self.thick_end]
else:
orf_sequence = sequence[(self.thick_start - 1):(
self.thick_end if not self.phase else self.end - (3 - self.phase) % 3)].reverse_complement()
orf_sequence = Seq.reverse_complement(
sequence[(self.thick_start - 1):(
self.thick_end if not self.phase else self.end - (3 - self.phase) % 3)])

self.start_codon = str(orf_sequence)[:3].upper()
self.stop_codon = str(orf_sequence[-3:]).upper()
Expand All @@ -430,8 +440,8 @@ def __check_validity(self, transcriptomic, fasta_index, sequence):

self.has_start_codon = False
if self.start_adjustment is True:
if self.strand == "-":
sequence = sequence.reverse_complement()
# if self.strand == "-":
# sequence = Seq.reverse_complement(sequence)
self._adjust_start(sequence, orf_sequence)

if self.stop_codon in self.table.stop_codons:
Expand All @@ -446,7 +456,7 @@ def __check_validity(self, transcriptomic, fasta_index, sequence):
last_pos = -3 - ((len(orf_sequence)) % 3)

if self.__lenient is False:
translated_seq = orf_sequence[:last_pos].translate(table=self.table, gap='N')
translated_seq = Seq.translate(orf_sequence[:last_pos], table=self.table, gap='N')
self.__internal_stop_codons = str(translated_seq).count("*")

if self.invalid is True:
Expand All @@ -457,22 +467,27 @@ def _adjust_start(self, sequence, orf_sequence):
assert len(orf_sequence) == (self.thick_end - self.thick_start + 1)
# Let's check UPstream first.
# This means that we DO NOT have a starting Met and yet we are starting far upstream.
if (self.strand == "+" and self.thick_start > 3) or (self.strand == "-" and self.end - self.thick_end > 3):
if self.strand == "+" and self.thick_start > 3:
for pos in range(self.thick_start, 3, -3):
if self.strand == "+":
self.thick_start -= 3
else:
self.thick_end += 3
self.thick_start -= 3
if sequence[pos - 3:pos] in self.table.start_codons:
# We have found a valid methionine.
break
elif sequence[pos - 3:pos] in self.table.stop_codons:
if self.strand == "+":
self.thick_start += 3
else:
self.thick_end -= 3
self.thick_start += 3
break
continue

elif self.strand == "-" and self.end - self.thick_end > 3:
for pos in range(self.thick_end, self.end - 3, 3):
self.thick_end += 3
if Seq.reverse_complement(sequence[pos - 3:pos]) in self.table.start_codons:
# We have found a valid methionine.
break
elif Seq.reverse_complement(sequence[pos - 3:pos]) in self.table.stop_codons:
self.thick_end -= 3
break
print("Thick end:", self.thick_end)
else:
for pos in range(3,
int(len(orf_sequence) * self.max_regression),
Expand All @@ -488,6 +503,8 @@ def _adjust_start(self, sequence, orf_sequence):
break
else:
continue
print("Thick end:", self.thick_end)

if self.has_start_codon is False:
# The validity will be automatically checked
if self.strand == "+":
Expand Down Expand Up @@ -863,7 +880,7 @@ def expand(self, sequence, upstream, downstream, expand_orf=False, logger=create
if len(coding_seq) % 3 != 0:
# Only get a multiple of three
coding_seq = coding_seq[:-((len(coding_seq)) % 3)]
prot_seq = coding_seq.translate(table=self.table, gap="N")
prot_seq = Seq.translate(coding_seq, table=self.table, gap="N")
if "*" in prot_seq:
self.thick_end = self.thick_start + self.phase - 1 + (1 + prot_seq.find("*")) * 3
self.stop_codon = coding_seq[prot_seq.find("*") * 3:(1 + prot_seq.find("*")) * 3].upper()
Expand Down Expand Up @@ -1020,9 +1037,9 @@ def __init__(self, handle,
elif fasta_index is not None:
if isinstance(fasta_index, str):
assert os.path.exists(fasta_index)
fasta_index = SeqIO.index(fasta_index, "fasta")
fasta_index = pysam.FastaFile(fasta_index)
else:
assert "SeqIO" in repr(fasta_index) and "index" in repr(fasta_index)
assert isinstance(fasta_index, pysam.FastaFile)

self.fasta_index = fasta_index
self.__closed = False
Expand Down
47 changes: 40 additions & 7 deletions Mikado/preparation/annotation_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,14 @@ def __init__(self,
logging_queue,
identifier,
min_length=0,
max_intron=3*10**5,
log_level="WARNING",
strip_cds=False):

super().__init__()
self.submission_queue = submission_queue
self.min_length = min_length
self.max_intron = max_intron
self.__strip_cds = strip_cds
self.logging_queue = logging_queue
self.log_level = log_level
Expand Down Expand Up @@ -86,6 +88,7 @@ def run(self):
found_ids,
self.logger,
min_length=self.min_length,
max_intron=self.max_intron,
strip_cds=self.__strip_cds,
is_reference=is_reference,
strand_specific=strand_specific)
Expand All @@ -96,6 +99,7 @@ def run(self):
found_ids,
self.logger,
min_length=self.min_length,
max_intron=self.max_intron,
is_reference=is_reference,
strip_cds=self.__strip_cds,
strand_specific=strand_specific)
Expand All @@ -107,6 +111,7 @@ def run(self):
self.logger,
is_reference=is_reference,
min_length=self.min_length,
max_intron=self.max_intron,
strip_cds=self.__strip_cds,
strand_specific=strand_specific)
else:
Expand Down Expand Up @@ -164,7 +169,7 @@ def __raise_invalid(row_id, name, label):
"(label: {0})".format(label) if label != '' else ""))


def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True):
def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True, max_intron=3*10**5):

"""Function to load the exon_lines dictionary into the temporary storage."""

Expand Down Expand Up @@ -246,9 +251,17 @@ def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True
else:
raise KeyError(exon_lines[tid]["features"])

tlength = sum(exon[1] + 1 - exon[0] for exon in segments)
start = min((_[0] for _ in segments))
end = max((_[1] for _ in segments))
segments = sorted(segments, key=itemgetter(0))
tlength = 0
start, end = segments[0][0], segments[-1][1]
biggest_intron = -1
num_segments = len(segments)
for pos, segment in enumerate(segments):
if pos < num_segments - 1:
later = segments[pos + 1]
biggest_intron = max(biggest_intron,
later[0] - (segment[1] + 1))
tlength += segment[1] + 1 - segment[0]

# Discard transcript under a certain size
if tlength < min_length:
Expand All @@ -260,6 +273,17 @@ def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True
tid, tlength, min_length)
continue

# Discard transcripts with introns over the limit
if biggest_intron > max(-1, max_intron):
if exon_lines[tid]["is_reference"] is True:
logger.info(
"%s retained even if its longest intron is over the limit (%d) as it is a reference transcript.",
tid, biggest_intron)
else:
logger.info("Discarding %s because its longest intron (%d) is over the maximum of %d",
tid, biggest_intron, max_intron)
continue

values = json.dumps(exon_lines[tid])

logger.debug("Inserting %s into shelf %s", tid, shelf_name)
Expand All @@ -282,6 +306,7 @@ def load_from_gff(shelf_name,
found_ids,
logger,
min_length=0,
max_intron=3*10**5,
is_reference=False,
strip_cds=False,
strand_specific=False):
Expand All @@ -297,6 +322,8 @@ def load_from_gff(shelf_name,
:type logger: logging.Logger
:param min_length: minimum length for a cDNA to be considered as valid
:type min_length: int
:param max_intron: maximum intron length for a cDNA to be considered as valid
:type max_intron: int
:param strip_cds: boolean flag. If true, all CDS lines will be ignored.
:type strip_cds: bool
:param strand_specific: whether the assembly is strand-specific or not.
Expand Down Expand Up @@ -422,7 +449,7 @@ def load_from_gff(shelf_name,

logger.info("Starting to load %s", shelf_name)
load_into_storage(shelf_name, exon_lines,
logger=logger, min_length=min_length, strip_cds=strip_cds)
logger=logger, min_length=min_length, strip_cds=strip_cds, max_intron=max_intron)

return new_ids

Expand All @@ -433,6 +460,7 @@ def load_from_gtf(shelf_name,
found_ids,
logger,
min_length=0,
max_intron=3*10**5,
is_reference=False,
strip_cds=False,
strand_specific=False):
Expand All @@ -448,6 +476,8 @@ def load_from_gtf(shelf_name,
:type logger: logging.Logger
:param min_length: minimum length for a cDNA to be considered as valid
:type min_length: int
:param max_intron: maximum intron length for a cDNA to be considered as valid
:type max_intron: int
:param strip_cds: boolean flag. If true, all CDS lines will be ignored.
:type strip_cds: bool
:param strand_specific: whether the assembly is strand-specific or not.
Expand Down Expand Up @@ -537,7 +567,7 @@ def load_from_gtf(shelf_name,
logger.info("Starting to load %s", shelf_name)
load_into_storage(shelf_name,
exon_lines,
logger=logger, min_length=min_length, strip_cds=strip_cds)
logger=logger, min_length=min_length, strip_cds=strip_cds, max_intron=max_intron)

return new_ids

Expand All @@ -548,6 +578,7 @@ def load_from_bed12(shelf_name,
found_ids,
logger,
min_length=0,
max_intron=3*10**5,
is_reference=False,
strip_cds=False,
strand_specific=False):
Expand All @@ -563,6 +594,8 @@ def load_from_bed12(shelf_name,
:type logger: logging.Logger
:param min_length: minimum length for a cDNA to be considered as valid
:type min_length: int
:param max_intron: maximum intron length for a cDNA to be considered as valid
:type max_intron: int
:param strip_cds: boolean flag. If true, all CDS lines will be ignored.
:type strip_cds: bool
:param strand_specific: whether the assembly is strand-specific or not.
Expand Down Expand Up @@ -621,6 +654,6 @@ def load_from_bed12(shelf_name,
new_ids.add(transcript.id)
gff_handle.close()
load_into_storage(shelf_name, exon_lines,
logger=logger, min_length=min_length, strip_cds=strip_cds)
logger=logger, min_length=min_length, strip_cds=strip_cds, max_intron=max_intron)

return new_ids
Loading

0 comments on commit 9aa33f0

Please sign in to comment.