Fix #186 and #189 (#191)

* Fix #189 * Fix #186 * #183: added static seed from CLI for pick. * #186: introduced a maximum intron length parameter for mikado prepare (prepare/max_intron_length), with a default value of 1M bps and a minimum value of 20. * #186: there was a very serious bug in the evaluation of negative truncated ORFs, which potentially led to a lot of them being called incorrectly at the serialisation stage. Refactored the function responsible for the mishap and added a unit-test which confirmed fixing of the bug.
EI-CoreBioinformatics · Jul 8, 2019 · 9aa33f0 · 9aa33f0
1 parent 89cb531
commit 9aa33f0
Show file tree

Hide file tree

Showing 14 changed files with 212 additions and 57 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -18,11 +18,8 @@ install:
   - sed -i "s/defaults::python.*/defaults::python=$TRAVIS_PYTHON_VERSION/" environment.yml
   - conda env create -n env_name --file environment.yml
   - source activate env_name
-  - conda install --yes setuptools cython atlas numpy scipy scikit-learn biopython
-  - conda install --yes -c bioconda diamond prodigal samtools
   - pip install pytest-cov codecov;
   - if [[ "$(python -c "import sys; print(sys.version_info.minor)")" == "7" ]]; then wget https://github.com/pytries/datrie/archive/0.7.1.tar.gz; tar xf 0.7.1.tar.gz; cd datrie-0.7.1; ./update_c.sh; python3.7 setup.py build; python3.7 setup.py install; cd ../; fi;
-  - pip install -r requirements.txt
   - python setup.py develop;
 script:
   - touch plants.yaml && python -c "import Mikado; print(Mikado.__version__)" && rm plants.yaml;  # This is to verify we fixed bug #124

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,7 +9,7 @@ Users are ***very strongly recommended*** to update Mikado as soon as possible.
 **IMPORTANT**: this release has completely overhauled the scoring files. We now provide only two ("plant.yaml" and "mammalian.yaml"). "Plant.yaml" should function also for insect or fungal species, but we have not tested it extensively. Old scoring files can be found under "HISTORIC".
 
 Two of the major highlits of this release are:
-	- the completion of the "padding" functionality. Briefly, if instructed to do so, now Mikado will be able to uniform the ends of transcripts within a single locus (similar to what was done for the last _Arabidopsis thaliana_ annotation release). The behaviour is controlled by the "pad" boolean switch, and by the "ts_max_splices" and "ts_distance" parameters under "pick". Please note that now "ts_distance" refers to the **transcriptomic** distance, ie, long introns are not considered for this purpose.
+  - the completion of the "padding" functionality. Briefly, if instructed to do so, now Mikado will be able to uniform the ends of transcripts within a single locus (similar to what was done for the last _Arabidopsis thaliana_ annotation release). The behaviour is controlled by the "pad" boolean switch, and by the "ts_max_splices" and "ts_distance" parameters under "pick". Please note that now "ts_distance" refers to the **transcriptomic** distance, ie, long introns are not considered for this purpose.
   - general improvements in speed and multiprocessing, as well as flexibility, for the Mikado compare utility.
 
 With this release, we are also officially dropping support for Python 3.4. Python 3.5 will not be automatically tested for, as many Conda dependencies are not up-to-date, complicating the TRAVIS setup.

diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json
@@ -239,6 +239,9 @@
         "minimum_length": {
           "type": "integer", "default": 200, "minimum": 1
         },
+        "max_intron_length": {
+          "type": "integer", "default": 1000000, "minimum": 20
+        },
         "procs": {"type": "integer", "default": 1},
         "files": {
           "Comment": ["Options related to the input and output files.",

diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py
@@ -39,6 +39,11 @@ def __init__(self, transcript: Transcript, logger=None, json_conf=None, **kwargs
 
         self.counter = 0
         transcript.attributes["primary"] = True
+        if transcript.is_coding:
+            transcript.feature = "mRNA"
+        else:
+            transcript.feature = "ncRNA"
+
         self.counter = 0  # simple tag to avoid collisions
         Abstractlocus.__init__(self, logger=logger, json_conf=json_conf, **kwargs)
         # this must be defined straight away
@@ -373,7 +378,12 @@ def add_transcript_to_locus(self, transcript: Transcript, check_in_locus=True,
                           transcript.id, self.id)
         transcript.attributes["primary"] = False
 
+        if transcript.is_coding:
+            transcript.feature = "mRNA"
+        else:
+            transcript.feature = "ncRNA"
         Abstractlocus.add_transcript_to_locus(self, transcript)
+
         self.locus_verified_introns.update(transcript.verified_introns)
 
     def __check_as_requirements(self, transcript: Transcript) -> bool:

diff --git a/Mikado/parsers/bed12.py b/Mikado/parsers/bed12.py
@@ -8,7 +8,6 @@
 
 import random
 import os
-from Bio import SeqIO
 from Bio import Seq
 import Bio.SeqRecord
 from . import Parser
@@ -19,6 +18,7 @@
 import re
 from ..utilities.log_utils import create_null_logger
 from Bio.Data import CodonTable
+import pysam
 
 standard = CodonTable.ambiguous_dna_by_id[1]
 standard.start_codons = ["ATG"]
@@ -396,16 +396,25 @@ def __check_validity(self, transcriptomic, fasta_index, sequence):
             self.validity_checked = True
             if sequence is not None:
                 self.fasta_length = len(sequence)
-                if isinstance(sequence, str):
-                    sequence = Seq.Seq(sequence)
+                if hasattr(sequence, "seq"):
+                    sequence = str(sequence.seq)
+                if not isinstance(sequence, str):
+                    sequence = str(sequence)
+                # if isinstance(sequence, str):
+                #     sequence = Seq.Seq(sequence)
             else:
                 if self.id not in fasta_index:
                     self.__in_index = False
                     return
 
                 self.fasta_length = len(fasta_index[self.id])
-                sequence = fasta_index[self.id].seq
+                sequence = fasta_index[self.id]
+                if hasattr(sequence, "seq"):
+                    sequence = str(sequence.seq)
+                if not isinstance(sequence, str):
+                    sequence = str(sequence)
 
+            assert isinstance(sequence, str)
             # Just double check that the sequence length is the same as what the BED would suggest
             if self.invalid is True:
                 self.coding = False
@@ -415,8 +424,9 @@ def __check_validity(self, transcriptomic, fasta_index, sequence):
                 orf_sequence = sequence[
                                (self.thick_start - 1 if not self.phase else self.start + self.phase - 1):self.thick_end]
             else:
-                orf_sequence = sequence[(self.thick_start - 1):(
-                    self.thick_end if not self.phase else self.end - (3 - self.phase) % 3)].reverse_complement()
+                orf_sequence = Seq.reverse_complement(
+                    sequence[(self.thick_start - 1):(
+                        self.thick_end if not self.phase else self.end - (3 - self.phase) % 3)])
 
             self.start_codon = str(orf_sequence)[:3].upper()
             self.stop_codon = str(orf_sequence[-3:]).upper()
@@ -430,8 +440,8 @@ def __check_validity(self, transcriptomic, fasta_index, sequence):
 
                 self.has_start_codon = False
                 if self.start_adjustment is True:
-                    if self.strand == "-":
-                        sequence = sequence.reverse_complement()
+                    # if self.strand == "-":
+                    #     sequence = Seq.reverse_complement(sequence)
                     self._adjust_start(sequence, orf_sequence)
 
             if self.stop_codon in self.table.stop_codons:
@@ -446,7 +456,7 @@ def __check_validity(self, transcriptomic, fasta_index, sequence):
             last_pos = -3 - ((len(orf_sequence)) % 3)
 
             if self.__lenient is False:
-                translated_seq = orf_sequence[:last_pos].translate(table=self.table, gap='N')
+                translated_seq = Seq.translate(orf_sequence[:last_pos], table=self.table, gap='N')
                 self.__internal_stop_codons = str(translated_seq).count("*")
 
             if self.invalid is True:
@@ -457,22 +467,27 @@ def _adjust_start(self, sequence, orf_sequence):
         assert len(orf_sequence) == (self.thick_end - self.thick_start + 1)
         # Let's check UPstream first.
         # This means that we DO NOT have a starting Met and yet we are starting far upstream.
-        if (self.strand == "+" and self.thick_start > 3) or (self.strand == "-" and self.end - self.thick_end > 3):
+        if self.strand == "+" and self.thick_start > 3:
             for pos in range(self.thick_start, 3, -3):
-                if self.strand == "+":
-                    self.thick_start -= 3
-                else:
-                    self.thick_end += 3
+                self.thick_start -= 3
                 if sequence[pos - 3:pos] in self.table.start_codons:
                     # We have found a valid methionine.
                     break
                 elif sequence[pos - 3:pos] in self.table.stop_codons:
-                    if self.strand == "+":
-                        self.thick_start += 3
-                    else:
-                        self.thick_end -= 3
+                    self.thick_start += 3
                     break
                 continue
+
+        elif self.strand == "-" and self.end - self.thick_end > 3:
+            for pos in range(self.thick_end, self.end - 3, 3):
+                self.thick_end += 3
+                if Seq.reverse_complement(sequence[pos - 3:pos]) in self.table.start_codons:
+                    # We have found a valid methionine.
+                    break
+                elif Seq.reverse_complement(sequence[pos - 3:pos]) in self.table.stop_codons:
+                    self.thick_end -= 3
+                    break
+            print("Thick end:", self.thick_end)
         else:
             for pos in range(3,
                              int(len(orf_sequence) * self.max_regression),
@@ -488,6 +503,8 @@ def _adjust_start(self, sequence, orf_sequence):
                     break
                 else:
                     continue
+            print("Thick end:", self.thick_end)
+
         if self.has_start_codon is False:
             # The validity will be automatically checked
             if self.strand == "+":
@@ -863,7 +880,7 @@ def expand(self, sequence, upstream, downstream, expand_orf=False, logger=create
             if len(coding_seq) % 3 != 0:
                 # Only get a multiple of three
                 coding_seq = coding_seq[:-((len(coding_seq)) % 3)]
-            prot_seq = coding_seq.translate(table=self.table, gap="N")
+            prot_seq = Seq.translate(coding_seq, table=self.table, gap="N")
             if "*" in prot_seq:
                 self.thick_end = self.thick_start + self.phase - 1 + (1 + prot_seq.find("*")) * 3
                 self.stop_codon = coding_seq[prot_seq.find("*") * 3:(1 + prot_seq.find("*")) * 3].upper()
@@ -1020,9 +1037,9 @@ def __init__(self, handle,
         elif fasta_index is not None:
             if isinstance(fasta_index, str):
                 assert os.path.exists(fasta_index)
-                fasta_index = SeqIO.index(fasta_index, "fasta")
+                fasta_index = pysam.FastaFile(fasta_index)
             else:
-                assert "SeqIO" in repr(fasta_index) and "index" in repr(fasta_index)
+                assert isinstance(fasta_index, pysam.FastaFile)
 
         self.fasta_index = fasta_index
         self.__closed = False

diff --git a/Mikado/preparation/annotation_parser.py b/Mikado/preparation/annotation_parser.py
@@ -26,12 +26,14 @@ def __init__(self,
                  logging_queue,
                  identifier,
                  min_length=0,
+                 max_intron=3*10**5,
                  log_level="WARNING",
                  strip_cds=False):
 
         super().__init__()
         self.submission_queue = submission_queue
         self.min_length = min_length
+        self.max_intron = max_intron
         self.__strip_cds = strip_cds
         self.logging_queue = logging_queue
         self.log_level = log_level
@@ -86,6 +88,7 @@ def run(self):
                                             found_ids,
                                             self.logger,
                                             min_length=self.min_length,
+                                            max_intron=self.max_intron,
                                             strip_cds=self.__strip_cds,
                                             is_reference=is_reference,
                                             strand_specific=strand_specific)
@@ -96,6 +99,7 @@ def run(self):
                                             found_ids,
                                             self.logger,
                                             min_length=self.min_length,
+                                            max_intron=self.max_intron,
                                             is_reference=is_reference,
                                             strip_cds=self.__strip_cds,
                                             strand_specific=strand_specific)
@@ -107,6 +111,7 @@ def run(self):
                                               self.logger,
                                               is_reference=is_reference,
                                               min_length=self.min_length,
+                                              max_intron=self.max_intron,
                                               strip_cds=self.__strip_cds,
                                               strand_specific=strand_specific)
                 else:
@@ -164,7 +169,7 @@ def __raise_invalid(row_id, name, label):
             "(label: {0})".format(label) if label != '' else ""))
 
 
-def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True):
+def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True, max_intron=3*10**5):
 
     """Function to load the exon_lines dictionary into the temporary storage."""
 
@@ -246,9 +251,17 @@ def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True
         else:
             raise KeyError(exon_lines[tid]["features"])
 
-        tlength = sum(exon[1] + 1 - exon[0] for exon in segments)
-        start = min((_[0] for _ in segments))
-        end = max((_[1] for _ in segments))
+        segments = sorted(segments, key=itemgetter(0))
+        tlength = 0
+        start, end = segments[0][0], segments[-1][1]
+        biggest_intron = -1
+        num_segments = len(segments)
+        for pos, segment in enumerate(segments):
+            if pos < num_segments - 1:
+                later = segments[pos + 1]
+                biggest_intron = max(biggest_intron,
+                                     later[0] - (segment[1] + 1))
+            tlength += segment[1] + 1 - segment[0]
 
         # Discard transcript under a certain size
         if tlength < min_length:
@@ -260,6 +273,17 @@ def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True
                              tid, tlength, min_length)
                 continue
 
+        # Discard transcripts with introns over the limit
+        if biggest_intron > max(-1, max_intron):
+            if exon_lines[tid]["is_reference"] is True:
+                logger.info(
+                    "%s retained even if its longest intron is over the limit (%d) as it is a reference transcript.",
+                    tid, biggest_intron)
+            else:
+                logger.info("Discarding %s because its longest intron (%d) is over the maximum of %d",
+                             tid, biggest_intron, max_intron)
+                continue
+
         values = json.dumps(exon_lines[tid])
 
         logger.debug("Inserting %s into shelf %s", tid, shelf_name)
@@ -282,6 +306,7 @@ def load_from_gff(shelf_name,
                   found_ids,
                   logger,
                   min_length=0,
+                  max_intron=3*10**5,
                   is_reference=False,
                   strip_cds=False,
                   strand_specific=False):
@@ -297,6 +322,8 @@ def load_from_gff(shelf_name,
     :type logger: logging.Logger
     :param min_length: minimum length for a cDNA to be considered as valid
     :type min_length: int
+    :param max_intron: maximum intron length for a cDNA to be considered as valid
+    :type max_intron: int
     :param strip_cds: boolean flag. If true, all CDS lines will be ignored.
     :type strip_cds: bool
     :param strand_specific: whether the assembly is strand-specific or not.
@@ -422,7 +449,7 @@ def load_from_gff(shelf_name,
 
     logger.info("Starting to load %s", shelf_name)
     load_into_storage(shelf_name, exon_lines,
-                      logger=logger, min_length=min_length, strip_cds=strip_cds)
+                      logger=logger, min_length=min_length, strip_cds=strip_cds, max_intron=max_intron)
 
     return new_ids
 
@@ -433,6 +460,7 @@ def load_from_gtf(shelf_name,
                   found_ids,
                   logger,
                   min_length=0,
+                  max_intron=3*10**5,
                   is_reference=False,
                   strip_cds=False,
                   strand_specific=False):
@@ -448,6 +476,8 @@ def load_from_gtf(shelf_name,
     :type logger: logging.Logger
     :param min_length: minimum length for a cDNA to be considered as valid
     :type min_length: int
+    :param max_intron: maximum intron length for a cDNA to be considered as valid
+    :type max_intron: int
     :param strip_cds: boolean flag. If true, all CDS lines will be ignored.
     :type strip_cds: bool
     :param strand_specific: whether the assembly is strand-specific or not.
@@ -537,7 +567,7 @@ def load_from_gtf(shelf_name,
     logger.info("Starting to load %s", shelf_name)
     load_into_storage(shelf_name,
                       exon_lines,
-                      logger=logger, min_length=min_length, strip_cds=strip_cds)
+                      logger=logger, min_length=min_length, strip_cds=strip_cds, max_intron=max_intron)
 
     return new_ids
 
@@ -548,6 +578,7 @@ def load_from_bed12(shelf_name,
                     found_ids,
                     logger,
                     min_length=0,
+                    max_intron=3*10**5,
                     is_reference=False,
                     strip_cds=False,
                     strand_specific=False):
@@ -563,6 +594,8 @@ def load_from_bed12(shelf_name,
     :type logger: logging.Logger
     :param min_length: minimum length for a cDNA to be considered as valid
     :type min_length: int
+    :param max_intron: maximum intron length for a cDNA to be considered as valid
+    :type max_intron: int
     :param strip_cds: boolean flag. If true, all CDS lines will be ignored.
     :type strip_cds: bool
     :param strand_specific: whether the assembly is strand-specific or not.
@@ -621,6 +654,6 @@ def load_from_bed12(shelf_name,
         new_ids.add(transcript.id)
     gff_handle.close()
     load_into_storage(shelf_name, exon_lines,
-                      logger=logger, min_length=min_length, strip_cds=strip_cds)
+                      logger=logger, min_length=min_length, strip_cds=strip_cds, max_intron=max_intron)
 
     return new_ids