Unit-tests (EI-CoreBioinformatics#137) seem to say that EI-CoreBioinf…

…ormatics#142 should be solved.
lucventurini · Nov 9, 2018 · 8f18b3c · 8f18b3c
1 parent 2c014de
commit 8f18b3c
Show file tree

Hide file tree

Showing 2 changed files with 102 additions and 30 deletions.
diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py
@@ -943,7 +943,16 @@ def ts_max_splices(self):
 def expand_transcript(transcript: Transcript,
                       start_transcript: [Transcript, bool],
                       end_transcript: [Transcript, bool],
-                      fai, logger):
+                      fai: pyfaidx.Fasta,
+                      logger):
+
+    """This method will enlarge the coordinates and exon structure of a transcript, given:
+    :param transcript: the transcript to modify.
+    :param start_transcript: the template transcript for the 5' end.
+    :param end_transcript: the template transcript for the 3' end.
+    :param fai: the indexed genomic sequence.
+    :param logger: the logger to be used in the function.
+    """
 
     # If there is nothing to do, just get out
     if not start_transcript and not end_transcript:
@@ -965,18 +974,19 @@ def expand_transcript(transcript: Transcript,
     downstream = 0
     down_exons = []
 
-    upstream, up_exons, new_first_exon = _enlarge_start(transcript, backup, start_transcript)
-    downstream, up_exons, down_exons = _enlarge_end(transcript, backup, end_transcript, up_exons, new_first_exon)
+    upstream, up_exons, new_first_exon, up_remove = _enlarge_start(transcript, backup, start_transcript)
+    downstream, up_exons, down_exons, down_remove = _enlarge_end(transcript,
+                                                               backup, end_transcript, up_exons, new_first_exon)
 
     first_exon, last_exon = transcript.exons[0], transcript.exons[-1]
 
     assert upstream >= 0 and downstream >= 0
 
-    if upstream > 0:
+    if up_remove is True:
         # Remove the first exon
         transcript.remove_exon(first_exon)
-    if downstream > 0:
-        if not (upstream > 0 and first_exon == last_exon):
+    if down_remove is True:
+        if not (up_remove is True and first_exon == last_exon):
             transcript.remove_exon(last_exon)
 
     new_exons = up_exons + down_exons
@@ -1023,11 +1033,23 @@ def expand_transcript(transcript: Transcript,
 
 def _enlarge_start(transcript: Transcript,
                    backup: Transcript,
-                   start_transcript: Transcript) -> (int, list, [None, tuple]):
+                   start_transcript: Transcript) -> (int, list, [None, tuple], bool):
+
+    """This method will enlarge the transcript at the 5' end, using another transcript as the template.
+    :param transcript: the original transcript to modify.
+    :param backup: a copy of the transcript. As we are modifying the original one, we do need a hard copy.
+    :param start_transcript: the template transcript.
+
+    The function returns the following:
+    :returns: the upstream modification, the list of upstream exons to add, the new first exon (if any),
+              a boolean flag indicating whether the first exon of the transcript should be removed.
+    """
+
 
     upstream = 0
     up_exons = []
     new_first_exon = None
+    to_remove = False
     if start_transcript:
         transcript.start = start_transcript.start
         upstream_exons = sorted([ _ for _ in
@@ -1047,6 +1069,7 @@ def _enlarge_start(transcript: Transcript,
             if new_first_exon != transcript.exons[0]:
                 upstream += backup.start - new_first_exon[0]
                 up_exons.append(new_first_exon)
+                to_remove = True
             else:
                 new_first_exon = None
             if intersecting_upstream[0] in upstream_exons:
@@ -1063,17 +1086,33 @@ def _enlarge_start(transcript: Transcript,
             up_exons.extend([(_[0], _[1]) for _ in upstream_exons])
             up_exons.append(new_first_exon)
 
-    return upstream, up_exons, new_first_exon
+    return upstream, up_exons, new_first_exon, to_remove
 
 
 def _enlarge_end(transcript: Transcript,
                  backup: Transcript,
                  end_transcript: Transcript,
                  up_exons: list,
-                 new_first_exon: [None, tuple]):
+                 new_first_exon: [None, tuple]) -> [int, list, list, bool]:
+
+    """
+    This method will enlarge the transcript at the 5' end, using another transcript as the template.
+    :param transcript: the original transcript to modify.
+    :param backup: a copy of the transcript. As we are modifying the original one, we do need a hard copy.
+    :param end_transcript: the template transcript.
+    :param up_exons: the list of exons added at the 5' end.
+    :param new_first_exon: the new coordinates of what used to be the first exon of the transcript.
+                           This is necessary because if the transcript is monoexonic, we might need to re-modify it.
+
+    The function returns the following:
+    :returns: the downstream modification, the (potentially modified) list of upstream exons to add,
+              the list of downstream exons to add, a boolean flag indicating whether the last exon of the transcript
+              should be removed.
+    """
 
     downstream = 0
     down_exons = []
+    to_remove = False
 
     if end_transcript:
         transcript.end = end_transcript.end
@@ -1093,12 +1132,14 @@ def _enlarge_end(transcript: Transcript,
                     up_exons.remove(new_first_exon)
                     downstream += new_exon[1] - backup.end
                     down_exons.append(new_exon)
+                    to_remove = True
             else:
                 new_exon = (transcript.exons[-1][0],
                             max(intersecting_downstream[-1][1], transcript.exons[-1][1]))
                 if new_exon != transcript.exons[-1]:
                     downstream += new_exon[1] - backup.end
                     down_exons.append(new_exon)
+                    to_remove = True
 
             if intersecting_downstream[-1] in downstream_exons:
                 downstream_exons.remove(intersecting_downstream[-1])
@@ -1112,18 +1153,34 @@ def _enlarge_end(transcript: Transcript,
             if transcript.monoexonic and new_first_exon is not None:
                 new_exon = (new_first_exon[0], downstream_exon[1])
                 up_exons.remove(new_first_exon)
+                to_remove = True
             else:
                 new_exon = (transcript.exons[-1][0], downstream_exon[1])
+                to_remove = True
             downstream_exons.remove(downstream_exon)
             downstream += new_exon[1] - backup.end
             downstream += sum(_[1] - _[0] + 1 for _ in downstream_exons)
             down_exons.extend([(_[0], _[1]) for _ in downstream_exons])
             down_exons.append(new_exon)
 
-    return downstream, up_exons, down_exons
+    return downstream, up_exons, down_exons, to_remove
 
 
-def check_expanded(transcript, backup, start_transcript, end_transcript, fai, upstream, downstream, logger):
+def check_expanded(transcript, backup, start_transcript, end_transcript, fai, upstream, downstream, logger) -> str:
+
+    """
+    This function checks that the expanded transcript is valid, and it also calculates and returns its cDNA sequence.
+    :param transcript: the modified transcript.
+    :param backup: The original transcript, before expansion.
+    :param start_transcript: the transcript used as template at the 5' end.
+    :param end_transcript: the transcript used as template at the 3' end.
+    :param fai: The pyfaidx.Fasta object indexing the genome.
+    :param upstream: the amount of transcriptomic base-pairs added to the transcript at its 5' end.
+    :param downstream: the amount of transcriptomic base-pairs added to the transcript at its 3' end.
+    :param logger: the logger to use.
+    :returns: the cDNA of the modified transcript, as a standard Python string.
+    """
+
     assert transcript.exons != backup.exons
 
     assert transcript.end <= len(fai[transcript.chrom]), (transcript.end, len(fai[transcript.chrom]))
@@ -1141,11 +1198,14 @@ def check_expanded(transcript, backup, start_transcript, end_transcript, fai, up
         error = [len(seq), backup.cdna_length + upstream + downstream,
                  backup.cdna_length, upstream, downstream,
                  (transcript.start, transcript.end), (backup.id, backup.start, backup.end),
-                 (None if not start_transcript else (start_transcript.id, start_transcript.end)),
-                 (None if not end_transcript else (end_transcript.id, end_transcript.end)),
-                 (backup.exons,
-                  None if not start_transcript else start_transcript.exons,
-                  None if not end_transcript else end_transcript.exons),
+                 (None if not start_transcript else (start_transcript.id, (start_transcript.start,
+                                                                           start_transcript.end))),
+                 (None if not end_transcript else (end_transcript.id, (end_transcript.start,
+                                                                       end_transcript.end))),
+                 (backup.id, backup.exons),
+                 None if not start_transcript else (start_transcript.id, start_transcript.exons),
+                 None if not end_transcript else (end_transcript.id, end_transcript.exons),
+                 (transcript.id + "_expanded", transcript.exons),
                  set.difference(set(transcript.exons), set(backup.exons)),
                  set.difference(set(backup.exons), set(transcript.exons))
                  ]
@@ -1161,6 +1221,18 @@ def enlarge_orfs(transcript: Transcript,
                  downstream: int,
                  logger) -> Transcript:
 
+    """
+    This method will take an expanded transcript and recalculate its ORF(s). As a consequence of the expansion,
+    truncated transcripts might become whole.
+    :param transcript: the expanded transcript.
+    :param backup: the original transcript. Used to extract the original ORF(s).
+    :param seq: the new cDNA sequence of the expanded transcript.
+    :param upstream: the amount of expansion that happened at the 5'.
+    :param downstream: the amount of expansion that happened at the 3'.
+    :param logger: the logger.
+    :returns: the modified transcript with the ORF(s) recalculated.
+    """
+
     if backup.combined_cds_length > 0:
         try:
             internal_orfs = list(backup.get_internal_orf_beds())

diff --git a/docs/Algorithms.rst b/docs/Algorithms.rst
@@ -617,30 +617,30 @@ Padding transcripts
 
 Mikado can optionally "pad" transcripts so to uniform, as much as possible, their start and stops. The procedure is as follows:
 
-#. Transcripts can be padded on one end if there is a **template** transcript for which the extension:
+1. Transcripts can be padded on one end if there is a **template** transcript for which the extension:
   - would have a *genomic* distance to the current end equal to *at most* a number of base-pairs specified under "ts_distance"
   - would not require to cross a number of splice junctions in the template over the number specified under "ts_max_splices"
-#. Create a copy of the transcripts in the locus, for backtracking.
-#. After selecting the templates and the attached transcripts, expand the transcript.
-  #. Create a copy of the transcript for backtracking
-  #. Calculate whether the 5' terminal exon should be enlarged:
+2. Create a copy of the transcripts in the locus, for backtracking.
+3. After selecting the templates and the attached transcripts, expand the transcript.
+  a. Create a copy of the transcript for backtracking
+  b. Calculate whether the 5' terminal exon should be enlarged:
     - if the transcript exon terminally overlaps a template exon, enlarge it until the end of the template
     - If the template transcript has multiple exons upstream of the expanded exon, add those to the transcript.
     - Calculate the number of bases that have been added upstream to the cDNA of the transcript
-  #. Calculate whether the 3' terminal exon should be enlarged:
+  c. Calculate whether the 3' terminal exon should be enlarged:
     - if the transcript exon terminally overlaps a template exon, enlarge it until the end of the template
     - If the template transcript has multiple exons downstream of the expanded exon, add those to the transcript.
     - Calculate the number of bases that have been added downstream to the cDNA of the transcript
-  #. If the transcript is coding:
-    #. Calculate the new putative CDS positions in the transcript, using the memoized amount of added basepairs downstream and upstream
-    #. Calculate the new CDS, **keeping the same frame as the original transcript**. If the transcript is incomplete, this might lead to find the proper start and stop codons
-    #. If we find an in-frame stop codon, the expansion would lead to an invalid transcript. Backtrack.
-#. Recalculate metrics and scores.
-#. Check whether we have made any transcript an invalid alternative splicing event; possible common causes include:
+  d. If the transcript is coding:
+    I. Calculate the new putative CDS positions in the transcript, using the memoized amount of added basepairs downstream and upstream
+    II. Calculate the new CDS, **keeping the same frame as the original transcript**. If the transcript is incomplete, this might lead to find the proper start and stop codons
+    III. If we find an in-frame stop codon, the expansion would lead to an invalid transcript. Backtrack.
+4. Recalculate metrics and scores.
+5. Check whether we have made any transcript an invalid alternative splicing event; possible common causes include:
   - Having created a retained intron
   - Having expanded the number or size of the UTR so that the transcripts are no longer viable
-#. If any of the non-viable transcripts is either the primary transcript or one of the templates, remove the current templates from the locus and restart the analysis.
-#. Discard all the non-viable transcripts that are neither the primary nor templates.
+6. If any of the non-viable transcripts is either the primary transcript or one of the templates, remove the current templates from the locus and restart the analysis.
+7. Discard all the non-viable transcripts that are neither the primary nor templates.
 
 This option is normally disabled. It has been written for using Mikado in conjunction with *ab initio* predictions, but it can be used fruitfully also with transcript assemblies.
 Please note that some of the metrics might become invalid after the padding. In particular, BLASTX results will be invalid as the query sequence will have changed.