From ab878b9824f95fc005aa5c18193750c38934047f Mon Sep 17 00:00:00 2001
From: Luca Venturini <lucventurini@gmail.com>
Date: Wed, 10 Oct 2018 12:45:20 +0100
Subject: [PATCH] Fixed #34

---
 CHANGELOG.md                                  |  1 +
 .../configuration_blueprint.json              | 16 +++++++++++++++-
 Mikado/parsers/bed12.py                       | 10 +++++++---
 Mikado/serializers/orf.py                     |  4 +++-
 Mikado/transcripts/transcript.py              | 19 ++++++++++++++++---
 .../transcript_methods/printing.py            |  2 +-
 .../transcript_methods/splitting.py           |  2 +-
 Mikado/transcripts/transcriptchecker.py       |  4 ----
 8 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 17d7d4564..1fb856af7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ Bugfixes and improvements:
 - Fixed a bug which caused some loci to crash at the last part of the picking stage
 - Now coding and non-coding transcripts will be in different loci.
 - Mikado prepare now can accept models that lack any exon features but still have valid CDS/UTR features
+- Fixed [#34](https://github.com/lucventurini/mikado/issues/34): now Mikado can specify a valid codon table among those provided by [NCBI  through BioPython](ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt). The default is "0", ie the Standard table but with only the canonical "ATG" being accepted as valid start codon.
 - Fixed [#126](https://github.com/lucventurini/mikado/issues/126): now reversing the strand of a model will cause its CDS to be stripped.
 - Fixed [#127](https://github.com/lucventurini/mikado/issues/127): previously, Mikado _prepare_ only considered cDNA coordinates when determining the redundancy of two models. In some edge cases, two models could be identical but have a different ORF called. Now Mikado will also consider the CDS before deciding whether to discard a model as redundant.
 - [#129](https://github.com/lucventurini/mikado/issues/129): Mikado is now capable of correctly padding the transcripts so to uniform their ends in a single locus. This will also have the effect of trying to enlarge the ORF of a transcript if it is truncated to begin with.
diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json
index 16f5efe0c..a92896aaf 100644
--- a/Mikado/configuration/configuration_blueprint.json
+++ b/Mikado/configuration/configuration_blueprint.json
@@ -189,7 +189,21 @@
         "max_target_seqs": {"type": "integer", "default": 100000, "minimum": 1},
         "force": {"type": "boolean", "default": false},
         "single_thread": {"type": "boolean", "default": false},
-        "procs": {"type": "integer", "default": 1, "minimum": 1}
+        "procs": {"type": "integer", "default": 1, "minimum": 1},
+        "codon_table": {
+              "enum": [0, 1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            "Standard", "SGC0", "Vertebrate Mitochondrial", "SGC1", "Yeast Mitochondrial", "SGC2", "Mold Mitochondrial",
+                  "Protozoan Mitochondrial", "Coelenterate Mitochondrial", "Mycoplasma", "Spiroplasma", "SGC3",
+                  "Invertebrate Mitochondrial", "SGC4", "Ciliate Nuclear", "Dasycladacean Nuclear",
+                  "Hexamita Nuclear", "SGC5", "Echinoderm Mitochondrial", "Flatworm Mitochondrial",
+                  "SGC8", "Euplotid Nuclear", "SGC9", "Bacterial", "Archaeal", "Plant Plastid",
+                  "Alternative Yeast Nuclear", "Ascidian Mitochondrial", "Alternative Flatworm Mitochondrial",
+                  "Blepharisma Macronuclear", "Chlorophycean Mitochondrial", "Trematode Mitochondrial",
+                  "Scenedesmus obliquus Mitochondrial", "Thraustochytrium Mitochondrial",
+                  "Pterobranchia Mitochondrial", "Candidate Division SR1", "Gracilibacteria",
+                  "Pachysolen tannophilus Nuclear", "Karyorelict Nuclear", "Condylostoma Nuclear",
+                  "Mesodinium Nuclear", "Peritrich Nuclear", "Blastocrithidia Nuclear"]
+            }
       }
     },
     "prepare":{
diff --git a/Mikado/parsers/bed12.py b/Mikado/parsers/bed12.py
index ff5622db7..2ea34046d 100644
--- a/Mikado/parsers/bed12.py
+++ b/Mikado/parsers/bed12.py
@@ -912,7 +912,8 @@ def __init__(self, handle,
                  transcriptomic=False,
                  max_regression=0,
                  is_gff=False,
-                 coding=False):
+                 coding=False,
+                 table=0):
         """
         Constructor method.
         :param handle: the input BED file.
@@ -949,6 +950,7 @@ def __init__(self, handle,
         self.fasta_index = fasta_index
         self.__closed = False
         self.header = False
+        self.__table = table
         self._is_bed12 = (not is_gff)
 
     def __iter__(self):
@@ -976,7 +978,8 @@ def bed_next(self):
                           fasta_index=self.fasta_index,
                           transcriptomic=self.transcriptomic,
                           max_regression=self._max_regression,
-                          coding=self.coding)
+                          coding=self.coding,
+                          table=self.__table)
         return bed12
 
     def gff_next(self):
@@ -998,7 +1001,8 @@ def gff_next(self):
             bed12 = BED12(line,
                           fasta_index=self.fasta_index,
                           transcriptomic=self.transcriptomic,
-                          max_regression=self._max_regression)
+                          max_regression=self._max_regression,
+                          table=self.__table)
         # raise NotImplementedError("Still working on this!")
         return bed12
 
diff --git a/Mikado/serializers/orf.py b/Mikado/serializers/orf.py
index 0dceccd4d..14c51c9d0 100644
--- a/Mikado/serializers/orf.py
+++ b/Mikado/serializers/orf.py
@@ -172,6 +172,7 @@ def __init__(self,
 
         fasta_index = json_conf["serialise"]["files"]["transcripts"]
         self._max_regression = json_conf["serialise"]["max_regression"]
+        self._table = json_conf["serialise"]["codon_table"]
 
         if isinstance(fasta_index, str):
             assert os.path.exists(fasta_index)
@@ -194,7 +195,8 @@ def __init__(self,
                                               fasta_index=fasta_index,
                                               is_gff=(not self.is_bed12),
                                               transcriptomic=True,
-                                              max_regression=self._max_regression)
+                                              max_regression=self._max_regression,
+                                              table=self._table)
 
         self.engine = connect(json_conf, logger)
 
diff --git a/Mikado/transcripts/transcript.py b/Mikado/transcripts/transcript.py
index a7b70b987..8fc464569 100644
--- a/Mikado/transcripts/transcript.py
+++ b/Mikado/transcripts/transcript.py
@@ -736,7 +736,8 @@ def get_internal_orf_beds(self):
         else:
             seq = None
 
-        row = BED12(transcriptomic=True, coding=True, start_adjustment=False, max_regression=0)
+        row = BED12(transcriptomic=True, coding=True, start_adjustment=False, max_regression=0,
+                    table=self.codon_table)
         row.header = False
         row.chrom = self.id
         row.strand = "+"
@@ -752,7 +753,9 @@ def get_internal_orf_beds(self):
             row.block_count = 0
             row.block_starts = [0]
             row.block_sizes = [0]
-            row = BED12(row, seq, coding=False, transcriptomic=True, max_regression=0, start_adjustment=False)
+            row = BED12(row, seq,
+                        coding=False, transcriptomic=True, max_regression=0, start_adjustment=False,
+                        table=self.codon_table)
             assert row.invalid is False, ("\n".join([str(row), row.invalid_reason]))
             yield row
 
@@ -793,7 +796,8 @@ def get_internal_orf_beds(self):
                 new_row = BED12(new_row,
                                 sequence=seq,
                                 phase=phase,
-                                coding=True, transcriptomic=True, max_regression=0, start_adjustment=False)
+                                coding=True, transcriptomic=True, max_regression=0, start_adjustment=False,
+                                table=self.codon_table)
                 if (cds_len - phase) % 3 != 0 and cds_end not in (self.start, self.end):
                     raise AssertionError("Invalid CDS length for {}:\n{}\n{}".format(self.id,
                                                                                      iorf,
@@ -1959,6 +1963,15 @@ def __calculate_cds_tree(self):
         self.__cds_tree = IntervalTree.from_tuples(
             [(cds[0], max(cds[1], cds[0] + 1)) for cds in self.combined_cds])
 
+    @property
+    def codon_table(self):
+        """This property returns the codon table for the project. Default: 0 (Standard, but only ATG is considered
+        a valid start codon)."""
+
+        if self.json_conf is None:
+            return 0
+        return self.json_conf.get("serialise", {}).get("codon_table", 0)
+
     @property
     def segmenttree(self):
 
diff --git a/Mikado/transcripts/transcript_methods/printing.py b/Mikado/transcripts/transcript_methods/printing.py
index 302e85809..993f625ae 100644
--- a/Mikado/transcripts/transcript_methods/printing.py
+++ b/Mikado/transcripts/transcript_methods/printing.py
@@ -278,7 +278,7 @@ def as_bed12(transcript, transcriptomic=False):
     """
 
     transcript.finalize()
-    bed12 = BED12()
+    bed12 = BED12(table=transcript.codon_table)
     bed12.transcriptomic = False
     bed12.header = False
     bed12.chrom = transcript.chrom
diff --git a/Mikado/transcripts/transcript_methods/splitting.py b/Mikado/transcripts/transcript_methods/splitting.py
index eacca84a6..5b57d625f 100644
--- a/Mikado/transcripts/transcript_methods/splitting.py
+++ b/Mikado/transcripts/transcript_methods/splitting.py
@@ -707,7 +707,7 @@ def __relocate_orfs(transcript, bed12_objects, tstart, tend):
     for obj in bed12_objects:
         # import copy
         # obj = copy.deepcopy(obj)
-        new = BED12()
+        new = BED12(table=transcript.codon_table)
         new.transcriptomic = True
         # Phase is necessary for truncated models
         for attr in ["chrom", "start", "end", "strand", "thick_start", "thick_end",
diff --git a/Mikado/transcripts/transcriptchecker.py b/Mikado/transcripts/transcriptchecker.py
index e7c5bfd4d..d8b5ce185 100644
--- a/Mikado/transcripts/transcriptchecker.py
+++ b/Mikado/transcripts/transcriptchecker.py
@@ -335,10 +335,6 @@ def check_orf(self):
 
             orf = orfs[0]
             assert isinstance(orf, BED12)
-            # orf = BED12(str(orf), transcriptomic=True, sequence=self.cdna, max_regression=0, start_adjustment=False)
-            # orf.max_regression = 0
-            # orf.start_adjustment = False
-            # orf.sequence = self.cdna
 
             if orf.invalid:
                 self.logger.warning("Invalid ORF for %s (reason: %s)", self.id, orf.invalid_reason)