From ab878b9824f95fc005aa5c18193750c38934047f Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 10 Oct 2018 12:45:20 +0100 Subject: [PATCH] Fixed #34 --- CHANGELOG.md | 1 + .../configuration_blueprint.json | 16 +++++++++++++++- Mikado/parsers/bed12.py | 10 +++++++--- Mikado/serializers/orf.py | 4 +++- Mikado/transcripts/transcript.py | 19 ++++++++++++++++--- .../transcript_methods/printing.py | 2 +- .../transcript_methods/splitting.py | 2 +- Mikado/transcripts/transcriptchecker.py | 4 ---- 8 files changed, 44 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 17d7d4564..1fb856af7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Bugfixes and improvements: - Fixed a bug which caused some loci to crash at the last part of the picking stage - Now coding and non-coding transcripts will be in different loci. - Mikado prepare now can accept models that lack any exon features but still have valid CDS/UTR features +- Fixed [#34](https://github.com/lucventurini/mikado/issues/34): now Mikado can specify a valid codon table among those provided by [NCBI through BioPython](ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt). The default is "0", ie the Standard table but with only the canonical "ATG" being accepted as valid start codon. - Fixed [#126](https://github.com/lucventurini/mikado/issues/126): now reversing the strand of a model will cause its CDS to be stripped. - Fixed [#127](https://github.com/lucventurini/mikado/issues/127): previously, Mikado _prepare_ only considered cDNA coordinates when determining the redundancy of two models. In some edge cases, two models could be identical but have a different ORF called. Now Mikado will also consider the CDS before deciding whether to discard a model as redundant. - [#129](https://github.com/lucventurini/mikado/issues/129): Mikado is now capable of correctly padding the transcripts so to uniform their ends in a single locus. This will also have the effect of trying to enlarge the ORF of a transcript if it is truncated to begin with. diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json index 16f5efe0c..a92896aaf 100644 --- a/Mikado/configuration/configuration_blueprint.json +++ b/Mikado/configuration/configuration_blueprint.json @@ -189,7 +189,21 @@ "max_target_seqs": {"type": "integer", "default": 100000, "minimum": 1}, "force": {"type": "boolean", "default": false}, "single_thread": {"type": "boolean", "default": false}, - "procs": {"type": "integer", "default": 1, "minimum": 1} + "procs": {"type": "integer", "default": 1, "minimum": 1}, + "codon_table": { + "enum": [0, 1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + "Standard", "SGC0", "Vertebrate Mitochondrial", "SGC1", "Yeast Mitochondrial", "SGC2", "Mold Mitochondrial", + "Protozoan Mitochondrial", "Coelenterate Mitochondrial", "Mycoplasma", "Spiroplasma", "SGC3", + "Invertebrate Mitochondrial", "SGC4", "Ciliate Nuclear", "Dasycladacean Nuclear", + "Hexamita Nuclear", "SGC5", "Echinoderm Mitochondrial", "Flatworm Mitochondrial", + "SGC8", "Euplotid Nuclear", "SGC9", "Bacterial", "Archaeal", "Plant Plastid", + "Alternative Yeast Nuclear", "Ascidian Mitochondrial", "Alternative Flatworm Mitochondrial", + "Blepharisma Macronuclear", "Chlorophycean Mitochondrial", "Trematode Mitochondrial", + "Scenedesmus obliquus Mitochondrial", "Thraustochytrium Mitochondrial", + "Pterobranchia Mitochondrial", "Candidate Division SR1", "Gracilibacteria", + "Pachysolen tannophilus Nuclear", "Karyorelict Nuclear", "Condylostoma Nuclear", + "Mesodinium Nuclear", "Peritrich Nuclear", "Blastocrithidia Nuclear"] + } } }, "prepare":{ diff --git a/Mikado/parsers/bed12.py b/Mikado/parsers/bed12.py index ff5622db7..2ea34046d 100644 --- a/Mikado/parsers/bed12.py +++ b/Mikado/parsers/bed12.py @@ -912,7 +912,8 @@ def __init__(self, handle, transcriptomic=False, max_regression=0, is_gff=False, - coding=False): + coding=False, + table=0): """ Constructor method. :param handle: the input BED file. @@ -949,6 +950,7 @@ def __init__(self, handle, self.fasta_index = fasta_index self.__closed = False self.header = False + self.__table = table self._is_bed12 = (not is_gff) def __iter__(self): @@ -976,7 +978,8 @@ def bed_next(self): fasta_index=self.fasta_index, transcriptomic=self.transcriptomic, max_regression=self._max_regression, - coding=self.coding) + coding=self.coding, + table=self.__table) return bed12 def gff_next(self): @@ -998,7 +1001,8 @@ def gff_next(self): bed12 = BED12(line, fasta_index=self.fasta_index, transcriptomic=self.transcriptomic, - max_regression=self._max_regression) + max_regression=self._max_regression, + table=self.__table) # raise NotImplementedError("Still working on this!") return bed12 diff --git a/Mikado/serializers/orf.py b/Mikado/serializers/orf.py index 0dceccd4d..14c51c9d0 100644 --- a/Mikado/serializers/orf.py +++ b/Mikado/serializers/orf.py @@ -172,6 +172,7 @@ def __init__(self, fasta_index = json_conf["serialise"]["files"]["transcripts"] self._max_regression = json_conf["serialise"]["max_regression"] + self._table = json_conf["serialise"]["codon_table"] if isinstance(fasta_index, str): assert os.path.exists(fasta_index) @@ -194,7 +195,8 @@ def __init__(self, fasta_index=fasta_index, is_gff=(not self.is_bed12), transcriptomic=True, - max_regression=self._max_regression) + max_regression=self._max_regression, + table=self._table) self.engine = connect(json_conf, logger) diff --git a/Mikado/transcripts/transcript.py b/Mikado/transcripts/transcript.py index a7b70b987..8fc464569 100644 --- a/Mikado/transcripts/transcript.py +++ b/Mikado/transcripts/transcript.py @@ -736,7 +736,8 @@ def get_internal_orf_beds(self): else: seq = None - row = BED12(transcriptomic=True, coding=True, start_adjustment=False, max_regression=0) + row = BED12(transcriptomic=True, coding=True, start_adjustment=False, max_regression=0, + table=self.codon_table) row.header = False row.chrom = self.id row.strand = "+" @@ -752,7 +753,9 @@ def get_internal_orf_beds(self): row.block_count = 0 row.block_starts = [0] row.block_sizes = [0] - row = BED12(row, seq, coding=False, transcriptomic=True, max_regression=0, start_adjustment=False) + row = BED12(row, seq, + coding=False, transcriptomic=True, max_regression=0, start_adjustment=False, + table=self.codon_table) assert row.invalid is False, ("\n".join([str(row), row.invalid_reason])) yield row @@ -793,7 +796,8 @@ def get_internal_orf_beds(self): new_row = BED12(new_row, sequence=seq, phase=phase, - coding=True, transcriptomic=True, max_regression=0, start_adjustment=False) + coding=True, transcriptomic=True, max_regression=0, start_adjustment=False, + table=self.codon_table) if (cds_len - phase) % 3 != 0 and cds_end not in (self.start, self.end): raise AssertionError("Invalid CDS length for {}:\n{}\n{}".format(self.id, iorf, @@ -1959,6 +1963,15 @@ def __calculate_cds_tree(self): self.__cds_tree = IntervalTree.from_tuples( [(cds[0], max(cds[1], cds[0] + 1)) for cds in self.combined_cds]) + @property + def codon_table(self): + """This property returns the codon table for the project. Default: 0 (Standard, but only ATG is considered + a valid start codon).""" + + if self.json_conf is None: + return 0 + return self.json_conf.get("serialise", {}).get("codon_table", 0) + @property def segmenttree(self): diff --git a/Mikado/transcripts/transcript_methods/printing.py b/Mikado/transcripts/transcript_methods/printing.py index 302e85809..993f625ae 100644 --- a/Mikado/transcripts/transcript_methods/printing.py +++ b/Mikado/transcripts/transcript_methods/printing.py @@ -278,7 +278,7 @@ def as_bed12(transcript, transcriptomic=False): """ transcript.finalize() - bed12 = BED12() + bed12 = BED12(table=transcript.codon_table) bed12.transcriptomic = False bed12.header = False bed12.chrom = transcript.chrom diff --git a/Mikado/transcripts/transcript_methods/splitting.py b/Mikado/transcripts/transcript_methods/splitting.py index eacca84a6..5b57d625f 100644 --- a/Mikado/transcripts/transcript_methods/splitting.py +++ b/Mikado/transcripts/transcript_methods/splitting.py @@ -707,7 +707,7 @@ def __relocate_orfs(transcript, bed12_objects, tstart, tend): for obj in bed12_objects: # import copy # obj = copy.deepcopy(obj) - new = BED12() + new = BED12(table=transcript.codon_table) new.transcriptomic = True # Phase is necessary for truncated models for attr in ["chrom", "start", "end", "strand", "thick_start", "thick_end", diff --git a/Mikado/transcripts/transcriptchecker.py b/Mikado/transcripts/transcriptchecker.py index e7c5bfd4d..d8b5ce185 100644 --- a/Mikado/transcripts/transcriptchecker.py +++ b/Mikado/transcripts/transcriptchecker.py @@ -335,10 +335,6 @@ def check_orf(self): orf = orfs[0] assert isinstance(orf, BED12) - # orf = BED12(str(orf), transcriptomic=True, sequence=self.cdna, max_regression=0, start_adjustment=False) - # orf.max_regression = 0 - # orf.start_adjustment = False - # orf.sequence = self.cdna if orf.invalid: self.logger.warning("Invalid ORF for %s (reason: %s)", self.id, orf.invalid_reason)