Skip to content

Commit

Permalink
Fixed #34
Browse files Browse the repository at this point in the history
  • Loading branch information
lucventurini committed Oct 10, 2018
1 parent 0bea266 commit ab878b9
Show file tree
Hide file tree
Showing 8 changed files with 44 additions and 14 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Bugfixes and improvements:
- Fixed a bug which caused some loci to crash at the last part of the picking stage
- Now coding and non-coding transcripts will be in different loci.
- Mikado prepare now can accept models that lack any exon features but still have valid CDS/UTR features
- Fixed [#34](https://github.com/lucventurini/mikado/issues/34): now Mikado can specify a valid codon table among those provided by [NCBI through BioPython](ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt). The default is "0", ie the Standard table but with only the canonical "ATG" being accepted as valid start codon.
- Fixed [#126](https://github.com/lucventurini/mikado/issues/126): now reversing the strand of a model will cause its CDS to be stripped.
- Fixed [#127](https://github.com/lucventurini/mikado/issues/127): previously, Mikado _prepare_ only considered cDNA coordinates when determining the redundancy of two models. In some edge cases, two models could be identical but have a different ORF called. Now Mikado will also consider the CDS before deciding whether to discard a model as redundant.
- [#129](https://github.com/lucventurini/mikado/issues/129): Mikado is now capable of correctly padding the transcripts so to uniform their ends in a single locus. This will also have the effect of trying to enlarge the ORF of a transcript if it is truncated to begin with.
Expand Down
16 changes: 15 additions & 1 deletion Mikado/configuration/configuration_blueprint.json
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,21 @@
"max_target_seqs": {"type": "integer", "default": 100000, "minimum": 1},
"force": {"type": "boolean", "default": false},
"single_thread": {"type": "boolean", "default": false},
"procs": {"type": "integer", "default": 1, "minimum": 1}
"procs": {"type": "integer", "default": 1, "minimum": 1},
"codon_table": {
"enum": [0, 1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
"Standard", "SGC0", "Vertebrate Mitochondrial", "SGC1", "Yeast Mitochondrial", "SGC2", "Mold Mitochondrial",
"Protozoan Mitochondrial", "Coelenterate Mitochondrial", "Mycoplasma", "Spiroplasma", "SGC3",
"Invertebrate Mitochondrial", "SGC4", "Ciliate Nuclear", "Dasycladacean Nuclear",
"Hexamita Nuclear", "SGC5", "Echinoderm Mitochondrial", "Flatworm Mitochondrial",
"SGC8", "Euplotid Nuclear", "SGC9", "Bacterial", "Archaeal", "Plant Plastid",
"Alternative Yeast Nuclear", "Ascidian Mitochondrial", "Alternative Flatworm Mitochondrial",
"Blepharisma Macronuclear", "Chlorophycean Mitochondrial", "Trematode Mitochondrial",
"Scenedesmus obliquus Mitochondrial", "Thraustochytrium Mitochondrial",
"Pterobranchia Mitochondrial", "Candidate Division SR1", "Gracilibacteria",
"Pachysolen tannophilus Nuclear", "Karyorelict Nuclear", "Condylostoma Nuclear",
"Mesodinium Nuclear", "Peritrich Nuclear", "Blastocrithidia Nuclear"]
}
}
},
"prepare":{
Expand Down
10 changes: 7 additions & 3 deletions Mikado/parsers/bed12.py
Original file line number Diff line number Diff line change
Expand Up @@ -912,7 +912,8 @@ def __init__(self, handle,
transcriptomic=False,
max_regression=0,
is_gff=False,
coding=False):
coding=False,
table=0):
"""
Constructor method.
:param handle: the input BED file.
Expand Down Expand Up @@ -949,6 +950,7 @@ def __init__(self, handle,
self.fasta_index = fasta_index
self.__closed = False
self.header = False
self.__table = table
self._is_bed12 = (not is_gff)

def __iter__(self):
Expand Down Expand Up @@ -976,7 +978,8 @@ def bed_next(self):
fasta_index=self.fasta_index,
transcriptomic=self.transcriptomic,
max_regression=self._max_regression,
coding=self.coding)
coding=self.coding,
table=self.__table)
return bed12

def gff_next(self):
Expand All @@ -998,7 +1001,8 @@ def gff_next(self):
bed12 = BED12(line,
fasta_index=self.fasta_index,
transcriptomic=self.transcriptomic,
max_regression=self._max_regression)
max_regression=self._max_regression,
table=self.__table)
# raise NotImplementedError("Still working on this!")
return bed12

Expand Down
4 changes: 3 additions & 1 deletion Mikado/serializers/orf.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ def __init__(self,

fasta_index = json_conf["serialise"]["files"]["transcripts"]
self._max_regression = json_conf["serialise"]["max_regression"]
self._table = json_conf["serialise"]["codon_table"]

if isinstance(fasta_index, str):
assert os.path.exists(fasta_index)
Expand All @@ -194,7 +195,8 @@ def __init__(self,
fasta_index=fasta_index,
is_gff=(not self.is_bed12),
transcriptomic=True,
max_regression=self._max_regression)
max_regression=self._max_regression,
table=self._table)

self.engine = connect(json_conf, logger)

Expand Down
19 changes: 16 additions & 3 deletions Mikado/transcripts/transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,7 +736,8 @@ def get_internal_orf_beds(self):
else:
seq = None

row = BED12(transcriptomic=True, coding=True, start_adjustment=False, max_regression=0)
row = BED12(transcriptomic=True, coding=True, start_adjustment=False, max_regression=0,
table=self.codon_table)
row.header = False
row.chrom = self.id
row.strand = "+"
Expand All @@ -752,7 +753,9 @@ def get_internal_orf_beds(self):
row.block_count = 0
row.block_starts = [0]
row.block_sizes = [0]
row = BED12(row, seq, coding=False, transcriptomic=True, max_regression=0, start_adjustment=False)
row = BED12(row, seq,
coding=False, transcriptomic=True, max_regression=0, start_adjustment=False,
table=self.codon_table)
assert row.invalid is False, ("\n".join([str(row), row.invalid_reason]))
yield row

Expand Down Expand Up @@ -793,7 +796,8 @@ def get_internal_orf_beds(self):
new_row = BED12(new_row,
sequence=seq,
phase=phase,
coding=True, transcriptomic=True, max_regression=0, start_adjustment=False)
coding=True, transcriptomic=True, max_regression=0, start_adjustment=False,
table=self.codon_table)
if (cds_len - phase) % 3 != 0 and cds_end not in (self.start, self.end):
raise AssertionError("Invalid CDS length for {}:\n{}\n{}".format(self.id,
iorf,
Expand Down Expand Up @@ -1959,6 +1963,15 @@ def __calculate_cds_tree(self):
self.__cds_tree = IntervalTree.from_tuples(
[(cds[0], max(cds[1], cds[0] + 1)) for cds in self.combined_cds])

@property
def codon_table(self):
"""This property returns the codon table for the project. Default: 0 (Standard, but only ATG is considered
a valid start codon)."""

if self.json_conf is None:
return 0
return self.json_conf.get("serialise", {}).get("codon_table", 0)

@property
def segmenttree(self):

Expand Down
2 changes: 1 addition & 1 deletion Mikado/transcripts/transcript_methods/printing.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def as_bed12(transcript, transcriptomic=False):
"""

transcript.finalize()
bed12 = BED12()
bed12 = BED12(table=transcript.codon_table)
bed12.transcriptomic = False
bed12.header = False
bed12.chrom = transcript.chrom
Expand Down
2 changes: 1 addition & 1 deletion Mikado/transcripts/transcript_methods/splitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,7 +707,7 @@ def __relocate_orfs(transcript, bed12_objects, tstart, tend):
for obj in bed12_objects:
# import copy
# obj = copy.deepcopy(obj)
new = BED12()
new = BED12(table=transcript.codon_table)
new.transcriptomic = True
# Phase is necessary for truncated models
for attr in ["chrom", "start", "end", "strand", "thick_start", "thick_end",
Expand Down
4 changes: 0 additions & 4 deletions Mikado/transcripts/transcriptchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,10 +335,6 @@ def check_orf(self):

orf = orfs[0]
assert isinstance(orf, BED12)
# orf = BED12(str(orf), transcriptomic=True, sequence=self.cdna, max_regression=0, start_adjustment=False)
# orf.max_regression = 0
# orf.start_adjustment = False
# orf.sequence = self.cdna

if orf.invalid:
self.logger.warning("Invalid ORF for %s (reason: %s)", self.id, orf.invalid_reason)
Expand Down

0 comments on commit ab878b9

Please sign in to comment.