Skip to content

Commit

Permalink
First step to fix #34: now BED12 objects accept codon table definitions.
Browse files Browse the repository at this point in the history
  • Loading branch information
lucventurini committed Oct 10, 2018
1 parent 93f9855 commit 0bea266
Show file tree
Hide file tree
Showing 2 changed files with 144 additions and 35 deletions.
111 changes: 81 additions & 30 deletions Mikado/parsers/bed12.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
from typing import Union
import re
from ..utilities.log_utils import create_null_logger
from Bio.Data import CodonTable
import pickle

standard = CodonTable.ambiguous_dna_by_id[1]
standard.start_codons = ["ATG"]

# import numpy as np


Expand All @@ -36,7 +42,8 @@ def __init__(self, *args: Union[str, list, tuple, GffLine],
transcriptomic=False,
max_regression=0,
start_adjustment=True,
coding=True):
coding=True,
table=0):

"""
:param args: the BED12 line.
Expand Down Expand Up @@ -144,6 +151,9 @@ def __init__(self, *args: Union[str, list, tuple, GffLine],
self.max_regression = max_regression
self.start_adjustment = start_adjustment
self.coding = coding
self.__table = standard
self.__table_index = 0
self.table = table

if len(args) == 0:
self.header = True
Expand All @@ -165,7 +175,7 @@ def __init__(self, *args: Union[str, list, tuple, GffLine],
self.header = True
return
elif isinstance(self._line, type(self)): # Re-initialising with another object
self.__set_values_from_bed12()
self.__set_values_from_bed12(args[0])
elif isinstance(self._line, GffLine):
if self._line.header is True:
self.header = True
Expand Down Expand Up @@ -210,6 +220,7 @@ def __init__(self, *args: Union[str, list, tuple, GffLine],
self.name = groups["ID"]

self.__check_validity(transcriptomic, fasta_index, sequence)

if self.invalid and self.coding:
self.coding = False

Expand All @@ -236,12 +247,50 @@ def gene(self):
def parent(self):
return self.__parent

@property
def table(self):
return self.__table

@table.setter
def table(self, table):
if table is None:
self.__table = standard
self.__table_index = 0
elif isinstance(table, int):
if table == 0:
self.__table = standard
else:
self.__table = CodonTable.ambiguous_dna_by_id[table]
self.__table_index = 0
elif isinstance(table, str):
self.__table = CodonTable.ambiguous_dna_by_name[table]
self.__table_index = self.__table._codon_table.id
elif isinstance(table, bytes):
self.__table = CodonTable.ambiguous_dna_by_name[table.decode()]
self.__table_index = self.__table._codon_table.id
else:
raise ValueError("Invalid table: {} (type: {})".format(
table, type(table)))
return

@parent.setter
def parent(self, parent):
if parent is not None and not isinstance(parent, str):
raise TypeError(type(parent))
self.__parent = [parent]

def __getstate__(self):

state = copy.deepcopy(dict((key, val) for key, val in self.__dict__.items()
if key not in ("_BED12_table") and
not isinstance(val, CodonTable.CodonTable)))
return state

def __setstate__(self, state):
# del state["table"]
self.__dict__.update(state)
self.table = self.__table_index

def __set_values_from_fields(self):

"""
Expand Down Expand Up @@ -276,20 +325,22 @@ def __set_values_from_fields(self):
self.fasta_length = len(self)
return

def __set_values_from_bed12(self):

for attr in ["chrom", "start", "end", "name", "score", "strand",
"thick_start", "thick_end", "rgb",
"block_count", "block_starts", "block_sizes"]:
setattr(self, attr, getattr(self._line, attr))

intern(self.chrom)
self.has_start_codon = None
self.has_stop_codon = None
self.start_codon = None
self.stop_codon = None
self.fasta_length = len(self)
self.transcriptomic = self._line.transcriptomic
def __set_values_from_bed12(self, line):

self.__setstate__(line.__getstate__())
#
# for attr in ["chrom", "start", "end", "name", "score", "strand",
# "thick_start", "thick_end", "rgb",
# "block_count", "block_starts", "block_sizes"]:
# setattr(self, attr, getattr(self._line, attr))
#
# intern(self.chrom)
# self.has_start_codon = None
# self.has_stop_codon = None
# self.start_codon = None
# self.stop_codon = None
# self.fasta_length = len(self)
# self.transcriptomic = self._line.transcriptomic
return

def __set_values_from_gff(self, fasta_length):
Expand Down Expand Up @@ -363,7 +414,7 @@ def __check_validity(self, transcriptomic, fasta_index, sequence):
self.start_codon = str(orf_sequence)[:3].upper()
self.stop_codon = str(orf_sequence[-3:]).upper()

if self.start_codon == "ATG":
if self.start_codon in self.table.start_codons:
self.has_start_codon = True
self.phase = 0
else:
Expand All @@ -375,7 +426,7 @@ def __check_validity(self, transcriptomic, fasta_index, sequence):
if self.start_adjustment is True:
self._adjust_start(orf_sequence)

if self.stop_codon in ("TAA", "TGA", "TAG"):
if self.stop_codon in self.table.stop_codons:
self.has_stop_codon = True
else:
self.has_stop_codon = False
Expand All @@ -394,7 +445,7 @@ def _adjust_start(self, orf_sequence):
for pos in range(3,
int(len(orf_sequence) * self.max_regression),
3):
if orf_sequence[pos:pos + 3] == "ATG":
if orf_sequence[pos:pos + 3] in self.table.start_codons:
# Now we have to shift the start accordingly
self.has_start_codon = True
if self.strand == "+":
Expand Down Expand Up @@ -722,7 +773,7 @@ def expand(self, sequence, upstream, downstream, expand_orf=False, logger=create
self.thick_start += upstream
self.thick_end += upstream
if expand_orf is True:
if str(self.start_codon) != "ATG":
if str(self.start_codon) not in self.table.start_codons:
for pos in range(self.thick_start - self.phase,
0,
-3):
Expand All @@ -734,22 +785,22 @@ def expand(self, sequence, upstream, downstream, expand_orf=False, logger=create
self.__has_start = True
break

if self.start_codon != "ATG":
if self.start_codon not in self.table.start_codons:
self.phase = self.thick_start % 3
self.thick_start = 1
else:
self.phase = 0
self.__has_start = True

for pos in range(self.thick_start + self.phase - 1, self.end, 3):
codon = sequence[pos:pos + 3]
if codon in ("TAA", "TGA", "TAG"):
self.thick_end = pos + 3
self.stop_codon = codon
self.__has_stop = True
logger.debug("New stop codon for %s: %s", self.name, self.thick_end)
break
if self.stop_codon not in ("TAA", "TGA", "TAG"):
coding_seq = Seq.Seq(sequence[self.thick_start + self.phase - 1:self.end])
prot_seq = coding_seq.translate(table=self.table, gap="N")
if "*" in prot_seq:
self.thick_end = self.thick_start + self.phase - 1 + (1 + prot_seq.find("*")) * 3
self.stop_codon = coding_seq[prot_seq.find("*") * 3:(1 + prot_seq.find("*")) * 3]
self.__has_stop = True
logger.debug("New stop codon for %s: %s", self.name, self.thick_end)

if self.stop_codon not in self.table.stop_codons:
logger.debug("No valid stop codon found for %s", self.name)
self.thick_end = self.end

Expand Down
68 changes: 63 additions & 5 deletions util/add_transcript_feature_to_gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,85 @@
"""

import sys
from Mikado.parsers.GTF import GTF
from Mikado.parsers.GTF import GTF, GtfLine
from Mikado.transcripts import Transcript
from Mikado.utilities import overlap
from copy import deepcopy
import operator
import argparse
from collections import defaultdict
from typing import List, Generator


class Obj(object):
""" Simple container. Just a basic object."""
pass


def create_transcript(tid: str, lines: List[GtfLine], args: argparse.Namespace) -> Generator[Transcript]:

""""""

chroms = defaultdict(list)
for line in lines:
chroms[line.chrom].append(line)

if len(chroms) == 1:
# Everything as it should.
pass
else:
# Recursively
for chrom in chroms:
for transcript in create_transcript(tid + "." + chrom, chroms[chrom], args):
yield transcript

# Now we are sure that we only have one chromosome
exons = sorted([line for line in lines if line.is_exon],
key=operator.attrgetter("chrom", "start", "end"))

if len(exons) == 1:
transcript = Transcript(exons[0])
transcript.finalize()
yield transcript
else:
new_exons = []
identifier = ord("A")
for pos in range(1, len(exons)):
exon = exons[pos]
prev = exons[pos - 1]
if overlap((prev.start, prev.end), (exon.start, exon.end)) > 0:
# Merge the two exons
exons[pos].start = prev.start
elif exon.start - prev.end + 1 < args.min_intron:
if args.split is False:
exons[pos].start = prev.start
else:
# we have to split
pass
elif exon.start - prev.end + 1 > args.max_intron:
# we have to split
pass
else:
new_exons.append(prev)





def main():
"""
Main script function.
"""

parser = argparse.ArgumentParser("Script to add a transcript feature to e.g. Cufflinks GTFs")
parser.add_argument("-mai", "--max-intron", dest="max_intron",
help="Maximum intron length before splitting a transcript into different pieces.")
parser.add_argument("-mi", "--min-intron", dest="min_intron",
help="""Minimum intron length; intron lengths lower than this will cause two consecutive exons
to be merged.""")
parser.add_argument("--split-small-introns", dest="split", action="store_true", default=False,
help="""Flag. If set, transcripts with very small introns will end up
split into two (or more) transcripts rather than having their exons merged.""")
parser.add_argument("gtf", type=argparse.FileType(),
help="Input GTF")
parser.add_argument("out", default=sys.stdout, nargs="?",
Expand All @@ -40,10 +100,8 @@ def main():
args.gtf.close()
transcripts = list()

for tid in transcript_lines:
transcript = Transcript(transcript_lines[tid][0])
transcript.add_exons(transcript_lines[tid])
transcripts.append(transcript)
for tid, lines in transcript_lines.items():
transcripts.extend(*create_transcript(tid, lines, args))

for transcript in sorted(transcripts):
print(transcript.format("gtf"), file=args.out)
Expand Down

0 comments on commit 0bea266

Please sign in to comment.