Skip to content

Commit

Permalink
Starting to work on #389
Browse files Browse the repository at this point in the history
  • Loading branch information
lucventurini committed Mar 17, 2021
1 parent 9a5a5dc commit eea03cd
Show file tree
Hide file tree
Showing 9 changed files with 84 additions and 71 deletions.
47 changes: 27 additions & 20 deletions Mikado/_transcripts/transcript_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,30 @@ def __initialize_with_line(self, transcript_row):
else:
raise TypeError("Invalid data type: {0}".format(type(transcript_row)))

@staticmethod
def __parse_attributes(attributes):
new_attributes = dict()
booleans = {"True": True, "False": False, "None": None}
for key, val in attributes.items():
if not isinstance(val, Hashable):
pass
elif val in booleans:
val = booleans[val]
elif isinstance(val, bool):
pass
else:
try:
val = int(val)
except (ValueError, OverflowError):
try:
val = float(val)
except ValueError:
pass
except TypeError:
pass
new_attributes[intern(key)] = val
return new_attributes

def __initialize_with_bed12(self, transcript_row: BED12):

"""
Expand Down Expand Up @@ -333,6 +357,8 @@ def __initialize_with_bed12(self, transcript_row: BED12):
cds.append((int(max(exon[0], transcript_row.thick_start)),
int(min(exon[1], transcript_row.thick_end))))
self.add_exons(cds, features="CDS")

self.attributes = self.__parse_attributes(transcript_row.attributes)
self.finalize()

def __initialize_with_bam(self, transcript_row: pysam.AlignedSegment):
Expand Down Expand Up @@ -423,26 +449,7 @@ def __initialize_with_gf(self, transcript_row: (GffLine, GtfLine)):
self.score = transcript_row.score
self.scores = dict()

booleans = {"True": True, "False": False, "None": None}

for key, val in transcript_row.attributes.items():
if not isinstance(val, Hashable):
pass
elif val in booleans:
val = booleans[val]
elif isinstance(val, bool):
pass
else:
try:
val = int(val)
except (ValueError, OverflowError):
try:
val = float(val)
except ValueError:
pass
except TypeError:
pass
self.attributes[intern(key)] = val
self.attributes = self.__parse_attributes(transcript_row.attributes)

self.blast_hits = []
if transcript_row.is_transcript is False:
Expand Down
22 changes: 6 additions & 16 deletions Mikado/_transcripts/transcript_methods/printing.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,22 +377,8 @@ def as_bed12(transcript, transcriptomic=False, with_cds=True):
except KeyError:
raise KeyError((transcript.selected_cds[-1], transcript.phases))

name = "ID={ID};coding={coding};phase={phase}".format(
ID=transcript.id,
coding=transcript.is_coding,
# Now we have to get the phase of the first CDS exon ..
phase=phase)
else:
name = "ID={ID};coding={coding}".format(
ID=transcript.id,
coding=transcript.is_coding,
# Now we have to get the phase of the first CDS exon ..
)

if transcript.alias is not None and transcript.alias != transcript.id:
name += ";alias={}".format(transcript.alias)

bed12.name = name
bed12.coding = transcript.is_coding
bed12.name = transcript.id
bed12.score = transcript.score if transcript.score else 0
bed12.strand = transcript.strand
if transcript.is_coding and with_cds is True:
Expand All @@ -407,6 +393,9 @@ def as_bed12(transcript, transcriptomic=False, with_cds=True):
bed12.thick_end = bed12.end
bed12.coding = False

for key, val in transcript.attributes.items():
bed12.attributes[key] = val

bed12.block_count = transcript.exon_num
bed12.block_sizes = [exon[1] - exon[0] + 1 for exon in transcript.exons]
_introns = np.concatenate([np.array([intron[1] - intron[0] + 1 for intron in sorted(transcript.introns)],
Expand All @@ -419,6 +408,7 @@ def as_bed12(transcript, transcriptomic=False, with_cds=True):
bed12 = bed12.to_transcriptomic(alias=transcript.alias, start_adjustment=False,
coding=(transcript.is_coding and with_cds))
bed12.chrom = transcript.id

return bed12


Expand Down
1 change: 1 addition & 0 deletions Mikado/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from . import bed12
from . import blast_utils
from . import bam_parser
from .bed12 import BED12


def parser_factory(string, input_format=None):
Expand Down
42 changes: 26 additions & 16 deletions Mikado/parsers/bed12.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import os
from Bio import Seq
import Bio.SeqRecord
from .parser import Parser
from .parser import Parser, _attribute_definition
from sys import intern
import copy
from ..exceptions import InvalidParsingFormat
Expand Down Expand Up @@ -339,6 +339,7 @@ def __init__(self, *args: Union[str, list, tuple, GffLine],
self.alias = None
self.__logger = create_null_logger()
self.logger = logger
self.attributes = dict()
self.logger.debug("Set the basic properties for %s", self.chrom)

if len(args) == 0:
Expand Down Expand Up @@ -390,6 +391,7 @@ def __init__(self, *args: Union[str, list, tuple, GffLine],
raise InvalidParsingFormat("I need an ordered array, not {0}".format(type(self._line)))
else:
self._fields = self._line
print("Line", self._fields)
self.__set_values_from_fields()

self.__check_validity(transcriptomic, fasta_index, sequence)
Expand All @@ -404,7 +406,6 @@ def __init__(self, *args: Union[str, list, tuple, GffLine],
@property
def is_transcript(self):
"""BED12 files are always transcripts for Mikado."""

return True

@property
Expand Down Expand Up @@ -495,6 +496,7 @@ def _parse_attributes(self, attributes):
"""

self.attribute_order = []
print("Parsing", attributes)

infolist = self._attribute_pattern.findall(attributes.rstrip().rstrip(";"))

Expand All @@ -514,6 +516,8 @@ def _parse_attributes(self, attributes):
elif key.lower() == "id":
self.name = val
else:
print(key.capitalize(), val)
self.attributes[key.capitalize()] = _attribute_definition(val)
continue

def __set_values_from_fields(self):
Expand Down Expand Up @@ -546,9 +550,10 @@ def __set_values_from_fields(self):
self.block_starts = [int(x) for x in block_starts.split(",") if x]
else:
self.block_starts = [int(x) for x in block_starts]
self._parse_attributes(self.name)
to_parse = self.name
if len(self._fields) == 13:
self._parse_attributes(self._fields[-1])
to_parse = ";".join([to_parse, self._fields[-1]])
self._parse_attributes(to_parse)
self.has_start_codon = False
self.has_stop_codon = False
self.start_codon = None
Expand Down Expand Up @@ -834,18 +839,12 @@ def __str__(self):
return "#"

line = [self.chrom, self.start - 1, self.end]

if self.transcriptomic is True:
name = "ID={};coding={}".format(self.id, self.coding)
if self.coding:
name += ";phase={}".format(self.phase)
if self.alias is not None and self.alias != self.id:
name += ";alias={}".format(self.alias)

line.append(name)
else:
line.append(self.name)

name = "ID={};coding={}".format(self.id, self.coding)
if self.coding:
name += ";phase={}".format(self.phase)
if self.alias is not None and self.alias != self.id:
name += ";alias={}".format(self.alias)
line.append(name)
if not self.score:
line.append(0)
else:
Expand All @@ -862,6 +861,17 @@ def __str__(self):
line.append(self.block_count)
line.append(",".join([str(x) for x in self.block_sizes]))
line.append(",".join([str(x) for x in self.block_starts]))
attributes = dict((key.lower(), val) for key, val in self.attributes.items() if key.lower() not in
("geneid", "gene_id", "name", "phase", "coding", "alias", "id"))
if self.parent is not None:
attributes["Parent"] = self.parent[0]
assert "Parent" in attributes
elif "parent" in attributes:
par = attributes["parent"] if isinstance(attributes["parent"], str) else attributes["parent"][0]
del attributes["parent"]
attributes["Parent"] = par
if attributes:
line.append(";".join(f"{key}={val}" for key, val in attributes.items()))
return "\t".join([str(x) for x in line])

def __eq__(self, other):
Expand Down
17 changes: 1 addition & 16 deletions Mikado/parsers/gfannotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import copy
from sys import intern
import re
from .parser import _attribute_definition


__author__ = 'Luca Venturini'
Expand All @@ -18,22 +19,6 @@
[intern(_) for _ in ["+", "-", "?", "true", "True", "false", "False"]]


def _attribute_definition(val):
try:
val = float(val)
if val.is_integer():
return int(val)
return val
except (ValueError, TypeError):
if val.lower() in ("true", "false"):
val = val.capitalize()
if val == "True":
return True
else:
return False
return val


# This class has exactly how many attributes I need it to have
# pylint: disable=too-many-instance-attributes
class GFAnnotation(metaclass=abc.ABCMeta):
Expand Down
16 changes: 16 additions & 0 deletions Mikado/parsers/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,19 @@ def __setstate__(self, state):
self.__dict__.update(state)
self._handle = self.__get_handle(state["_handle"], position=position)


def _attribute_definition(val):
try:
val = float(val)
if val.is_integer():
return int(val)
return val
except (ValueError, TypeError):
if val.lower() in ("true", "false"):
val = val.capitalize()
if val == "True":
return True
else:
return False
return val

2 changes: 1 addition & 1 deletion Mikado/tests/test_bed12.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ def test_tran_to_bed12_neg(self):
t.add_exons([(71, 100), (201, end)], features="CDS", phases=[0, phase])
t.finalize()
r = t.as_bed12()
self.assertEqual(r.name, "ID={};coding={};phase={}".format(t.id, True, phase))
self.assertEqual(r.name, t.id)
self.assertEqual(r.phase, phase, (end, phase))
self.assertEqual(r.thick_end, end)
self.assertFalse(r.invalid)
Expand Down
2 changes: 2 additions & 0 deletions Mikado/tests/test_transcript_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,8 @@ def setUp(self):
transcript.name = transcript.id
self.transcripts[transcript.id] = transcript

for tid, transcript in self.transcripts.items():
self.assertFalse(any(key.lower() == "id=id" for key in transcript.attributes), (tid, transcript.attributes))
self.assertEqual(len(self.transcripts), 4)

def test_bed12(self):
Expand Down
6 changes: 4 additions & 2 deletions Mikado/tests/test_transcript_negative.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,12 @@ def test_print(self):
str(self.tr),
diff)

g_bed12 = "Chr1 5927 8737 ID=AT1G01020.1;coding=True;phase=0 0 - 6914 8666 0 10 336,633,76,67,86,74,46,90,48,167 0,509,1229,1456,1636,1834,2014,2308,2489,2643"
g_bed12 = "Chr1 5927 8737 ID=AT1G01020.1;coding=True;phase=0 0 - 6914 8666 0"\
" 10 336,633,76,67,86,74,46,90,48,167 0,509,1229,1456,1636,1834,2014,2308,2489,2643\tParent=AT1G01020"
self.assertEqual(g_bed12, self.tr.format("bed12", transcriptomic=False))

t_bed12 = "AT1G01020.1 0 1623 ID=AT1G01020.1;coding=True;phase=0 0 + 71 809 0 10 167,48,90,46,74,86,67,76,633,336 0,167,215,305,351,425,511,578,654,1287"
t_bed12 = "AT1G01020.1 0 1623 ID=AT1G01020.1;coding=True;phase=0 0 + 71 809 0 10 "\
"167,48,90,46,74,86,67,76,633,336 0,167,215,305,351,425,511,578,654,1287"
self.assertEqual(t_bed12, self.tr.format("bed12", transcriptomic=True))

def test_empty(self):
Expand Down

0 comments on commit eea03cd

Please sign in to comment.