Skip to content

Commit

Permalink
Ulterior fixes for #126. Now a monoexonic model is always kept on its…
Browse files Browse the repository at this point in the history
… strand if it is coding, while a multiexonic model is completely refused if it is coding and it should have its strand swapped.
  • Loading branch information
lucventurini committed Sep 26, 2018
1 parent fdc54e3 commit e59ce90
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 3 deletions.
6 changes: 6 additions & 0 deletions Mikado/preparation/checking.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def create_transcript(lines,
canonical_splices=(("GT", "AG"),
("GC", "AG"),
("AT", "AC")),
force_keep_cds=False,
logger=None):
"""Function to create the checker.
Expand All @@ -40,6 +41,10 @@ def create_transcript(lines,
:param canonical_splices: the splices considered as canonical for the species.
:type canonical_splices: list[tuple]
:param force_keep_cds: boolean. If set to true, coding transcripts that would be flipped are instead excluded.
The intention is that this flag will mirror strip_cds.
:type force_keep_cds: bool
:param logger: optional logger to use during processing.
:rtype: (None|TranscriptChecker)
Expand Down Expand Up @@ -72,6 +77,7 @@ def create_transcript(lines,
lenient=lenient,
strand_specific=strand_specific,
canonical_splices=canonical_splices,
force_keep_cds=force_keep_cds,
logger=logger)
logger.debug("Finished adding exon lines to %s", lines["tid"])
transcript_object.finalize()
Expand Down
3 changes: 2 additions & 1 deletion Mikado/preparation/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ def perform_check(keys, shelve_stacks, args, logger):
lenient=args.json_conf["prepare"]["lenient"],
# strand_specific=args.json_conf["prepare"]["strand_specific"],
canonical_splices=args.json_conf["prepare"]["canonical"],
logger=logger)
logger=logger,
force_keep_cds= not args.json_conf["prepare"]["strip_cds"])

for tid, chrom, key in keys:
tid, shelf_name = tid
Expand Down
59 changes: 58 additions & 1 deletion Mikado/tests/test_system_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@
from Mikado.scales.compare import compare, load_index
from Mikado.subprograms.util.stats import Calculator
from Mikado.transcripts.transcript import Namespace
from Mikado.utilities.log_utils import create_null_logger
from Mikado.utilities.log_utils import create_null_logger, create_default_logger
from Mikado.parsers.GFF import GffLine
from Mikado.parsers import to_gff
from Mikado.transcripts import Transcript


class PrepareCheck(unittest.TestCase):
Expand Down Expand Up @@ -185,6 +188,60 @@ def test_prepare_trinity_and_cufflinks(self):
os.remove(os.path.join(self.conf["prepare"]["files"]["output_dir"],
"mikado_prepared.fasta.fai"))

def test_prepare_with_cds(self):

rev_strand = {"+": "-", "-": "+"}

self.conf["prepare"]["files"]["labels"] = ["ann"]
ann_gff3 = pkg_resources.resource_filename("Mikado.tests", "annotation.gff3")
rev_ann_gff3 = tempfile.NamedTemporaryFile(suffix=".gff3", mode="wt")
with open(ann_gff3) as ann:
for line in ann:
line = GffLine(line)
if line.header is True:
continue
line.strand = rev_strand[line.strand] # Invert strand.
print(line, file=rev_ann_gff3)
rev_ann_gff3.flush()

self.conf["prepare"]["files"]["gff"] = []
self.conf["prepare"]["files"]["output_dir"] = tempfile.gettempdir()
self.conf["prepare"]["files"]["out_fasta"] = "mikado_prepared.fasta"
self.conf["prepare"]["files"]["out"] = "mikado_prepared.gtf"
args = Namespace()
args.json_conf = self.conf

for fname in [ann_gff3, rev_ann_gff3.name]:
for strip in (True, False):
with self.subTest(fname=fname, strip=strip):
self.conf["prepare"]["files"]["gff"] = [fname]
args.json_conf["prepare"]["strip_cds"] = strip
prepare.prepare(args, self.logger)
self.assertTrue(os.path.exists(os.path.join(self.conf["prepare"]["files"]["output_dir"],
"mikado_prepared.fasta")))
fa = pyfaidx.Fasta(os.path.join(self.conf["prepare"]["files"]["output_dir"],
"mikado_prepared.fasta"))
if strip is True or (strip is False and fname == ann_gff3):
self.assertEqual(len(fa.keys()), 2)
else:
self.assertEqual(len(fa.keys()), 0)
# Now verify that no model has CDS
gtf = os.path.join(self.conf["prepare"]["files"]["output_dir"], "mikado_prepared.gtf")
models = dict()
for line in to_gff(gtf):
if line.header:
continue
elif line.is_transcript:
models[line.id] = Transcript(line)
else:
models[line.parent[0]].add_exon(line)
[models[model].finalize() for model in models]
for model in models:
if strip is False:
self.assertTrue(models[model].is_coding, models[model].format("gtf"))
else:
self.assertFalse(models[model].is_coding, models[model].format("gtf"))


class CompareCheck(unittest.TestCase):

Expand Down
44 changes: 44 additions & 0 deletions Mikado/tests/test_transcript_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from Mikado.parsers.GFF import GffLine
from Mikado.parsers.GTF import GtfLine
from Mikado.transcripts.transcript import Transcript
from Mikado.exceptions import InvalidTranscript


class TChekerTester(unittest.TestCase):
Expand Down Expand Up @@ -132,6 +133,27 @@ def test_monoexonic(self):
tcheck.check_strand()
self.assertEqual(tcheck.strand, "-")

def test_monoexonic_cds(self):

# Chr5 tair10 exon 26584797 26584879 . + . ID=c58_g1_i3.mrna1.19.exon1;Parent=c58_g1_i3.mrna1.19
for strand in ("+", "-"):
with self.subTest(strand=strand):
exon = self.gff_lines[1]
transcript_line = self.gff_lines[0]
transcript_line.end = exon.end
transcript_line.strand = strand
exon.strand = strand
model = Transcript(transcript_line)
model.add_exon(exon)
model.add_exon((exon.start + 2, exon.end), feature="CDS")
model.finalize()
self.assertTrue(model.is_coding)
fasta = self.fasta[model.chrom][model.start - 1: model.end]
tcheck = TranscriptChecker(model.copy(), fasta, force_keep_cds=True, strand_specific=False)
tcheck.check_strand()
self.assertEqual(model.strand, strand)
self.assertTrue(model.is_coding)

def test_negative(self):

gtf_lines = """Chr5 Cufflinks transcript 26575364 26578163 1000 - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";exon_number "1";FPKM "2.9700103727";conf_hi "3.260618";frac "0.732092";cov "81.895309";conf_lo "2.679403";
Expand Down Expand Up @@ -225,6 +247,10 @@ def test_reverse_with_cds_negative(self):
check_model.check_strand()
self.assertEqual(check_model.strand, "-")
self.assertFalse(check_model.is_coding)
check_model = TranscriptChecker(model, model_fasta, force_keep_cds=True)
# Check that if we want to keep the CDS, this will raise an error
with self.assertRaises(InvalidTranscript):
check_model.check_strand()

def test_reverse_with_cds_positive(self):

Expand All @@ -245,6 +271,9 @@ def test_reverse_with_cds_positive(self):
check_model.check_strand()
self.assertEqual(check_model.strand, "+")
self.assertFalse(check_model.is_coding)
check_model = TranscriptChecker(model, model_fasta, force_keep_cds=True)
with self.assertRaises(InvalidTranscript):
check_model.check_strand()

def test_monoexonic_suspicious(self):

Expand Down Expand Up @@ -287,6 +316,21 @@ def test_monoexonic_suspicious(self):
self.assertFalse(model.suspicious_splicing)
self.assertFalse(model.only_non_canonical_splicing)

def test_sequence_reversed(self):

model = Transcript()
model.chrom, model.start, model.end, model.strand = "Chr5", 1001, 1500, "+"
model.add_exon((1001, 1500))
model.id, model.parent = "foo.1", "foo"
model.finalize()
seq = str(self.fasta["Chr5"][1001-1:1500].seq)
self.assertEqual(len(seq), len(model))
model = TranscriptChecker(model, seq, strand_specific=True)
model.reverse_strand()
fasta = "".join(model.fasta.split("\n")[1:])
self.assertEqual(model.strand, "-")
self.assertEqual(fasta, TranscriptChecker.rev_complement(seq))


class StopCodonChecker(unittest.TestCase):

Expand Down
9 changes: 8 additions & 1 deletion Mikado/transcripts/transcriptchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class TranscriptChecker(Transcript):
def __init__(self, gffline, seq,
strand_specific=False, lenient=False,
canonical_splices=(("GT", "AG"), ("GC", "AG"), ("AT", "AC")),
force_keep_cds=False,
logger=None):

"""
Expand Down Expand Up @@ -70,6 +71,7 @@ def __init__(self, gffline, seq,
self.mixed_splices = False
self.reversed = False
self.canonical_splices = []
self.__force_keep_cds = force_keep_cds
if not isinstance(canonical_splices, (tuple, list)):
raise ValueError("Canonical splices should be provided as lists or tuples")

Expand Down Expand Up @@ -178,7 +180,7 @@ def check_strand(self):
if self.checked is True:
return

if self.strand_specific is False and self.monoexonic is True:
if self.strand_specific is False and self.monoexonic is True and self.__force_keep_cds is False:
self.strand = None

elif self.monoexonic is False:
Expand Down Expand Up @@ -244,6 +246,11 @@ def check_strand(self):
self.checked = True
return

def reverse_strand(self):
if self.is_coding is True and self.__force_keep_cds is True:
raise InvalidTranscript("I cannot reverse the strand of a coding transcript.")
super().reverse_strand()

def _check_intron(self, intron):

"""
Expand Down

0 comments on commit e59ce90

Please sign in to comment.