From e59ce900047c7da68a36cfdf2c908967d28f2a30 Mon Sep 17 00:00:00 2001 From: Luca Venturini Date: Wed, 26 Sep 2018 12:36:05 +0100 Subject: [PATCH] Ulterior fixes for #126. Now a monoexonic model is always kept on its strand if it is coding, while a multiexonic model is completely refused if it is coding and it should have its strand swapped. --- Mikado/preparation/checking.py | 6 +++ Mikado/preparation/prepare.py | 3 +- Mikado/tests/test_system_calls.py | 59 ++++++++++++++++++++++++- Mikado/tests/test_transcript_checker.py | 44 ++++++++++++++++++ Mikado/transcripts/transcriptchecker.py | 9 +++- 5 files changed, 118 insertions(+), 3 deletions(-) diff --git a/Mikado/preparation/checking.py b/Mikado/preparation/checking.py index b3a0a8f10..62826ad55 100644 --- a/Mikado/preparation/checking.py +++ b/Mikado/preparation/checking.py @@ -21,6 +21,7 @@ def create_transcript(lines, canonical_splices=(("GT", "AG"), ("GC", "AG"), ("AT", "AC")), + force_keep_cds=False, logger=None): """Function to create the checker. @@ -40,6 +41,10 @@ def create_transcript(lines, :param canonical_splices: the splices considered as canonical for the species. :type canonical_splices: list[tuple] + :param force_keep_cds: boolean. If set to true, coding transcripts that would be flipped are instead excluded. + The intention is that this flag will mirror strip_cds. + :type force_keep_cds: bool + :param logger: optional logger to use during processing. :rtype: (None|TranscriptChecker) @@ -72,6 +77,7 @@ def create_transcript(lines, lenient=lenient, strand_specific=strand_specific, canonical_splices=canonical_splices, + force_keep_cds=force_keep_cds, logger=logger) logger.debug("Finished adding exon lines to %s", lines["tid"]) transcript_object.finalize() diff --git a/Mikado/preparation/prepare.py b/Mikado/preparation/prepare.py index d492ef1c9..4bff5ad40 100644 --- a/Mikado/preparation/prepare.py +++ b/Mikado/preparation/prepare.py @@ -148,7 +148,8 @@ def perform_check(keys, shelve_stacks, args, logger): lenient=args.json_conf["prepare"]["lenient"], # strand_specific=args.json_conf["prepare"]["strand_specific"], canonical_splices=args.json_conf["prepare"]["canonical"], - logger=logger) + logger=logger, + force_keep_cds= not args.json_conf["prepare"]["strip_cds"]) for tid, chrom, key in keys: tid, shelf_name = tid diff --git a/Mikado/tests/test_system_calls.py b/Mikado/tests/test_system_calls.py index d4bab7adb..d58febf36 100644 --- a/Mikado/tests/test_system_calls.py +++ b/Mikado/tests/test_system_calls.py @@ -20,7 +20,10 @@ from Mikado.scales.compare import compare, load_index from Mikado.subprograms.util.stats import Calculator from Mikado.transcripts.transcript import Namespace -from Mikado.utilities.log_utils import create_null_logger +from Mikado.utilities.log_utils import create_null_logger, create_default_logger +from Mikado.parsers.GFF import GffLine +from Mikado.parsers import to_gff +from Mikado.transcripts import Transcript class PrepareCheck(unittest.TestCase): @@ -185,6 +188,60 @@ def test_prepare_trinity_and_cufflinks(self): os.remove(os.path.join(self.conf["prepare"]["files"]["output_dir"], "mikado_prepared.fasta.fai")) + def test_prepare_with_cds(self): + + rev_strand = {"+": "-", "-": "+"} + + self.conf["prepare"]["files"]["labels"] = ["ann"] + ann_gff3 = pkg_resources.resource_filename("Mikado.tests", "annotation.gff3") + rev_ann_gff3 = tempfile.NamedTemporaryFile(suffix=".gff3", mode="wt") + with open(ann_gff3) as ann: + for line in ann: + line = GffLine(line) + if line.header is True: + continue + line.strand = rev_strand[line.strand] # Invert strand. + print(line, file=rev_ann_gff3) + rev_ann_gff3.flush() + + self.conf["prepare"]["files"]["gff"] = [] + self.conf["prepare"]["files"]["output_dir"] = tempfile.gettempdir() + self.conf["prepare"]["files"]["out_fasta"] = "mikado_prepared.fasta" + self.conf["prepare"]["files"]["out"] = "mikado_prepared.gtf" + args = Namespace() + args.json_conf = self.conf + + for fname in [ann_gff3, rev_ann_gff3.name]: + for strip in (True, False): + with self.subTest(fname=fname, strip=strip): + self.conf["prepare"]["files"]["gff"] = [fname] + args.json_conf["prepare"]["strip_cds"] = strip + prepare.prepare(args, self.logger) + self.assertTrue(os.path.exists(os.path.join(self.conf["prepare"]["files"]["output_dir"], + "mikado_prepared.fasta"))) + fa = pyfaidx.Fasta(os.path.join(self.conf["prepare"]["files"]["output_dir"], + "mikado_prepared.fasta")) + if strip is True or (strip is False and fname == ann_gff3): + self.assertEqual(len(fa.keys()), 2) + else: + self.assertEqual(len(fa.keys()), 0) + # Now verify that no model has CDS + gtf = os.path.join(self.conf["prepare"]["files"]["output_dir"], "mikado_prepared.gtf") + models = dict() + for line in to_gff(gtf): + if line.header: + continue + elif line.is_transcript: + models[line.id] = Transcript(line) + else: + models[line.parent[0]].add_exon(line) + [models[model].finalize() for model in models] + for model in models: + if strip is False: + self.assertTrue(models[model].is_coding, models[model].format("gtf")) + else: + self.assertFalse(models[model].is_coding, models[model].format("gtf")) + class CompareCheck(unittest.TestCase): diff --git a/Mikado/tests/test_transcript_checker.py b/Mikado/tests/test_transcript_checker.py index 0d68faeac..b5d85c8eb 100644 --- a/Mikado/tests/test_transcript_checker.py +++ b/Mikado/tests/test_transcript_checker.py @@ -8,6 +8,7 @@ from Mikado.parsers.GFF import GffLine from Mikado.parsers.GTF import GtfLine from Mikado.transcripts.transcript import Transcript +from Mikado.exceptions import InvalidTranscript class TChekerTester(unittest.TestCase): @@ -132,6 +133,27 @@ def test_monoexonic(self): tcheck.check_strand() self.assertEqual(tcheck.strand, "-") + def test_monoexonic_cds(self): + + # Chr5 tair10 exon 26584797 26584879 . + . ID=c58_g1_i3.mrna1.19.exon1;Parent=c58_g1_i3.mrna1.19 + for strand in ("+", "-"): + with self.subTest(strand=strand): + exon = self.gff_lines[1] + transcript_line = self.gff_lines[0] + transcript_line.end = exon.end + transcript_line.strand = strand + exon.strand = strand + model = Transcript(transcript_line) + model.add_exon(exon) + model.add_exon((exon.start + 2, exon.end), feature="CDS") + model.finalize() + self.assertTrue(model.is_coding) + fasta = self.fasta[model.chrom][model.start - 1: model.end] + tcheck = TranscriptChecker(model.copy(), fasta, force_keep_cds=True, strand_specific=False) + tcheck.check_strand() + self.assertEqual(model.strand, strand) + self.assertTrue(model.is_coding) + def test_negative(self): gtf_lines = """Chr5 Cufflinks transcript 26575364 26578163 1000 - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";exon_number "1";FPKM "2.9700103727";conf_hi "3.260618";frac "0.732092";cov "81.895309";conf_lo "2.679403"; @@ -225,6 +247,10 @@ def test_reverse_with_cds_negative(self): check_model.check_strand() self.assertEqual(check_model.strand, "-") self.assertFalse(check_model.is_coding) + check_model = TranscriptChecker(model, model_fasta, force_keep_cds=True) + # Check that if we want to keep the CDS, this will raise an error + with self.assertRaises(InvalidTranscript): + check_model.check_strand() def test_reverse_with_cds_positive(self): @@ -245,6 +271,9 @@ def test_reverse_with_cds_positive(self): check_model.check_strand() self.assertEqual(check_model.strand, "+") self.assertFalse(check_model.is_coding) + check_model = TranscriptChecker(model, model_fasta, force_keep_cds=True) + with self.assertRaises(InvalidTranscript): + check_model.check_strand() def test_monoexonic_suspicious(self): @@ -287,6 +316,21 @@ def test_monoexonic_suspicious(self): self.assertFalse(model.suspicious_splicing) self.assertFalse(model.only_non_canonical_splicing) + def test_sequence_reversed(self): + + model = Transcript() + model.chrom, model.start, model.end, model.strand = "Chr5", 1001, 1500, "+" + model.add_exon((1001, 1500)) + model.id, model.parent = "foo.1", "foo" + model.finalize() + seq = str(self.fasta["Chr5"][1001-1:1500].seq) + self.assertEqual(len(seq), len(model)) + model = TranscriptChecker(model, seq, strand_specific=True) + model.reverse_strand() + fasta = "".join(model.fasta.split("\n")[1:]) + self.assertEqual(model.strand, "-") + self.assertEqual(fasta, TranscriptChecker.rev_complement(seq)) + class StopCodonChecker(unittest.TestCase): diff --git a/Mikado/transcripts/transcriptchecker.py b/Mikado/transcripts/transcriptchecker.py index d2fedd04b..9e0bbad1d 100644 --- a/Mikado/transcripts/transcriptchecker.py +++ b/Mikado/transcripts/transcriptchecker.py @@ -33,6 +33,7 @@ class TranscriptChecker(Transcript): def __init__(self, gffline, seq, strand_specific=False, lenient=False, canonical_splices=(("GT", "AG"), ("GC", "AG"), ("AT", "AC")), + force_keep_cds=False, logger=None): """ @@ -70,6 +71,7 @@ def __init__(self, gffline, seq, self.mixed_splices = False self.reversed = False self.canonical_splices = [] + self.__force_keep_cds = force_keep_cds if not isinstance(canonical_splices, (tuple, list)): raise ValueError("Canonical splices should be provided as lists or tuples") @@ -178,7 +180,7 @@ def check_strand(self): if self.checked is True: return - if self.strand_specific is False and self.monoexonic is True: + if self.strand_specific is False and self.monoexonic is True and self.__force_keep_cds is False: self.strand = None elif self.monoexonic is False: @@ -244,6 +246,11 @@ def check_strand(self): self.checked = True return + def reverse_strand(self): + if self.is_coding is True and self.__force_keep_cds is True: + raise InvalidTranscript("I cannot reverse the strand of a coding transcript.") + super().reverse_strand() + def _check_intron(self, intron): """