From e59ce900047c7da68a36cfdf2c908967d28f2a30 Mon Sep 17 00:00:00 2001
From: Luca Venturini <lucventurini@gmail.com>
Date: Wed, 26 Sep 2018 12:36:05 +0100
Subject: [PATCH] Ulterior fixes for #126. Now a monoexonic model is always
 kept on its strand if it is coding, while a multiexonic model is completely
 refused if it is coding and it should have its strand swapped.

---
 Mikado/preparation/checking.py          |  6 +++
 Mikado/preparation/prepare.py           |  3 +-
 Mikado/tests/test_system_calls.py       | 59 ++++++++++++++++++++++++-
 Mikado/tests/test_transcript_checker.py | 44 ++++++++++++++++++
 Mikado/transcripts/transcriptchecker.py |  9 +++-
 5 files changed, 118 insertions(+), 3 deletions(-)

diff --git a/Mikado/preparation/checking.py b/Mikado/preparation/checking.py
index b3a0a8f10..62826ad55 100644
--- a/Mikado/preparation/checking.py
+++ b/Mikado/preparation/checking.py
@@ -21,6 +21,7 @@ def create_transcript(lines,
                       canonical_splices=(("GT", "AG"),
                                          ("GC", "AG"),
                                          ("AT", "AC")),
+                      force_keep_cds=False,
                       logger=None):
     """Function to create the checker.
 
@@ -40,6 +41,10 @@ def create_transcript(lines,
     :param canonical_splices: the splices considered as canonical for the species.
     :type canonical_splices: list[tuple]
 
+    :param force_keep_cds: boolean. If set to true, coding transcripts that would be flipped are instead excluded.
+                           The intention is that this flag will mirror strip_cds.
+    :type force_keep_cds: bool
+
     :param logger: optional logger to use during processing.
 
     :rtype: (None|TranscriptChecker)
@@ -72,6 +77,7 @@ def create_transcript(lines,
                                               lenient=lenient,
                                               strand_specific=strand_specific,
                                               canonical_splices=canonical_splices,
+                                              force_keep_cds=force_keep_cds,
                                               logger=logger)
         logger.debug("Finished adding exon lines to %s", lines["tid"])
         transcript_object.finalize()
diff --git a/Mikado/preparation/prepare.py b/Mikado/preparation/prepare.py
index d492ef1c9..4bff5ad40 100644
--- a/Mikado/preparation/prepare.py
+++ b/Mikado/preparation/prepare.py
@@ -148,7 +148,8 @@ def perform_check(keys, shelve_stacks, args, logger):
             lenient=args.json_conf["prepare"]["lenient"],
             # strand_specific=args.json_conf["prepare"]["strand_specific"],
             canonical_splices=args.json_conf["prepare"]["canonical"],
-            logger=logger)
+            logger=logger,
+            force_keep_cds= not args.json_conf["prepare"]["strip_cds"])
 
         for tid, chrom, key in keys:
             tid, shelf_name = tid
diff --git a/Mikado/tests/test_system_calls.py b/Mikado/tests/test_system_calls.py
index d4bab7adb..d58febf36 100644
--- a/Mikado/tests/test_system_calls.py
+++ b/Mikado/tests/test_system_calls.py
@@ -20,7 +20,10 @@
 from Mikado.scales.compare import compare, load_index
 from Mikado.subprograms.util.stats import Calculator
 from Mikado.transcripts.transcript import Namespace
-from Mikado.utilities.log_utils import create_null_logger
+from Mikado.utilities.log_utils import create_null_logger, create_default_logger
+from Mikado.parsers.GFF import GffLine
+from Mikado.parsers import to_gff
+from Mikado.transcripts import Transcript
 
 
 class PrepareCheck(unittest.TestCase):
@@ -185,6 +188,60 @@ def test_prepare_trinity_and_cufflinks(self):
                 os.remove(os.path.join(self.conf["prepare"]["files"]["output_dir"],
                                        "mikado_prepared.fasta.fai"))
 
+    def test_prepare_with_cds(self):
+
+        rev_strand = {"+": "-", "-": "+"}
+
+        self.conf["prepare"]["files"]["labels"] = ["ann"]
+        ann_gff3 = pkg_resources.resource_filename("Mikado.tests", "annotation.gff3")
+        rev_ann_gff3 = tempfile.NamedTemporaryFile(suffix=".gff3", mode="wt")
+        with open(ann_gff3) as ann:
+            for line in ann:
+                line = GffLine(line)
+                if line.header is True:
+                    continue
+                line.strand = rev_strand[line.strand]  # Invert strand.
+                print(line, file=rev_ann_gff3)
+        rev_ann_gff3.flush()
+
+        self.conf["prepare"]["files"]["gff"] = []
+        self.conf["prepare"]["files"]["output_dir"] = tempfile.gettempdir()
+        self.conf["prepare"]["files"]["out_fasta"] = "mikado_prepared.fasta"
+        self.conf["prepare"]["files"]["out"] = "mikado_prepared.gtf"
+        args = Namespace()
+        args.json_conf = self.conf
+
+        for fname in [ann_gff3, rev_ann_gff3.name]:
+            for strip in (True, False):
+                with self.subTest(fname=fname, strip=strip):
+                    self.conf["prepare"]["files"]["gff"] = [fname]
+                    args.json_conf["prepare"]["strip_cds"] = strip
+                    prepare.prepare(args, self.logger)
+                    self.assertTrue(os.path.exists(os.path.join(self.conf["prepare"]["files"]["output_dir"],
+                                                                "mikado_prepared.fasta")))
+                    fa = pyfaidx.Fasta(os.path.join(self.conf["prepare"]["files"]["output_dir"],
+                                                    "mikado_prepared.fasta"))
+                    if strip is True or (strip is False and fname == ann_gff3):
+                        self.assertEqual(len(fa.keys()), 2)
+                    else:
+                        self.assertEqual(len(fa.keys()), 0)
+                    # Now verify that no model has CDS
+                    gtf = os.path.join(self.conf["prepare"]["files"]["output_dir"], "mikado_prepared.gtf")
+                    models = dict()
+                    for line in to_gff(gtf):
+                        if line.header:
+                            continue
+                        elif line.is_transcript:
+                            models[line.id] = Transcript(line)
+                        else:
+                            models[line.parent[0]].add_exon(line)
+                    [models[model].finalize() for model in models]
+                    for model in models:
+                        if strip is False:
+                            self.assertTrue(models[model].is_coding, models[model].format("gtf"))
+                        else:
+                            self.assertFalse(models[model].is_coding, models[model].format("gtf"))
+
 
 class CompareCheck(unittest.TestCase):
 
diff --git a/Mikado/tests/test_transcript_checker.py b/Mikado/tests/test_transcript_checker.py
index 0d68faeac..b5d85c8eb 100644
--- a/Mikado/tests/test_transcript_checker.py
+++ b/Mikado/tests/test_transcript_checker.py
@@ -8,6 +8,7 @@
 from Mikado.parsers.GFF import GffLine
 from Mikado.parsers.GTF import GtfLine
 from Mikado.transcripts.transcript import Transcript
+from Mikado.exceptions import InvalidTranscript
 
 
 class TChekerTester(unittest.TestCase):
@@ -132,6 +133,27 @@ def test_monoexonic(self):
         tcheck.check_strand()
         self.assertEqual(tcheck.strand, "-")
 
+    def test_monoexonic_cds(self):
+
+        # Chr5	tair10	exon	26584797	26584879	.	+	.	ID=c58_g1_i3.mrna1.19.exon1;Parent=c58_g1_i3.mrna1.19
+        for strand in ("+", "-"):
+            with self.subTest(strand=strand):
+                exon = self.gff_lines[1]
+                transcript_line = self.gff_lines[0]
+                transcript_line.end = exon.end
+                transcript_line.strand = strand
+                exon.strand = strand
+                model = Transcript(transcript_line)
+                model.add_exon(exon)
+                model.add_exon((exon.start + 2, exon.end), feature="CDS")
+                model.finalize()
+                self.assertTrue(model.is_coding)
+                fasta = self.fasta[model.chrom][model.start - 1: model.end]
+                tcheck = TranscriptChecker(model.copy(), fasta, force_keep_cds=True, strand_specific=False)
+                tcheck.check_strand()
+                self.assertEqual(model.strand, strand)
+                self.assertTrue(model.is_coding)
+
     def test_negative(self):
 
         gtf_lines = """Chr5	Cufflinks	transcript	26575364	26578163	1000	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";exon_number "1";FPKM "2.9700103727";conf_hi "3.260618";frac "0.732092";cov "81.895309";conf_lo "2.679403";
@@ -225,6 +247,10 @@ def test_reverse_with_cds_negative(self):
         check_model.check_strand()
         self.assertEqual(check_model.strand, "-")
         self.assertFalse(check_model.is_coding)
+        check_model = TranscriptChecker(model, model_fasta, force_keep_cds=True)
+        # Check that if we want to keep the CDS, this will raise an error
+        with self.assertRaises(InvalidTranscript):
+            check_model.check_strand()
 
     def test_reverse_with_cds_positive(self):
 
@@ -245,6 +271,9 @@ def test_reverse_with_cds_positive(self):
         check_model.check_strand()
         self.assertEqual(check_model.strand, "+")
         self.assertFalse(check_model.is_coding)
+        check_model = TranscriptChecker(model, model_fasta, force_keep_cds=True)
+        with self.assertRaises(InvalidTranscript):
+            check_model.check_strand()
 
     def test_monoexonic_suspicious(self):
 
@@ -287,6 +316,21 @@ def test_monoexonic_suspicious(self):
         self.assertFalse(model.suspicious_splicing)
         self.assertFalse(model.only_non_canonical_splicing)
 
+    def test_sequence_reversed(self):
+
+        model = Transcript()
+        model.chrom, model.start, model.end, model.strand = "Chr5", 1001, 1500, "+"
+        model.add_exon((1001, 1500))
+        model.id, model.parent = "foo.1", "foo"
+        model.finalize()
+        seq = str(self.fasta["Chr5"][1001-1:1500].seq)
+        self.assertEqual(len(seq), len(model))
+        model = TranscriptChecker(model, seq, strand_specific=True)
+        model.reverse_strand()
+        fasta = "".join(model.fasta.split("\n")[1:])
+        self.assertEqual(model.strand, "-")
+        self.assertEqual(fasta, TranscriptChecker.rev_complement(seq))
+
 
 class StopCodonChecker(unittest.TestCase):
 
diff --git a/Mikado/transcripts/transcriptchecker.py b/Mikado/transcripts/transcriptchecker.py
index d2fedd04b..9e0bbad1d 100644
--- a/Mikado/transcripts/transcriptchecker.py
+++ b/Mikado/transcripts/transcriptchecker.py
@@ -33,6 +33,7 @@ class TranscriptChecker(Transcript):
     def __init__(self, gffline, seq,
                  strand_specific=False, lenient=False,
                  canonical_splices=(("GT", "AG"), ("GC", "AG"), ("AT", "AC")),
+                 force_keep_cds=False,
                  logger=None):
 
         """
@@ -70,6 +71,7 @@ def __init__(self, gffline, seq,
         self.mixed_splices = False
         self.reversed = False
         self.canonical_splices = []
+        self.__force_keep_cds = force_keep_cds
         if not isinstance(canonical_splices, (tuple, list)):
             raise ValueError("Canonical splices should be provided as lists or tuples")
 
@@ -178,7 +180,7 @@ def check_strand(self):
         if self.checked is True:
             return
 
-        if self.strand_specific is False and self.monoexonic is True:
+        if self.strand_specific is False and self.monoexonic is True and self.__force_keep_cds is False:
             self.strand = None
 
         elif self.monoexonic is False:
@@ -244,6 +246,11 @@ def check_strand(self):
         self.checked = True
         return
 
+    def reverse_strand(self):
+        if self.is_coding is True and self.__force_keep_cds is True:
+            raise InvalidTranscript("I cannot reverse the strand of a coding transcript.")
+        super().reverse_strand()
+
     def _check_intron(self, intron):
 
         """