Added configuration option for minimal ORF length, as requested by Gemy

EI-CoreBioinformatics · Sep 8, 2015 · 6b9d5c5 · 6b9d5c5
1 parent 3499eb4
commit 6b9d5c5
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 2 deletions.
diff --git a/mikado_lib/json_utils.py b/mikado_lib/json_utils.py
@@ -442,13 +442,21 @@ def check_orf_loading(json_conf):
         json_conf["orf_loading"] = dict()
         json_conf["orf_loading"]["strand_specific"] = False
         json_conf["orf_loading"]["minimal_secondary_orf_length"] = 0
+        json_conf["orf_loading"]["minimal_orf_length"] = 0
     else:
         if "strand_specific" not in json_conf:
             json_conf["orf_loading"]["strand_specific"] = False
         else:
             if not type(json_conf["orf_loading"]["strand_specific"]) is bool:
                 raise mikado_lib.exceptions.InvalidJson(
                     "Invalid strand_specific value: {0}".format(json_conf["orf_loading"]["strand_specific"]))
+        if "minimal_orf_length" not in json_conf["orf_loading"]:
+            json_conf["orf_loading"]["minimal_orf_length"] = 0
+        else:
+            if not type(json_conf["orf_loading"]["minimal_orf_length"]) is int:
+                raise mikado_lib.exceptions.InvalidJson("Invalid minimal_primary_orf_length value: {0}".format(
+                    json_conf["orf_loading"]["minimal_primary_orf_length"]))
+
         if "minimal_secondary_orf_length" not in json_conf["orf_loading"]:
             json_conf["orf_loading"]["minimal_secondary_orf_length"] = 0
         else:

diff --git a/mikado_lib/loci_objects/transcript.py b/mikado_lib/loci_objects/transcript.py
@@ -104,6 +104,7 @@ class Transcript:
 
     orf_baked = bakery(lambda session: session.query(mikado_lib.serializers.orf.Orf))
     orf_baked += lambda q: q.filter(mikado_lib.serializers.orf.Orf.query_id == bindparam("query_id"))
+    orf_baked += lambda q: q.filter(mikado_lib.serializers.orf.Orf.cds_len >= bindparam("cds_len"))
     orf_baked += lambda q: q.order_by(desc(mikado_lib.serializers.orf.Orf.cds_len))
 
     # ######## Class special methods ####################
@@ -1120,10 +1121,11 @@ def retrieve_from_dict(self, data_dict):
 
         # ORF data
         trust_strand = self.json_dict["orf_loading"]["strand_specific"]
+        min_cds_len = self.json_dict["orf_loading"]["minimal_orf_length"]
 
         self.logger.debug("Retrieving ORF information from DB dictionary for {0}".format(self.id))
         if self.id in data_dict["orfs"]:
-            candidate_orfs = data_dict["orfs"][self.id]
+            candidate_orfs = list(filter(lambda orf: orf.cds_len >=min_cds_len, data_dict["orfs"][self.id]))
         else:
             candidate_orfs = []
 
@@ -1203,8 +1205,10 @@ def retrieve_orfs(self):
             return []
 
         trust_strand = self.json_dict["orf_loading"]["strand_specific"]
+        min_cds_len = self.json_dict["orf_loading"]["minimal_orf_length"]
 
-        orf_results = self.orf_baked(self.session).params(query_id=self.query_id)
+        orf_results = self.orf_baked(self.session).params(query_id=self.query_id,
+                                                          cds_len=min_cds_len)
 
         if (self.monoexonic is False) or (self.monoexonic is True and trust_strand is True):
             # Remove negative strand ORFs for multiexonic transcripts, or monoexonic strand-specific transcripts

diff --git a/sample_data/configuration.yaml b/sample_data/configuration.yaml
@@ -30,6 +30,7 @@ alternative_splicing:
   max_isoforms: 3
 orf_loading:
   minimal_secondary_orf_length: 200
+  minimal_orf_length: 50
   strand_specific: true
 run_options:
   shm: false