lucventurini · lucventurini · Oct 20, 2019 · Oct 17, 2019 · Oct 17, 2019 · Oct 19, 2019
diff --git a/Mikado/picking/_merge_loci_utils.py b/Mikado/picking/_merge_loci_utils.py
@@ -5,6 +5,8 @@
 from ..loci import Locus
 import sys
 import collections
+import itertools
+import numpy as np
 from ._locus_line_creator import _create_locus_lines
 
 
@@ -83,39 +85,34 @@ def manage_index(data, dumps, source):
 def __create_gene_counters(common_index: dict) -> (dict, int):
     """Function to assign to each counter in the database the correct base and maximum number of genes.
     This allows to parallelise the printing.
+    The common index has the following structure:
+
+    d[counter] = (database index, chrom, number of genes in locus)
     """
 
-    chroms, nums = list(zip(*[common_index[index][1:3] for index in range(1, max(common_index.keys()) + 1)]))
-    total_genes = sum(nums)
+    chroms = []
+    num_genes = []
+
+    for index in range(1, max(common_index.keys()) + 1):
+        _, chrom, n_genes = common_index[index]
+        chroms.append(chrom)
+        num_genes.append(n_genes)
+
+    chroms = np.array(chroms)
+    num_genes = np.array(num_genes)
+
     gene_counters = dict()
+    total_genes = sum(num_genes)
+
     chrom_tots = collections.defaultdict(list)
-    assert len(chroms) == len(common_index), (len(chroms), len(common_index))
-    for pos in range(len(chroms)):
-        key = pos + 1
-        chrom, num = chroms[pos], nums[pos]
-        if chrom == '' and pos > 0:
-            assert num == 0
-            former = gene_counters[pos][0]
-        elif pos == 0 or chrom != chroms[pos - 1]:
-            if chroms[pos - 1] != "":
-                former = 0
-            else:  # The previous one is wrong ..
-                prev_pos = pos - 1
-                prev_chrom = chroms[prev_pos]
-                while prev_chrom == "":
-                    prev_pos -= 1
-                    if prev_pos < 0:
-                        break
-                    prev_chrom = chroms[prev_pos]
-                if prev_chrom == "" or prev_chrom != chrom:
-                    former = 0
-                else:
-                    former = gene_counters[pos][0] + gene_counters[pos][1]
-        else:
-            former = gene_counters[pos][0] + gene_counters[pos][1]
-        gene_counters[key] = (former, num)
-        if chrom:
-            chrom_tots[chrom].extend(list(range(former + 1, former + num + 1)))
+    for chrom in np.unique(chroms):
+        index = np.where(chroms == chrom)
+        totals = num_genes[index]
+        cumu = totals.cumsum()
+        for counter, former, num in zip(index[0], itertools.chain([0], cumu[:-1]), totals):
+            gene_counters[counter + 1] = (former, num)
+            if chrom:
+                chrom_tots[chrom].extend(list(range(former + 1, former + num + 1)))
 
     tot_found = 0
     for chrom in chrom_tots:
@@ -137,9 +134,11 @@ def __create_gene_counters(common_index: dict) -> (dict, int):
             tot_found += chrom_tots[chrom][-1]
 
     assert tot_found == total_genes, (tot_found, total_genes)
-    new_common = dict()
+
     assert min(common_index) == 1
 
+    new_common = dict()
     for key in common_index:
+        # DbIndex
         new_common[key] = (common_index[key][0], gene_counters[key][0], gene_counters[key][1])
     return new_common, total_genes
diff --git a/Mikado/picking/loci_processer.py b/Mikado/picking/loci_processer.py
@@ -61,12 +61,17 @@ def merge_loci(num_temp, out_handles,
         checker.update(counters)
         logger.fatal("%d double indices found!", len([_ for _ in checker if checker[_] > 1]))
 
+    # Start iterating the output dictionaries ("cursors")
     for dbindex, cursor in enumerate(cursors):
-        d = dict((index[0], (dbindex, index[1], index[2])) for index in cursor.execute(
-            "SELECT counter, chrom, genes FROM loci").fetchall())
-        assert not set.intersection(set(d.keys()), set(common_index.keys())), set.intersection(
-            set(d.keys()), set(common_index.keys()))
-
+        # Get the counter (this is the dictionary key), chromosome, and number of genes
+        d = dict()
+        doubles = set()
+        for counter, chrom, genes in cursor.execute("SELECT counter, chrom, genes FROM loci"):
+            if counter in common_index:
+                doubles.add(counter)
+            d[counter] = (dbindex, chrom, genes)
+        if len(doubles) > 0:
+            raise AssertionError("Double indices found: {}".format(doubles))
         common_index.update(d)
 
     print_subloci = (out_handles[1][0] is not None)
@@ -75,10 +80,10 @@ def merge_loci(num_temp, out_handles,
         raise KeyError("I am missing some loci! {} vs {}".format(
             max_counter, max(common_index.keys())))
 
-    assert set(common_index.keys()) == set(range(1, max(common_index.keys()) + 1)), (
-        set.difference(set(range(1, max(common_index.keys()) + 1)), set(common_index.keys()))
-    )
-    assert len(common_index.keys()) == len(set(common_index.keys()))
+    __valid = set(range(1, max(common_index.keys()) + 1))
+    if set(common_index.keys()) != __valid:
+        missing = set.difference(__valid, set(common_index.keys()))
+        raise AssertionError("Missing the following loci: {}".format(missing))
 
     new_common, total_genes = __create_gene_counters(common_index)
 

diff --git a/Mikado/scales/assigner.py b/Mikado/scales/assigner.py
@@ -434,6 +434,8 @@ def __check_for_fusions(self, prediction, matches, fuzzymatch=None):
             for gene in genes:
                 if gene not in new_matches:
                     continue
+                elif new_matches[gene] == []:
+                    continue
                 # The best match is the first
                 if new_matches[gene][0].j_f1[0] == 0 and new_matches[gene][0].n_recall[0] < 10:
                     dubious.add(gene)

diff --git a/sample_data/Snakefile b/sample_data/Snakefile
@@ -38,7 +38,7 @@ configfile: "configuration.yaml"
 
 rule complete:
     input: "compare.stats", "compare_subloci.stats", "compare_input.stats", "check.ok",
-         "check_metrics.ok", "daijin_test/mikado.yaml", "g11.ok", "refmap_check.ok",
+         "check_metrics.ok", "daijin_test/mikado.yaml", "g11.ok", "refmap_check.ok", "refmap_check_pc.ok"
          "external.ok"
     output: touch("finished.ok")
 
@@ -129,16 +129,42 @@ rule compare_subloci:
     message: """mikado compare -r {input[reference]} -p {input[prediction]} -o compare_subloci -l {log}"""
     shell: """mikado compare -r {input[reference]} -p {input[prediction]} -o compare_subloci -l {log}"""
 
+rule compare_subloci_pc:
+    input: reference="reference.gff3", prediction=rules.daijin.output.sub, midx=rules.index_reference.output
+    output:
+        stats="compare_subloci_pc.stats",
+        tmap="compare_subloci_pc.tmap",
+        refmap="compare_subloci_pc.refmap"
+    log: "compare_subloci_pc.log"
+    message: """mikado compare -r {input[reference]} -p {input[prediction]} -o compare_subloci_pc -l {log}"""
+    shell: """mikado compare -r {input[reference]} -p {input[prediction]} -o compare_subloci_pc -l {log}"""
+
 rule check_refmap:
     input:
         refmap=rules.compare_input.output.refmap
     output: touch("refmap_check.ok")
     run:
         import pandas as pd
-        refmap = pd.read_csv(input["refmap"], delimiter="\t")
+        refmap = pd.read_csv(input["refmap"], delimiter="\t", index_col=0)
         assert refmap.location.str.contains("^Chr5:", regex=True).all()
-        assert refmap.ccode.str.contains("^(=|_)$", regex=True).all()
-        assert (refmap.tid.str.replace("^at_", "", regex=True) == refmap["ref_id"]).all()
+        # Account for the non-protein coding
+        assert refmap.loc[~refmap.index.str.contains("AT5G66650")].ccode.str.contains("^(=|_)$", regex=True).all()
+        assert refmap.loc[refmap.index.str.contains("AT5G66650")].ccode.str.contains("^(f,=|f,_)$", regex=True).all()
+        assert (refmap.tid.index == refmap.tid.str.replace("^at_", "")).all(), refmap.tid
+
+rule check_refmap_pc:
+    input:
+        refmap=rules.compare_subloci_pc.output.refmap
+    output: touch("refmap_check_pc.ok")
+    run:
+        import pandas as pd
+        refmap = pd.read_csv(input["refmap"], delimiter="\t", index_col=0)
+        assert refmap.location.str.contains("^Chr5:", regex=True).all()
+        # Account for the non-protein coding
+        assert refmap.loc[~refmap.index.str.contains("AT5G66650")].ccode.str.contains("^(=|_)$", regex=True).all()
+        checker = refmap.loc[~refmap.index.str.contains("AT5G66650")].tid.str.replace("^at_", "", regex=True)
+        assert (checker == checker.index).all()
+
 
 rule check_logs:
     input:
@@ -332,6 +358,7 @@ rule clean:
                                     glob.glob("*.log"), glob.glob("*xml"), ["chr5.fas"],
 				                    glob.glob("*.ok"),
                                     "refmap_check.ok",
+                                    "refmap_check_pc.ok",
                                     ["configuration.yaml"]):
             if os.path.exists(filename) and filename not in (".", ".."):
                 shutil.rmtree(filename) if os.path.isdir(filename) else os.remove(filename)

diff --git a/sample_data/reference.gff3 b/sample_data/reference.gff3
@@ -273,6 +273,11 @@ Chr5	TAIR10	CDS	26603204	26603638	.	-	0	Parent=AT5G66650.1
 Chr5	TAIR10	three_prime_UTR	26603003	26603203	.	-	.	Parent=AT5G66650.1
 Chr5	TAIR10	exon	26603003	26603638	.	-	.	Parent=AT5G66650.1
 ###
+Chr5	TAIR10	gene	26603503	26604876	.	-	.	ID=nc_AT5G66650;Name=nc_AT5G66650;
+Chr5	TAIR10	ncRNA	26603503	26604876	.	-	.	ID=nc_AT5G66650.1;Parent=nc_AT5G66650;Name=nc_AT5G66650.1;index=1
+Chr5	TAIR10	exon	26604227	26604876	.	-	.	Parent=nc_AT5G66650.1
+Chr5	TAIR10	exon	26603503	26604138	.	-	.	Parent=nc_AT5G66650.1
+###
 Chr5	TAIR10	gene	26608316	26608866	.	+	.	ID=AT5G66658;Name=AT5G66658;note=protein_coding_gene
 Chr5	TAIR10	mRNA	26608316	26608866	.	+	.	ID=AT5G66658.1;Parent=AT5G66658;Name=AT5G66658.1;index=1
 Chr5	TAIR10	protein	26608329	26608553	.	+	.	ID=AT5G66658.1-Protein;Parent=AT5G66658.1;Name=AT5G66658.1;derives_from=AT5G66658.1