Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 241 #9

Merged
merged 3 commits into from
Oct 20, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 29 additions & 30 deletions Mikado/picking/_merge_loci_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from ..loci import Locus
import sys
import collections
import itertools
import numpy as np
from ._locus_line_creator import _create_locus_lines


Expand Down Expand Up @@ -83,39 +85,34 @@ def manage_index(data, dumps, source):
def __create_gene_counters(common_index: dict) -> (dict, int):
"""Function to assign to each counter in the database the correct base and maximum number of genes.
This allows to parallelise the printing.
The common index has the following structure:

d[counter] = (database index, chrom, number of genes in locus)
"""

chroms, nums = list(zip(*[common_index[index][1:3] for index in range(1, max(common_index.keys()) + 1)]))
total_genes = sum(nums)
chroms = []
num_genes = []

for index in range(1, max(common_index.keys()) + 1):
_, chrom, n_genes = common_index[index]
chroms.append(chrom)
num_genes.append(n_genes)

chroms = np.array(chroms)
num_genes = np.array(num_genes)

gene_counters = dict()
total_genes = sum(num_genes)

chrom_tots = collections.defaultdict(list)
assert len(chroms) == len(common_index), (len(chroms), len(common_index))
for pos in range(len(chroms)):
key = pos + 1
chrom, num = chroms[pos], nums[pos]
if chrom == '' and pos > 0:
assert num == 0
former = gene_counters[pos][0]
elif pos == 0 or chrom != chroms[pos - 1]:
if chroms[pos - 1] != "":
former = 0
else: # The previous one is wrong ..
prev_pos = pos - 1
prev_chrom = chroms[prev_pos]
while prev_chrom == "":
prev_pos -= 1
if prev_pos < 0:
break
prev_chrom = chroms[prev_pos]
if prev_chrom == "" or prev_chrom != chrom:
former = 0
else:
former = gene_counters[pos][0] + gene_counters[pos][1]
else:
former = gene_counters[pos][0] + gene_counters[pos][1]
gene_counters[key] = (former, num)
if chrom:
chrom_tots[chrom].extend(list(range(former + 1, former + num + 1)))
for chrom in np.unique(chroms):
index = np.where(chroms == chrom)
totals = num_genes[index]
cumu = totals.cumsum()
for counter, former, num in zip(index[0], itertools.chain([0], cumu[:-1]), totals):
gene_counters[counter + 1] = (former, num)
if chrom:
chrom_tots[chrom].extend(list(range(former + 1, former + num + 1)))

tot_found = 0
for chrom in chrom_tots:
Expand All @@ -137,9 +134,11 @@ def __create_gene_counters(common_index: dict) -> (dict, int):
tot_found += chrom_tots[chrom][-1]

assert tot_found == total_genes, (tot_found, total_genes)
new_common = dict()

assert min(common_index) == 1

new_common = dict()
for key in common_index:
# DbIndex
new_common[key] = (common_index[key][0], gene_counters[key][0], gene_counters[key][1])
return new_common, total_genes
23 changes: 14 additions & 9 deletions Mikado/picking/loci_processer.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,17 @@ def merge_loci(num_temp, out_handles,
checker.update(counters)
logger.fatal("%d double indices found!", len([_ for _ in checker if checker[_] > 1]))

# Start iterating the output dictionaries ("cursors")
for dbindex, cursor in enumerate(cursors):
d = dict((index[0], (dbindex, index[1], index[2])) for index in cursor.execute(
"SELECT counter, chrom, genes FROM loci").fetchall())
assert not set.intersection(set(d.keys()), set(common_index.keys())), set.intersection(
set(d.keys()), set(common_index.keys()))

# Get the counter (this is the dictionary key), chromosome, and number of genes
d = dict()
doubles = set()
for counter, chrom, genes in cursor.execute("SELECT counter, chrom, genes FROM loci"):
if counter in common_index:
doubles.add(counter)
d[counter] = (dbindex, chrom, genes)
if len(doubles) > 0:
raise AssertionError("Double indices found: {}".format(doubles))
common_index.update(d)

print_subloci = (out_handles[1][0] is not None)
Expand All @@ -75,10 +80,10 @@ def merge_loci(num_temp, out_handles,
raise KeyError("I am missing some loci! {} vs {}".format(
max_counter, max(common_index.keys())))

assert set(common_index.keys()) == set(range(1, max(common_index.keys()) + 1)), (
set.difference(set(range(1, max(common_index.keys()) + 1)), set(common_index.keys()))
)
assert len(common_index.keys()) == len(set(common_index.keys()))
__valid = set(range(1, max(common_index.keys()) + 1))
if set(common_index.keys()) != __valid:
missing = set.difference(__valid, set(common_index.keys()))
raise AssertionError("Missing the following loci: {}".format(missing))

new_common, total_genes = __create_gene_counters(common_index)

Expand Down
2 changes: 2 additions & 0 deletions Mikado/scales/assigner.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,8 @@ def __check_for_fusions(self, prediction, matches, fuzzymatch=None):
for gene in genes:
if gene not in new_matches:
continue
elif new_matches[gene] == []:
continue
# The best match is the first
if new_matches[gene][0].j_f1[0] == 0 and new_matches[gene][0].n_recall[0] < 10:
dubious.add(gene)
Expand Down
35 changes: 31 additions & 4 deletions sample_data/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ configfile: "configuration.yaml"

rule complete:
input: "compare.stats", "compare_subloci.stats", "compare_input.stats", "check.ok",
"check_metrics.ok", "daijin_test/mikado.yaml", "g11.ok", "refmap_check.ok",
"check_metrics.ok", "daijin_test/mikado.yaml", "g11.ok", "refmap_check.ok", "refmap_check_pc.ok"
"external.ok"
output: touch("finished.ok")

Expand Down Expand Up @@ -129,16 +129,42 @@ rule compare_subloci:
message: """mikado compare -r {input[reference]} -p {input[prediction]} -o compare_subloci -l {log}"""
shell: """mikado compare -r {input[reference]} -p {input[prediction]} -o compare_subloci -l {log}"""

rule compare_subloci_pc:
input: reference="reference.gff3", prediction=rules.daijin.output.sub, midx=rules.index_reference.output
output:
stats="compare_subloci_pc.stats",
tmap="compare_subloci_pc.tmap",
refmap="compare_subloci_pc.refmap"
log: "compare_subloci_pc.log"
message: """mikado compare -r {input[reference]} -p {input[prediction]} -o compare_subloci_pc -l {log}"""
shell: """mikado compare -r {input[reference]} -p {input[prediction]} -o compare_subloci_pc -l {log}"""

rule check_refmap:
input:
refmap=rules.compare_input.output.refmap
output: touch("refmap_check.ok")
run:
import pandas as pd
refmap = pd.read_csv(input["refmap"], delimiter="\t")
refmap = pd.read_csv(input["refmap"], delimiter="\t", index_col=0)
assert refmap.location.str.contains("^Chr5:", regex=True).all()
assert refmap.ccode.str.contains("^(=|_)$", regex=True).all()
assert (refmap.tid.str.replace("^at_", "", regex=True) == refmap["ref_id"]).all()
# Account for the non-protein coding
assert refmap.loc[~refmap.index.str.contains("AT5G66650")].ccode.str.contains("^(=|_)$", regex=True).all()
assert refmap.loc[refmap.index.str.contains("AT5G66650")].ccode.str.contains("^(f,=|f,_)$", regex=True).all()
assert (refmap.tid.index == refmap.tid.str.replace("^at_", "")).all(), refmap.tid

rule check_refmap_pc:
input:
refmap=rules.compare_subloci_pc.output.refmap
output: touch("refmap_check_pc.ok")
run:
import pandas as pd
refmap = pd.read_csv(input["refmap"], delimiter="\t", index_col=0)
assert refmap.location.str.contains("^Chr5:", regex=True).all()
# Account for the non-protein coding
assert refmap.loc[~refmap.index.str.contains("AT5G66650")].ccode.str.contains("^(=|_)$", regex=True).all()
checker = refmap.loc[~refmap.index.str.contains("AT5G66650")].tid.str.replace("^at_", "", regex=True)
assert (checker == checker.index).all()


rule check_logs:
input:
Expand Down Expand Up @@ -332,6 +358,7 @@ rule clean:
glob.glob("*.log"), glob.glob("*xml"), ["chr5.fas"],
glob.glob("*.ok"),
"refmap_check.ok",
"refmap_check_pc.ok",
["configuration.yaml"]):
if os.path.exists(filename) and filename not in (".", ".."):
shutil.rmtree(filename) if os.path.isdir(filename) else os.remove(filename)
Expand Down
5 changes: 5 additions & 0 deletions sample_data/reference.gff3
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,11 @@ Chr5 TAIR10 CDS 26603204 26603638 . - 0 Parent=AT5G66650.1
Chr5 TAIR10 three_prime_UTR 26603003 26603203 . - . Parent=AT5G66650.1
Chr5 TAIR10 exon 26603003 26603638 . - . Parent=AT5G66650.1
###
Chr5 TAIR10 gene 26603503 26604876 . - . ID=nc_AT5G66650;Name=nc_AT5G66650;
Chr5 TAIR10 ncRNA 26603503 26604876 . - . ID=nc_AT5G66650.1;Parent=nc_AT5G66650;Name=nc_AT5G66650.1;index=1
Chr5 TAIR10 exon 26604227 26604876 . - . Parent=nc_AT5G66650.1
Chr5 TAIR10 exon 26603503 26604138 . - . Parent=nc_AT5G66650.1
###
Chr5 TAIR10 gene 26608316 26608866 . + . ID=AT5G66658;Name=AT5G66658;note=protein_coding_gene
Chr5 TAIR10 mRNA 26608316 26608866 . + . ID=AT5G66658.1;Parent=AT5G66658;Name=AT5G66658.1;index=1
Chr5 TAIR10 protein 26608329 26608553 . + . ID=AT5G66658.1-Protein;Parent=AT5G66658.1;Name=AT5G66658.1;derives_from=AT5G66658.1
Expand Down