Skip to content

Commit

Permalink
Issue 240 (#245)
Browse files Browse the repository at this point in the history
* Fix #240, #243
* Solved a bug that caused boolean values to be converted into integers for `pick`.
  • Loading branch information
lucventurini authored Oct 25, 2019
1 parent b451404 commit 160a3a3
Show file tree
Hide file tree
Showing 17 changed files with 1,217 additions and 73 deletions.
4 changes: 4 additions & 0 deletions Mikado/configuration/configuration_blueprint.json
Original file line number Diff line number Diff line change
Expand Up @@ -639,6 +639,10 @@
"type": "boolean",
"default": false
},
"check_references": {
"type": "boolean",
"default": false
},
"single_thread": {
"type": "boolean",
"default": false
Expand Down
2 changes: 1 addition & 1 deletion Mikado/configuration/daijin_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@
"- identity: minimum identity for any alignment. Default: 95%",
"- coverage: minimum coverage for any alignment. Default: 70%"],
"properties": {
"max_mem": {"type": "integer", "default": 6000, "minimum": 1000, "required": true},
"max_mem": {"type": "integer", "default": 6000, "minimum": 1000},
"npaths": {"type": "integer", "default": 0},
"identity": {"type": "number", "default": 0.95, "minimum": 0, "maximum": 1},
"coverage": {"type": "number", "default": 0.70, "minimum": 0, "maximum": 1}
Expand Down
6 changes: 5 additions & 1 deletion Mikado/daijin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ def assemble_transcripts_pipeline(args):
prefix="assemble")
yaml.dump(doc, yaml_file)
yaml_file.flush()
shutil.copystat(args.config, yaml_file.name)

if args.latency_wait is not None:
latency = abs(args.latency_wait)
Expand Down Expand Up @@ -400,7 +401,8 @@ def assemble_transcripts_pipeline(args):
"printdag": args.dag,
"forceall": args.dag,
"forcerun": args.forcerun,
"lock": (not args.nolock)
"lock": (not args.nolock),
"printreason": True
}

if "configfile" in inspect.getfullargspec(snakemake.snakemake).args:
Expand Down Expand Up @@ -494,6 +496,7 @@ def mikado_pipeline(args):
)
yaml.dump(doc, yaml_file)
yaml_file.flush()
shutil.copystat(args.config, yaml_file.name)

if SCHEDULER == "local":
hpc_conf = None
Expand Down Expand Up @@ -541,6 +544,7 @@ def mikado_pipeline(args):
"forceall": args.dag,
"forcerun": args.forcerun,
"lock": (not args.nolock),
"printreason": True
}

if "configfile" in inspect.getfullargspec(snakemake.snakemake).args:
Expand Down
44 changes: 31 additions & 13 deletions Mikado/loci/abstractlocus.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,11 @@ def remove_transcript_from_locus(self, tid: str):
self.initialized = False

self.logger.debug("Deleted %s from %s", tid, self.id)
if tid in self._metrics:
del self._metrics[tid]
if tid in self.scores:
del self.scores[tid]

self.metrics_calculated = False
self.scores_calculated = False

Expand Down Expand Up @@ -987,6 +992,7 @@ def get_metrics(self):

if self.metrics_calculated is True:
return
self._metrics = dict()
cds_bases = sum(_[1] - _[0] + 1 for _ in merge_ranges(
itertools.chain(*[
self.transcripts[_].combined_cds for _ in self.transcripts
Expand Down Expand Up @@ -1135,19 +1141,20 @@ def _check_not_passing(self, previous_not_passing=set()):
assert self.transcripts[tid].json_conf["prepare"]["files"][\
"reference"] == self.json_conf["prepare"]["files"]["reference"]

if self.transcripts[tid].is_reference is True:
# Reference transcripts should be kept in, no matter what.
self.logger.debug("Skipping %s from the requirement check as it is a reference transcript")
continue
elif self.transcripts[tid].original_source in self.json_conf["prepare"]["files"]["reference"]:
self.transcripts[tid].is_reference = True # Bug
self.logger.debug("Skipping %s from the requirement check as it is a reference transcript", tid)
continue
else:
is_reference = ((self.transcripts[tid].is_reference is True) or
(self.transcripts[tid].original_source in self.json_conf["prepare"]["files"]["reference"]))

if is_reference is False:
self.logger.debug("Transcript %s (source %s) is not a reference transcript (references: %s; in it: %s)",
tid, self.transcripts[tid].original_source,
self.json_conf["prepare"]["files"]["reference"],
self.transcripts[tid].original_source in self.json_conf["prepare"]["files"]["reference"])
self.transcripts[tid].original_source in self.json_conf["prepare"]["files"][
"reference"])
elif is_reference is True and self.json_conf["pick"]["run_options"]["check_references"] is False:
self.logger.debug("Skipping %s from the requirement check as it is a reference transcript", tid)
continue
elif is_reference is True and self.json_conf["pick"]["run_options"]["check_references"] is True:
self.logger.debug("Performing the requirement check for %s even if it is a reference transcript", tid)

evaluated = dict()
for key in self.json_conf["requirements"]["parameters"]:
Expand Down Expand Up @@ -1333,16 +1340,27 @@ def _calculate_score(self, param):
try:
# metric = rgetattr(self.transcripts[tid], param)
if tid not in self._metrics and transcript.alias in self._metrics:
metric = self._metrics[transcript.alias][param]
if param in self._metrics[transcript.alias]:
metric = self._metrics[transcript.alias][param]
else:
metric = rgetattr(self.transcripts[tid], param)
self._metrics[transcript.alias][param] = metric
else:
metric = self._metrics[tid][param]
if tid not in self._metrics:
self._metrics[tid] = dict()
if param in self._metrics[tid]:
metric = self._metrics[tid][param]
else:
metric = rgetattr(self.transcripts[tid], param)
self._metrics[tid][param] = metric
if isinstance(metric, (tuple, list)):
metric = metric[0]
metrics[tid] = metric
except TypeError:
raise TypeError(param)
except KeyError:
raise KeyError(param)
metric = rgetattr(self.transcripts[tid], param)
raise KeyError((tid, param, metric))
except AttributeError:
raise AttributeError(param)

Expand Down
14 changes: 10 additions & 4 deletions Mikado/loci/locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,10 @@ def __remove_redundant_after_padding(self):
continue
return

def as_dict(self):
self.calculate_scores()
return super().as_dict()

def remove_transcript_from_locus(self, tid: str):

"""Overloading of the AbstractLocus class, in order to ensure that the primary transcript will *not*
Expand Down Expand Up @@ -484,7 +488,8 @@ def is_putative_fragment(self):
"""This method will use the expression in the "not_fragmentary" section
of the configuration to determine whether it is itself a putative fragment."""

if any(self.transcripts[tid].is_reference is True for tid in self.transcripts):
if not self.json_conf["pick"]["run_options"]["check_references"] and \
any(self.transcripts[tid].is_reference is True for tid in self.transcripts):
return False

self.json_conf["not_fragmentary"]["compiled"] = compile(
Expand Down Expand Up @@ -1065,7 +1070,7 @@ def __set_id(self, string):
return
primary_id = "{0}.1".format(string)
old_primary = self.primary_transcript.id
self.primary_transcript.attributes["Alias"] = self.primary_transcript.id
self.primary_transcript.attributes["alias"] = self.primary_transcript.id
self.primary_transcript.id = primary_id
self.transcripts[primary_id] = self.primary_transcript
self.primary_transcript_id = primary_id
Expand All @@ -1078,7 +1083,7 @@ def __set_id(self, string):

for counter, tid in enumerate(order):
counter += 2
self.transcripts[tid].attributes["Alias"] = tid
self.transcripts[tid].attributes["alias"] = tid
new_id = "{0}.{1}".format(string, counter)
self.transcripts[tid].id = new_id
self.transcripts[new_id] = self.transcripts.pop(tid)
Expand All @@ -1087,6 +1092,7 @@ def __set_id(self, string):
if self.scores_calculated is True:
for tid in mapper:
self.scores[mapper[tid]] = self.scores.pop(tid)
self._metrics[mapper[tid]] = self._metrics.pop(tid)
if self.metrics_calculated is True:
for index in range(len(self.metric_lines_store)):
self.metric_lines_store[index]["tid"] = mapper[self.metric_lines_store[index]["tid"]]
Expand Down Expand Up @@ -1153,7 +1159,7 @@ def ts_max_splices(self):

@property
def has_reference_transcript(self):
return any(self.transcripts[transcript].is_reference is True for transcript in self)
return any(self.transcripts[transcript].is_reference for transcript in self)

def _get_alternative_splicing_codes(self):
"""Method to retrieve the currently valid alternative splicing event codes"""
Expand Down
1 change: 1 addition & 0 deletions Mikado/loci/superlocus.py
Original file line number Diff line number Diff line change
Expand Up @@ -937,6 +937,7 @@ def define_subloci(self):

if self.subloci_defined is True:
return

self.compile_requirements()
self.subloci = []

Expand Down
59 changes: 29 additions & 30 deletions Mikado/picking/_merge_loci_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from ..loci import Locus
import sys
import collections
import itertools
import numpy as np
from ._locus_line_creator import _create_locus_lines


Expand Down Expand Up @@ -83,39 +85,34 @@ def manage_index(data, dumps, source):
def __create_gene_counters(common_index: dict) -> (dict, int):
"""Function to assign to each counter in the database the correct base and maximum number of genes.
This allows to parallelise the printing.
The common index has the following structure:
d[counter] = (database index, chrom, number of genes in locus)
"""

chroms, nums = list(zip(*[common_index[index][1:3] for index in range(1, max(common_index.keys()) + 1)]))
total_genes = sum(nums)
chroms = []
num_genes = []

for index in range(1, max(common_index.keys()) + 1):
_, chrom, n_genes = common_index[index]
chroms.append(chrom)
num_genes.append(n_genes)

chroms = np.array(chroms)
num_genes = np.array(num_genes)

gene_counters = dict()
total_genes = sum(num_genes)

chrom_tots = collections.defaultdict(list)
assert len(chroms) == len(common_index), (len(chroms), len(common_index))
for pos in range(len(chroms)):
key = pos + 1
chrom, num = chroms[pos], nums[pos]
if chrom == '' and pos > 0:
assert num == 0
former = gene_counters[pos][0]
elif pos == 0 or chrom != chroms[pos - 1]:
if chroms[pos - 1] != "":
former = 0
else: # The previous one is wrong ..
prev_pos = pos - 1
prev_chrom = chroms[prev_pos]
while prev_chrom == "":
prev_pos -= 1
if prev_pos < 0:
break
prev_chrom = chroms[prev_pos]
if prev_chrom == "" or prev_chrom != chrom:
former = 0
else:
former = gene_counters[pos][0] + gene_counters[pos][1]
else:
former = gene_counters[pos][0] + gene_counters[pos][1]
gene_counters[key] = (former, num)
if chrom:
chrom_tots[chrom].extend(list(range(former + 1, former + num + 1)))
for chrom in np.unique(chroms):
index = np.where(chroms == chrom)
totals = num_genes[index]
cumu = totals.cumsum()
for counter, former, num in zip(index[0], itertools.chain([0], cumu[:-1]), totals):
gene_counters[counter + 1] = (former, num)
if chrom:
chrom_tots[chrom].extend(list(range(former + 1, former + num + 1)))

tot_found = 0
for chrom in chrom_tots:
Expand All @@ -137,9 +134,11 @@ def __create_gene_counters(common_index: dict) -> (dict, int):
tot_found += chrom_tots[chrom][-1]

assert tot_found == total_genes, (tot_found, total_genes)
new_common = dict()

assert min(common_index) == 1

new_common = dict()
for key in common_index:
# DbIndex
new_common[key] = (common_index[key][0], gene_counters[key][0], gene_counters[key][1])
return new_common, total_genes
23 changes: 14 additions & 9 deletions Mikado/picking/loci_processer.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,17 @@ def merge_loci(num_temp, out_handles,
checker.update(counters)
logger.fatal("%d double indices found!", len([_ for _ in checker if checker[_] > 1]))

# Start iterating the output dictionaries ("cursors")
for dbindex, cursor in enumerate(cursors):
d = dict((index[0], (dbindex, index[1], index[2])) for index in cursor.execute(
"SELECT counter, chrom, genes FROM loci").fetchall())
assert not set.intersection(set(d.keys()), set(common_index.keys())), set.intersection(
set(d.keys()), set(common_index.keys()))

# Get the counter (this is the dictionary key), chromosome, and number of genes
d = dict()
doubles = set()
for counter, chrom, genes in cursor.execute("SELECT counter, chrom, genes FROM loci"):
if counter in common_index:
doubles.add(counter)
d[counter] = (dbindex, chrom, genes)
if len(doubles) > 0:
raise AssertionError("Double indices found: {}".format(doubles))
common_index.update(d)

print_subloci = (out_handles[1][0] is not None)
Expand All @@ -75,10 +80,10 @@ def merge_loci(num_temp, out_handles,
raise KeyError("I am missing some loci! {} vs {}".format(
max_counter, max(common_index.keys())))

assert set(common_index.keys()) == set(range(1, max(common_index.keys()) + 1)), (
set.difference(set(range(1, max(common_index.keys()) + 1)), set(common_index.keys()))
)
assert len(common_index.keys()) == len(set(common_index.keys()))
__valid = set(range(1, max(common_index.keys()) + 1))
if set(common_index.keys()) != __valid:
missing = set.difference(__valid, set(common_index.keys()))
raise AssertionError("Missing the following loci: {}".format(missing))

new_common, total_genes = __create_gene_counters(common_index)

Expand Down
4 changes: 1 addition & 3 deletions Mikado/serializers/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(self, source, rtype, valid_raw):
elif np.dtype("complex") == rtype:
rtype = "complex"
else:
raise ValueError("Invalid source rtype: {}".format(rtype))
raise ValueError("Invalid source rtype for {}: {}".format(source, rtype))

self.rtype = rtype
self.valid_raw = valid_raw
Expand Down Expand Up @@ -138,8 +138,6 @@ def __init__(self, handle,
type(fasta_index))
self.logger.warning(error)



try:
self.data = pd.read_csv(self.handle, delimiter=delimiter, index_col=["tid"])
except ValueError:
Expand Down
Loading

0 comments on commit 160a3a3

Please sign in to comment.