From 14e14179e64087b30fd0a7d010bf891a40a387f8 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Fri, 14 May 2021 19:29:02 -0600 Subject: [PATCH 001/133] Log cache pickling error instead of failing --- src/ymp/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ymp/common.py b/src/ymp/common.py index e834fddb..7a5df4e5 100644 --- a/src/ymp/common.py +++ b/src/ymp/common.py @@ -245,6 +245,8 @@ def store(self, cache, key, obj): VALUES (?, ?, ?) """, [cache, key, pickle.dumps(obj)] ) + except pickle.PicklingError: + log.error("Failed to pickle %s", obj) except FileNotFoundError: pass From 5f75c955abf59e07342c9c5e61093429c5a00dee Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Sat, 15 May 2021 18:49:16 -0600 Subject: [PATCH 002/133] Add Snakemake 6.3.0 to white list --- src/ymp/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ymp/__init__.py b/src/ymp/__init__.py index 1c562f1c..e3a811f6 100644 --- a/src/ymp/__init__.py +++ b/src/ymp/__init__.py @@ -49,7 +49,7 @@ #: List of versions this version of YMP has been verified to work with snakemake_versions = [ - '6.0.5', '6.1.0', '6.1.1', '6.2.1' + '6.0.5', '6.1.0', '6.1.1', '6.2.1', '6.3.0' ] From 5e89feda60b8827a7f6ef000a32c5f19c82f5cc3 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 18 May 2021 11:22:12 -0600 Subject: [PATCH 003/133] Stage polish_pilon: parametrize fix types --- src/ymp/rules/pilon.rules | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/ymp/rules/pilon.rules b/src/ymp/rules/pilon.rules index b6b362d5..821a2899 100644 --- a/src/ymp/rules/pilon.rules +++ b/src/ymp/rules/pilon.rules @@ -6,6 +6,10 @@ with Stage("polish_pilon") as S: Requires fasta.gz and sorted.bam files as input. """) + S.add_param("S", typ="flag", name="fix_snps", value="snps") + S.add_param("I", typ="flag", name="fix_indels", value="indels") + S.add_param("G", typ="flag", name="fix_gaps", value="gaps") + S.add_param("L", typ="flag", name="fix_local", value="local") rule pilon_polish: message: @@ -42,6 +46,7 @@ with Stage("polish_pilon") as S: " echo > {output.changes};" " exit 0;" "fi;" + "FIX=$(echo {params.fix_snps} {params.fix_indels} {params.fix_gaps} {params.fix_local} | tr ' ' ,);" "pilon" " -Xmx{resources.mem_mb}m" " -Xms{resources.mem_mb}m" @@ -52,6 +57,7 @@ with Stage("polish_pilon") as S: " --vcf" " {params.bamopts}" " --iupac" + " --fix ${{FIX:-all}}" ";" "pigz " From f8fef72679f237d94ebd0e8d6288f1f3c4ddc755 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 5 Aug 2021 23:30:37 -0600 Subject: [PATCH 004/133] Add basic show stage params --- src/ymp/cli/stage.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/ymp/cli/stage.py b/src/ymp/cli/stage.py index e412eb0f..40f31848 100644 --- a/src/ymp/cli/stage.py +++ b/src/ymp/cli/stage.py @@ -41,10 +41,14 @@ def stage(): "--types", "-t", "type_opt", is_flag=True, help="Show input/output types" ) +@click.option( + "--params", "-p", "param_opt", is_flag=True, + help="Show parameters" +) @click.argument( "stage_opt", metavar="STAGE", nargs=-1 ) -def ls(long_opt, short_opt, stage_opt, code_opt, type_opt): +def ls(long_opt, short_opt, stage_opt, code_opt, type_opt, param_opt): """ List available stages """ @@ -98,10 +102,16 @@ def ls(long_opt, short_opt, stage_opt, code_opt, type_opt): else: dtypes = "" - print("{name:<{width}}{summary}{description}{code}{dtypes}\n" + if param_opt: + params = wrap(" params: ", map(str, stage.params)) + else: + params = "" + + print("{name:<{width}}{summary}{description}{code}{dtypes}{params}\n" "".format(name=stage.name, width=name_width, summary=summary, code=code, dtypes=dtypes, + params=params, description=description)) From 018432ce8bb25ac52193bc79b6f9f33cd1030505 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 5 Aug 2021 23:31:16 -0600 Subject: [PATCH 005/133] Allow choice param with name --- src/ymp/stage/params.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ymp/stage/params.py b/src/ymp/stage/params.py index 1a660b7f..0b9c52ef 100644 --- a/src/ymp/stage/params.py +++ b/src/ymp/stage/params.py @@ -50,7 +50,7 @@ def __init_subclass__(cls, **kwargs) -> None: def make(cls, stage: BaseStage, typ: str, key: str, name: str, value, default) -> "Param": if typ not in cls.types: raise YmpRuleError(stage, f"Unknown stage Parameter type '{typ}'") - return cls.types[typ](stage, key,name, value, default) + return cls.types[typ](stage, key, name, value, default) @property def wildcard(self): @@ -121,13 +121,13 @@ def add_param(self, key, typ, name, value=None, default=None) -> bool: if key and param.key == key: raise YmpRuleError( self, - f"Keys must be uninque. Key '{key}' already used by {param}.\n" + f"Keys must be unique. Key '{key}' already used by {param}.\n" f" while trying to add {new_param}" ) if param.name == name: raise YmpRuleError( self, - f"Names must be uninque. Name '{name}' already used by {param}.\n" + f"Names must be unique. Name '{name}' already used by {param}.\n" f" while trying to add {new_param}" ) self.__params.append(new_param) @@ -213,7 +213,7 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if self.default is not None: self.value = list(self.value) + [""] - self.regex = f"({self.key}({'|'.join(self.value)}))" + self.regex = f"({self.key}({'|'.join(self.value)}))?" class ParamRef(Param): From 117dd5d6cf44a9b0b991e90c5c98c6ddb156910b Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 5 Aug 2021 23:53:32 -0600 Subject: [PATCH 006/133] Allow references to be directories --- src/ymp/rules/00_download.rules | 9 ++++++++- src/ymp/stage/reference.py | 19 ++++++++++++++----- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/ymp/rules/00_download.rules b/src/ymp/rules/00_download.rules index fd80422b..65a096c3 100644 --- a/src/ymp/rules/00_download.rules +++ b/src/ymp/rules/00_download.rules @@ -56,7 +56,7 @@ with Stage("references") as S: message: "Preparing {output}" input: - files = lambda wc: ymp.get_config().ref[wc.refname].get_file(wc.path) + files = lambda wc: ymp.get_config().ref[wc.refname].get_file(wc.path, isdir=False) output: "{:dir.references:}/{refname}/{path}" wildcard_constraints: @@ -93,6 +93,13 @@ with Stage("references") as S: os.symlink(input_relpath, output[0]) + localrules: prepare_reference_dir + rule prepare_reference_dir: # ymp: extends prepare_reference + input: + files = lambda wc: ymp.get_config().ref[wc.refname].get_file(wc.path, isdir=True) + output: + directory("{:dir.references:}/{refname}/{path}") + localrules: unpack_archive rule unpack_archive: """ diff --git a/src/ymp/stage/reference.py b/src/ymp/stage/reference.py index f88a6961..fb8f9ff7 100644 --- a/src/ymp/stage/reference.py +++ b/src/ymp/stage/reference.py @@ -150,12 +150,19 @@ def add_resource(self, rsc): isurl = local_path != maybeurl if not isurl: local_path = rsc.get_path("url") - - type_name = rsc.get('type', 'fasta').lower() + id = "ALL" if 'id' in rsc: - self._ids.add(rsc['id']) + id = rsc["id"] + self._ids.add(id) - if type_name in ("fasta", "fastp"): + type_name = rsc.get('type', 'fasta').lower() + if type_name == "direct": + if not "extension" in rsc: + raise YmpConfigError( + rsc, "Reference resource of type direct must have 'extension' field" + ) + self.files[".".join((id, rsc["extension"]))] = local_path + elif type_name in ("fasta", "fastp"): self.files[f"ALL.{type_name}.gz"] = local_path elif type_name in ("gtf", "snp", "tsv", "csv"): self.files[f"ALL.{type_name}"] = local_path @@ -199,9 +206,11 @@ def get_path(self, _stack): def get_all_targets(self, stack: "StageStack") -> List[str]: return [os.path.join(self.dir, fname) for fname in self.files] - def get_file(self, filename): + def get_file(self, filename, isdir=False): local_path = self.files.get(filename) if local_path: + if os.path.isdir(local_path) != isdir: + return "YMP_THIS_FILE_MUST_NOT_EXIST" return local_path log.error(f"{self!r}: Failed to find {filename}") log.warning(f" Available: {self.files}") From 4be7cac3531d107eeff7dcb4968de0143c0ba790 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 5 Aug 2021 23:54:01 -0600 Subject: [PATCH 007/133] Add count_subread, count_htseq and quant_salmon --- src/ymp/rules/htseq.rules | 39 ++++++++++++++++++++++++++++++++++ src/ymp/rules/salmon.rules | 41 ++++++++++++++++++++++++++++++++++++ src/ymp/rules/subreads.rules | 26 +++++++++++++++++++++++ 3 files changed, 106 insertions(+) create mode 100644 src/ymp/rules/htseq.rules create mode 100644 src/ymp/rules/salmon.rules create mode 100644 src/ymp/rules/subreads.rules diff --git a/src/ymp/rules/htseq.rules b/src/ymp/rules/htseq.rules new file mode 100644 index 00000000..8baa1419 --- /dev/null +++ b/src/ymp/rules/htseq.rules @@ -0,0 +1,39 @@ +Env(name="htseq", base="bioconda", packages="htseq>0.13") + +with Stage("count_htseq"): + rule htseq_count: + message: + "Counting per gene reads with htseq-count" + input: + bam = "{:prev:}/{:target:}.sorted.bam", + gtf = "{:prev:}/{:target:}.gtf" + output: + counts = "{:this:}/{target}.htseq_counts", + log: + "{:this:}/{target}.log" + params: + max_reads_in_buffer = 30000000, # 30m + stranded = "reverse", # yes, no, reverse + minaqual = 20, + mode = "intersection-nonempty", + nonunique = "none", + threads: + 1 ## like fastqc, only 1 thread per file + conda: + "htseq" + shell: + "exec >/dev/null 2>&1;" + "htseq-count" + " --nprocesses={threads}" + " --format=bam" + " --order=pos" + " --max-reads-in-buffer={params.max_reads_in_buffer}" + " --stranded={params.stranded}" + " -a={params.minaqual}" + # --type=exon + # --idattr=gene_id + " --mode={params.mode}" + " --nonunique={params.nonunique}" + " {input.bam}" + " {input.gtf}" + " >{output.counts}" diff --git a/src/ymp/rules/salmon.rules b/src/ymp/rules/salmon.rules new file mode 100644 index 00000000..03afb7ed --- /dev/null +++ b/src/ymp/rules/salmon.rules @@ -0,0 +1,41 @@ +Env(name="salmon", base="bioconda", packages=["salmon>1.5"]) + +with Stage("quant_salmon") as S: + S.doc(""" + """) + S.add_param("L", typ="choice", name="libtype", default="A", + value=["A", "IU", "MU", "OU", "ISF", "ISR", "MSF", "MSR", "OSF", "OSR", + "U", "SF", "SR"]) + rule salmon_quant: + message: "{:name:}: {output.quant}" + input: + index = directory("{:prev:}/{:target:}.salmon_index"), + fq = "{:prev:}/{:target:}.{:pairnames:}.fq.gz", + output: + quant = "{:this:}/{target}.salmon/quant.sf", + unmapped = "{:this:}/{target}.salmon/aux_info/unmapped_names.txt", + benchmark: + "benchmarks/{:name:}/{:this:}/{target}.txt" + log: + "{:this:}/{target}.log", + params: + libtypex = "A" + conda: + "salmon" + + threads: + 32 + shell: + "exec >{log} 2>&1;" + "salmon quant" + " --libType {params.libtype}" + " --threads {threads}" + " --seqBias" + " --gcBias" + " --writeUnmappedNames" + " --index {input.index}" + " --mates1 {input.fq[0]}" + " --mates2 {input.fq[1]}" + " --output $(dirname {output.quant})" + + diff --git a/src/ymp/rules/subreads.rules b/src/ymp/rules/subreads.rules new file mode 100644 index 00000000..ec6c3225 --- /dev/null +++ b/src/ymp/rules/subreads.rules @@ -0,0 +1,26 @@ +Env(name="subread", base="bioconda", packages="subread") + +with Stage("count_subread"): + rule subread_featureCounts: + message: + "Counting reads with subreads featureCounts" + input: + bam = "{:prev:}/{:target:}.bam", + gtf = "{:prev:}/{:target:}.gtf", + output: + counts = "{:this:}/{target}.subread_counts", + log: + "{:this:}/{target}.log" + params: + minqual = 20, + threads: + 8 + conda: + "subread" + shell: + "exec >{log} 2>&1;" + "featureCounts" + " -a {input.gtf}" + " -o {output.counts}" + " -Q {params.minqual}" + " {input.bam}" From 52c50e0a5b80e138222695c84e4fb8b998e63233 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Fri, 27 Aug 2021 20:08:52 -0600 Subject: [PATCH 008/133] Add {:all_prevs:} --- src/ymp/stage/base.py | 16 ++++------ src/ymp/stage/pipeline.py | 67 +++++++++++++++++++++++++++++---------- src/ymp/stage/stack.py | 25 +++++++++++++++ src/ymp/stage/stage.py | 8 +++++ 4 files changed, 89 insertions(+), 27 deletions(-) diff --git a/src/ymp/stage/base.py b/src/ymp/stage/base.py index 15fd9f7d..83c75421 100644 --- a/src/ymp/stage/base.py +++ b/src/ymp/stage/base.py @@ -6,7 +6,7 @@ import os import re -from typing import Set, Dict, Union, List, Optional +from typing import Set, Dict, Union, List, Optional, Tuple from snakemake.rules import Rule from snakemake.workflow import Workflow @@ -83,19 +83,15 @@ def outputs(self) -> Union[Set[str], Dict[str, str]]: """ return set() - def get_outputs(self, path: str) -> Dict[str, str]: + def get_outputs(self, path: str) -> Dict[str, List[Tuple[str,bool]]]: """Returns a dictionary of outputs""" outputs = self.outputs - if isinstance(outputs, set): - return {output: path for output in outputs} - path, _, _ = path.rpartition("." + self.name) - # false positive - pylint: disable=no-member return { - output: path + p - for output, p in outputs.items() + output: [(path, False)] + for output in self.outputs } - def can_provide(self, inputs: Set[str]) -> Dict[str, str]: + def can_provide(self, inputs: Set[str], full_stack: bool = False) -> Dict[str, str]: """Determines which of ``inputs`` this stage can provide. Returns a dictionary with the keys a subset of ``inputs`` and @@ -105,7 +101,7 @@ def can_provide(self, inputs: Set[str]) -> Dict[str, str]: """ return { - output: "" + output: [("",False)] if full_stack else "" for output in inputs.intersection(self.outputs) } diff --git a/src/ymp/stage/pipeline.py b/src/ymp/stage/pipeline.py index 67cecc77..83d91342 100644 --- a/src/ymp/stage/pipeline.py +++ b/src/ymp/stage/pipeline.py @@ -9,7 +9,7 @@ from collections import OrderedDict from collections.abc import Mapping -from typing import Dict, List, Set, Optional +from typing import Dict, List, Set, Optional, Tuple from ymp.stage import StageStack, find_stage from ymp.stage.base import ConfigStage @@ -110,16 +110,16 @@ def params(self): self._params = params return super().params - def get_path(self, stack, typ=None): + def get_path(self, stack, typ=None, pipeline=None): pipeline_parameters = self.parse(stack.stage_name) param_map = { key.format(**pipeline_parameters): value for key, value in self._params.items() } - if typ is None: - pipeline = self.pipeline - else: + if typ is not None: pipeline = self.outputs[typ] + if pipeline is None: + pipeline = self.pipeline pipeline = pipeline.format(**pipeline_parameters) stages = [] path = "" @@ -140,17 +140,33 @@ def get_path(self, stack, typ=None): prefix = stack.name.rsplit(".", 1)[0] return ".".join([prefix]+stages) - def _make_outputs(self) -> Dict[str, str]: + def _make_outputs(self) -> Dict[str, List[Tuple[str,bool]]]: + """Collects outputs from all stages within pipeline + + Returns: { suffix: (stack_suffix, is_hidden) } + """ outputs = {} for stage_path, cfg in self.stages.items(): - if cfg.get("hide", self.hide_outputs): - continue stage_name = stage_path.rsplit(".", 1)[-1] stage = find_stage(stage_name) - new_outputs = stage.get_outputs(stage_path) - outputs.update(new_outputs) + ourhide = cfg.get("hide", self.hide_outputs) + for output, pathlist in stage.get_outputs(stage_path).items(): + ourpathlist = outputs.setdefault(output, []) + for path, hide in pathlist: + ourpathlist.append((path, hide|ourhide)) return outputs + def get_outputs(self, path: str) -> Dict[str, List[Tuple[str,bool]]]: + """Returns a dictionary of outputs""" + if self._outputs is None: + self._outputs = self._make_outputs() + path, _, _, = path.rpartition("." + self.name) + return { + output: [(path + lpath, hidden) for lpath, hidden in pathlist] + for output, pathlist in self._outputs.items() + } + + @property def outputs(self) -> Dict[str, str]: """The outputs of a pipeline are the sum of the outputs @@ -159,18 +175,35 @@ def outputs(self) -> Dict[str, str]: """ if self._outputs is None: self._outputs = self._make_outputs() - return self._outputs + res = {} + for output, pathlist in self._outputs.items(): + for path, hidden in reversed(pathlist): + if hidden: + continue + res[output] = path + break + return res - def can_provide(self, inputs: Set[str]) -> Dict[str, str]: + def can_provide(self, inputs: Set[str], full_stack: bool = False) -> Dict[str, str]: """Determines which of ``inputs`` this stage can provide. The result dictionary values will point to the "real" output. """ - res = { - output: path - for output, path in self.outputs.items() - if output in inputs - } + if full_stack: + if self._outputs is None: + self._outputs = self._make_outputs() + + res = { + output: pathlist + for output, pathlist in self._outputs.items() + if output in inputs + } + else: + res = { + output: path + for output, path in self.outputs.items() + if output in inputs + } return res def get_all_targets(self, stack): diff --git a/src/ymp/stage/stack.py b/src/ymp/stage/stack.py index 6f88f99b..650daf21 100644 --- a/src/ymp/stage/stack.py +++ b/src/ymp/stage/stack.py @@ -243,6 +243,31 @@ def prev(self, _args=None, kwargs=None) -> "StageStack": return self.prevs[suffix] + def all_prevs(self, _args=None, kwargs=None) -> List["StageStack"]: + if not kwargs or "wc" not in kwargs: + raise ExpandLateException() + + _, _, suffix = kwargs['item'].partition("{:all_prevs:}") + suffix = norm_wildcards(suffix) + + stage_names = copy.copy(self.stage_names) + stage_names.pop() + + prevs = [] + while stage_names: + path = ".".join(stage_names) + prev_stack = self.instance(path) + prev_stage = find_stage(stage_names.pop()) + ## FIXME: using prev_stack.stage instead of finding anew leads to deadlock?! + pathlist = prev_stage.can_provide(set((suffix,)), full_stack = True).get(suffix, []) + for ppath, hidden in pathlist: + if ppath: + npath = prev_stage.get_path(prev_stack, pipeline=ppath) + prevs.append(self.instance(npath)) + else: + prevs.append(prev_stack) + + return prevs def get_ids(self, select_cols, where_cols=None, where_vals=None): if not self.debug: diff --git a/src/ymp/stage/stage.py b/src/ymp/stage/stage.py index 16bf9195..300cfaf2 100644 --- a/src/ymp/stage/stage.py +++ b/src/ymp/stage/stage.py @@ -172,6 +172,14 @@ def prev(self, _args, kwargs) -> None: """ self.register_inout("prev", self._inputs, kwargs['item']) + def all_prevs(self, _args, kwargs) -> None: + """Gathers {:all_prevs:} calls from rules + + We register this as input as if called {:prev:}, assuming at + least one instance is required. + """ + self.register_inout("all_prevs", self._inputs, kwargs['item']) + def this(self, args=None, kwargs=None): """Replaces {:this:} in rules From 8dd271c6d66d94f5b65d5edc9e1198e2fc74fa3a Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Fri, 27 Aug 2021 20:09:12 -0600 Subject: [PATCH 009/133] Add file type tx.fasta --- src/ymp/stage/reference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ymp/stage/reference.py b/src/ymp/stage/reference.py index fb8f9ff7..f542285d 100644 --- a/src/ymp/stage/reference.py +++ b/src/ymp/stage/reference.py @@ -162,7 +162,7 @@ def add_resource(self, rsc): rsc, "Reference resource of type direct must have 'extension' field" ) self.files[".".join((id, rsc["extension"]))] = local_path - elif type_name in ("fasta", "fastp"): + elif type_name in ("fasta", "fastp", "tx.fasta"): self.files[f"ALL.{type_name}.gz"] = local_path elif type_name in ("gtf", "snp", "tsv", "csv"): self.files[f"ALL.{type_name}"] = local_path From f2d0d904399b84fb9d6326fc288fc4d654c8a4eb Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Fri, 27 Aug 2021 20:09:55 -0600 Subject: [PATCH 010/133] Fix TimeoutError != asyncio.TimeoutError (!!!) --- src/ymp/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ymp/download.py b/src/ymp/download.py index 9582b2cd..b9d88c78 100644 --- a/src/ymp/download.py +++ b/src/ymp/download.py @@ -147,7 +147,7 @@ async def _download(self, session: aiohttp.ClientSession, destfile, md5): return True break - except TimeoutError as e: + except asyncio.TimeoutError as e: exc = e return False From e20e7ad42190e9f4592a232dcd6320dbb5e58073 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Fri, 27 Aug 2021 20:16:17 -0600 Subject: [PATCH 011/133] Add MultiQC stage --- src/ymp/rules/bowtie2.rules | 25 +++++++++++++ src/ymp/rules/fastqc.rules | 26 ++++++++++++++ src/ymp/rules/hisat2.rules | 27 +++++++++++++- src/ymp/rules/htseq.rules | 25 +++++++++++++ src/ymp/rules/multiqc.rules | 68 +++++++++++++++++++++++++----------- src/ymp/rules/subreads.rules | 30 ++++++++++++++++ 6 files changed, 180 insertions(+), 21 deletions(-) diff --git a/src/ymp/rules/bowtie2.rules b/src/ymp/rules/bowtie2.rules index 9d53bda0..e443c371 100644 --- a/src/ymp/rules/bowtie2.rules +++ b/src/ymp/rules/bowtie2.rules @@ -121,3 +121,28 @@ with Stage("map_bowtie2") as S: r1 = filter_input("r1", join=","), r2 = "" + localrules: bowtie2_map_multiqc_cfg + rule bowtie2_map_multiqc_cfg: + message: + "{:name:}: Writing MultiQC config" + input: + "{:this:}/{:targets:}.log" + output: + "{:this:}/multiqc_config.yaml" + params: + this = "{:this:}" + run: + from ruamel.yaml import YAML + yaml = YAML(typ="rt") + data = { + "run_modules": [ "bowtie2" ], + "module_order": [{ + "bowtie2": { + "name": f"Bowtie2 ({params.this})", + "path_filters": f"{params.this}/*.log" + } + }] + } + with open(output[0], "w") as out: + yaml.dump(data, out) + diff --git a/src/ymp/rules/fastqc.rules b/src/ymp/rules/fastqc.rules index 27535774..d7923007 100644 --- a/src/ymp/rules/fastqc.rules +++ b/src/ymp/rules/fastqc.rules @@ -37,3 +37,29 @@ with Stage("qc_fastqc") as S: -k {params.k} \ >{log} 2>&1 """ + + localrules: fastqc_multiqc + rule fastqc_multiqc: + message: + "{:name:}: Writing MultiQC config" + input: + "{:this:}/{:targets:}.{:pairnames:}_fastqc.zip" + output: + "{:this:}/multiqc_config.yaml" + params: + this = "{:this:}" + run: + from ruamel.yaml import YAML + yaml = YAML(typ="rt") + data = { + "run_modules": [ "fastqc" ], + "module_order": [{ + "fastqc": { + "name": f"FastQC ({params.this})", + "path_filters": f"{params.this}/*_fastqc.zip" + } + }] + } + + with open(output[0], "w") as out: + yaml.dump(data, out) diff --git a/src/ymp/rules/hisat2.rules b/src/ymp/rules/hisat2.rules index a6b8fd09..78a78d83 100644 --- a/src/ymp/rules/hisat2.rules +++ b/src/ymp/rules/hisat2.rules @@ -4,7 +4,7 @@ HT2IDX_SUFFIXES = ["{}.ht2".format(n+1) for n in range(8)] with Stage("map_hisat2") as S: S.doc(""" - Map reads using Hisat2 + Map reads using HISAT2 """) rule hisat2_map: """ @@ -43,3 +43,28 @@ with Stage("map_hisat2") as S: " -p {threads} " " 2>{log}" " | samtools view -b -o {output.bam} -" + + localrules: hisat2_map_multiqc_cfg + rule hisat2_map_multiqc_cfg: + message: + "{:name:}: Writing MultiQC config" + input: + "{:this:}/{:targets:}.stats" + output: + "{:this:}/multiqc_config.yaml" + params: + this = "{:this:}" + run: + from ruamel.yaml import YAML + yaml = YAML(typ="rt") + data = { + "run_modules": [ "hisat2" ], + "module_order": [{ + "hisat2": { + "name": f"HISAT2 ({params.this})", + "path_filters": f"{params.this}/*.stats" + } + }] + } + with open(output[0], "w") as out: + yaml.dump(data, out) diff --git a/src/ymp/rules/htseq.rules b/src/ymp/rules/htseq.rules index 8baa1419..29dd89d0 100644 --- a/src/ymp/rules/htseq.rules +++ b/src/ymp/rules/htseq.rules @@ -37,3 +37,28 @@ with Stage("count_htseq"): " {input.bam}" " {input.gtf}" " >{output.counts}" + + localrules: htseq_count_multiqc_cfg + rule htseq_count_multiqc_cfg: + message: + "{:name:}: Writing MultiQC config" + input: + "{:this:}/{:targets:}.htseq_counts" + output: + "{:this:}/multiqc_config.yaml" + params: + this = "{:this:}" + run: + from ruamel.yaml import YAML + yaml = YAML(typ="rt") + data = { + "run_modules": [ "htseq" ], + "module_order": [{ + "fastqc": { + "name": f"HTSeq-Count ({params.this})", + "path_filters": f"{params.this}/*.htseq_counts" + } + }] + } + with open(output[0], "w") as out: + yaml.dump(data, out) diff --git a/src/ymp/rules/multiqc.rules b/src/ymp/rules/multiqc.rules index 56198133..86234efd 100644 --- a/src/ymp/rules/multiqc.rules +++ b/src/ymp/rules/multiqc.rules @@ -6,34 +6,62 @@ with Stage("qc_multiqc") as S: S.doc(""" Aggregate QC reports using MultiQC """) - rule multiqc_fastqc: + rule multiqc_merge_configs: + message: + "Aggregating MultiQC configs for {:this:}" + input: + conf = "{:all_prevs:}/multiqc_config.yaml" + output: + conf = "{:this:}/merged_multiqc_config.yaml" + run: + from ruamel.yaml import YAML + yaml = YAML(typ="rt") + run_modules = [] + sp = {} + module_order = [] + for conffile in input.conf: + with open(conffile, "r") as fd: + data = yaml.load(fd) + run_modules.extend(data.get("run_modules", [])) + sp.update(data.get("sp", {})) ## FIXME check conflicts! + module_order.extend(data.get("module_order", [])) + run_modules = list(set(run_modules)) + conf = { + "run_modules": run_modules, + "sp": sp, + "module_order": module_order, + } + print("writing to ", output.conf) + with open(output.conf, "w") as fd: + yaml.dump(conf, fd) + print("done") + + rule multiqc_report: """Assemble report on all FQ files in a directory""" message: - "Aggregating QC reports for {params.pdir}" + "Aggregating QC reports for {:this:}" input: - fastqc = "{:prev:}/{:fq_names:}_fastqc.zip" + conf = "{:this:}/merged_multiqc_config.yaml", + parts = "{:all_prevs:}/multiqc_config.yaml" output: - flist = "{:this:}/file_list.txt", - report = "{:this:}/multiqc_report.html", - log: - "{:this:}/multiqc.log" + report = "{:this:}/multiqc_report.html", + stamp = touch("{:this:}/all_targets.stamp") params: - pdir = "{:prev:}" + dirs = lambda wc, input: [os.path.dirname(p) for p in input.parts] + log: + "{:this:}/multiqc.log" threads: 1 conda: "multiqc" - shell: """ - echo {input.fastqc} | tr ' ' '\n' > {output.flist} - multiqc \ - --verbose \ - --module fastqc \ - --file-list {output.flist} \ - --filename {output.report} \ - --title {params.pdir} \ - --force \ - > {log} 2>&1 - cp {output.report} {output.report2} - """ + shell: + "exec >{log} 2>&1;" + "multiqc" + " --verbose" + " --force" + " --config {input.conf}" + " --filename {output.report}" + " {params.dirs}" + diff --git a/src/ymp/rules/subreads.rules b/src/ymp/rules/subreads.rules index ec6c3225..f93e731e 100644 --- a/src/ymp/rules/subreads.rules +++ b/src/ymp/rules/subreads.rules @@ -24,3 +24,33 @@ with Stage("count_subread"): " -o {output.counts}" " -Q {params.minqual}" " {input.bam}" + + localrules: subread_featureCounts_multiqc_cfg + rule subread_featureCounts_multiqc_cfg: + message: + "{:name:}: Writing MultiQC config" + input: + "{:this:}/{:targets:}.subread_counts" + output: + "{:this:}/multiqc_config.yaml" + params: + this = "{:this:}" + run: + from ruamel.yaml import YAML + yaml = YAML(typ="rt") + data = { + "run_modules": [ "featurecounts" ], + "sp": { + "featurecounts": { + "fn": "subread_counts" + } + }, + "module_order": [{ + "fastqc": { + "name": f"featureCounts ({params.this})", + "path_filters": f"{params.this}/*.subread_counts" + } + }] + } + with open(output[0], "w") as out: + yaml.dump(data, out) From 0ed87528c85fb3962800b80121f8618c09e59915 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Fri, 27 Aug 2021 20:23:10 -0600 Subject: [PATCH 012/133] Update STAR, Salmon and RSEM stages --- src/ymp/rules/rsem.rules | 99 ++++++++++++++++-------------------- src/ymp/rules/salmon.rules | 75 +++++++++++++++++++++++++--- src/ymp/rules/star.rules | 100 +++++++++++++++++-------------------- 3 files changed, 159 insertions(+), 115 deletions(-) diff --git a/src/ymp/rules/rsem.rules b/src/ymp/rules/rsem.rules index 3e9eadd3..8dfd5346 100644 --- a/src/ymp/rules/rsem.rules +++ b/src/ymp/rules/rsem.rules @@ -2,28 +2,32 @@ Env(name="rsem", base="bioconda", packages="rsem") RSEM_IDX = "chrlist grp idx.fa n2g.idx.fa seq ti transcripts.fa".split() -rule rsem_index: - """Build Genome Index for RSEM""" - message: - "RSEM: Indexing {input.contigs}" - input: - contigs = "{path}/{source}.fasta", - gtf = "{path}/{source}.gtf" - output: - index = expand("{{params.index}}.{ext}", ext=RSEM_IDX) - log: - "{params.index}.log" - params: - index = "{path}.index/{source}.rsem", - resources: - mem = "20g", - threads: - 1 - conda: - "rsem" - shell: """ - rsem-prepare-reference --gtf {input.gtf} {input.contigs} {params.index} >{log} 2>&1 - """ +with Stage("index_rsem") as S: + rule rsem_index: + """Build Genome Index for RSEM""" + message: + "RSEM: Indexing {input.contigs}" + input: + contigs = "{:prev:}/{:target:}.fasta.gz", + gtf = "{:prev:}/{:target:}.gtf" + output: + index = expand("{{:this:}}/{{target}}.rsem.{ext}", ext=RSEM_IDX) + log: + "{:this:}/{target}.log" + params: + index = "{:this:}/{target}.rsem" + resources: + mem = "20g", + shadow: + "shallow" + threads: + 1 + conda: + "rsem" + shell: """ + gzip -dc {input.contigs} > contigs.fa + rsem-prepare-reference --gtf {input.gtf} contigs.fa {params.index} >{log} 2>&1 + """ with Stage("quant_rsem") as S: S.doc(""" @@ -33,18 +37,18 @@ with Stage("quant_rsem") as S: message: "RSEM: calculating expression" input: - bam = "{:prev:}/{target}-annotated.{source}.bam", - idx = expand("{{:reference.dir:}}.index/{{target}}.rsem.{ext}", + bam = "{:prev:}/{:target:}.tx.bam", + idx = expand("{{:prev:}}/{{:target:}}.rsem.{ext}", ext=RSEM_IDX) output: - "{params.outprefix}.genes.results", - "{params.outprefix}.isoforms.results" + "{:this:}/{target}.genes.results", + "{:this:}/{target}.isoforms.results", log: - "{params.outprefix}.log" + "{:this:}/{target}.log", params: - index = "{:reference.dir:}.index/{target}.rsem", - outprefix = "{:this:}/{target}.{source}", - forward_prob = 0, # P of having fwd read + outprefix = "{:this:}/{target}", + index = lambda wc, input: input.idx[0][:-len(RSEM_IDX[0])-1], + forward_prob = 1.0, # P of having fwd read resources: mem = "16G", threads: @@ -52,35 +56,18 @@ with Stage("quant_rsem") as S: conda: "rsem" shell: - "rsem-calculate-expression " - " -p {threads} " + "rsem-calculate-expression" + " -p {threads}" " --bam " - " --no-bam-output " - " --estimate-rspd " # estimate read start position + " --no-bam-output" + " --estimate-rspd" # estimate read start position " --calc-ci" # calculate 95% credibility intervals and posterior mean estimates - " --ci-memory $(({resources.mem_mb} / 16 * 10)) " - " --forward-prob {params.forward_prob} " - " --paired-end " - " {input.bam} " - " {params.index} " + " --ci-memory $(({resources.mem_mb} / 16 * 10))" + " --forward-prob {params.forward_prob}" + " --paired-end" + " {input.bam}" + " {params.index}" " {params.outprefix} " " >{log} 2>&1 " - rule rsem_all_for_target: - message: - "RSEM: finished {output}" - input: - "{:this:}/{target}.{:sources:}.genes.results", - output: - touch("{:this:}/all_{target}") - - rule rsem_all: - message: - "RSEM: finished {output}" - input: - "{:this:}/all_{:targets:}" - output: - touch("{:this:}/all_targets.stamp") - - # TODO: SE mode diff --git a/src/ymp/rules/salmon.rules b/src/ymp/rules/salmon.rules index 03afb7ed..ab7ef252 100644 --- a/src/ymp/rules/salmon.rules +++ b/src/ymp/rules/salmon.rules @@ -1,12 +1,42 @@ Env(name="salmon", base="bioconda", packages=["salmon>1.5"]) -with Stage("quant_salmon") as S: +with Stage("index_salmon") as S: + S.doc(""" + """) + S.add_param("G", typ="flag", name="gencode", value="--gencode") + + rule salmon_index: + message: "{:name:}: FIXME" + input: + txfa = "{:prev:}/{:target:}.tx.fasta.gz", + output: + index = directory("{:this:}/{target}.salmon_index"), + benchmark: + "benchmarks/{:name:}/{:this:}/{target}.txt", + log: + "{:this:}/{target}.log", + params: + kmerlen = 31, + conda: + "salmon" + threads: + 32 + shell: + "exec >{log} 2>&1;" + "salmon index" + " --transcripts {input.txfa}" + " --kmerLen {params.kmerlen}" + " --index {output.index}" + " {params.gencode}" + + +with Stage("quant_salmon_sa") as S: S.doc(""" """) S.add_param("L", typ="choice", name="libtype", default="A", value=["A", "IU", "MU", "OU", "ISF", "ISR", "MSF", "MSR", "OSF", "OSR", "U", "SF", "SR"]) - rule salmon_quant: + rule salmon_sa_quant: message: "{:name:}: {output.quant}" input: index = directory("{:prev:}/{:target:}.salmon_index"), @@ -15,14 +45,11 @@ with Stage("quant_salmon") as S: quant = "{:this:}/{target}.salmon/quant.sf", unmapped = "{:this:}/{target}.salmon/aux_info/unmapped_names.txt", benchmark: - "benchmarks/{:name:}/{:this:}/{target}.txt" + "benchmarks/{:name:}/{:this:}/{target}.txt", log: "{:this:}/{target}.log", - params: - libtypex = "A" conda: "salmon" - threads: 32 shell: @@ -39,3 +66,39 @@ with Stage("quant_salmon") as S: " --output $(dirname {output.quant})" +with Stage("quant_salmon") as S: + S.doc(""" + """) + S.add_param("L", typ="choice", name="libtype", default="A", + value=["A", "IU", "MU", "OU", "ISF", "ISR", "MSF", "MSR", "OSF", "OSR", + "U", "SF", "SR"]) + S.add_param("G", typ="flag", name="gencode", value="--gencode") + + rule salmon_quant: + message: "{:name:}: {output.quant}" + input: + bam = "{:prev:}/{:target:}.tx.bam", + txfa = "{:prev:}/{:target:}.tx.fasta.gz" + output: + quant = "{:this:}/{target}.salmon/quant.sf", + benchmark: + "benchmarks/{:name:}/{:this:}/{target}.txt", + log: + "{:this:}/{target}.log", + conda: + "salmon" + threads: + 32 + shell: + "exec >{log} 2>&1;" + "salmon quant" + " --libType {params.libtype}" + " --threads {threads}" + " --seqBias" + " --gcBias" + " --writeUnmappedNames" + " --alignments {input.bam}" + " --targets {input.txfa}" + " --output $(dirname {output.quant})" + " --minAssignedFrags 0" + " {params.gencode}" diff --git a/src/ymp/rules/star.rules b/src/ymp/rules/star.rules index 920adbef..d9f661b2 100644 --- a/src/ymp/rules/star.rules +++ b/src/ymp/rules/star.rules @@ -1,42 +1,41 @@ Env(name="star", base="bioconda", packages="star") -rule star_index: - """Build Genome Index for Star""" - message: - "Star: Indexing {input.contigs}" - input: - contigs = "{path}/{source}.fasta", - gtf = "{path}/{source}.gtf" - output: - gdir = "{path}.index/{source}.star/", - index = "{path}.index/{source}.star/SA" - log: - std = "{path}.index/{source}.star.log", - log = "{path}.index/{source}.star/Log.txt" - threads: - 16 - params: - overhang = 100 - resources: - mem = "32g", - shadow: - "shallow" - conda: - "star" - shell: """ - STAR \ - --runThreadN {threads} \ - --limitGenomeGenerateRAM $(({resources.mem_mb}-1000))000000 \ - --runMode genomeGenerate \ - --genomeDir {output.gdir} \ - --genomeFastaFiles {input.contigs} \ - --sjdbGTFfile {input.gtf} \ - --sjdbOverhang {params.overhang} \ - >{log.std} 2>&1 - mv Log.txt {log.log} - """ - # TODO: - # - pass --genomeSAindexNbases =min(14, math.log2(genomelen)/2-1) +with Stage("index_star") as S: + rule star_index: + """Build Genome Index for Star""" + message: + "Star: Indexing {input.contigs}" + input: + contigs = "{:prev:}/{:target:}.fasta.gz", + gtf = "{:prev:}/{:target:}.gtf", + output: + gdir = directory("{:this:}/{target}.staridx"), + log: + "{:this:}/{target}.log", + threads: + 16 + params: + overhang = 100, + resources: + mem = "32g", + shadow: + "shallow" + conda: + "star" + shell: """ + gzip -dc {input.contigs} > genome.fa; + STAR \ + --runMode genomeGenerate \ + --runThreadN {threads} \ + --limitGenomeGenerateRAM $(({resources.mem_mb}-1000))000000 \ + --sjdbOverhang {params.overhang} \ + --genomeFastaFiles genome.fa \ + --sjdbGTFfile {input.gtf} \ + --genomeDir {output.gdir} \ + >{log} 2>&1 + """ + # TODO: + # - pass --genomeSAindexNbases =min(14, math.log2(genomelen)/2-1) with Stage("map_star") as S: @@ -45,35 +44,32 @@ with Stage("map_star") as S: """) rule star_map: input: - index = "{:reference.dir:}.index/{target}.star/SA", - fq = "{:prev:}/{source}.{:pairnames:}.fq.gz" + index = directory("{:prev:}/{:target:}.staridx"), + fq = "{:prev:}/{:target:}.{:pairnames:}.fq.gz" output: - bamgn = "{:this:}/{target}.{source}.bam", - bamtr = "{:this:}/{target}-annotated.{source}.bam", - sj = "{:this:}/{target}.{source}.SJ.out.tab" + bamgn = "{:this:}/{target}.bam", + bamtr = "{:this:}/{target}.tx.bam", log: - std = "{:this:}/{target}.{source}.log", - log = "{:this:}/{target}.{source}.Log.out", - prg = "{:this:}/{target}.{source}.Log.progress.out", - fin = "{:this:}/{target}.{source}.Log.final.out" + std = "{:this:}/{target}.log", params: - outprefix = "{:this:}/{target}.{source}.", + outprefix = "{:this:}/{target}.star.", multimap_nmax = 10, quantmode = "TranscriptomeSAM", - tmpdir = "{params.outprefix}_STAR_tmp" + tmpdir = "{:dir.tmp:}/star/{:this:}/{target}" resources: mem = "32g", threads: - 16 + 32 conda: "star" shell: """ + mkdir -p {params.tmpdir}; rmdir {params.tmpdir}; STAR \ - --genomeDir $(dirname {input.index}) \ + --genomeDir {input.index} \ --genomeLoad NoSharedMemory \ --runThreadN {threads} \ --readFilesIn {input.fq} \ - --readFilesCommand zcat \ + --readFilesCommand "gzip -dc" \ --outFileNamePrefix {params.outprefix} \ --outSAMtype BAM Unsorted \ --outSAMunmapped Within \ @@ -85,5 +81,3 @@ with Stage("map_star") as S: mv {params.outprefix}Aligned.out.bam {output.bamgn} mv {params.outprefix}Aligned.toTranscriptome.out.bam {output.bamtr} """ - - # TODO: SE mode From 0cbd498e9ff398be047851b9e77cff98447dac6f Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Fri, 27 Aug 2021 20:25:55 -0600 Subject: [PATCH 013/133] Add FastP --- src/ymp/rules/fastp.rules | 72 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 src/ymp/rules/fastp.rules diff --git a/src/ymp/rules/fastp.rules b/src/ymp/rules/fastp.rules new file mode 100644 index 00000000..402e5f41 --- /dev/null +++ b/src/ymp/rules/fastp.rules @@ -0,0 +1,72 @@ +Env(name="fastp", base="bioconda", packages=["fastp"]) + +with Stage("trim_fastp") as S: + S.doc(""" + Trims reads with `fastp ` + + >>>ymp make toy.trim_fastp + + """) + S.add_param("L", typ="int", name="length", default=20) + S.add_param("Q", typ="int", name="qual", default=20) + S.add_param("O", typ="flag", name="overrepresentcheck", value="--overrepresentation_analysis") + S.add_param("C", typ="flag", name="correction", value="--correction") + + rule fastp_trim: + message: + "{:name:}: Trimming {input[0]}" + input: + fq = "{:prev:}/{:target:}.{:pairnames:}.fq.gz", + output: + fq = "{:this:}/{target}.{:pairnames:}.fq.gz", + json = "{:this:}/{target}.fastp.json" + log: + "{:this:}/{target}.log", + benchmark: + "benchmarks/{:name:}/{:this:}/{target}.txt", + params: + resources: + mem = "2g", + threads: 4 + conda: "fastp" + shell: + "exec >{log} 2>&1;" + "fastp" + " --in1 {input.fq[0]}" + " --in2 {input.fq[1]}" + " --out1 {output.fq[0]}" + " --out2 {output.fq[1]}" + " --json {output.json}" + " --length_required {params.length}" + " --cut_mean_quality {params.qual}" + " --cut_tail" + " --thread {threads}" + " {params.overrepresentcheck}" + " {params.correction}" + + + localrules: fastp_multiqc + rule fastp_multiqc: + message: + "{:name:}: Writing MultiQC config" + input: + "{:this:}/{:targets:}.fastp.json" + output: + "{:this:}/multiqc_config.yaml" + params: + this = "{:this:}" + run: + from ruamel.yaml import YAML + yaml = YAML(typ="rt") + data = { + "run_modules": [ "fastp" ], + "module_order": [{ + "fastp": { + "name": f"FastP ({params.this})", + "path_filters": f"{params.this}/*.fastp.json" + } + }] + } + + with open(output[0], "w") as out: + yaml.dump(data, out) From d8232801e3d9c6041772b2810f85b2e977ffc3fc Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Fri, 27 Aug 2021 20:33:39 -0600 Subject: [PATCH 014/133] RSEM: turn of credibility interval calculation - takes very long --- src/ymp/rules/rsem.rules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ymp/rules/rsem.rules b/src/ymp/rules/rsem.rules index 8dfd5346..2fb5c75f 100644 --- a/src/ymp/rules/rsem.rules +++ b/src/ymp/rules/rsem.rules @@ -61,7 +61,7 @@ with Stage("quant_rsem") as S: " --bam " " --no-bam-output" " --estimate-rspd" # estimate read start position - " --calc-ci" # calculate 95% credibility intervals and posterior mean estimates + #" --calc-ci" # calculate 95% credibility intervals and posterior mean estimates " --ci-memory $(({resources.mem_mb} / 16 * 10))" " --forward-prob {params.forward_prob}" " --paired-end" From 06ab1308ad8fd870696806b58889322175f325ee Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Sat, 28 Aug 2021 17:28:09 -0600 Subject: [PATCH 015/133] Allow references to be leftmost item in stacks --- src/ymp/stage/reference.py | 10 ++++++++++ src/ymp/stage/stack.py | 5 ++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/ymp/stage/reference.py b/src/ymp/stage/reference.py index f542285d..fb6f11a5 100644 --- a/src/ymp/stage/reference.py +++ b/src/ymp/stage/reference.py @@ -85,6 +85,7 @@ def __init__(self, name, cfg): self.archives = [] self._ids: Set[str] = set() self._outputs = None + self.cfg = cfg import ymp self.dir = os.path.join(ymp.get_config().dir.references, name) @@ -235,3 +236,12 @@ def this(self, args=None, kwargs=None): def prev(self, args=None, kwargs=None): return self.dir + + def minimize_variables(self, groups): + """Removes redundant groupings + + This allows the reference to be used as a project, starting a pipeline" + """ + if groups != []: + raise YmpConfigError(self.cfg, "Reference may not be (re)grouped") + return groups, [] diff --git a/src/ymp/stage/stack.py b/src/ymp/stage/stack.py index 650daf21..065ca592 100644 --- a/src/ymp/stage/stack.py +++ b/src/ymp/stage/stack.py @@ -106,7 +106,10 @@ def __init__(self, path): #: This is needed for grouping variables currently. self.project = cfg.projects.get(self.stage_names[0]) if not self.project: - raise YmpStageError(f"No project for stage stack {path} found") + if self.stage_names[0].startswith("ref_"): + self.project = cfg.references.get(self.stage_names[0][4:]) + if not self.project: + raise YmpStageError(f"No project for stage stack {path} found") #: Mapping of each input type required by the stage of this stack #: to the prefix stack providing it. From f96b3a754cfabf9d782d97bbfcccad8cb4c49666 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 8 Sep 2021 22:04:28 -0600 Subject: [PATCH 016/133] Fix part of exception written to stdout, not stderr --- src/ymp/exceptions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ymp/exceptions.py b/src/ymp/exceptions.py index ef843bf3..b77ef465 100644 --- a/src/ymp/exceptions.py +++ b/src/ymp/exceptions.py @@ -1,5 +1,5 @@ """Exceptions raised by YMP""" - +import sys import textwrap from inspect import stack from typing import Optional, Tuple @@ -60,6 +60,8 @@ def get_fileline(self) -> Tuple[str, int]: def show(self, file=None) -> None: super().show(file) + if file is None: + file = sys.stderr fname, line = self.get_fileline() if fname: if line is None: From fbed67a9bbb5cd46bc995e258572fdd8790a9a1b Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 8 Sep 2021 22:04:57 -0600 Subject: [PATCH 017/133] Fix YmpConfigException not showing correct lines for sequences --- src/ymp/yaml.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/ymp/yaml.py b/src/ymp/yaml.py index ae0bb843..4f570f4d 100644 --- a/src/ymp/yaml.py +++ b/src/ymp/yaml.py @@ -111,7 +111,7 @@ def get_files(self): return [fn for fn, layer in self._maps] def get_linenos(self): - return [layer._yaml_line_col.line + return [layer._yaml_line_col.line + 1 for fn, layer in self._maps] def get_fileline(self, key = None): @@ -315,7 +315,7 @@ def __repr__(self): def __str__(self): return "+".join(f"{m}" for _, m in self._maps) - def _finditem(self, index): + def _locateitem(self, index): if isinstance(index, slice): raise NotImplementedError() if isinstance(index, str): @@ -327,10 +327,14 @@ def _finditem(self, index): if index >= len(smap): index -= len(smap) else: - return [(fn, smap[index])] + return fn, smap, index else: raise IndexError() + def _finditem(self, index): + fn, smap, index = self._locateitem(index) + return [(fn, smap[index])] + def __radd__(self, other): return self.__add__(other) @@ -356,6 +360,12 @@ def extend(self, item): def get_paths(self, absolute=False): return [self.get_path(i, absolute) for i in range(len(self))] + def get_fileline(self, key = None): + if key is None: + return ";".join(self.get_files()), next(iter(self.get_linenos()), None) + fn, smap, index = self._locateitem(key) + return fn, smap._yaml_line_col.data[index][0] + 1 + class LayeredConfProxy(MultiMapProxy): """Layered configuration""" From cfdff90f12afff9078341079be4950dcb9367994 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 8 Sep 2021 22:05:24 -0600 Subject: [PATCH 018/133] Comment --- src/ymp/util.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ymp/util.py b/src/ymp/util.py index e674865b..7f025074 100644 --- a/src/ymp/util.py +++ b/src/ymp/util.py @@ -13,6 +13,10 @@ def make_local_path(icfg, url: str): + """Rewrites remote URLs to point to downloads folder so they will be + retrieved by the download rules + + """ url_match = re.match("^(http|https|ftp|ftps)://", url) if url_match: return os.path.join( From 2e79841d5d1b88497168b331f42f38229a22678e Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 8 Sep 2021 22:05:38 -0600 Subject: [PATCH 019/133] Increase RSEM threads --- src/ymp/rules/rsem.rules | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ymp/rules/rsem.rules b/src/ymp/rules/rsem.rules index 2fb5c75f..3b609ca0 100644 --- a/src/ymp/rules/rsem.rules +++ b/src/ymp/rules/rsem.rules @@ -21,7 +21,7 @@ with Stage("index_rsem") as S: shadow: "shallow" threads: - 1 + 32 conda: "rsem" shell: """ @@ -52,7 +52,7 @@ with Stage("quant_rsem") as S: resources: mem = "16G", threads: - 16 + 32 conda: "rsem" shell: From b35f77c61c6199e9b920d0f6dfd70f870818109c Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 8 Sep 2021 22:05:59 -0600 Subject: [PATCH 020/133] Increase download block size --- src/ymp/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ymp/download.py b/src/ymp/download.py index b9d88c78..7e1b0457 100644 --- a/src/ymp/download.py +++ b/src/ymp/download.py @@ -31,7 +31,7 @@ class FileDownloader(object): alturls: List of regexps modifying URLs retry: Number of times to retry download """ - def __init__(self, block_size: int=4096, timeout: int=300, parallel: int=4, + def __init__(self, block_size: int=8192, timeout: int=300, parallel: int=4, loglevel: int=logging.WARNING, alturls=None, retry: int=3): self._block_size = block_size self._timeout = timeout From df750e66e059069f829703cfd2d54dec8530ce0e Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 8 Sep 2021 22:06:19 -0600 Subject: [PATCH 021/133] Log cancelled or timeout during download --- src/ymp/download.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ymp/download.py b/src/ymp/download.py index 7e1b0457..59e02b74 100644 --- a/src/ymp/download.py +++ b/src/ymp/download.py @@ -223,6 +223,7 @@ async def _download_one(self, session, name, url, dest, md5): return False return True except (asyncio.CancelledError, asyncio.TimeoutError): + self.error("Download failed: %s (cancelled or timed out)", name) if os.path.exists(part): os.unlink(part) raise From b27478fdb5d03f8d14607cbc4f1e84fa3a98465c Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 8 Sep 2021 23:49:15 -0600 Subject: [PATCH 022/133] Refactor reference stage system --- src/ymp/etc/defaults.yml | 1 - src/ymp/rules/00_download.rules | 4 +- src/ymp/stage/reference.py | 401 +++++++++++++++++++++++--------- tests/test_reference.py | 372 +++++++++++++++++++++++++++++ 4 files changed, 661 insertions(+), 117 deletions(-) create mode 100644 tests/test_reference.py diff --git a/src/ymp/etc/defaults.yml b/src/ymp/etc/defaults.yml index 2190cf6b..1f11a78c 100644 --- a/src/ymp/etc/defaults.yml +++ b/src/ymp/etc/defaults.yml @@ -63,7 +63,6 @@ references: - url: ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/grch38_snp_tran.tar.gz strip_components: 1 type: dir - stage: .index files: ALL.1.ht2: genome_snp_tran.1.ht2 ALL.2.ht2: genome_snp_tran.2.ht2 diff --git a/src/ymp/rules/00_download.rules b/src/ymp/rules/00_download.rules index 65a096c3..a499b43d 100644 --- a/src/ymp/rules/00_download.rules +++ b/src/ymp/rules/00_download.rules @@ -115,6 +115,8 @@ with Stage("references") as S: ruleorder: unpack_archive > prepare_reference for ref in ymp.get_config().ref.values(): - for unpack_rule in ref.make_unpack_rules(workflow._rules['unpack_archive']): + for unpack_rule in ref.generate_rules( + unpack_archive=workflow._rules['unpack_archive'], + ): unpack_rule diff --git a/src/ymp/stage/reference.py b/src/ymp/stage/reference.py index fb6f11a5..b55796e9 100644 --- a/src/ymp/stage/reference.py +++ b/src/ymp/stage/reference.py @@ -24,94 +24,325 @@ class Archive(object): strip_components = None files = None - def __init__(self, name, dirname, tar, url, strip, files): + def __init__(self, name, dirname, tar, strip, files): self.name = name self.dirname = dirname self.tar = tar - self.url = url self.strip = strip self.files = files - self.hash = sha1(self.tar.encode('utf-8')).hexdigest()[:8] + self.hash = sha1(self.tar.encode("utf-8")).hexdigest()[:8] self.prefix = os.path.join(self.dirname, "_unpacked_" + self.hash) def get_files(self): if isinstance(self.files, Sequence): - return {fn: os.path.join(self.prefix, fn) - for fn in self.files} + return {fn: os.path.join(self.prefix, fn) for fn in self.files} elif isinstance(self.files, Mapping): - return {fn_ymp: os.path.join(self.prefix, fn_arch) - for fn_ymp, fn_arch in self.files.items()} + return { + fn_ymp: os.path.join(self.prefix, fn_arch) + for fn_ymp, fn_arch in self.files.items() + } else: raise Exception("unknown data type for reference.files") - def make_unpack_rule(self, baserule: 'Rule'): + def make_unpack_rule(self, baserule: "Rule"): docstr_tpl = """ Unpacks {} archive: - URL: {} - Files: """ item_tpl = """ - {} """ - docstr = "\n".join([docstr_tpl.format(self.name, self.url)] + - [item_tpl.format(fn) for fn in self.files]) + docstr = "\n".join( + [docstr_tpl.format(self.name)] + [item_tpl.format(fn) for fn in self.files] + ) return make_rule( name="unpack_{}_{}".format(self.name, self.hash), docstring=docstr, lineno=0, snakefile=__name__, parent=baserule, - input=([], {'tar': self.tar}), - output=([], {'files': list(self.get_files().values())}), - params=([], {'strip': self.strip, - 'prefix': self.prefix}) + input=([], {"tar": self.tar}), + output=([], {"files": list(self.get_files().values())}), + params=([], {"strip": self.strip, "prefix": self.prefix}), + ) + + +class Resource: + """References comprise files, possibly remote, spefied as + "resources". These could e.g. be a archive (tar.gz), a local + directory or individual files. This is the base class for resource + types that can be configured. + + """ + + _registry = {} + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) # recurse up if subsubclass + if not getattr(cls, "type_names", []): # no type no registering + return + for name in cls.type_names: + if name in Resource._registry: + raise ValueError( + f"Resource class '{cls.__name__}' defines duplicate type name '{name}'" + f" already registered for " + f"'{Resource._registry[name].__name__}'." + ) + Resource._registry.update({name: cls for name in cls.type_names}) + + def __init__(self, ref, cfg): + self.reference = ref + self.cfg = cfg + self.type_name = self.get_type_name(cfg) + self._ids: Set[str] = set() + self.id_name = self.get_id_name(cfg) + + @classmethod + def make_from_cfg(cls, ref, cfg, num): + rsc = cfg[num] + if rsc is None: + raise YmpConfigError(cfg, "Empty reference resource config?!", key=num) + if not isinstance(rsc, Mapping): + raise YmpConfigError( + cfg, "Reference resource config must be a key-value mapping", key=num + ) + type_name = Resource.get_type_name(rsc) + klass = Resource._registry.get(type_name) + if klass is None: + raise YmpConfigError(rsc, f"Unknown type {type_name}", key="type") + return klass(ref, rsc) + + @staticmethod + def get_type_name(rsc): + return rsc.get("type", "fasta").lower() + + def get_local_path(self, rsc, field="url"): + """Extract local file path from config field + + - File paths for remote URLs are rewritten to point configured + downloads folder so that download is run by download rule. + - Relative paths are interpreted relative to the config file + defining the url, unless ``!workdir`` is prefixed, in which + case it's relative to the main ymp.yml. + + """ + if not "url" in rsc: + raise YmpConfigError( + rsc, + f"Reference resource of type '{self.type_name}' must have '{field}' field", + ) + import ymp + + cfg = ymp.get_config() + local_path = make_local_path(cfg, str(rsc[field])) + if not local_path != rsc[field]: + # unchanged by download redirect, honor the relative path: + local_path = rsc.get_path(field) + return local_path + + def get_id_name(self, rsc): + id = "ALL" + if "id" in rsc: + id = rsc["id"] + self._ids.add(id) + return id + + def generate_rules(self, **kwargs_): + """Generate special rules needed for the resource type""" + yield None + + +class UrlResource(Resource): + def __init__(self, *args): + super().__init__(*args) + self.local_path = self.get_local_path(self.cfg) + + +class FileResource(UrlResource): + type_names = ["file", "direct"] + + def __init__(self, *args): + super().__init__(*args) + self.extension = self.get_extension(self.cfg) + self.files = {f"{self.id_name}.{self.extension}": self.local_path} + + def get_extension(self, cfg): + ext = cfg.get("extension") + if not ext: + raise YmpConfigError( + cfg, "Reference resource of type direct must have 'extension' field" + ) + return ext + + +class NamedResource(FileResource): + type_names = ["fasta", "fastp", "tx.fasta"] + + def get_extension(self, cfg): + return self.type_name + ".gz" + + +class NamedUnpackedResources(FileResource): + type_names = ["gtf", "snp", "tsv", "csv"] + + def get_extension(self, cfg): + return self.type_name + + +class ArchiveResource(UrlResource): + type_names = ["archive", "dir"] + + def __init__(self, *args): + super().__init__(*args) + if not "files" in self.cfg: + raise YmpConfigError( + self.cfg, "Reference resource of type archive must have 'files' field" + ) + files = self.cfg.get("files") + if ( + not isinstance(files, Mapping) + and not isinstance(files, Sequence) + or isinstance(files, str) + ): + raise YmpConfigError( + self.cfg, "Archive 'files' must be mapping", key="files" + ) + self.archive = Archive( + name="NAME", + dirname=self.reference.canonical_location(), + tar=self.local_path, + files=files, + strip=self.cfg.get("strip_components", 0), ) + self.files = self.archive.get_files() + + def generate_rules(self, **kwargs): + yield self.archive.make_unpack_rule(kwargs["unpack_archive"]) + + +class LocalDirResource(UrlResource): + type_names = ["localdir", "dirx"] + + def __init__(self, *args): + super().__init__(*args) + if not "files" in self.cfg: + raise YmpConfigError( + self.cfg, "Reference resource of type localdir must have 'files' field" + ) + files = self.cfg.get("files") + if not isinstance(files, Mapping): + raise YmpConfigError( + self.cfg, "Localdir 'files' must be mapping", key="files" + ) + + self.files = { + key: os.path.join(self.local_path, val) for key, val in files.items() + } + + +class RegexLocalDirResource(UrlResource): + type_names = ["path"] + + def __init__(self, *args): + super().__init__(*args) + if not "match" in self.cfg: + raise YmpConfigError( + self.cfg, "Reference resource of type path must have 'match' field" + ) + matchlist = self.cfg.get("match") + if not isinstance(matchlist, Sequence) or isinstance(matchlist, str): + raise YmpConfigError(self.cfg, "Path 'match' must be list", key="match") + + try: + filenames = os.listdir(self.local_path) + except FileNotFoundError: + raise YmpConfigError( + self.cfg, "Directory required by path resource inaccessible" + ) + self.dir = self.local_path.rstrip("/") + + self.files = {} + for num, regex in enumerate(matchlist): + try: + comp_regex = re.compile(regex) + except re.error as exc: + raise YmpConfigError( + matchlist, f"Regex failed to compile: {exc}", key=num + ) from exc + + if list(comp_regex.groupindex) != ["sample"]: + raise YmpConfigError( + matchlist, + "Path resource match regexp's must have exactly one " + "named wildcard called 'sample'", + key=num, + ) + + for filename in filenames: + match = comp_regex.fullmatch(filename) + if match: + self._ids.add(match.group("sample")) + self.files[filename] = os.path.join(self.local_path, filename) + + if not self.files: + raise YmpConfigError( + self.cfg, "Reference resource of type path found no files!" + ) class Reference(Activateable, ConfigStage): """ Represents (remote) reference file/database configuration """ + def __init__(self, name, cfg): super().__init__("ref_" + name, cfg) #: Files provided by the reference. Keys are the file names #: within ymp ("target.extension"), symlinked into dir.ref/ref_name/ and #: values are the path to the reference file from workspace root. self.files: Dict[str, str] = {} + #: Name without the ref_ prefix + self.plainname = name self.archives = [] - self._ids: Set[str] = set() self._outputs = None self.cfg = cfg - import ymp - self.dir = os.path.join(ymp.get_config().dir.references, name) + self.dir = self.canonical_location() - if isinstance(cfg, Mapping): - self.add_resource(cfg) - elif isinstance(cfg, Sequence) and not isinstance(cfg, str): - for item in cfg: - self.add_resource(item) - else: - raise YmpConfigError(cfg, "Reference config must list or key-value mapping") + if not isinstance(cfg, Sequence) or isinstance(cfg, str): + raise YmpConfigError(cfg, "Reference config must be list") + + self._resources = [ + Resource.make_from_cfg(self, cfg, num) for num in range(len(cfg)) + ] + + self._ids: Set[str] = set.union(*(rsc._ids for rsc in self._resources)) + self._files: Dict[str, str] = {} + for rsc in self._resources: + for name, path in rsc.files.items(): + if name in self._files: + raise YmpConfigError(rsc.cfg, "Duplicate File") + self._files[name] = path # Copy rules defined in primary references stage stage_references = Stage.get_registry().get("references") if not stage_references: raise YmpConfigError( - cfg, - "Reference base stage not found. Main rules not loaded?" + cfg, "Reference base stage not found. Main rules not loaded?" ) self.rules = stage_references.rules.copy() - def get_group( - self, - stack: "StageStack", - default_groups: List[str] - ) -> List[str]: + def canonical_location(self): + import ymp + + cfg = ymp.get_config() + basedir = cfg.dir.references + return os.path.join(basedir, self.plainname) + + def get_group(self, stack: "StageStack", default_groups: List[str]) -> List[str]: if len(self._ids) > 1: groups = [self.name] else: @@ -119,11 +350,11 @@ def get_group( return super().get_group(stack, groups) def get_ids( - self, - stack: "StageStack", - groups: List[str], - match_groups: Optional[List[str]] = None, - match_value: Optional[str] = None + self, + stack: "StageStack", + groups: List[str], + match_groups: Optional[List[str]] = None, + match_value: Optional[str] = None, ) -> List[str]: if self._ids: return list(self._ids) @@ -134,103 +365,43 @@ def outputs(self) -> Union[Set[str], Dict[str, str]]: if self._outputs is None: keys = self._ids if self._ids else ["ALL"] self._outputs = { - "/" + re.sub(f"(^|.)({'|'.join(keys)})\.", r"\1{sample}.", fname) : "."+self.name - for fname in self.files + "/" + + re.sub(f"(^|.)({'|'.join(keys)})\.", r"\1{sample}.", fname): "." + + self.name + for fname in self._files } return self._outputs - def add_resource(self, rsc): - if not isinstance(rsc, Mapping): - raise YmpConfigError(rsc, "Reference resource config must be a key-value mapping") - - if not "url" in rsc: - raise YmpConfigError(rsc, "Reference resource must have 'url' field") - maybeurl = str(rsc["url"]) - import ymp - local_path = make_local_path(ymp.get_config(), maybeurl) - isurl = local_path != maybeurl - if not isurl: - local_path = rsc.get_path("url") - id = "ALL" - if 'id' in rsc: - id = rsc["id"] - self._ids.add(id) - - type_name = rsc.get('type', 'fasta').lower() - if type_name == "direct": - if not "extension" in rsc: - raise YmpConfigError( - rsc, "Reference resource of type direct must have 'extension' field" - ) - self.files[".".join((id, rsc["extension"]))] = local_path - elif type_name in ("fasta", "fastp", "tx.fasta"): - self.files[f"ALL.{type_name}.gz"] = local_path - elif type_name in ("gtf", "snp", "tsv", "csv"): - self.files[f"ALL.{type_name}"] = local_path - elif type_name == 'dir': - archive = Archive( - name=self.name, - dirname=self.dir, - tar=local_path, - url=maybeurl, - files=rsc['files'], - strip=rsc.get('strip_components', 0) - ) - self.files.update(archive.get_files()) - self.archives.append(archive) - elif type_name == 'dirx': - self.files.update({ - key: os.path.join(local_path, val) - for key, val in rsc.get('files', {}).items() - }) - elif type_name == 'path': - self.dir = local_path.rstrip("/") - try: - filenames = os.listdir(local_path) - except FileNotFoundError: - log.error("Directory %s required by %s %s does not exist", - local_path, self.__class__.__name__, self.name) - filenames = [] - for filename in filenames: - for regex in rsc.get('match', []): - match = re.fullmatch(regex, filename) - if not match: - continue - self._ids.add(match.group('sample')) - self.files[filename] = os.path.join(local_path, filename) - else: - raise YmpConfigError(rsc, f"Unknown type {type_name}", key="type") - def get_path(self, _stack): return self.dir def get_all_targets(self, stack: "StageStack") -> List[str]: - return [os.path.join(self.dir, fname) for fname in self.files] + return [os.path.join(self.dir, fname) for fname in self._files] def get_file(self, filename, isdir=False): - local_path = self.files.get(filename) + local_path = self._files.get(filename) if local_path: if os.path.isdir(local_path) != isdir: return "YMP_THIS_FILE_MUST_NOT_EXIST" return local_path log.error(f"{self!r}: Failed to find {filename}") - log.warning(f" Available: {self.files}") - return ("YMP_FILE_NOT_FOUND__" + - "No file {} in Reference {}" - "".format(filename, self.name).replace(" ", "_")) + log.warning(f" Available: {self._files}") + return "YMP_FILE_NOT_FOUND__" + "No file {} in Reference {}" "".format( + filename, self.name + ).replace(" ", "_") - def make_unpack_rules(self, baserule: 'Rule'): - for archive in self.archives: - yield archive.make_unpack_rule(baserule) + def generate_rules(self, **kwargs): + for rsc in self._resources: + yield from rsc.generate_rules(**kwargs) def __str__(self): return os.path.join(self.dir, "ALL") def this(self, args=None, kwargs=None): - item = kwargs['item'] - if kwargs.get('field') == 'output': - suffix = self.register_inout("this", set(), item).lstrip('/') - self.files[suffix] = os.path.join(self.dir, suffix) + item = kwargs["item"] + if kwargs.get("field") == "output": + suffix = self.register_inout("this", set(), item).lstrip("/") + self._files[suffix] = os.path.join(self.dir, suffix) self._outputs = None # will need refresh return self.dir diff --git a/tests/test_reference.py b/tests/test_reference.py new file mode 100644 index 00000000..56a07bf0 --- /dev/null +++ b/tests/test_reference.py @@ -0,0 +1,372 @@ +import logging +import os + +import pytest + +import ymp +from ymp import yaml +from ymp.stage import Reference, StageStack +from ymp.stage.reference import Resource +from ymp.exceptions import YmpConfigError + + +def make_cfg(text, *args): + fname = "test.yml" + with open(fname, "w") as f: + f.write("\n".join(["ref:"] + [" " + a for a in text.splitlines() + list(args)])) + cfg = yaml.load([fname]) + return cfg["ref"] + + +@pytest.fixture() +def check_show(capsys): + def checker(exc, substr): + exc.show() + log = capsys.readouterr() + assert substr in log.err + assert not log.out + + return checker + + +def test_not_list(saved_cwd, check_show): + with pytest.raises(YmpConfigError) as excinfo: + Reference("test", make_cfg("asd:")) + assert excinfo.match("must be list") + check_show(excinfo.value, "line 2") + + +def test_empty_ref(saved_cwd, check_show): + with pytest.raises(YmpConfigError) as excinfo: + Reference("test", make_cfg("-")) + assert excinfo.match("Empty") + check_show(excinfo.value, "line 2") + + +def test_empty_unknown_type(saved_cwd, check_show): + with pytest.raises(YmpConfigError) as excinfo: + Reference("test", make_cfg("- type: mountain")) + assert excinfo.match("Unknown type") + assert excinfo.match("mountain") + check_show(excinfo.value, "line 2") + + +def test_fasta_no_url(saved_cwd, check_show): + with pytest.raises(YmpConfigError) as excinfo: + ref = Reference("test", make_cfg("- type: fasta")) + assert excinfo.match("fasta") + assert excinfo.match("must have 'url'") + check_show(excinfo.value, "line 2") + + +def test_fasta_with_url(saved_cwd, check_show): + ref = Reference("test", make_cfg("- type: fasta", " url: somewhere")) + + +def test_duplicate_resource(saved_cwd): + from ymp.stage.reference import FileResource + + with pytest.raises(ValueError) as excinfo: + + class duplicate(FileResource): + pass + + assert excinfo.match("'file'") + assert excinfo.match("duplicate type") + + +def test_resource_not_mapping(saved_cwd, check_show): + with pytest.raises(YmpConfigError) as excinfo: + ref = Reference("test", make_cfg("- []")) + assert excinfo.match("mapping") + check_show(excinfo.value, "line 2") + + +def test_resource_not_mapping_third(saved_cwd, check_show): + with pytest.raises(YmpConfigError) as excinfo: + ref = Reference( + "test", + make_cfg( + "- type: fasta", + " url: somewhere", + "- type: fasta", + " url: somewhere", + "- []", + ), + ) + assert excinfo.match("mapping") + check_show(excinfo.value, "line 6") + + +def test_get_id_name(saved_cwd): + ref = Reference( + "test", make_cfg("- type: fasta", " id: customid", " url: somewhere") + ) + # FIXME, check IDs in reference, this just triggers resource + + +def test_file_resource_no_extension(saved_cwd, check_show): + with pytest.raises(YmpConfigError) as excinfo: + ref = Reference("test", make_cfg("- type: file", " url: somewhere")) + assert excinfo.match("must have") + assert excinfo.match("extension") + check_show(excinfo.value, "line 2") + + +def test_file_resource(saved_cwd): + ref = Reference( + "test", make_cfg("- type: file", " url: somewhere", " extension: bam") + ) + + +def test_named_unpacked_resource(saved_cwd): + ref = Reference("test", make_cfg("- type: gtf", " url: somewhere")) + + +def test_archive_resource_no_url(saved_cwd, check_show): + with pytest.raises(YmpConfigError) as excinfo: + ref = Reference("test", make_cfg(" - type: archive")) + assert excinfo.match("must have") + assert excinfo.match("url") + check_show(excinfo.value, "line 2") + + +def test_archive_resource_no_files(saved_cwd, check_show): + with pytest.raises(YmpConfigError) as excinfo: + ref = Reference("test", make_cfg(" - type: archive", " url: somwhere")) + assert excinfo.match("must have") + assert excinfo.match("files") + check_show(excinfo.value, "line 2") + + +def test_archive_resource_files_not_mapping(saved_cwd, check_show): + with pytest.raises(YmpConfigError) as excinfo: + ref = Reference( + "test", make_cfg(" - type: archive", " url: somwhere", " files:") + ) + assert excinfo.match("must be mapping") + assert excinfo.match("files") + check_show(excinfo.value, "line 4") + + +def test_archive_resource_no_url(saved_cwd, check_show): + ref = Reference( + "test", + make_cfg( + " - type: archive", + " url: somwhere", + " files:", + " ALL.bam: some.bam", + ), + ) + + +def test_localdir_resource_no_files(saved_cwd, check_show): + with pytest.raises(YmpConfigError) as excinfo: + ref = Reference("test", make_cfg(" - type: localdir", " url: somwhere")) + assert excinfo.match("must have") + assert excinfo.match("files") + check_show(excinfo.value, "line 2") + + +def test_localdir_resource_files_not_mapping(saved_cwd, check_show): + with pytest.raises(YmpConfigError) as excinfo: + ref = Reference( + "test", make_cfg(" - type: localdir", " url: somwhere", " files:") + ) + assert excinfo.match("must be mapping") + assert excinfo.match("files") + check_show(excinfo.value, "line 4") + + +def test_localdir_resource(saved_cwd): + ref = Reference( + "test", + make_cfg( + " - type: localdir", + " url: somewhere", + " files:", + " ALL.bam: some.bam", + ), + ) + + +def test_regexlocaldir_directory_missing(saved_cwd, check_show): + with pytest.raises(YmpConfigError) as excinfo: + ref = Reference( + "test", + make_cfg( + " - type: path", + " url: somewhere", + " match: [something]", + ), + ) + assert excinfo.match("Directory") + check_show(excinfo.value, "line 2") + + +def test_regexlocaldir_no_match(saved_cwd, check_show): + os.mkdir("somewhere") + with pytest.raises(YmpConfigError) as excinfo: + ref = Reference( + "test", + make_cfg( + " - type: path", + " url: somewhere", + ), + ) + assert excinfo.match("must have") + assert excinfo.match("match") + check_show(excinfo.value, "line 2") + + +def test_regexlocaldir_match_not_list(saved_cwd, check_show): + os.mkdir("somewhere") + with pytest.raises(YmpConfigError) as excinfo: + ref = Reference( + "test", + make_cfg(" - type: path", " url: somewhere", " match: something"), + ) + assert excinfo.match("must be") + assert excinfo.match("match") + check_show(excinfo.value, "line 4") + + +def test_regexlocaldir_match_no_files(saved_cwd, check_show): + os.mkdir("somewhere") + with pytest.raises(YmpConfigError) as excinfo: + ref = Reference( + "test", + make_cfg(" - type: path", " url: somewhere", " match: [(?P)]"), + ) + assert excinfo.match("no files") + check_show(excinfo.value, "line 2") + + +def test_regexlocaldir_match_broken_regex(saved_cwd, check_show): + os.mkdir("somewhere") + with pytest.raises(YmpConfigError) as excinfo: + ref = Reference( + "test", + make_cfg(" - type: path", " url: somewhere", " match: [(?P]"), + ) + assert excinfo.match("compile") + assert excinfo.match("missing \)") + check_show(excinfo.value, "line 4") + + +def test_regexlocaldir_match_regex_no_sample(saved_cwd, check_show): + os.mkdir("somewhere") + with pytest.raises(YmpConfigError) as excinfo: + ref = Reference( + "test", + make_cfg( + " - type: path", + " url: somewhere", + " match:", + " - (?P.)", + " - (?P.)", + ), + ) + assert excinfo.match("must have") + assert excinfo.match("sample") + check_show(excinfo.value, "line 6") + + +def test_regexlocaldir_resource(saved_cwd): + os.mkdir("somewhere") + open("somewhere/test.file", "a").close() + ref = Reference( + "test", + make_cfg( + " - type: path", + " url: somewhere", + " match:", + " - (?P[^.]*)\.file", + ), + ) + + +def test_get_path(demo_dir): + ref = Reference( + "test", + make_cfg( + "- type: fasta", + " url: somewhere", + ), + ) + assert ref.get_path(None) == "references/test" + ## FIXME: Do we need the below feature at all?9 + assert str(ref) == "references/test/ALL" + + +def test_get_all_targets(demo_dir): + ref = Reference( + "test", + make_cfg( + "- type: fasta", + " url: somewhere", + ), + ) + assert ref.get_all_targets(None) == ["references/test/ALL.fasta.gz"] + + +def test_no_ids(demo_dir): + ref = Reference("test", make_cfg("- type: fasta", " url: somewhere")) + stack = StageStack("toy") + groups = ref.get_group(stack, ["bla"]) + assert groups == [] + ids = ref.get_ids(stack, groups) + assert ids == ["ALL"] + + +def test_with_ids(demo_dir): + ref = Reference( + "test", + make_cfg( + "- type: fasta", + " url: somewhere/1.fasta", + " id: one", + "- type: fasta", + " url: elsewhere/2.fasta", + " id: two", + ), + ) + stack = StageStack("toy") + groups = ref.get_group(stack, ["bla"]) + assert groups == ["ref_test"] + ids = ref.get_ids(stack, groups) + assert set(ids) == set(["one", "two"]) + assert ref.outputs == {"/{sample}.fasta.gz": ".ref_test"} + + +def test_duplicate_file(saved_cwd, check_show): + with pytest.raises(YmpConfigError) as excinfo: + ref = Reference( + "test", + make_cfg( + "- type: fasta", " url: somewhere", "- type: fasta", " url: somewhere" + ), + ) + assert excinfo.match("Duplicate") + check_show(excinfo.value, "line 4") + + +def test_get_file(saved_cwd): + ref = Reference("test", make_cfg("- type: fasta", " url: somewhere.fasta.gz")) + assert ref.get_file("ALL.fasta.gz") == "somewhere.fasta.gz" + assert ref.get_file("ALL.fasta.gz", isdir=True) == "YMP_THIS_FILE_MUST_NOT_EXIST" + assert ref.get_file("blabla").startswith("YMP_FILE_NOT_FOUND") + + +def test_add_rule(saved_cwd): + ref = Reference("test", make_cfg("- type: fasta", " url: somewhere.fasta.gz")) + assert ref.prev() == "references/test" + assert ref.get_file("ALL.sometype").startswith("YMP_FILE_NOT_FOUND") + kwargs = {"item": "{:this:}/{:target:}.sometype"} + assert ref.this(kwargs=kwargs) == "references/test" + assert ref.get_file("ALL.sometype").startswith("YMP_FILE_NOT_FOUND") + kwargs["field"] = "output" + ref.set_active(ref) + assert ref.this(kwargs=kwargs) == "references/test" + assert ref.get_file("{sample}.sometype") == "references/test/{sample}.sometype" From ad9549a1e71e3c23d4636f99a2ed568cbd18faa2 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 9 Sep 2021 17:06:21 -0600 Subject: [PATCH 023/133] Merge Archive into ArchiveResource --- src/ymp/stage/reference.py | 109 ++++++++++++++----------------------- 1 file changed, 41 insertions(+), 68 deletions(-) diff --git a/src/ymp/stage/reference.py b/src/ymp/stage/reference.py index b55796e9..de1d3023 100644 --- a/src/ymp/stage/reference.py +++ b/src/ymp/stage/reference.py @@ -16,60 +16,6 @@ log = logging.getLogger(__name__) # pylint: disable=invalid-name -class Archive(object): - name = None - hash = None - tar = None - dirname = None - strip_components = None - files = None - - def __init__(self, name, dirname, tar, strip, files): - self.name = name - self.dirname = dirname - self.tar = tar - self.strip = strip - self.files = files - - self.hash = sha1(self.tar.encode("utf-8")).hexdigest()[:8] - self.prefix = os.path.join(self.dirname, "_unpacked_" + self.hash) - - def get_files(self): - if isinstance(self.files, Sequence): - return {fn: os.path.join(self.prefix, fn) for fn in self.files} - elif isinstance(self.files, Mapping): - return { - fn_ymp: os.path.join(self.prefix, fn_arch) - for fn_ymp, fn_arch in self.files.items() - } - else: - raise Exception("unknown data type for reference.files") - - def make_unpack_rule(self, baserule: "Rule"): - docstr_tpl = """ - Unpacks {} archive: - - Files: - """ - - item_tpl = """ - - {} - """ - docstr = "\n".join( - [docstr_tpl.format(self.name)] + [item_tpl.format(fn) for fn in self.files] - ) - return make_rule( - name="unpack_{}_{}".format(self.name, self.hash), - docstring=docstr, - lineno=0, - snakefile=__name__, - parent=baserule, - input=([], {"tar": self.tar}), - output=([], {"files": list(self.get_files().values())}), - params=([], {"strip": self.strip, "prefix": self.prefix}), - ) - - class Resource: """References comprise files, possibly remote, spefied as "resources". These could e.g. be a archive (tar.gz), a local @@ -197,29 +143,56 @@ class ArchiveResource(UrlResource): def __init__(self, *args): super().__init__(*args) + + # Generate hash from tarfile name + self.fnhash = sha1(self.local_path.encode("utf-8")).hexdigest()[:8] + # Compute output prefix + self.prefix = os.path.join( + self.reference.canonical_location(), "_unpacked_" + self.fnhash + ) + + # Collect files if not "files" in self.cfg: raise YmpConfigError( self.cfg, "Reference resource of type archive must have 'files' field" ) files = self.cfg.get("files") - if ( - not isinstance(files, Mapping) - and not isinstance(files, Sequence) - or isinstance(files, str) - ): + if isinstance(files, Sequence) and not isinstance(files, str): + self.files = {fn: os.path.join(self.prefix, fn) for fn in files} + elif isinstance(files, Mapping): + self.files = { + fn_ymp: os.path.join(self.prefix, fn_arch) + for fn_ymp, fn_arch in files.items() + } + else: raise YmpConfigError( self.cfg, "Archive 'files' must be mapping", key="files" ) - self.archive = Archive( - name="NAME", - dirname=self.reference.canonical_location(), - tar=self.local_path, - files=files, - strip=self.cfg.get("strip_components", 0), - ) - self.files = self.archive.get_files() - def generate_rules(self, **kwargs): + # Collect strip components parameter for untar + self.strip = self.cfg.get("strip_components", 0) + + def generate_rules(self, unpack_archive=None, **kwargs): + docstr = f""" + Unpacks {self.reference.name} archive: + + Files: + """ + + item_tpl = """ + - {} + """ + docstr = "\n".join([docstr] + [item_tpl.format(fn) for fn in self.files]) + return make_rule( + name=f"unpack_{self.reference.name}_{self.fnhash}", + docstring=docstr, + lineno=0, + snakefile=__name__, + parent=unpack_archive, + input=([], {"tar": self.local_path}), + output=([], {"files": list(self.files.values())}), + params=([], {"strip": self.strip, "prefix": self.prefix}), + ) yield self.archive.make_unpack_rule(kwargs["unpack_archive"]) From a9e53abe777b468387dd0e3dd9f5bfc1b7bb17e4 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 9 Sep 2021 22:35:22 -0600 Subject: [PATCH 024/133] Allow pipelines to stack within references --- src/ymp/stage/reference.py | 78 ++++++++++++++++++++++++++++---------- src/ymp/stage/stack.py | 5 +++ 2 files changed, 63 insertions(+), 20 deletions(-) diff --git a/src/ymp/stage/reference.py b/src/ymp/stage/reference.py index de1d3023..89ff19b4 100644 --- a/src/ymp/stage/reference.py +++ b/src/ymp/stage/reference.py @@ -9,7 +9,7 @@ from ymp.snakemake import make_rule from ymp.util import make_local_path -from ymp.stage import ConfigStage, Activateable, Stage +from ymp.stage import ConfigStage, Activateable, Stage, Pipeline from ymp.exceptions import YmpConfigError @@ -266,6 +266,23 @@ def __init__(self, *args): ) +class StageResource(Resource): + type_names = ["pipeline"] + + def __init__(self, *args): + super().__init__(*args) + self.pipeline = Pipeline("NAME", self.cfg) + self._files = None + + @property + def files(self): + if self._files is None: + self._files = {} + for name, path in self.pipeline.outputs.items(): + self._files[name.lstrip("/")] = path + return self._files + + class Reference(Activateable, ConfigStage): """ Represents (remote) reference file/database configuration @@ -276,7 +293,7 @@ def __init__(self, name, cfg): #: Files provided by the reference. Keys are the file names #: within ymp ("target.extension"), symlinked into dir.ref/ref_name/ and #: values are the path to the reference file from workspace root. - self.files: Dict[str, str] = {} + self._files: Dict[str, str] = None #: Name without the ref_ prefix self.plainname = name self.archives = [] @@ -293,12 +310,6 @@ def __init__(self, name, cfg): ] self._ids: Set[str] = set.union(*(rsc._ids for rsc in self._resources)) - self._files: Dict[str, str] = {} - for rsc in self._resources: - for name, path in rsc.files.items(): - if name in self._files: - raise YmpConfigError(rsc.cfg, "Duplicate File") - self._files[name] = path # Copy rules defined in primary references stage stage_references = Stage.get_registry().get("references") @@ -333,32 +344,54 @@ def get_ids( return list(self._ids) return super().get_ids(stack, groups, match_groups, match_value) + @property + def files(self): + if self._files is None: + self._files = {} + for rsc in self._resources: + for name, path in rsc.files.items(): + if name in self._files: + raise YmpConfigError(rsc.cfg, "Duplicate File") + self._files[name] = path + return self._files + @property def outputs(self) -> Union[Set[str], Dict[str, str]]: if self._outputs is None: keys = self._ids if self._ids else ["ALL"] - self._outputs = { - "/" - + re.sub(f"(^|.)({'|'.join(keys)})\.", r"\1{sample}.", fname): "." - + self.name - for fname in self._files - } + self._outputs = {} + for fname, target in self.files.items(): + if "{sample}" in fname: + self._outputs["/" + fname] = target + else: + normname = "/" + re.sub( + f"(^|.)({'|'.join(keys)})\.", r"\1{sample}.", fname + ) + self._outputs[normname] = "" return self._outputs - def get_path(self, _stack): - return self.dir + def can_provide(self, inputs: Set[str], full_stack: bool = False) -> Dict[str, str]: + res = { + output: path for output, path in self.outputs.items() if output in inputs + } + return res + + def get_path(self, _stack=None, typ=None): + if typ is None: + return self.dir + return self.name + self.outputs[typ] def get_all_targets(self, stack: "StageStack") -> List[str]: - return [os.path.join(self.dir, fname) for fname in self._files] + return [os.path.join(self.dir, fname) for fname in self.files] def get_file(self, filename, isdir=False): - local_path = self._files.get(filename) + local_path = self.files.get(filename) if local_path: if os.path.isdir(local_path) != isdir: return "YMP_THIS_FILE_MUST_NOT_EXIST" return local_path log.error(f"{self!r}: Failed to find {filename}") - log.warning(f" Available: {self._files}") + log.warning(f" Available: {self.files}") return "YMP_FILE_NOT_FOUND__" + "No file {} in Reference {}" "".format( filename, self.name ).replace(" ", "_") @@ -374,7 +407,8 @@ def this(self, args=None, kwargs=None): item = kwargs["item"] if kwargs.get("field") == "output": suffix = self.register_inout("this", set(), item).lstrip("/") - self._files[suffix] = os.path.join(self.dir, suffix) + ## FIXME + self.files[suffix] = os.path.join(self.dir, suffix) self._outputs = None # will need refresh return self.dir @@ -389,3 +423,7 @@ def minimize_variables(self, groups): if groups != []: raise YmpConfigError(self.cfg, "Reference may not be (re)grouped") return groups, [] + + @property + def variables(self): + return [] diff --git a/src/ymp/stage/stack.py b/src/ymp/stage/stack.py index 065ca592..838d545d 100644 --- a/src/ymp/stage/stack.py +++ b/src/ymp/stage/stack.py @@ -39,6 +39,11 @@ def find_stage(name): if refname in cfg.ref: return cfg.ref[refname] raise YmpStageError(f"Unknown reference '{refname}'") + if name.startswith(cfg.dir.references): + refname = name[len(cfg.dir.references):].lstrip("/") + if refname in cfg.ref: + return cfg.ref[refname] + raise YmpStageError(f"Unknown reference '{refname}'") if name in cfg.projects: return cfg.projects[name] for stage in registry.values(): From f8b1ac0fd8bfc5945080a6d154060bbc56538585 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Sun, 16 Aug 2020 10:52:09 -0600 Subject: [PATCH 025/133] Add install with Bioconda badge --- README.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index c08896bb..713c4221 100644 --- a/README.rst +++ b/README.rst @@ -1,9 +1,10 @@ YMP - a Flexible Omics Pipeline =============================== +|Install with Bioconda| |Github Unit Tests| |Read the Docs| |Codacy grade| |Codecov| -|Github Unit Tests| |Read the Docs| |Codacy grade| |Codecov| - +.. |Install with Bioconda| image:: https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat + :target: http://bioconda.github.io/recipes/ymp/README.html) .. |Github Unit Tests| image:: https://github.com/epruesse/ymp/workflows/Unit%20Tests/badge.svg :target: https://github.com/epruesse/ymp/actions?query=workflow%3A%22Unit+Tests%22 .. |CircleCI| image:: https://img.shields.io/circleci/project/github/epruesse/ymp.svg?label=CircleCI From 762321621a1fbdaa3428c9ba2d8b7bb0b42e1206 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Sun, 16 Aug 2020 10:52:36 -0600 Subject: [PATCH 026/133] fix typo --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 713c4221..cd520ee1 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ YMP - a Flexible Omics Pipeline |Install with Bioconda| |Github Unit Tests| |Read the Docs| |Codacy grade| |Codecov| .. |Install with Bioconda| image:: https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat - :target: http://bioconda.github.io/recipes/ymp/README.html) + :target: http://bioconda.github.io/recipes/ymp/README.html .. |Github Unit Tests| image:: https://github.com/epruesse/ymp/workflows/Unit%20Tests/badge.svg :target: https://github.com/epruesse/ymp/actions?query=workflow%3A%22Unit+Tests%22 .. |CircleCI| image:: https://img.shields.io/circleci/project/github/epruesse/ymp.svg?label=CircleCI From 98f10d6ca7e4839cfc06d89b067240415b65ddcf Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Fri, 13 Nov 2020 12:12:36 +0000 Subject: [PATCH 027/133] Bump sphinx from 3.2.1 to 3.3.1 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 3.2.1 to 3.3.1. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/3.x/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v3.2.1...v3.3.1) Signed-off-by: dependabot-preview[bot] --- doc/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/requirements.txt b/doc/requirements.txt index 978a6b4a..e1dc7db5 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -1,4 +1,4 @@ -sphinx ==3.2.1 +sphinx ==3.3.1 cloud_sptheme setuptools_scm sphinxcontrib-fulltoc From 7b1aba6a6ae0e091dd1ae1f50f8dd2ae2674f302 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Fri, 10 Sep 2021 12:55:22 -0600 Subject: [PATCH 028/133] Add r_tximport stage --- src/ymp/rules/tximport.rules | 55 +++++++++++++++++++++ src/ymp/rules/tximport_rsem.R | 86 +++++++++++++++++++++++++++++++++ src/ymp/rules/tximport_salmon.R | 86 +++++++++++++++++++++++++++++++++ 3 files changed, 227 insertions(+) create mode 100644 src/ymp/rules/tximport.rules create mode 100644 src/ymp/rules/tximport_rsem.R create mode 100644 src/ymp/rules/tximport_salmon.R diff --git a/src/ymp/rules/tximport.rules b/src/ymp/rules/tximport.rules new file mode 100644 index 00000000..be9d693b --- /dev/null +++ b/src/ymp/rules/tximport.rules @@ -0,0 +1,55 @@ +Env(name="tximport",base="bioconda", packages=[ + "bioconductor-tximport", + "bioconductor-tximeta", + "r-readr" # faster read + ]) + +with Stage("r_tximport") as S: + S.doc(""" + """) + + S.require( + counts = [ + ["isoforms.results", "genes.results"], # RSEM output + ["salmon/quant.sf"] # Salmon output + ], + gtf = [["gtf"]], + ) + + rule tximport_rsem: + message: + "{:name:}: Importing counts from RSEM" + input: + counts = "{:prev:}/{:target:}.genes.results", + transcripts = "{:prev:}/{:target:}.isoforms.results", + gtf = "{:prev:}/{:target:}.gtf", + output: + counts = "{:this:}/{target}.gene_counts.rds", + transcripts = "{:this:}/{target}.tx_counts.rds", + log: + "{:this:}/{target}.log", + threads: + 1 + conda: + "tximport" + script: + "tximport_rsem.R" + + + rule tximport_salmon: + message: + "{:name:}: Importing counts from Salmon" + input: + counts = "{:prev:}/{:target:}.salmon/quant.sf", + gtf = "{:prev:}/{:target:}.gtf", + output: + counts = "{:this:}/{target}.gene_counts.rds", + transcripts = "{:this:}/{target}.tx_counts.rds", + log: + "{:this:}/{target}.log" + threads: + 1 + conda: + "tximport" + script: + "tximport_salmon.R" diff --git a/src/ymp/rules/tximport_rsem.R b/src/ymp/rules/tximport_rsem.R new file mode 100644 index 00000000..ceae26d1 --- /dev/null +++ b/src/ymp/rules/tximport_rsem.R @@ -0,0 +1,86 @@ +#!/usr/bin/env Rscript + +#' We expect to be called from snakemake script directive, so having +#' `snakemake` object with `snakemake@input` etc containing paths. + +#' We also need to redirect our output to log ourselves... + +R.home() + +logfile <- file(snakemake@log[[1]], open="wt") +sink(logfile) +sink(logfile, type="message") + +R.home() + +message("Importing RSEM gene and isoform count files into R using tximport") + + +message("1. ----------- Loading packages ----------") +library(tximport) +library(readr) +library(GenomicFeatures) +library(rtracklayer) +library(SummarizedExperiment) + +message("2. ----------- Loading GTF ----------") +message("Filename = ", snakemake@input$gtf) +gr <- rtracklayer::import.gff(snakemake@input$gtf) + +message("3. ----------- Loading per transcript count files ----------") +samples <- gsub(".genes.results", "", basename(snakemake@input$counts)) +tx_files <- setNames(snakemake@input$transcripts, samples) +txi <- tximport(tx_files, type = "rsem", txIn = TRUE, txOut = TRUE) + +message("4. ----------- Assembling SummarizedExperiment w/ rowData ----------") +txmeta <- mcols(gr)[mcols(gr)$type=="transcript", ] # only transcript rows +txmeta <- subset(txmeta, select = -type) +rownames(txmeta) <- txmeta$transcript_id # set names +txmeta <- txmeta[rownames(txi$counts), ] # only rows for which we have counts +txmeta <- Filter(function(x)!all(is.na(x)), txmeta) # remove all-NA columns + +se <- SummarizedExperiment( + assays = txi[c("counts", "abundance", "length")], + rowData = txmeta, + metadata = list( + countsFromAbundance = txi$countsFromAbundance + ) +) + +message("5. ----------- Writing RDS with transcript se object ----------") +message("Filename = ", snakemake@output$transcripts) +saveRDS(se, snakemake@output$transcripts) + +message("6. ----------- Loading per gene count files ----------") +gene_files <- setNames(snakemake@input$counts, samples) +txi_genes <- tximport(gene_files, type = "rsem", txIn = FALSE, txOut = FALSE) + +## Something inside of tximport seems to reset the log sink on the +## second call. Resetting it here: +sink(logfile) +sink(logfile, type="message") + +message("7. ----------- Assembling SummarizedExperiment w/ rowData ----------") +gmeta <- mcols(gr)[mcols(gr)$type=="gene", ] # only transcript rows +gmeta <- subset(gmeta, select = -type) +rownames(gmeta) <- gmeta$gene_id # set names +gmeta <- gmeta[rownames(txi_genes$counts), ] # only rows for which we have counts +gmeta <- Filter(function(x)!all(is.na(x)), gmeta) # remove all-NA columns + +gse <- SummarizedExperiment( + assays = txi_genes[c("counts", "abundance", "length")], + rowData = gmeta, + metadata = list( + countsFromAbundance = txi_genes$countsFromAbundance + ) +) + +message("Rounding counts to keep DESeq2 happy") +assay(gse) <- round(assay(gse)) +mode(assay(gse)) <- "integer" + +message("8. ----------- Writing RDS with gene se object ----------") +message("Filename = ", snakemake@output$transcripts) +saveRDS(gse, snakemake@output$counts) + +message("done") diff --git a/src/ymp/rules/tximport_salmon.R b/src/ymp/rules/tximport_salmon.R new file mode 100644 index 00000000..1ffc6c1b --- /dev/null +++ b/src/ymp/rules/tximport_salmon.R @@ -0,0 +1,86 @@ +#!/usr/bin/env Rscript + +#' We expect to be called from snakemake script directive, so having +#' `snakemake` object with `snakemake@input` etc containing paths. + +#' We also need to redirect our output to log ourselves... +R.home() +logfile <- file(snakemake@log[[1]], open="wt") +sink(logfile) +sink(logfile, type="message") + +R.home() + +message("Importing Salmon data into R using tximport") + +message("1. ----------- Loading packages ----------") +library(tximport) +library(readr) +library(GenomicFeatures) +library(rtracklayer) +library(SummarizedExperiment) + +message("2. ----------- Loading GTF ----------") +message("Filename = ", snakemake@input$gtf) +gr <- rtracklayer::import.gff(snakemake@input$gtf) + +message("3. ----------- Loading quant.sf files ----------") +files <- snakemake@input$counts +names(files) <- gsub(".salmon", "", basename(dirname(snakemake@input$counts))) +txi <- tximport(files, type="salmon", txOut=TRUE) + +message("4. ----------- Assembling SummarizedExperiment w/ rowData ----------") +txmeta <- mcols(gr)[mcols(gr)$type=="transcript", ] # only transcript rows +txmeta <- subset(txmeta, select = -type) +rownames(txmeta) <- txmeta$transcript_id # set names +txmeta <- txmeta[rownames(txi$counts), ] # only rows for which we have counts +txmeta <- Filter(function(x)!all(is.na(x)), txmeta) # remove all-NA columns + +se <- SummarizedExperiment( + assays = txi[c("counts", "abundance", "length")], + rowData = txmeta, + metadata = list( + countsFromAbundance = txi$countsFromAbundance # should be no + ) +) + +message("5. ----------- Writing RDS with transcript se object ----------") +message("Filename = ", snakemake@output$transcripts) +saveRDS(se, snakemake@output$transcripts) + + +message("6. ----------- Summarizing transcript counts to gene counts ----------") +txi_genes <- summarizeToGene(txi, txmeta[,c("transcript_id", "gene_id")]) + +message("7. ----------- Assembling SummarizedExperiment w/ rowData ----------") +gmeta <- mcols(gr)[mcols(gr)$type=="gene", ] # only transcript rows +gmeta <- subset(gmeta, select = -type) +rownames(gmeta) <- gmeta$gene_id # set names +gmeta <- gmeta[rownames(txi_genes$counts), ] # only rows for which we have counts +gmeta <- Filter(function(x)!all(is.na(x)), gmeta) # remove all-NA columns + +gse <- SummarizedExperiment( + assays = txi_genes[c("counts", "abundance", "length")], + rowData = gmeta, + metadata = list( + countsFromAbundance = txi_genes$countsFromAbundance # should be no + ) +) + +message("Rounding counts to keep DESeq2 happy") +assay(gse) <- round(assay(gse)) +mode(assay(gse)) <- "integer" + +## Rename length assay IFF we are having counts, not TPM +## (not sure if otherwise is possible with Salmon, but since this is +## checked inside of deseq/tximeta, let's do check here as well). +if (txi_genes$countsFromAbundance == "no") { + message("Renaming length assay to avgTxLength so DESeq2 will use for size estimation") + assayNames(gse)[assayNames(gse) == "length"] <- "avgTxLength" +} + +message("8. ----------- Writing RDS with gene se object ----------") +message("Filename = ", snakemake@output$transcripts) +saveRDS(gse, snakemake@output$counts) + +message("done") From bac66acbc4b5ea958a4764e1e74ca9dd92b89c0c Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Fri, 10 Sep 2021 17:21:35 -0600 Subject: [PATCH 029/133] Fix multiqc path filter should be list not string --- src/ymp/rules/bowtie2.rules | 2 +- src/ymp/rules/fastqc.rules | 2 +- src/ymp/rules/hisat2.rules | 2 +- src/ymp/rules/multiqc.rules | 4 +--- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/ymp/rules/bowtie2.rules b/src/ymp/rules/bowtie2.rules index e443c371..ba414465 100644 --- a/src/ymp/rules/bowtie2.rules +++ b/src/ymp/rules/bowtie2.rules @@ -139,7 +139,7 @@ with Stage("map_bowtie2") as S: "module_order": [{ "bowtie2": { "name": f"Bowtie2 ({params.this})", - "path_filters": f"{params.this}/*.log" + "path_filters": [f"{params.this}/*.log"] } }] } diff --git a/src/ymp/rules/fastqc.rules b/src/ymp/rules/fastqc.rules index d7923007..84fb5c6f 100644 --- a/src/ymp/rules/fastqc.rules +++ b/src/ymp/rules/fastqc.rules @@ -56,7 +56,7 @@ with Stage("qc_fastqc") as S: "module_order": [{ "fastqc": { "name": f"FastQC ({params.this})", - "path_filters": f"{params.this}/*_fastqc.zip" + "path_filters": [f"{params.this}/*_fastqc.zip"] } }] } diff --git a/src/ymp/rules/hisat2.rules b/src/ymp/rules/hisat2.rules index 78a78d83..a7b1fe2b 100644 --- a/src/ymp/rules/hisat2.rules +++ b/src/ymp/rules/hisat2.rules @@ -62,7 +62,7 @@ with Stage("map_hisat2") as S: "module_order": [{ "hisat2": { "name": f"HISAT2 ({params.this})", - "path_filters": f"{params.this}/*.stats" + "path_filters": [f"{params.this}/*.stats"] } }] } diff --git a/src/ymp/rules/multiqc.rules b/src/ymp/rules/multiqc.rules index 86234efd..e7299609 100644 --- a/src/ymp/rules/multiqc.rules +++ b/src/ymp/rules/multiqc.rules @@ -12,7 +12,7 @@ with Stage("qc_multiqc") as S: input: conf = "{:all_prevs:}/multiqc_config.yaml" output: - conf = "{:this:}/merged_multiqc_config.yaml" + conf = temp("{:this:}/merged_multiqc_config.yaml") run: from ruamel.yaml import YAML yaml = YAML(typ="rt") @@ -31,10 +31,8 @@ with Stage("qc_multiqc") as S: "sp": sp, "module_order": module_order, } - print("writing to ", output.conf) with open(output.conf, "w") as fd: yaml.dump(conf, fd) - print("done") rule multiqc_report: """Assemble report on all FQ files in a directory""" From b7733fbac962fa1f45234fb68a5b6cbe1225bcbf Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Fri, 10 Sep 2021 18:09:48 -0600 Subject: [PATCH 030/133] More multiqc --- src/ymp/rules/fastp.rules | 2 +- src/ymp/rules/multiqc.rules | 4 +++- src/ymp/rules/rsem.rules | 27 +++++++++++++++++++++++++++ src/ymp/rules/sambamba.rules | 25 +++++++++++++++++++++++++ src/ymp/rules/sickle.rules | 25 +++++++++++++++++++++++++ src/ymp/rules/star.rules | 26 ++++++++++++++++++++++++++ src/ymp/rules/trimmomatic.rules | 25 +++++++++++++++++++++++++ 7 files changed, 132 insertions(+), 2 deletions(-) diff --git a/src/ymp/rules/fastp.rules b/src/ymp/rules/fastp.rules index 402e5f41..672e5c41 100644 --- a/src/ymp/rules/fastp.rules +++ b/src/ymp/rules/fastp.rules @@ -63,7 +63,7 @@ with Stage("trim_fastp") as S: "module_order": [{ "fastp": { "name": f"FastP ({params.this})", - "path_filters": f"{params.this}/*.fastp.json" + "path_filters": [f"{params.this}/*.fastp.json"] } }] } diff --git a/src/ymp/rules/multiqc.rules b/src/ymp/rules/multiqc.rules index e7299609..be81afd5 100644 --- a/src/ymp/rules/multiqc.rules +++ b/src/ymp/rules/multiqc.rules @@ -6,13 +6,14 @@ with Stage("qc_multiqc") as S: S.doc(""" Aggregate QC reports using MultiQC """) + localrules: multiqc_merge_configs rule multiqc_merge_configs: message: "Aggregating MultiQC configs for {:this:}" input: conf = "{:all_prevs:}/multiqc_config.yaml" output: - conf = temp("{:this:}/merged_multiqc_config.yaml") + conf = "{:this:}/merged_multiqc_config.yaml" run: from ruamel.yaml import YAML yaml = YAML(typ="rt") @@ -34,6 +35,7 @@ with Stage("qc_multiqc") as S: with open(output.conf, "w") as fd: yaml.dump(conf, fd) + localrules: multiqc_report rule multiqc_report: """Assemble report on all FQ files in a directory""" message: diff --git a/src/ymp/rules/rsem.rules b/src/ymp/rules/rsem.rules index 3b609ca0..e4678463 100644 --- a/src/ymp/rules/rsem.rules +++ b/src/ymp/rules/rsem.rules @@ -43,6 +43,9 @@ with Stage("quant_rsem") as S: output: "{:this:}/{target}.genes.results", "{:this:}/{target}.isoforms.results", + "{:this:}/{target}.stats/{target}.cnt", + "{:this:}/{target}.stats/{target}.model", + "{:this:}/{target}.stats/{target}.theta", log: "{:this:}/{target}.log", params: @@ -71,3 +74,27 @@ with Stage("quant_rsem") as S: " >{log} 2>&1 " + localrules: rsem_quant_multiqc_cfg + rule rsem_quant_multiqc_cfg: + message: + "{:name:}: Writing MultiQC config" + input: + "{:this:}/{:targets:}.genes.results" + output: + "{:this:}/multiqc_config.yaml" + params: + this = "{:this:}" + run: + from ruamel.yaml import YAML + yaml = YAML(typ="rt") + data = { + "run_modules": [ "rsem" ], + "module_order": [{ + "rsem": { + "name": f"RSEM ({params.this})", + "path_filters": [f"{params.this}/*.stats/*.cnt"] + } + }] + } + with open(output[0], "w") as out: + yaml.dump(data, out) diff --git a/src/ymp/rules/sambamba.rules b/src/ymp/rules/sambamba.rules index b52a3fd4..157bd118 100644 --- a/src/ymp/rules/sambamba.rules +++ b/src/ymp/rules/sambamba.rules @@ -66,3 +66,28 @@ with Stage("markdup_sambamba") as S: "sambamba index" " --nthreads={threads}" " {output.bam} {output.bai};" + + localrules: sambamba_markdup_multiqc_cfg + rule sambamba_markdup_multiqc_cfg: + message: + "{:name:}: Writing MultiQC config" + input: + "{:this:}/{:targets:}.sorted.bam.log" + output: + "{:this:}/multiqc_config.yaml" + params: + this = "{:this:}" + run: + from ruamel.yaml import YAML + yaml = YAML(typ="rt") + data = { + "run_modules": [ "sambamba" ], + "module_order": [{ + "sambamba": { + "name": f"Sambamba Markdup ({params.this})", + "path_filters": [f"{params.this}/*.log"] + } + }] + } + with open(output[0], "w") as out: + yaml.dump(data, out) diff --git a/src/ymp/rules/sickle.rules b/src/ymp/rules/sickle.rules index 3fd36262..880d930b 100644 --- a/src/ymp/rules/sickle.rules +++ b/src/ymp/rules/sickle.rules @@ -58,3 +58,28 @@ with Stage("trim_sickle") as S: touch("{:this:}/all_targets.stamp") input: "{:this:}/{:fq_names:}.fq.gz" + + localrules: sickle_multiqc_cfg + rule sickle_multiqc_cfg: + message: + "{:name:}: Writing MultiQC config" + input: + "{:this:}/{:targets:}.log" + output: + "{:this:}/multiqc_config.yaml" + params: + this = "{:this:}" + run: + from ruamel.yaml import YAML + yaml = YAML(typ="rt") + data = { + "run_modules": [ "sickle" ], + "module_order": [{ + "sickle": { + "name": f"Sickle ({params.this})", + "path_filters": [f"{params.this}/*.log"] + } + }] + } + with open(output[0], "w") as out: + yaml.dump(data, out) diff --git a/src/ymp/rules/star.rules b/src/ymp/rules/star.rules index d9f661b2..f6d8ebcc 100644 --- a/src/ymp/rules/star.rules +++ b/src/ymp/rules/star.rules @@ -51,6 +51,7 @@ with Stage("map_star") as S: bamtr = "{:this:}/{target}.tx.bam", log: std = "{:this:}/{target}.log", + final = "{:this:}/{target}.star.Log.final.out", params: outprefix = "{:this:}/{target}.star.", multimap_nmax = 10, @@ -81,3 +82,28 @@ with Stage("map_star") as S: mv {params.outprefix}Aligned.out.bam {output.bamgn} mv {params.outprefix}Aligned.toTranscriptome.out.bam {output.bamtr} """ + + localrules: star_map_multiqc_cfg + rule star_map_multiqc_cfg: + message: + "{:name:}: Writing MultiQC config" + input: + "{:this:}/{:targets:}.log" + output: + "{:this:}/multiqc_config.yaml" + params: + this = "{:this:}" + run: + from ruamel.yaml import YAML + yaml = YAML(typ="rt") + data = { + "run_modules": [ "star" ], + "module_order": [{ + "star": { + "name": f"STAR ({params.this})", + "path_filters": [f"{params.this}/*.star.Log.final.out"] + } + }] + } + with open(output[0], "w") as out: + yaml.dump(data, out) diff --git a/src/ymp/rules/trimmomatic.rules b/src/ymp/rules/trimmomatic.rules index 9dfe4d77..b01f4205 100644 --- a/src/ymp/rules/trimmomatic.rules +++ b/src/ymp/rules/trimmomatic.rules @@ -69,3 +69,28 @@ with Stage("trim_trimmomatic") as S: touch("{:this:}/all_targets.stamp") input: "{:this:}/{:fq_names:}.fq.gz" + + localrules: trimmomatic_adapter_multiqc_cfg + rule trimmomatic_adapter_multiqc_cfg: + message: + "{:name:}: Writing MultiQC config" + input: + "{:this:}/{:targets:}.log" + output: + "{:this:}/multiqc_config.yaml" + params: + this = "{:this:}" + run: + from ruamel.yaml import YAML + yaml = YAML(typ="rt") + data = { + "run_modules": [ "trimmomatic" ], + "module_order": [{ + "trimmomatic": { + "name": f"Trimmomatic ({params.this})", + "path_filters": [f"{params.this}/*.log"] + } + }] + } + with open(output[0], "w") as out: + yaml.dump(data, out) From 67ab5515c574bc0ccaf076eee64297c2862cd9b9 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Fri, 10 Sep 2021 19:58:10 -0600 Subject: [PATCH 031/133] More multiqc (2) --- src/ymp/rules/multiqc.rules | 4 +++ src/ymp/rules/salmon.rules | 58 +++++++++++++++++++++++++++++++++++++ src/ymp/rules/star.rules | 3 +- 3 files changed, 64 insertions(+), 1 deletion(-) diff --git a/src/ymp/rules/multiqc.rules b/src/ymp/rules/multiqc.rules index be81afd5..27371ff8 100644 --- a/src/ymp/rules/multiqc.rules +++ b/src/ymp/rules/multiqc.rules @@ -20,17 +20,21 @@ with Stage("qc_multiqc") as S: run_modules = [] sp = {} module_order = [] + sample_names_replace = {} for conffile in input.conf: with open(conffile, "r") as fd: data = yaml.load(fd) run_modules.extend(data.get("run_modules", [])) sp.update(data.get("sp", {})) ## FIXME check conflicts! module_order.extend(data.get("module_order", [])) + sample_names_replace.update(data.get("sample_names_replace", {})) run_modules = list(set(run_modules)) conf = { "run_modules": run_modules, "sp": sp, "module_order": module_order, + "sample_names_replace": sample_names_replace, + "sample_names_replace_regex": True, } with open(output.conf, "w") as fd: yaml.dump(conf, fd) diff --git a/src/ymp/rules/salmon.rules b/src/ymp/rules/salmon.rules index ab7ef252..6624a392 100644 --- a/src/ymp/rules/salmon.rules +++ b/src/ymp/rules/salmon.rules @@ -64,6 +64,35 @@ with Stage("quant_salmon_sa") as S: " --mates1 {input.fq[0]}" " --mates2 {input.fq[1]}" " --output $(dirname {output.quant})" + + localrules: salmon_sa_quant_multiqc_cfg + rule salmon_sa_quant_multiqc_cfg: + message: + "{:name:}: Writing MultiQC config" + input: + "{:this:}/{:targets:}.log" + output: + "{:this:}/multiqc_config.yaml" + params: + this = "{:this:}" + run: + from ruamel.yaml import YAML + yaml = YAML(typ="rt") + data = { + "run_modules": [ "salmon" ], + "module_order": [{ + "salmon": { + "name": f"Salmon SA ({params.this})", + "path_filters": [ + f"{params.this}/*.salmon/aux_info/meta_info.json", + f"{params.this}/*.salmon/libParams/flenDist.txt", + ] + } + }], + "sample_names_replace": {"(.*)\\.salmon": "\\1"}, + } + with open(output[0], "w") as out: + yaml.dump(data, out) with Stage("quant_salmon") as S: @@ -102,3 +131,32 @@ with Stage("quant_salmon") as S: " --output $(dirname {output.quant})" " --minAssignedFrags 0" " {params.gencode}" + + localrules: salmon_quant_multiqc_cfg + rule salmon_quant_multiqc_cfg: + message: + "{:name:}: Writing MultiQC config" + input: + "{:this:}/{:targets:}.log" + output: + "{:this:}/multiqc_config.yaml" + params: + this = "{:this:}" + run: + from ruamel.yaml import YAML + yaml = YAML(typ="rt") + data = { + "run_modules": [ "salmon" ], + "module_order": [{ + "salmon": { + "name": f"Salmon ({params.this})", + "path_filters": [ + f"{params.this}/*.salmon/aux_info/meta_info.json", + f"{params.this}/*.salmon/libParams/flenDist.txt", + ], + } + }], + "sample_names_replace": {"(.*)\\.salmon": "\\1"}, + } + with open(output[0], "w") as out: + yaml.dump(data, out) diff --git a/src/ymp/rules/star.rules b/src/ymp/rules/star.rules index f6d8ebcc..0d83b491 100644 --- a/src/ymp/rules/star.rules +++ b/src/ymp/rules/star.rules @@ -103,7 +103,8 @@ with Stage("map_star") as S: "name": f"STAR ({params.this})", "path_filters": [f"{params.this}/*.star.Log.final.out"] } - }] + }], + "sample_names_replace": {"(.*)\\\\.star": "\\\\1"}, } with open(output[0], "w") as out: yaml.dump(data, out) From 0d92fca83b26baf814cfa797f7cafaac6b9c0182 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Fri, 10 Sep 2021 19:59:15 -0600 Subject: [PATCH 032/133] Bump required multiqc version --- src/ymp/rules/multiqc.rules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ymp/rules/multiqc.rules b/src/ymp/rules/multiqc.rules index 27371ff8..779fd701 100644 --- a/src/ymp/rules/multiqc.rules +++ b/src/ymp/rules/multiqc.rules @@ -1,5 +1,5 @@ Env(name="multiqc", base="bioconda", packages=[ - "multiqc >=1.4" + "multiqc >=1.11" ]) with Stage("qc_multiqc") as S: From a3e79de27833959853691af5f73bac7179931ef1 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Mon, 13 Sep 2021 18:34:34 -0600 Subject: [PATCH 033/133] Improve PathResource missing path exception (fixes #169) --- src/ymp/stage/reference.py | 3 ++- tests/test_reference.py | 16 +++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/ymp/stage/reference.py b/src/ymp/stage/reference.py index 89ff19b4..dcb8ad6e 100644 --- a/src/ymp/stage/reference.py +++ b/src/ymp/stage/reference.py @@ -233,7 +233,8 @@ def __init__(self, *args): filenames = os.listdir(self.local_path) except FileNotFoundError: raise YmpConfigError( - self.cfg, "Directory required by path resource inaccessible" + self.cfg, + f"Directory '{self.local_path}' required by path resource inaccessible", ) self.dir = self.local_path.rstrip("/") diff --git a/tests/test_reference.py b/tests/test_reference.py index 56a07bf0..c675ddcc 100644 --- a/tests/test_reference.py +++ b/tests/test_reference.py @@ -202,6 +202,7 @@ def test_regexlocaldir_directory_missing(saved_cwd, check_show): ), ) assert excinfo.match("Directory") + assert excinfo.match("somewhere") check_show(excinfo.value, "line 2") @@ -337,17 +338,18 @@ def test_with_ids(demo_dir): assert groups == ["ref_test"] ids = ref.get_ids(stack, groups) assert set(ids) == set(["one", "two"]) - assert ref.outputs == {"/{sample}.fasta.gz": ".ref_test"} + assert ref.outputs == {"/{sample}.fasta.gz": ""} def test_duplicate_file(saved_cwd, check_show): + ref = Reference( + "test", + make_cfg( + "- type: fasta", " url: somewhere", "- type: fasta", " url: somewhere" + ), + ) with pytest.raises(YmpConfigError) as excinfo: - ref = Reference( - "test", - make_cfg( - "- type: fasta", " url: somewhere", "- type: fasta", " url: somewhere" - ), - ) + ref.files assert excinfo.match("Duplicate") check_show(excinfo.value, "line 4") From 695208cdfd9ed52f6eb1faa01b6b2f9ba43ffd52 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Mon, 13 Sep 2021 19:06:16 -0600 Subject: [PATCH 034/133] Use a heavy checksum so sonarcloud doesn't complain --- src/ymp/stage/reference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ymp/stage/reference.py b/src/ymp/stage/reference.py index dcb8ad6e..aaf034bc 100644 --- a/src/ymp/stage/reference.py +++ b/src/ymp/stage/reference.py @@ -1,7 +1,7 @@ import logging import os import re -from hashlib import sha1 +from hashlib import sha512 from typing import Dict, Optional, Union, Set, List from collections.abc import Mapping, Sequence @@ -145,7 +145,7 @@ def __init__(self, *args): super().__init__(*args) # Generate hash from tarfile name - self.fnhash = sha1(self.local_path.encode("utf-8")).hexdigest()[:8] + self.fnhash = sha512(self.local_path.encode("utf-8")).hexdigest()[:8] # Compute output prefix self.prefix = os.path.join( self.reference.canonical_location(), "_unpacked_" + self.fnhash From 35a0f45b0f9b24e15fd867be91058c6aef644547 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Mon, 13 Sep 2021 19:16:46 -0600 Subject: [PATCH 035/133] More placate sonarcloud --- tests/test_reference.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/tests/test_reference.py b/tests/test_reference.py index c675ddcc..b729ea5a 100644 --- a/tests/test_reference.py +++ b/tests/test_reference.py @@ -9,6 +9,8 @@ from ymp.stage.reference import Resource from ymp.exceptions import YmpConfigError +references_test = "references/test" # place sonarcloud + def make_cfg(text, *args): fname = "test.yml" @@ -53,7 +55,7 @@ def test_empty_unknown_type(saved_cwd, check_show): def test_fasta_no_url(saved_cwd, check_show): with pytest.raises(YmpConfigError) as excinfo: - ref = Reference("test", make_cfg("- type: fasta")) + Reference("test", make_cfg("- type: fasta")) assert excinfo.match("fasta") assert excinfo.match("must have 'url'") check_show(excinfo.value, "line 2") @@ -67,7 +69,6 @@ def test_duplicate_resource(saved_cwd): from ymp.stage.reference import FileResource with pytest.raises(ValueError) as excinfo: - class duplicate(FileResource): pass @@ -77,14 +78,14 @@ class duplicate(FileResource): def test_resource_not_mapping(saved_cwd, check_show): with pytest.raises(YmpConfigError) as excinfo: - ref = Reference("test", make_cfg("- []")) + Reference("test", make_cfg("- []")) assert excinfo.match("mapping") check_show(excinfo.value, "line 2") def test_resource_not_mapping_third(saved_cwd, check_show): with pytest.raises(YmpConfigError) as excinfo: - ref = Reference( + Reference( "test", make_cfg( "- type: fasta", @@ -107,7 +108,7 @@ def test_get_id_name(saved_cwd): def test_file_resource_no_extension(saved_cwd, check_show): with pytest.raises(YmpConfigError) as excinfo: - ref = Reference("test", make_cfg("- type: file", " url: somewhere")) + Reference("test", make_cfg("- type: file", " url: somewhere")) assert excinfo.match("must have") assert excinfo.match("extension") check_show(excinfo.value, "line 2") @@ -121,11 +122,12 @@ def test_file_resource(saved_cwd): def test_named_unpacked_resource(saved_cwd): ref = Reference("test", make_cfg("- type: gtf", " url: somewhere")) + assert ref.files == {"ALL.gtf": "somewhere"} def test_archive_resource_no_url(saved_cwd, check_show): with pytest.raises(YmpConfigError) as excinfo: - ref = Reference("test", make_cfg(" - type: archive")) + Reference("test", make_cfg(" - type: archive")) assert excinfo.match("must have") assert excinfo.match("url") check_show(excinfo.value, "line 2") @@ -133,7 +135,7 @@ def test_archive_resource_no_url(saved_cwd, check_show): def test_archive_resource_no_files(saved_cwd, check_show): with pytest.raises(YmpConfigError) as excinfo: - ref = Reference("test", make_cfg(" - type: archive", " url: somwhere")) + Reference("test", make_cfg(" - type: archive", " url: somwhere")) assert excinfo.match("must have") assert excinfo.match("files") check_show(excinfo.value, "line 2") @@ -141,7 +143,7 @@ def test_archive_resource_no_files(saved_cwd, check_show): def test_archive_resource_files_not_mapping(saved_cwd, check_show): with pytest.raises(YmpConfigError) as excinfo: - ref = Reference( + Reference( "test", make_cfg(" - type: archive", " url: somwhere", " files:") ) assert excinfo.match("must be mapping") @@ -159,6 +161,9 @@ def test_archive_resource_no_url(saved_cwd, check_show): " ALL.bam: some.bam", ), ) + assert list(ref.files.keys()) == ["ALL.bam"] + assert ref.files["ALL.bam"].endswith("/some.bam") + assert ref.files["ALL.bam"].startswith(references_test) def test_localdir_resource_no_files(saved_cwd, check_show): @@ -296,7 +301,7 @@ def test_get_path(demo_dir): " url: somewhere", ), ) - assert ref.get_path(None) == "references/test" + assert ref.get_path(None) == references_test ## FIXME: Do we need the below feature at all?9 assert str(ref) == "references/test/ALL" @@ -363,12 +368,12 @@ def test_get_file(saved_cwd): def test_add_rule(saved_cwd): ref = Reference("test", make_cfg("- type: fasta", " url: somewhere.fasta.gz")) - assert ref.prev() == "references/test" + assert ref.prev() == references_test assert ref.get_file("ALL.sometype").startswith("YMP_FILE_NOT_FOUND") kwargs = {"item": "{:this:}/{:target:}.sometype"} - assert ref.this(kwargs=kwargs) == "references/test" + assert ref.this(kwargs=kwargs) == references_test assert ref.get_file("ALL.sometype").startswith("YMP_FILE_NOT_FOUND") kwargs["field"] = "output" ref.set_active(ref) - assert ref.this(kwargs=kwargs) == "references/test" - assert ref.get_file("{sample}.sometype") == "references/test/{sample}.sometype" + assert ref.this(kwargs=kwargs) == references_test + assert ref.get_file("{sample}.sometype") == references_test + "/{sample}.sometype" From dedd92f176cdccbd4ce83cc6f1e41149c164e72e Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Mon, 13 Sep 2021 19:50:44 -0600 Subject: [PATCH 036/133] Fix some tests --- tests/test_cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 9de8b058..c17597b3 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -296,7 +296,7 @@ def test_env_run(invoker, demo_dir, mock_conda, mock_downloader, capfd): res = invoker.call("env", "run", "bbmap", "true") assert res.exit_code == 0 cap = capfd.readouterr() - assert "bin/activate: No such file " in cap.err + assert "No such file or directory" in cap.err @pytest.mark.parametrize( @@ -317,7 +317,7 @@ def test_env_run(invoker, demo_dir, mock_conda, mock_downloader, capfd): }], ["ymp make toy.assemble_megahit.", -1, { "toy.assemble_megahit.trim_", - "toy.assemble_megahit.map_" + "toy.assemble_megahit.ref_" }], ["ymp make toy.assemble_megahit.map_", -1, { "toy.assemble_megahit.map_bbmap", From 6b4c5c0b991776a27840782a5df0c39a1ff152a7 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Mon, 13 Sep 2021 19:53:02 -0600 Subject: [PATCH 037/133] Fix unreachable code --- src/ymp/stage/reference.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ymp/stage/reference.py b/src/ymp/stage/reference.py index aaf034bc..887a794e 100644 --- a/src/ymp/stage/reference.py +++ b/src/ymp/stage/reference.py @@ -183,7 +183,7 @@ def generate_rules(self, unpack_archive=None, **kwargs): - {} """ docstr = "\n".join([docstr] + [item_tpl.format(fn) for fn in self.files]) - return make_rule( + yield make_rule( name=f"unpack_{self.reference.name}_{self.fnhash}", docstring=docstr, lineno=0, @@ -193,7 +193,6 @@ def generate_rules(self, unpack_archive=None, **kwargs): output=([], {"files": list(self.files.values())}), params=([], {"strip": self.strip, "prefix": self.prefix}), ) - yield self.archive.make_unpack_rule(kwargs["unpack_archive"]) class LocalDirResource(UrlResource): From ce8d320989924592423e0f45fb6b055ba3e9d861 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 30 Sep 2021 16:33:47 -0600 Subject: [PATCH 038/133] trim_fastp: Mark output as temp --- src/ymp/rules/fastp.rules | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ymp/rules/fastp.rules b/src/ymp/rules/fastp.rules index 672e5c41..b6049006 100644 --- a/src/ymp/rules/fastp.rules +++ b/src/ymp/rules/fastp.rules @@ -7,8 +7,8 @@ with Stage("trim_fastp") as S: >>>ymp make toy.trim_fastp """) - S.add_param("L", typ="int", name="length", default=20) S.add_param("Q", typ="int", name="qual", default=20) + S.add_param("L", typ="int", name="length", default=20) S.add_param("O", typ="flag", name="overrepresentcheck", value="--overrepresentation_analysis") S.add_param("C", typ="flag", name="correction", value="--correction") @@ -18,7 +18,8 @@ with Stage("trim_fastp") as S: input: fq = "{:prev:}/{:target:}.{:pairnames:}.fq.gz", output: - fq = "{:this:}/{target}.{:pairnames:}.fq.gz", + fq = [temp("{:this:}/{target}.{:pairnames[0]:}.fq.gz"), + temp("{:this:}/{target}.{:pairnames[1]:}.fq.gz")], json = "{:this:}/{target}.fastp.json" log: "{:this:}/{target}.log", From 514beaf3bdc3064312063619e81eb6ad73b06b39 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 30 Sep 2021 16:34:59 -0600 Subject: [PATCH 039/133] map_star: Mark output as temp --- src/ymp/rules/star.rules | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ymp/rules/star.rules b/src/ymp/rules/star.rules index 0d83b491..abb0a0d3 100644 --- a/src/ymp/rules/star.rules +++ b/src/ymp/rules/star.rules @@ -13,7 +13,7 @@ with Stage("index_star") as S: log: "{:this:}/{target}.log", threads: - 16 + 32 params: overhang = 100, resources: @@ -43,12 +43,14 @@ with Stage("map_star") as S: Map RNA-Seq reads with STAR """) rule star_map: + message: + "STAR: mapping {input.fq[0]} to {input.index}" input: index = directory("{:prev:}/{:target:}.staridx"), fq = "{:prev:}/{:target:}.{:pairnames:}.fq.gz" output: - bamgn = "{:this:}/{target}.bam", - bamtr = "{:this:}/{target}.tx.bam", + bamgn = temp("{:this:}/{target}.bam"), + bamtr = temp("{:this:}/{target}.tx.bam"), log: std = "{:this:}/{target}.log", final = "{:this:}/{target}.star.Log.final.out", @@ -58,7 +60,7 @@ with Stage("map_star") as S: quantmode = "TranscriptomeSAM", tmpdir = "{:dir.tmp:}/star/{:this:}/{target}" resources: - mem = "32g", + mem = "64g", threads: 32 conda: From 88801f80a0b8c08bf544682240f6da084ac467f1 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 30 Sep 2021 16:35:37 -0600 Subject: [PATCH 040/133] quant_salmon: Add mem requirement --- src/ymp/rules/salmon.rules | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ymp/rules/salmon.rules b/src/ymp/rules/salmon.rules index 6624a392..8075f36c 100644 --- a/src/ymp/rules/salmon.rules +++ b/src/ymp/rules/salmon.rules @@ -51,7 +51,7 @@ with Stage("quant_salmon_sa") as S: conda: "salmon" threads: - 32 + 16 shell: "exec >{log} 2>&1;" "salmon quant" @@ -114,6 +114,8 @@ with Stage("quant_salmon") as S: "benchmarks/{:name:}/{:this:}/{target}.txt", log: "{:this:}/{target}.log", + resources: + mem = "16G", conda: "salmon" threads: From 17873713457d6879ac53fd064523008856c01413 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 30 Sep 2021 16:36:16 -0600 Subject: [PATCH 041/133] Restore path resource behavior w/o symlinks This isn't ideal, but needed for BLAST for now. --- src/ymp/stage/reference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ymp/stage/reference.py b/src/ymp/stage/reference.py index 887a794e..0b660b58 100644 --- a/src/ymp/stage/reference.py +++ b/src/ymp/stage/reference.py @@ -227,7 +227,7 @@ def __init__(self, *args): matchlist = self.cfg.get("match") if not isinstance(matchlist, Sequence) or isinstance(matchlist, str): raise YmpConfigError(self.cfg, "Path 'match' must be list", key="match") - + self.reference.dir = self.local_path try: filenames = os.listdir(self.local_path) except FileNotFoundError: From f8d64cf47fdd3551a1578287d7bd1362a99714c6 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 30 Sep 2021 16:37:28 -0600 Subject: [PATCH 042/133] quant_rsem: use local temp folder --- src/ymp/rules/rsem.rules | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/ymp/rules/rsem.rules b/src/ymp/rules/rsem.rules index e4678463..142e3569 100644 --- a/src/ymp/rules/rsem.rules +++ b/src/ymp/rules/rsem.rules @@ -43,9 +43,11 @@ with Stage("quant_rsem") as S: output: "{:this:}/{target}.genes.results", "{:this:}/{target}.isoforms.results", - "{:this:}/{target}.stats/{target}.cnt", - "{:this:}/{target}.stats/{target}.model", - "{:this:}/{target}.stats/{target}.theta", + "{:this:}/{target}.stat/{target}.cnt", + "{:this:}/{target}.stat/{target}.model", + "{:this:}/{target}.stat/{target}.theta", + benchmark: + "benchmarks/{:name:}/{:this:}/{target}.txt", log: "{:this:}/{target}.log", params: @@ -55,7 +57,7 @@ with Stage("quant_rsem") as S: resources: mem = "16G", threads: - 32 + 8 conda: "rsem" shell: @@ -68,6 +70,7 @@ with Stage("quant_rsem") as S: " --ci-memory $(({resources.mem_mb} / 16 * 10))" " --forward-prob {params.forward_prob}" " --paired-end" + " --temporary-folder {resources.tmpdir}/rsem.{wildcards.target}.$$/" " {input.bam}" " {params.index}" " {params.outprefix} " From 03d33d30f020d0ace4c03e34727aee41f65b4115 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 30 Sep 2021 16:38:01 -0600 Subject: [PATCH 043/133] qc_multiqc: generate data folder --- src/ymp/rules/multiqc.rules | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ymp/rules/multiqc.rules b/src/ymp/rules/multiqc.rules index 779fd701..8068be95 100644 --- a/src/ymp/rules/multiqc.rules +++ b/src/ymp/rules/multiqc.rules @@ -49,6 +49,7 @@ with Stage("qc_multiqc") as S: parts = "{:all_prevs:}/multiqc_config.yaml" output: report = "{:this:}/multiqc_report.html", + data = directory("{:this:}/multiqc_report_data"), stamp = touch("{:this:}/all_targets.stamp") params: dirs = lambda wc, input: [os.path.dirname(p) for p in input.parts] From 977610d18db7b156c9cd3fcfe193df3c3eccca7c Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Mon, 11 Oct 2021 20:29:59 -0600 Subject: [PATCH 044/133] slurm status cmd fixes --- src/ymp/cluster.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/ymp/cluster.py b/src/ymp/cluster.py index 972d3bc6..625fcb46 100644 --- a/src/ymp/cluster.py +++ b/src/ymp/cluster.py @@ -33,7 +33,8 @@ class Slurm(ClusterMS): 'RUNNING': 'running', # job has allocation and should be working 'RESIZING': 'running', # job is about to change size 'SUSPENDED': 'running', # job is paused - 'TIMEOUT': 'failed', # job reached time limit + 'TIMEOUT': 'failed', # job reached time limit + 'OUT_OF_MEMORY': 'failed', # job ran out of memory # questionable states: 'SPECIAL_EXIT': 'running', # job failed but flagged "special_exit" 'REVOKED': 'running' # job removed due to other cluster starting it @@ -63,7 +64,8 @@ def status(jobid): try: job = {key: line[header.index(key)] for key in ('JobID', 'State', 'ExitCode')} - job['snakestate'] = Slurm.states[job['State'].split(' ')[0]] + state = job['State'].split(' ')[0] + job['snakestate'] = Slurm.states.get(state, "failed") jobs.append(job) except ValueError as e: error(e) @@ -75,7 +77,7 @@ def status(jobid): elif 'failed' in snakestates: print('failed') else: # job doesn't exist... assuming success - print('success') + print('running') sys.exit(0) From 16c295c2deb208c0c119f45025f5584e7bdad16b Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Mon, 11 Oct 2021 20:30:26 -0600 Subject: [PATCH 045/133] Use system temp for star --- src/ymp/rules/star.rules | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/ymp/rules/star.rules b/src/ymp/rules/star.rules index abb0a0d3..aa1e27f6 100644 --- a/src/ymp/rules/star.rules +++ b/src/ymp/rules/star.rules @@ -58,7 +58,6 @@ with Stage("map_star") as S: outprefix = "{:this:}/{target}.star.", multimap_nmax = 10, quantmode = "TranscriptomeSAM", - tmpdir = "{:dir.tmp:}/star/{:this:}/{target}" resources: mem = "64g", threads: @@ -66,7 +65,6 @@ with Stage("map_star") as S: conda: "star" shell: """ - mkdir -p {params.tmpdir}; rmdir {params.tmpdir}; STAR \ --genomeDir {input.index} \ --genomeLoad NoSharedMemory \ @@ -77,7 +75,6 @@ with Stage("map_star") as S: --outSAMtype BAM Unsorted \ --outSAMunmapped Within \ --outFilterMultimapNmax {params.multimap_nmax} \ - --outTmpDir {params.tmpdir} \ --quantMode {params.quantmode} \ >{log.std} 2>&1 From 083b98c7f320ced11466327f8da5ab11a23a5dcf Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Mon, 11 Oct 2021 20:31:23 -0600 Subject: [PATCH 046/133] Add salmon index with decoy --- src/ymp/rules/salmon.rules | 41 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/ymp/rules/salmon.rules b/src/ymp/rules/salmon.rules index 8075f36c..839d9fbb 100644 --- a/src/ymp/rules/salmon.rules +++ b/src/ymp/rules/salmon.rules @@ -21,12 +21,53 @@ with Stage("index_salmon") as S: "salmon" threads: 32 + resources: + mem = "32G" shell: "exec >{log} 2>&1;" "salmon index" " --transcripts {input.txfa}" " --kmerLen {params.kmerlen}" " --index {output.index}" + " --threads {threads}" + " {params.gencode}" + + +with Stage("index_salmon_decoy") as S: + S.doc(""" + """) + S.add_param("G", typ="flag", name="gencode", value="--gencode") + + rule salmon_index_decoy: + message: "{:name:}: FIXME" + input: + txfa = "{:prev:}/{:target:}.tx.fasta.gz", + fa = "{:prev:}/{:target:}.fasta.gz", + output: + index = directory("{:this:}/{target}.salmon_index"), + benchmark: + "benchmarks/{:name:}/{:this:}/{target}.txt", + log: + "{:this:}/{target}.log", + params: + kmerlen = 31, + conda: + "salmon" + threads: + 64 + resources: + mem = "45G" + shadow: "shallow" + shell: + "exec >{log} 2>&1;" + "gzip -dc {input.fa} | sed -n '/>/ s/>\\([^ ]*\\).*/\\1/p' > decoy.txt;" + "cat {input.txfa} {input.fa} > gentrome.fa.gz;" + "salmon index" + " --transcripts gentrome.fa.gz" + " --kmerLen {params.kmerlen}" + " --index {output.index}" + " --threads {threads}" + " --decoys decoy.txt" " {params.gencode}" From d5e5cf35224eb0ed15de6ac4aacdcc9c59f1406e Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 27 Oct 2021 11:50:50 -0600 Subject: [PATCH 047/133] Update slurm cluster commands --- src/ymp/cluster.py | 80 ++++++++++++++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 27 deletions(-) diff --git a/src/ymp/cluster.py b/src/ymp/cluster.py index 625fcb46..aeae1030 100644 --- a/src/ymp/cluster.py +++ b/src/ymp/cluster.py @@ -7,6 +7,12 @@ import subprocess as sp import sys import re +import logging + +log = logging.getLogger(__name__) # pylint: disable=invalid-name + +ATTEMPTS = 20 +DEFAULT_STATE = "running" def error(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) @@ -44,7 +50,41 @@ class Slurm(ClusterMS): } @staticmethod - def status(jobid): + def run_sacct(jobid): + try: + res = sp.run(['sacct', '-pbj', jobid], stdout=sp.PIPE) + except sp.CalledProcessError as exc: + log.error("Failed to run sacct: %s", exc) + return {} + try: + lines = [line.strip().split("|") + for line in res.stdout.decode().splitlines()] + header = lines.pop(0) + state_idx = header.index("State") + jobid_idx = header.index("JobID") + return { + line[jobid_idx]: line[state_idx].split(" ", 1)[0] + for line in lines + } + except IndexError as exc: + log.error("Failed to parse sacct: %s", exc) + return {} + + @staticmethod + def run_scontrol(jobid): + try: + res = sp.run(['scontrol', '-o', 'show', 'job', jobid], stdout=sp.PIPE) + except sp.CalledProcessError as e: + log.error("Failed to run scontrol: %s", e) + return {} + try: + return {jobid: re.search("JobState=(\w+)", res.stdout.decode()).group(1)} + except (AttributeError, IndexError) as exc: + log.error("Failed to parse sacct: %s", exc) + return {} + + @classmethod + def status(cls, jobid): """Print status of job @param jobid to stdout (as needed by snakemake) Anectotal benchmarking shows 200ms per invocation, half used @@ -52,32 +92,18 @@ def status(jobid): show job`` instead of ``sacct -pbs`` is faster by 80ms, but finished jobs are purged after unknown time window. """ - - header = None - res = sp.run(['sacct', '-pbj', jobid], stdout=sp.PIPE) - jobs = [] - for line in res.stdout.decode('ascii').splitlines(): - line = line.strip().split("|") - if header is None: - header = line - continue - try: - job = {key: line[header.index(key)] - for key in ('JobID', 'State', 'ExitCode')} - state = job['State'].split(' ')[0] - job['snakestate'] = Slurm.states.get(state, "failed") - jobs.append(job) - except ValueError as e: - error(e) - error(res.stdout) - sys.exit(1) - snakestates = [job['snakestate'] for job in jobs] - if 'running' in snakestates: - print('running') - elif 'failed' in snakestates: - print('failed') - else: # job doesn't exist... assuming success - print('running') + for i in range(ATTEMPTS): + jobs = cls.run_sacct(jobid) + if jobid not in jobs: + jobs = cls.run_scontrol(jobid) + if jobid in jobs: + slurmstate = jobs[jobid] + snakestate = cls.states[slurmstate] + print(snakestate) + sys.exit(0) + time.sleep(1) + log.error("Failed to obtain job info after %i attempts, claiming job %s", ATTEMPTS, DEFAULT_STATE) + print(DEFAULT_STATE) sys.exit(0) From 8aac53cef4bbde9c21329201ddc9dd3aa28d6105 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 27 Oct 2021 11:51:24 -0600 Subject: [PATCH 048/133] quant_rsem: use system temp --- src/ymp/rules/rsem.rules | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ymp/rules/rsem.rules b/src/ymp/rules/rsem.rules index 142e3569..59364ca7 100644 --- a/src/ymp/rules/rsem.rules +++ b/src/ymp/rules/rsem.rules @@ -54,13 +54,17 @@ with Stage("quant_rsem") as S: outprefix = "{:this:}/{target}", index = lambda wc, input: input.idx[0][:-len(RSEM_IDX[0])-1], forward_prob = 1.0, # P of having fwd read + this = "{:this:}", resources: mem = "16G", threads: 8 conda: "rsem" + shadow: + "shallow" shell: + "exec >{log} 2>&1;" "rsem-calculate-expression" " -p {threads}" " --bam " @@ -70,7 +74,7 @@ with Stage("quant_rsem") as S: " --ci-memory $(({resources.mem_mb} / 16 * 10))" " --forward-prob {params.forward_prob}" " --paired-end" - " --temporary-folder {resources.tmpdir}/rsem.{wildcards.target}.$$/" + " --temporary-folder {params.this}/{wildcards.target}.tmp" " {input.bam}" " {params.index}" " {params.outprefix} " From b7578a625aeee9d244d39f814a39b6d923b4ecf6 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 27 Oct 2021 11:51:53 -0600 Subject: [PATCH 049/133] quant_salmon: add resource requirements --- src/ymp/rules/salmon.rules | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ymp/rules/salmon.rules b/src/ymp/rules/salmon.rules index 839d9fbb..2e27e913 100644 --- a/src/ymp/rules/salmon.rules +++ b/src/ymp/rules/salmon.rules @@ -93,6 +93,8 @@ with Stage("quant_salmon_sa") as S: "salmon" threads: 16 + resources: + mem = "48G", shell: "exec >{log} 2>&1;" "salmon quant" @@ -155,12 +157,12 @@ with Stage("quant_salmon") as S: "benchmarks/{:name:}/{:this:}/{target}.txt", log: "{:this:}/{target}.log", - resources: - mem = "16G", conda: "salmon" threads: 32 + resources: + mem = "48G", shell: "exec >{log} 2>&1;" "salmon quant" From 54275c0febf9b2f666b4dec11884443f2335950b Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 27 Oct 2021 11:52:17 -0600 Subject: [PATCH 050/133] qc_multiqc: add resource requirements --- src/ymp/rules/multiqc.rules | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ymp/rules/multiqc.rules b/src/ymp/rules/multiqc.rules index 8068be95..9cea56cd 100644 --- a/src/ymp/rules/multiqc.rules +++ b/src/ymp/rules/multiqc.rules @@ -39,7 +39,6 @@ with Stage("qc_multiqc") as S: with open(output.conf, "w") as fd: yaml.dump(conf, fd) - localrules: multiqc_report rule multiqc_report: """Assemble report on all FQ files in a directory""" message: @@ -51,10 +50,14 @@ with Stage("qc_multiqc") as S: report = "{:this:}/multiqc_report.html", data = directory("{:this:}/multiqc_report_data"), stamp = touch("{:this:}/all_targets.stamp") + benchmark: + "benchmarks/{:name:}/{:this:}/all.txt", params: dirs = lambda wc, input: [os.path.dirname(p) for p in input.parts] log: "{:this:}/multiqc.log" + resources: + mem = "32g" threads: 1 conda: From 89f574ff8b5eeaa52e41e20a5aa27b842f451fad Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 27 Oct 2021 11:54:13 -0600 Subject: [PATCH 051/133] Allow --pdb with submit --- src/ymp/cli/make.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ymp/cli/make.py b/src/ymp/cli/make.py index 82b65a5a..e63d7724 100644 --- a/src/ymp/cli/make.py +++ b/src/ymp/cli/make.py @@ -158,7 +158,7 @@ def decorated(*args, **kwargs): # pylint: disable=missing-docstring return decorated -def start_snakemake(kwargs): +def start_snakemake(kwargs, submit=False): """Execute Snakemake with given parameters and targets Fixes paths of kwargs['targets'] to be relative to YMP root. @@ -197,7 +197,7 @@ def start_snakemake(kwargs): # our debug flag sets a new excepthoook handler, to we use this # to decide whether snakemake should run in debug mode - if sys.excepthook.__module__ != "sys": + if sys.excepthook.__module__ != "sys" and not submit: log.warning( "Custom excepthook detected. Having Snakemake open stdin " "inside of run: blocks") @@ -398,6 +398,6 @@ def submit(profile, **kwargs): config.add_layer("", {param: cfg.expand(" ".join(cmd))}) - rval = start_snakemake(config) + rval = start_snakemake(config, submit=True) if not rval: sys.exit(1) From 7e11c81d770ad50a9f136bef8984a3d5ff763397 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 27 Oct 2021 11:55:51 -0600 Subject: [PATCH 052/133] Adjust to sourcecache usage in snakemake >= 6.9 --- src/ymp/__init__.py | 2 +- src/ymp/env.py | 4 ++-- src/ymp/snakemake.py | 10 +++++----- src/ymp/yaml.py | 12 ++++++++++-- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/ymp/__init__.py b/src/ymp/__init__.py index e3a811f6..b1c65775 100644 --- a/src/ymp/__init__.py +++ b/src/ymp/__init__.py @@ -49,7 +49,7 @@ #: List of versions this version of YMP has been verified to work with snakemake_versions = [ - '6.0.5', '6.1.0', '6.1.1', '6.2.1', '6.3.0' + '6.10.0', ] diff --git a/src/ymp/env.py b/src/ymp/env.py index eec978df..cfcf8b4e 100644 --- a/src/ymp/env.py +++ b/src/ymp/env.py @@ -429,10 +429,10 @@ def format(self, conda_env, *args, **kwargs): if not self._envs: self._envs = Env.get_registry() if conda_env in self._envs: - return self._envs[conda_env].file + return self._envs[conda_env].file.get_path_or_uri() for snakefile in reversed(self.workflow.included_stack): - basepath = op.dirname(snakefile) + basepath = op.dirname(snakefile.get_path_or_uri()) for _, relpath in sorted(self._search_paths.items()): searchpath = op.join(basepath, relpath) abspath = op.abspath(op.join(searchpath, conda_env)) diff --git a/src/ymp/snakemake.py b/src/ymp/snakemake.py index 6227ce81..81870496 100644 --- a/src/ymp/snakemake.py +++ b/src/ymp/snakemake.py @@ -16,6 +16,8 @@ from snakemake.io import Namedlist as _Namedlist # type: ignore from snakemake.rules import Rule # type: ignore from snakemake.workflow import RuleInfo, Workflow # type: ignore +from snakemake.sourcecache import infer_source_file # type: ignore + import ymp from ymp.common import ensure_list, flatten, is_container @@ -908,22 +910,20 @@ def __init__(self): super().__init__() self.ruleinfos = {} self.snakefiles = {} - self.linemaps = None def get_code_line(self, rule: Rule) -> str: """Returns the source line defining *rule*""" + cached_file = infer_source_file(rule.snakefile) # Load and cache Snakefile if rule.snakefile not in self.snakefiles: try: - with open(rule.snakefile, "r") as sf: + with self.workflow.sourcecache.open(cached_file, "r") as sf: self.snakefiles[rule.snakefile] = sf.readlines() except IOError: raise Exception("Can't parse ...") # `rule.lineno` refers to compiled code. Convert to source line number. - if self.linemaps is None: - self.linemaps = ExpandableWorkflow.global_workflow.linemaps - real_lineno = self.linemaps[rule.snakefile][rule.lineno] + real_lineno = self.workflow.linemaps[cached_file][rule.lineno] return self.snakefiles[rule.snakefile][real_lineno - 1] diff --git a/src/ymp/yaml.py b/src/ymp/yaml.py index 4f570f4d..e17ef8bc 100644 --- a/src/ymp/yaml.py +++ b/src/ymp/yaml.py @@ -36,10 +36,14 @@ def get_fileline(self): return self.obj return None, None + class Entry: def __init__(self, filename, yaml, index): self.filename = filename - self.lineno = yaml._yaml_line_col.data[index][0] + 1 + try: + self.lineno = yaml._yaml_line_col.data[index][0] + 1 + except AttributeError: + self.lineno = 0 class MixedTypeError(LayeredConfError): @@ -118,7 +122,11 @@ def get_fileline(self, key = None): if key: for fname, layer in self._maps: if key in layer: - return fname, layer._yaml_line_col.data[key][0] + 1 + try: + line = layer._yaml_line_col.data[key][0] + 1 + except AttributeError: + line = 0 + return fname, line return ";".join(self.get_files()), next(iter(self.get_linenos()), None) def to_yaml(self, show_source=False): From 0554fed081f16e071f369e790337c5985bc1f227 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 27 Oct 2021 11:56:15 -0600 Subject: [PATCH 053/133] Avoid setting default cores from CLI --- src/ymp/cli/make.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ymp/cli/make.py b/src/ymp/cli/make.py index e63d7724..e846c0da 100644 --- a/src/ymp/cli/make.py +++ b/src/ymp/cli/make.py @@ -337,7 +337,7 @@ def make(**kwargs): help="Limit the maximum number of cores used by jobs submitted at a time" ) @click.option( - "--cores", "-j", default=16, metavar="N", + "--cores", "-j", metavar="N", help="Number of local threads to use" ) @click.option( From 7c317affd93e1447addc9373ddacbd70874a0866 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Mon, 15 Nov 2021 16:35:57 -0700 Subject: [PATCH 054/133] Require .fasta or .fq for classifying col as file; fix error msg --- src/ymp/stage/project.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ymp/stage/project.py b/src/ymp/stage/project.py index 466ea71f..765c116f 100644 --- a/src/ymp/stage/project.py +++ b/src/ymp/stage/project.py @@ -285,7 +285,7 @@ class Project(ConfigStage): RE_REMOTE = re.compile(r"^(?:https?|ftp|sftp)://(?:.*)") RE_SRR = re.compile(r"^[SED]RR[0-9]+$") - RE_FILE = re.compile(r"^(?!http://).*(?:fq|fastq)(?:|\.gz)$") + RE_FILE = re.compile(r"^(?!http://).*\.(?:fq|fastq)(?:|\.gz)$") def __init__(self, name, cfg): super().__init__(name, cfg) @@ -486,7 +486,7 @@ def choose_fq_columns(self): (cols[0] == 'srr' and len(cols) > 1): log.error("Ambiguous data sources found in row %s. " "You may need to constrain the columns allowed " - "to contain read data using '%'.", + "to contain read data using '%s'.", row[1], self.KEY_READCOLS) err = True elif len(cols) == 2: From a084f55f3b11e5cab1fb07866dbc81f42b829f56 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 23 Nov 2021 19:57:18 -0700 Subject: [PATCH 055/133] Use fasterq-dump instead of fastq-dump --- src/ymp/rules/00_import.rules | 49 +++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/src/ymp/rules/00_import.rules b/src/ymp/rules/00_import.rules index 786ce67b..c1fccda6 100644 --- a/src/ymp/rules/00_import.rules +++ b/src/ymp/rules/00_import.rules @@ -56,33 +56,38 @@ rule fastq_dump: output: "{:dir.scratch:}/SRR/{SRR}_1.fastq.gz", "{:dir.scratch:}/SRR/{SRR}_2.fastq.gz" + log: + "{:dir.scratch:}/SRR/{SRR}.log" wildcard_constraints: SRR = r"[EDS]RR[0-9]+", - params: - outdir = "{:ensuredir.scratch:}/SRR", - p = lambda wc, threads: int(threads/2+.5), resources: - mem = "200M", + mem = "10G", + threads: + 6 conda: "sratools.yml" - threads: - 4 - # FIXME - # the two cut processes use about 1 cpu each, fastqdump 1/4 and pgzip about 1 each. - # not ideal. not sure why cut needs so much time. - shell: """ - fastq-dump {wildcards.SRR} \ - --split-files \ - --readids \ - --dumpbase \ - --skip-technical \ - --clip \ - --read-filter pass \ - --stdout | \ - paste - - - - - - - - | \ - tee >(cut -f 1-4 | tr "\t" "\\n" | pigz -p {params.p} > {output[0]}) | \ - cut -f 5-8 | tr "\t" "\\n" | pigz -p {params.p} > {output[1]} - """ + shell: + "exec >{log} 2>&1;" + "TMPDIR=$(mktemp -d);" + "trap 'rm -rf $TMPDIR' EXIT;" + "fasterq-dump" + " {wildcards.SRR}" + " --details" + " --print-read-nr" + " --temp $TMPDIR" + " --outdir $TMPDIR" + " --threads {threads};" + "pigz " + " --stdout " + " --processes {threads}" + " $TMPDIR/{wildcards.SRR}_1.fastq" + " >{output[0]};" + "pigz " + " --stdout " + " --processes {threads}" + " $TMPDIR/{wildcards.SRR}_2.fastq" + " >{output[1]};" + with Stage("") as S: S.doc(""" From c080fd7764a8397a37b90e4cbf906c3cf9867224 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 23 Nov 2021 19:58:04 -0700 Subject: [PATCH 056/133] STAR: add twopass mode (T) and samstrandfield (Sf) --- src/ymp/rules/star.rules | 42 ++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/src/ymp/rules/star.rules b/src/ymp/rules/star.rules index aa1e27f6..205f3bf3 100644 --- a/src/ymp/rules/star.rules +++ b/src/ymp/rules/star.rules @@ -12,6 +12,8 @@ with Stage("index_star") as S: gdir = directory("{:this:}/{target}.staridx"), log: "{:this:}/{target}.log", + benchmark: + "benchmarks/{:name:}/{:this:}/{target}.txt", threads: 32 params: @@ -42,6 +44,9 @@ with Stage("map_star") as S: S.doc(""" Map RNA-Seq reads with STAR """) + S.add_param("T", typ="flag", name="twopass", value="--twopassMode Basic") + S.add_param("Sf", typ="flag", name="", value="--outSAMstrandField intronMotif") + rule star_map: message: "STAR: mapping {input.fq[0]} to {input.index}" @@ -54,6 +59,8 @@ with Stage("map_star") as S: log: std = "{:this:}/{target}.log", final = "{:this:}/{target}.star.Log.final.out", + benchmark: + "benchmarks/{:name:}/{:this:}/{target}.txt", params: outprefix = "{:this:}/{target}.star.", multimap_nmax = 10, @@ -64,23 +71,24 @@ with Stage("map_star") as S: 32 conda: "star" - shell: """ - STAR \ - --genomeDir {input.index} \ - --genomeLoad NoSharedMemory \ - --runThreadN {threads} \ - --readFilesIn {input.fq} \ - --readFilesCommand "gzip -dc" \ - --outFileNamePrefix {params.outprefix} \ - --outSAMtype BAM Unsorted \ - --outSAMunmapped Within \ - --outFilterMultimapNmax {params.multimap_nmax} \ - --quantMode {params.quantmode} \ - >{log.std} 2>&1 - - mv {params.outprefix}Aligned.out.bam {output.bamgn} - mv {params.outprefix}Aligned.toTranscriptome.out.bam {output.bamtr} - """ + shell: + "exec >{log.std} >&1;" + "STAR" + " --genomeDir {input.index}" + " --genomeLoad NoSharedMemory" + " --runThreadN {threads}" + " --readFilesIn {input.fq}" + " --readFilesCommand 'gzip -dc'" + " --outFileNamePrefix {params.outprefix}" + " --outSAMtype BAM Unsorted" + " --outSAMunmapped Within" + " --outFilterMultimapNmax {params.multimap_nmax}" + " --quantMode {params.quantmode}" + " {params.twopass};" + "mv {params.outprefix}Aligned.out.bam {output.bamgn};" + "mv {params.outprefix}Aligned.toTranscriptome.out.bam {output.bamtr};" + "sync {output}" + # --outTmpDir ? localrules: star_map_multiqc_cfg rule star_map_multiqc_cfg: From bab7d5d01b193045fbbf0b223bbaac00168fe5f2 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 23 Nov 2021 20:00:20 -0700 Subject: [PATCH 057/133] Add sort_bam2 to sort both tx.bam and bam files --- src/ymp/rules/sambamba.rules | 44 ++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/ymp/rules/sambamba.rules b/src/ymp/rules/sambamba.rules index 157bd118..8b97d703 100644 --- a/src/ymp/rules/sambamba.rules +++ b/src/ymp/rules/sambamba.rules @@ -32,6 +32,23 @@ with Stage("sort_bam") as S: " >{log} 2>&1" +with Stage("sort_bam2") as S: + rule sambamba_sort_gn: # ymp: extends sambamba_sort + input: + bam = "{:prev:}/{target}.bam", + + rule sambamba_sort_tx: # ymp: extends sambamba_sort + input: + bam = "{:prev:}/{target}.tx.bam", + output: + bam = "{:this:}/{target}.sorted.tx.bam", + bai = "{:this:}/{target}.sorted.tx.bam.bai", + log: + "{:this:}/{target}.sorted.tx.bam.log" + benchmark: + "benchmarks/{:name:}/{:this:}/{target}.tx.txt" + + with Stage("markdup_sambamba") as S: S.add_param("RM", typ="flag", name = "remove_dups", value="--remove-duplicates") rule sambamba_markdup: @@ -48,6 +65,29 @@ with Stage("markdup_sambamba") as S: "benchmarks/{:name:}/{:this:}/{target}.txt" params: compress = 6, + + hash_table_size = 262144, + # From help: + # size of hash table for finding read pairs (default is 262144 reads); + # will be rounded down to the nearest power of two; + # should be > (average coverage) * (insert size) for good performance + + overflow_list_size = 600000, + # From help: + # size of the overflow list where reads, thrown from the hash table, + # get a second chance to meet their pairs (default is 200000 reads); + # increasing the size reduces the number of temporary files created + + sort_buffer_size = 4096, + # From help: + # total amount of memory (in *megabytes*) used for sorting purposes; + # the default is 2048, increasing it will reduce the number of created + # temporary files and the time spent in the main thread + + io_buffer_size = 128, + # From help: + # two buffers of BUFFER_SIZE *megabytes* each are used + # for reading and writing BAM during the second pass (default is 128) resources: mem = "32g", threads: @@ -58,6 +98,10 @@ with Stage("markdup_sambamba") as S: "exec >{log} 2>&1;" "sambamba markdup" " --compression-level={params.compress}" + " --hash-table-size={params.hash_table_size}" + " --overflow-list-size={params.overflow_list_size}" + " --sort-buffer-size={params.sort_buffer_size}" + " --io-buffer-size={params.io_buffer_size}" " --nthreads={threads}" " {params.remove_dups}" " {input.bam}" From 3f7793363df35f1d8d9826a193bf42fd735d6b64 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 23 Nov 2021 20:01:30 -0700 Subject: [PATCH 058/133] Salmon: generate tx.bam --- src/ymp/rules/salmon.rules | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/ymp/rules/salmon.rules b/src/ymp/rules/salmon.rules index 2e27e913..39a08109 100644 --- a/src/ymp/rules/salmon.rules +++ b/src/ymp/rules/salmon.rules @@ -1,4 +1,7 @@ -Env(name="salmon", base="bioconda", packages=["salmon>1.5"]) +Env(name="salmon", base="bioconda", packages=[ + "salmon>1.5", + "samtools" +]) with Stage("index_salmon") as S: S.doc(""" @@ -85,6 +88,7 @@ with Stage("quant_salmon_sa") as S: output: quant = "{:this:}/{target}.salmon/quant.sf", unmapped = "{:this:}/{target}.salmon/aux_info/unmapped_names.txt", + bam = temp("{:this:}/{target}.tx.bam"), benchmark: "benchmarks/{:name:}/{:this:}/{target}.txt", log: @@ -107,6 +111,8 @@ with Stage("quant_salmon_sa") as S: " --mates1 {input.fq[0]}" " --mates2 {input.fq[1]}" " --output $(dirname {output.quant})" + " --writeMappings" + " | samtools view -b -o {output.bam} --threads 4 -" localrules: salmon_sa_quant_multiqc_cfg rule salmon_sa_quant_multiqc_cfg: From d8ec6f20a177492ed05ab567df52a8ce73d017da Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 23 Nov 2021 20:03:43 -0700 Subject: [PATCH 059/133] Add index_txfa - recompress and index tx.fasta.gz --- src/ymp/rules/samtools.rules | 37 ++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/ymp/rules/samtools.rules b/src/ymp/rules/samtools.rules index df1ca37a..84ccd969 100644 --- a/src/ymp/rules/samtools.rules +++ b/src/ymp/rules/samtools.rules @@ -149,3 +149,40 @@ with Stage("coverage_samtools") as S: ' -o {output}' ' >{log} 2>&1' + +with Stage("index_txfa") as S: + rule fai_index: + message: + "{:name:}: Re-compressing with bgzip and indexing {output.txfa}" + input: + txfa = "{:prev:}/{:target:}.tx.fasta.gz", + output: + txfa = "{:this:}/{target}.tx.fasta.gz", + gzi = "{:this:}/{target}.tx.fasta.gz.gzi", + fai = "{:this:}/{target}.tx.fasta.gz.fai", + log: + "{:this:}/{target}.log" + benchmark: + "benchmarks/{:name:}/{:this:}/{target}.txt" + conda: + "samtools" + resources: + mem = "4g", + threads: + 8 + shell: + 'exec >{log} 2>&1;' + 'bgzip' + ' --threads {threads}' + ' --decompress' + ' --stdout' + ' {input.txfa}' + '|' + 'bgzip' + ' --threads {threads}' + ' --index --index-name {output.gzi}' + ' --compress' + ' --stdout' + ' >{output.txfa};' + '' + 'samtools faidx {output.txfa}' From 3281cf596a58c3fac6479f328e157e9e94a53b14 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Sun, 27 Mar 2022 16:06:18 -0600 Subject: [PATCH 060/133] Fix escaping in star/multiqc --- src/ymp/rules/star.rules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ymp/rules/star.rules b/src/ymp/rules/star.rules index 205f3bf3..f68b00e1 100644 --- a/src/ymp/rules/star.rules +++ b/src/ymp/rules/star.rules @@ -111,7 +111,7 @@ with Stage("map_star") as S: "path_filters": [f"{params.this}/*.star.Log.final.out"] } }], - "sample_names_replace": {"(.*)\\\\.star": "\\\\1"}, + "sample_names_replace": {"(.*)\\.star": "\\1"}, } with open(output[0], "w") as out: yaml.dump(data, out) From 9176658575039965e0f96ba528dbd42f949d3190 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Sun, 27 Mar 2022 16:06:43 -0600 Subject: [PATCH 061/133] Add collate_txbam --- src/ymp/rules/samtools.rules | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/ymp/rules/samtools.rules b/src/ymp/rules/samtools.rules index 84ccd969..d7d93353 100644 --- a/src/ymp/rules/samtools.rules +++ b/src/ymp/rules/samtools.rules @@ -186,3 +186,32 @@ with Stage("index_txfa") as S: ' >{output.txfa};' '' 'samtools faidx {output.txfa}' + + +with Stage("collate_txbam") as S: + rule samtools_collate: + message: + "{:name:}: Collating BAM file by read name" + input: + bam = "{:prev:}/{:target:}.sorted.tx.bam", + output: + bam = temp("{:this:}/{target}.tx.bam"), + log: + "{:this:}/{target}.log" + benchmark: + "benchmarks/{:name:}/{:this:}/{target}.txt" + conda: + "samtools" + params: + compression_level = 3, + resources: + mem = "4g", + threads: + 8 + shell: + 'exec >{log} 2>&1;' + 'samtools collate' + ' -o {output.bam}' + ' -l {params.compression_level}' + ' --threads {threads}' + ' {input.bam}' From 68522157077970d5d03899eb8a1f5a6dffb3cec6 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Sun, 27 Mar 2022 16:08:14 -0600 Subject: [PATCH 062/133] Add sort_txbam, sort_bam_name --- src/ymp/rules/sambamba.rules | 52 +++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/src/ymp/rules/sambamba.rules b/src/ymp/rules/sambamba.rules index 8b97d703..5c97fd88 100644 --- a/src/ymp/rules/sambamba.rules +++ b/src/ymp/rules/sambamba.rules @@ -11,11 +11,12 @@ with Stage("sort_bam") as S: bam = "{:this:}/{target}.sorted.bam", bai = "{:this:}/{target}.sorted.bam.bai", log: - "{:this:}/{target}.sorted.bam.log" + "{:this:}/{target}.log" benchmark: "benchmarks/{:name:}/{:this:}/{target}.txt" params: compress = 6, + order_by = "position", resources: mem = "32g", threads: @@ -23,30 +24,63 @@ with Stage("sort_bam") as S: conda: "sambamba" shell: + "exec >{log} 2>&1;" + "case {params.order_by} in" + " name) PARM=--natural-sort;;" + " position) PARM=;;" + "esac;" + "" "sambamba sort" " --memory-limit={resources.mem_mb}MB" " --compression-level={params.compress}" " --nthreads={threads}" " --out={output.bam}" + " $PARM" " {input.bam}" " >{log} 2>&1" -with Stage("sort_bam2") as S: - rule sambamba_sort_gn: # ymp: extends sambamba_sort - input: - bam = "{:prev:}/{target}.bam", - - rule sambamba_sort_tx: # ymp: extends sambamba_sort +with Stage("sort_txbam") as S: + rule sambamba_sort_txbam: # ymp: extends sambamba_sort input: bam = "{:prev:}/{target}.tx.bam", output: bam = "{:this:}/{target}.sorted.tx.bam", bai = "{:this:}/{target}.sorted.tx.bam.bai", log: - "{:this:}/{target}.sorted.tx.bam.log" + "{:this:}/{target}.log" benchmark: - "benchmarks/{:name:}/{:this:}/{target}.tx.txt" + "benchmarks/{:name:}/{:this:}/{target}.txt" + + +with Stage("sort_bam_name") as S: + rule sambamba_sort_bam_name: # ymp: extends sambamba_sort + input: + bam = "{:prev:}/{target}.sorted.bam" + output: + bam = "{:this:}/{target}.bam", + bai = [], + params: + order_by = "name" + log: + "{:this:}/{target}.log" + benchmark: + "benchmarks/{:name:}/{:this:}/{target}.txt" + + +with Stage("sort_txbam_name") as S: + rule sambamba_sort_txbam_name: # ymp: extends sambamba_sort + input: + bam = "{:prev:}/{target}.sorted.tx.bam", + output: + bam = "{:this:}/{target}.tx.bam", + bai = [], + params: + order_by = "name" + log: + "{:this:}/{target}.log" + benchmark: + "benchmarks/{:name:}/{:this:}/{target}.txt" with Stage("markdup_sambamba") as S: From 8b8c4e2f69aa882016a2092488aa14cf4e9e87ca Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Sun, 27 Mar 2022 16:08:39 -0600 Subject: [PATCH 063/133] Bump multiqc version --- src/ymp/rules/multiqc.rules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ymp/rules/multiqc.rules b/src/ymp/rules/multiqc.rules index 9cea56cd..4d7913ca 100644 --- a/src/ymp/rules/multiqc.rules +++ b/src/ymp/rules/multiqc.rules @@ -1,5 +1,5 @@ Env(name="multiqc", base="bioconda", packages=[ - "multiqc >=1.11" + "multiqc >=1.12" ]) with Stage("qc_multiqc") as S: From 995571ec3b0089cfb5e0be8aedf9a3d158c8c414 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Sun, 27 Mar 2022 16:09:03 -0600 Subject: [PATCH 064/133] Rename abc.R1 samples to abc for fastqc multiqc --- src/ymp/rules/fastqc.rules | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ymp/rules/fastqc.rules b/src/ymp/rules/fastqc.rules index 84fb5c6f..c4fb2c74 100644 --- a/src/ymp/rules/fastqc.rules +++ b/src/ymp/rules/fastqc.rules @@ -58,7 +58,8 @@ with Stage("qc_fastqc") as S: "name": f"FastQC ({params.this})", "path_filters": [f"{params.this}/*_fastqc.zip"] } - }] + }], + "sample_names_replace": {"(.*)\\.R1": "\\1"}, } with open(output[0], "w") as out: From 9299735a874c4ed69669d3b1b4130d6cc0e277e8 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Sun, 27 Mar 2022 16:09:43 -0600 Subject: [PATCH 065/133] Fix some source cache to string conversions --- src/ymp/env.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ymp/env.py b/src/ymp/env.py index cfcf8b4e..09d3565a 100644 --- a/src/ymp/env.py +++ b/src/ymp/env.py @@ -343,11 +343,12 @@ def update(self): "Update conda environment" self.create() # call create to make sure environment exists log.warning("Updating environment '%s'", self.name) + log.warning(f"Running {self.frontend} env update --prune -p {self.path} -f {self.file} -v") return subprocess.run([ self.frontend, "env", "update", "--prune", - "-p", self.path, - "-f", self.file, + "-p", str(self.path), + "-f", str(self.file), "-v" ]).returncode From 13ba036da1c1ca2df2a7fbaa758413d85e2d03d7 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 2 Jun 2022 11:37:17 -0600 Subject: [PATCH 066/133] Print hostname when running fastqc - in case of fontconfig weirdness --- src/ymp/rules/fastqc.rules | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/ymp/rules/fastqc.rules b/src/ymp/rules/fastqc.rules index c4fb2c74..f9de2b7f 100644 --- a/src/ymp/rules/fastqc.rules +++ b/src/ymp/rules/fastqc.rules @@ -29,14 +29,15 @@ with Stage("qc_fastqc") as S: mem = "4g", conda: "fastqc" - shell: """ - fastqc \ - -t {threads} \ - -o $(dirname {output[0]}) \ - {input} \ - -k {params.k} \ - >{log} 2>&1 - """ + shell: + "exec >{log} 2>&1;" + "echo Launching fastqc on $HOSTNAME;" + "set -x;" + "fastqc" + " -t {threads}" + " -o $(dirname {output[0]})" + " {input}" + " -k {params.k}" localrules: fastqc_multiqc rule fastqc_multiqc: From 9801f41ba36983f2d146382e28ea08e6a2c47af1 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 2 Jun 2022 11:37:40 -0600 Subject: [PATCH 067/133] Allow salmon to pass if the sample was empty --- src/ymp/rules/salmon.rules | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/ymp/rules/salmon.rules b/src/ymp/rules/salmon.rules index 39a08109..47761dda 100644 --- a/src/ymp/rules/salmon.rules +++ b/src/ymp/rules/salmon.rules @@ -101,7 +101,9 @@ with Stage("quant_salmon_sa") as S: mem = "48G", shell: "exec >{log} 2>&1;" - "salmon quant" + "echo Launching salmon on $HOSTNAME;" + "set -x; " + "if ! salmon quant" " --libType {params.libtype}" " --threads {threads}" " --seqBias" @@ -112,7 +114,17 @@ with Stage("quant_salmon_sa") as S: " --mates2 {input.fq[1]}" " --output $(dirname {output.quant})" " --writeMappings" - " | samtools view -b -o {output.bam} --threads 4 -" + " | samtools view -b -o {output.bam} --threads 4 -; then" + " echo Salmon or Samtools failed;" + " if tail -n1 $(dirname {output.quant})/logs/salmon_quant.log |" + " grep -q 'salmon was only able to assign 0 fragments'; then" + " echo Salmon found no fragments. Faking output.;" + " touch {output.unmapped};" + " echo -e 'Name\tLength\tEffectiveLength\tTPM\tNumReads' > {output.quant};" + " exit 0;" + " fi;" + " exit 1;" + "fi;" localrules: salmon_sa_quant_multiqc_cfg rule salmon_sa_quant_multiqc_cfg: From 22529113b3ac1e38555285c404c9f5193d75cf71 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 2 Jun 2022 11:38:01 -0600 Subject: [PATCH 068/133] Allow counting genes instead of exons with htseq --- src/ymp/rules/htseq.rules | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ymp/rules/htseq.rules b/src/ymp/rules/htseq.rules index 29dd89d0..96554c86 100644 --- a/src/ymp/rules/htseq.rules +++ b/src/ymp/rules/htseq.rules @@ -1,6 +1,8 @@ Env(name="htseq", base="bioconda", packages="htseq>0.13") -with Stage("count_htseq"): +with Stage("count_htseq") as S: + S.add_param("T", typ="choice", name="typ", + value = ["exon", "gene"], default = "exon") rule htseq_count: message: "Counting per gene reads with htseq-count" @@ -30,7 +32,7 @@ with Stage("count_htseq"): " --max-reads-in-buffer={params.max_reads_in_buffer}" " --stranded={params.stranded}" " -a={params.minaqual}" - # --type=exon + " --type={params.typ}" # --idattr=gene_id " --mode={params.mode}" " --nonunique={params.nonunique}" From 8f59eccae1cb732f5322c627102d8d87d6dccafa Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 16 Jun 2022 13:44:50 -0600 Subject: [PATCH 069/133] Allow hard path in Stage.require --- src/ymp/stage/stage.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/ymp/stage/stage.py b/src/ymp/stage/stage.py index 300cfaf2..579ef769 100644 --- a/src/ymp/stage/stage.py +++ b/src/ymp/stage/stage.py @@ -130,9 +130,13 @@ def satisfy_inputs(self, other_stage, inputs) -> Dict[str, str]: keys = set() for key, input_alts in inputs.items(): for input_alt in input_alts: - have = other_stage.can_provide(set( - "/{{sample}}.{}".format(ext) for ext in input_alt - )) + formatted_alt = set() + for ext in input_alt: + if ext[0] == "/": + formatted_alt.add(ext) + else: + formatted_alt.add("/{{sample}}.{}".format(ext)) + have = other_stage.can_provide(set(formatted_alt)) if len(have) == len(input_alt): have_new = {output: path for output, path in have.items() From c73f2280e48ba0ac6a6bf531fdca954ab540df49 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Mon, 20 Jun 2022 17:57:28 -0600 Subject: [PATCH 070/133] Allow Salmon to "pass" with any (too low) number of assigned fragments --- src/ymp/rules/salmon.rules | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ymp/rules/salmon.rules b/src/ymp/rules/salmon.rules index 47761dda..9d8b7133 100644 --- a/src/ymp/rules/salmon.rules +++ b/src/ymp/rules/salmon.rules @@ -117,8 +117,8 @@ with Stage("quant_salmon_sa") as S: " | samtools view -b -o {output.bam} --threads 4 -; then" " echo Salmon or Samtools failed;" " if tail -n1 $(dirname {output.quant})/logs/salmon_quant.log |" - " grep -q 'salmon was only able to assign 0 fragments'; then" - " echo Salmon found no fragments. Faking output.;" + " grep -qE 'salmon was only able to assign [0-9]+ fragments'; then" + " echo Salmon found insufficient fragments. Faking output.;" " touch {output.unmapped};" " echo -e 'Name\tLength\tEffectiveLength\tTPM\tNumReads' > {output.quant};" " exit 0;" From 2787a92edbf16cd58448ba573fcd4bb6df689e44 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 23 Jun 2022 12:50:33 -0600 Subject: [PATCH 071/133] Add scan command --- src/ymp/cli/__init__.py | 2 + src/ymp/cli/scan.py | 114 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 src/ymp/cli/scan.py diff --git a/src/ymp/cli/__init__.py b/src/ymp/cli/__init__.py index 9a24ecba..64a89d7b 100644 --- a/src/ymp/cli/__init__.py +++ b/src/ymp/cli/__init__.py @@ -8,6 +8,7 @@ from ymp.cli.stage import stage from ymp.cli.show import show from ymp.cli.init import init +from ymp.cli.scan import scan click_completion.init() @@ -67,3 +68,4 @@ def main(**kwargs): main.add_command(stage) main.add_command(show) main.add_command(init) +main.add_command(scan) diff --git a/src/ymp/cli/scan.py b/src/ymp/cli/scan.py new file mode 100644 index 00000000..9f1e4622 --- /dev/null +++ b/src/ymp/cli/scan.py @@ -0,0 +1,114 @@ +import sys +import re +import os +import csv +import click + +all_headers = ["unit", "sample", "slot", "lane", "run", "pool", "fq1", "fq2"] + +class Scanner: + re_illumina = ( + "(?P{sample_pattern})" + "_S(?P\d+)" + "(_L(?P\d{{3}}))?" + "_R(?P[12])" + "_001.fastq.gz" + ) + + def __init__(self, folders): + self.folders = folders + self.sample_pattern = ".*" + self.folder_pattern = ".*" + self.units = {} + + def set_sample_pattern(self, pattern): + self.sample_pattern = pattern + + def set_folder_pattern(self, pattern): + self.folder_pattern = pattern + + def scan(self): + for folder in self.folders: + self.scan_folder(folder.rstrip("/")) + + def scan_folder(self, folder): + run = os.path.basename(folder) + for root, _dirs, files in os.walk(folder): + if re.search(self.folder_pattern, root): + self.scan_files(run, root, files) + + def get_regex(self): + regex = self.re_illumina.format(sample_pattern = self.sample_pattern) + return re.compile(regex) + + def scan_files(self, run, root, files): + regex = self.get_regex() + for fname in files: + match = regex.search(fname) + if match: + self.parse_match(run, root, fname, match) + + def parse_match(self, run, root, fname, match): + data = match.groupdict() + data["fq" + data["pair"]] = os.path.join(root, fname) + del data["pair"] + data["run"] = run + pool = os.path.basename(root) + if run != pool: + data["pool"] = pool + data = {key:value for key, value in data.items() if value} + for key in ("slot", "lane"): + try: + data[key] = int(data[key]) + except: + pass + data["unit"] = data["unit"].replace("-", "_") + + data["sample"] = data["unit"] + unit = self.find_unit(data) + unit.update(data) + + def find_unit(self, data, num=1): + unit_name = data["unit"] + if num > 1: + unit_name = f"{unit_name}_{num}" + data["unit"] = unit_name + unit = self.units.setdefault(unit_name, {}) + if unit and any(data[key] != unit[key] for key in data if key in unit): + if num > 20: + print("Too many units for one sample?!") + print(data) + print(unit) + sys.exit(1) + return self.find_unit(data, num+1) + return unit + + def write_csv(self, outfd): + keys = set() + for row in self.units.values(): + keys.update(set(row.keys())) + headers = [ + header for header in all_headers if header in keys + ] + writer = csv.DictWriter(outfd, fieldnames=headers) + writer.writeheader() + writer.writerows(self.units[unit] for unit in sorted(self.units)) + + +@click.command() +@click.option("--out", type=click.File('w')) +@click.option("--sample-re", default=".*") +@click.option("--folder-re", default=".*") +@click.argument("folders", nargs=-1) +def scan(folders, out, sample_re, folder_re): + if (out is None): + raise click.UsageError("--out parameter required") + scanner = Scanner(folders) + scanner.set_sample_pattern(sample_re) + scanner.set_folder_pattern(folder_re) + scanner.scan() + scanner.write_csv(out) + + +if __name__ == "__main__": + scan() From 4a9fdeee00b540ac618e67207eb835d644447189 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 19 Jul 2022 13:22:59 -0600 Subject: [PATCH 072/133] Fix recursion if reference pipeline has same output as input --- src/ymp/stage/base.py | 2 +- src/ymp/stage/pipeline.py | 2 +- src/ymp/stage/reference.py | 12 +++++++++--- src/ymp/stage/stack.py | 2 +- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/ymp/stage/base.py b/src/ymp/stage/base.py index 83c75421..2eafc57c 100644 --- a/src/ymp/stage/base.py +++ b/src/ymp/stage/base.py @@ -105,7 +105,7 @@ def can_provide(self, inputs: Set[str], full_stack: bool = False) -> Dict[str, s for output in inputs.intersection(self.outputs) } - def get_path(self, stack: "StageStack") -> str: + def get_path(self, stack: "StageStack", typ = None, pipeline = None) -> str: # pylint: disable = no-self-use """On disk location for this stage given ``stack``. diff --git a/src/ymp/stage/pipeline.py b/src/ymp/stage/pipeline.py index 83d91342..80b7b3b3 100644 --- a/src/ymp/stage/pipeline.py +++ b/src/ymp/stage/pipeline.py @@ -110,7 +110,7 @@ def params(self): self._params = params return super().params - def get_path(self, stack, typ=None, pipeline=None): + def get_path(self, stack, typ=None, pipeline=None, caller=None): pipeline_parameters = self.parse(stack.stage_name) param_map = { key.format(**pipeline_parameters): value diff --git a/src/ymp/stage/reference.py b/src/ymp/stage/reference.py index 0b660b58..138441a4 100644 --- a/src/ymp/stage/reference.py +++ b/src/ymp/stage/reference.py @@ -376,10 +376,16 @@ def can_provide(self, inputs: Set[str], full_stack: bool = False) -> Dict[str, s } return res - def get_path(self, _stack=None, typ=None): + def get_path(self, stack=None, typ=None, pipeline = None, caller = None): + # Send request for a file to the pipeline stage providing it, + # taking care not to bounce requests from our own stages back + # to themselves. if typ is None: - return self.dir - return self.name + self.outputs[typ] + return self.dir # references/ + path = self.name + self.outputs[typ] + if caller.name == path: + return self.name # ref_ + return path # potentially redirect to pipeline def get_all_targets(self, stack: "StageStack") -> List[str]: return [os.path.join(self.dir, fname) for fname in self.files] diff --git a/src/ymp/stage/stack.py b/src/ymp/stage/stack.py index 838d545d..f675356a 100644 --- a/src/ymp/stage/stack.py +++ b/src/ymp/stage/stack.py @@ -193,7 +193,7 @@ def _do_resolve_prevs(self, stage, inputs, exclude_self): provides = stage.satisfy_inputs(prev_stage, inputs) for typ, ppath in provides.items(): if ppath: - npath = prev_stage.get_path(prev_stack, typ) + npath = prev_stage.get_path(prev_stack, typ, caller=self) prevs[typ] = self.instance(npath) else: prevs[typ] = prev_stack From d2a65a6c3091a34e9c6b59f68bec05f7e4234a10 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 2 Aug 2022 17:02:26 -0600 Subject: [PATCH 073/133] Add bcftools stages --- src/ymp/rules/bcftools.rules | 133 +++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 src/ymp/rules/bcftools.rules diff --git a/src/ymp/rules/bcftools.rules b/src/ymp/rules/bcftools.rules new file mode 100644 index 00000000..c72e767a --- /dev/null +++ b/src/ymp/rules/bcftools.rules @@ -0,0 +1,133 @@ +Env(name="bcftools", base="bioconda", packages=["bcftools"]) + +with Stage("index_fasta") as S: + rule fasta_index: + message: "{:name:}: Recompressing and indexing fasta" + input: + fagz = "{:prev:}/{:target:}.fasta.gz", + output: + fagz = "{:this:}/{target}.fasta.gz", + fagzi = "{:this:}/{target}.fasta.gz.gzi", + log: + "{:this:}/{target}.log" + threads: + 8 + resources: + mem = "8G", + conda: + "bcftools" + shell: + "exec >{log} 2>&1;" + "gzip -dc {input.fagz} |" + " bgzip " + " --index --index-name {output.fagzi}" + " --threads {threads}" + " --stdout > {output.fagz}" + +with Stage("index_tx_fasta") as S: + rule tx_fasta_index: # ymp: extends fasta_index + input: + fagz = "{:prev:}/{:target:}.tx.fasta.gz", + output: + fagz = "{:this:}/{target}.tx.fasta.gz", + fagzi = "{:this:}/{target}.tx.fasta.gz.gzi", + + +with Stage("genotype_bcftools") as S: + S.add_param("Vo", typ = "flag", name = "variants_only", value = "--variants-only") + S.add_param("Si", typ = "flag", name = "skip_indels", value = "--skip-variants indels") + S.add_param("D", typ = "int", name = "max_depth", default = 250) + S.add_param("R", typ = "choice", name = "region", default = "", value = [ + "X", "Y" + ]) + + rule bcftools_call: + message: "{:name:} Genotyping {input.bam}" + input: + bam = "{:prev:}/{:target:}.sorted.bam", + ref = "{:prev:}/{:target:}.fasta.gz", + refi = "{:prev:}/{:target:}.fasta.gz.gzi", + output: + vcf = "{:this:}/{target}.vcf.gz", + tbi = "{:this:}/{target}.vcf.gz.tbi", + stats = "{:this:}/{target}.bcftools_stats.txt" + log: + "{:this:}/{target}.log" + benchmark: + "benchmarks/{:name:}/{:this:}/{target}.txt" + resources: + mem = "10g", + threads: + 2 + conda: + "bcftools" + shell: + "exec >{log} 2>&1;" + "set -x;" + "if [ x\"{params.region}\" != x ]; then" + " region_param=\"-r chr{params.region}\";" + "fi;" + "bcftools mpileup " + " --fasta-ref {input.ref}" + " --output-type u" + " --max-depth {params.max_depth}" + " ${{region_param:-}}" + " {input.bam}" + "|" + "bcftools call" + " --output-type z" + " --threads 2" # second thread for compression + " --multiallelic-caller" + " {params.variants_only}" + " --output {output.vcf};" + "tabix -p vcf {output.vcf};" + "bcftools stats {output.vcf} > {output.stats};" + + +with Stage("genotype_bcftools_tx") as S: + S.add_param("Vo", typ = "flag", name = "variants_only", value = "--variants-only") + S.add_param("Si", typ = "flag", name = "skip_indels", value = "--skip-variants indels") + S.add_param("D", typ = "int", name = "max_depth", default = 250) + S.add_param("R", typ = "choice", name = "region", default = "", value = [ + "X", "Y" + ]) + + rule bcftools_call_tx: # ymp: extends bcftools_call + input: + bam = "{:prev:}/{:target:}.sorted.tx.bam", + ref = "{:prev:}/{:target:}.tx.fasta.gz", + refi = "{:prev:}/{:target:}.tx.fasta.gz.gzi", + output: + vcf = "{:this:}/{target}.tx.vcf.gz", + tbi = "{:this:}/{target}.tx.vcf.gz.tbi", + stats = "{:this:}/{target}.tx.bcftools_stats.txt" + +with Stage("merge_vcf") as S: + rule bcftools_merge: + message: "{:name:} {output.vcf}" + input: + vcf = "{:prev:}/{:target:}.vcf.gz" + output: + vcf = "{:this:}/{target}.vcf.gz", + tbi = "{:this:}/{target}.vcf.gz.tbi", + stats = "{:this:}/{target}.bcftools_stats.txt" + log: + "{:this:}/{target}.log" + benchmark: + "benchmarks/{:name:}/{:this:}/{target}.txt" + resources: + mem = "10g", + threads: + 12 + conda: + "bcftools" + shell: + "exec >{log} 2>&1;" + "set -x;" + "bcftools merge" + " --output-type z" + " --threads {threads}" + " --output {output.vcf}" + " {input.vcf};" + "tabix -p vcf {output.vcf};" + "bcftools stats {output.vcf} > {output.stats}" From e4209c9c65e67e8aa49506e4780f42d49d861622 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 2 Aug 2022 17:03:34 -0600 Subject: [PATCH 074/133] Add --scheduler override for snakemake scheduler --- src/ymp/cli/make.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/ymp/cli/make.py b/src/ymp/cli/make.py index e846c0da..6b80adf3 100644 --- a/src/ymp/cli/make.py +++ b/src/ymp/cli/make.py @@ -183,6 +183,7 @@ def start_snakemake(kwargs, submit=False): 'scriptname': 'jobname', 'cluster_cores': 'nodes', 'snake_config': 'config', + 'scheduler': 'scheduler', 'drmaa': None, 'sync': None, 'sync_arg': None, @@ -350,6 +351,10 @@ def make(**kwargs): "--scriptname", metavar="NAME", help="Set the name template used for submitted jobs" ) +@click.option( + "--scheduler", + help="ILP or greedy" +) def submit(profile, **kwargs): """Build target(s) on cluster From 67edf842050dc30a61e35d97333b290792f5f65a Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 2 Aug 2022 17:04:01 -0600 Subject: [PATCH 075/133] Update salmon message --- src/ymp/rules/salmon.rules | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ymp/rules/salmon.rules b/src/ymp/rules/salmon.rules index 9d8b7133..7029d149 100644 --- a/src/ymp/rules/salmon.rules +++ b/src/ymp/rules/salmon.rules @@ -9,7 +9,7 @@ with Stage("index_salmon") as S: S.add_param("G", typ="flag", name="gencode", value="--gencode") rule salmon_index: - message: "{:name:}: FIXME" + message: "{:name:}: Creating Salmon Index from {input}" input: txfa = "{:prev:}/{:target:}.tx.fasta.gz", output: @@ -42,7 +42,7 @@ with Stage("index_salmon_decoy") as S: S.add_param("G", typ="flag", name="gencode", value="--gencode") rule salmon_index_decoy: - message: "{:name:}: FIXME" + message: "{:name:}: Creating Salmon Index w/ Decoy from {input}" input: txfa = "{:prev:}/{:target:}.tx.fasta.gz", fa = "{:prev:}/{:target:}.fasta.gz", From 00ae22b216794e051cdec42eb06e5a3a982e37ce Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 28 Sep 2022 18:25:30 -0600 Subject: [PATCH 076/133] Don't convert - to _ in ymp scan command; emit lane/slot on request only --- src/ymp/cli/scan.py | 82 +++++++++++++++++++++++++++++++-------------- 1 file changed, 57 insertions(+), 25 deletions(-) diff --git a/src/ymp/cli/scan.py b/src/ymp/cli/scan.py index 9f1e4622..1d524882 100644 --- a/src/ymp/cli/scan.py +++ b/src/ymp/cli/scan.py @@ -4,7 +4,6 @@ import csv import click -all_headers = ["unit", "sample", "slot", "lane", "run", "pool", "fq1", "fq2"] class Scanner: re_illumina = ( @@ -12,43 +11,82 @@ class Scanner: "_S(?P\d+)" "(_L(?P\d{{3}}))?" "_R(?P[12])" - "_001.fastq.gz" + "_001.fastq.gz$" ) + _re_compiled = None + header_order = ["unit", "sample", "slot", "lane", "run", "pool", "fq1", "fq2"] def __init__(self, folders): self.folders = folders self.sample_pattern = ".*" self.folder_pattern = ".*" self.units = {} + self.verbosity = 0 + self.extra_keys = [] + self.keys = ["unit", "sample", "run", "pool", "fq1", "fq2"] def set_sample_pattern(self, pattern): self.sample_pattern = pattern + self._re_compiled = None def set_folder_pattern(self, pattern): self.folder_pattern = pattern + def set_verbosity(self, verbosity): + self.verbosity = verbosity + + def set_extra_keys(self, extra_keys): + self.extra_keys = extra_keys + + def log(self, message): + if self.verbosity > 0: + print(message) + + def get_regex(self): + if self._re_compiled is None: + regex = self.re_illumina.format(sample_pattern = self.sample_pattern) + self.log(f"Regex: {regex}") + self._re_compiled = re.compile(regex) + return self._re_compiled + def scan(self): + "Iterate over configured folders, call scan_folder on each" for folder in self.folders: self.scan_folder(folder.rstrip("/")) def scan_folder(self, folder): + "Walk folder" run = os.path.basename(folder) + self.log(f"Scanning run {run}") for root, _dirs, files in os.walk(folder): if re.search(self.folder_pattern, root): self.scan_files(run, root, files) - def get_regex(self): - regex = self.re_illumina.format(sample_pattern = self.sample_pattern) - return re.compile(regex) - def scan_files(self, run, root, files): + "Detect files" regex = self.get_regex() for fname in files: match = regex.search(fname) if match: self.parse_match(run, root, fname, match) + def find_unit(self, data, num=1): + unit_name = data["unit"] + if num > 1: + unit_name = f"{unit_name}_{num}" + data["unit"] = unit_name + unit = self.units.setdefault(unit_name, {}) + if unit and any(data[key] != unit[key] for key in data if key in unit): + if num > 20: + print("Too many units for one sample?!") + print(data) + print(unit) + sys.exit(1) + return self.find_unit(data, num+1) + return unit + def parse_match(self, run, root, fname, match): + self.log(f"Splitting {fname}") data = match.groupdict() data["fq" + data["pair"]] = os.path.join(root, fname) del data["pair"] @@ -57,38 +95,27 @@ def parse_match(self, run, root, fname, match): if run != pool: data["pool"] = pool data = {key:value for key, value in data.items() if value} - for key in ("slot", "lane"): + for key in self.extra_keys: try: data[key] = int(data[key]) except: pass - data["unit"] = data["unit"].replace("-", "_") + #data["unit"] = data["unit"].replace("-", "_") data["sample"] = data["unit"] + data = { + key: value for key, value in data.items() + if key in self.keys + self.extra_keys + } unit = self.find_unit(data) unit.update(data) - def find_unit(self, data, num=1): - unit_name = data["unit"] - if num > 1: - unit_name = f"{unit_name}_{num}" - data["unit"] = unit_name - unit = self.units.setdefault(unit_name, {}) - if unit and any(data[key] != unit[key] for key in data if key in unit): - if num > 20: - print("Too many units for one sample?!") - print(data) - print(unit) - sys.exit(1) - return self.find_unit(data, num+1) - return unit - def write_csv(self, outfd): keys = set() for row in self.units.values(): keys.update(set(row.keys())) headers = [ - header for header in all_headers if header in keys + header for header in self.header_order if header in keys ] writer = csv.DictWriter(outfd, fieldnames=headers) writer.writeheader() @@ -99,13 +126,18 @@ def write_csv(self, outfd): @click.option("--out", type=click.File('w')) @click.option("--sample-re", default=".*") @click.option("--folder-re", default=".*") +@click.option("-s", "extra_keys", flag_value="slot", multiple=True) +@click.option("-l", "extra_keys", flag_value="lane", multiple=True) +@click.option("-v", "--verbose", count=True) @click.argument("folders", nargs=-1) -def scan(folders, out, sample_re, folder_re): +def scan(folders, out, sample_re, folder_re, extra_keys, verbose): if (out is None): raise click.UsageError("--out parameter required") scanner = Scanner(folders) scanner.set_sample_pattern(sample_re) scanner.set_folder_pattern(folder_re) + scanner.set_verbosity(verbose) + scanner.set_extra_keys(list(extra_keys)) scanner.scan() scanner.write_csv(out) From e4d2f84cf81fb584d8dbd290d615df96ed5b6bbd Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 28 Sep 2022 18:50:18 -0600 Subject: [PATCH 077/133] Fix overriding job resources on jobs without resource section --- src/ymp/config.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ymp/config.py b/src/ymp/config.py index 461ba420..36b2864a 100644 --- a/src/ymp/config.py +++ b/src/ymp/config.py @@ -95,10 +95,13 @@ def expand(self, rule, ruleinfo, **kwargs): key=attr_name, ) if isinstance(values, Mapping): + if attr is None: + attr = ((), dict()) + setattr(ruleinfo, attr_name, attr) for val_name, value in values.items(): log.debug( "Overriding {}.{}={} in {} with {}".format( - attr_name, val_name, attr[1][val_name], rule.name, value + attr_name, val_name, attr[1].get(val_name, "not set"), rule.name, value ) ) attr[1][val_name] = value From 7eed34306862dfc451cab9997e05eb098cb26564 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 28 Sep 2022 18:51:27 -0600 Subject: [PATCH 078/133] Fix time format resources (walltime) scale parameter misinterpreted scale: 1 would be read as 1 minute and converted to 60 seconds, leading to grossly overscaled walltimes --- src/ymp/config.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/ymp/config.py b/src/ymp/config.py index 36b2864a..035d9800 100644 --- a/src/ymp/config.py +++ b/src/ymp/config.py @@ -145,6 +145,7 @@ def __init__(self, cfg: Optional[Mapping]) -> None: if not isinstance(cfg, Mapping): raise YmpConfigError(cfg, "Limits section must be a map (key: value)") self.limits = self.parse_config(cfg) + log.debug("Parsed Resource Limits: %s", str(self.limits)) def parse_config(self, cfg): """Parses limits config""" @@ -170,22 +171,25 @@ def parse_config(self, cfg): ) lconf["from"] = source for opt in params: - if opt in ("format", "unit", "from"): - continue - if opt not in ("default", "scale", "min", "max"): + if opt in ("default", "min", "max"): + try: + lconf[opt] = lconf['parser'](params.get(opt)) + except ValueError: + raise YmpConfigError( + params, + f'Failed to parse "{params.get(opt)}"', + key=opt + ) from None + elif opt in ("scale"): + lconf[opt] = params.get(opt) + elif opt in ("format", "unit", "from"): + pass + else: raise YmpConfigError( params, f'Unknown parameter "{opt}" in "{name}" resource_limits', opt ) - try: - lconf[opt] = lconf['parser'](params.get(opt)) - except ValueError: - raise YmpConfigError( - params, - f'Failed to parse "{params.get(opt)}"', - key=opt - ) from None limits[name] = lconf for key in list(limits.keys()): if limits[key].get("from"): From 20949dd5d620533ff3f3852f31dba314d509a7ac Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 28 Sep 2022 18:53:18 -0600 Subject: [PATCH 079/133] Fix bcftools skip_indels parameter not passed on --- src/ymp/rules/bcftools.rules | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ymp/rules/bcftools.rules b/src/ymp/rules/bcftools.rules index c72e767a..188a84bb 100644 --- a/src/ymp/rules/bcftools.rules +++ b/src/ymp/rules/bcftools.rules @@ -79,6 +79,7 @@ with Stage("genotype_bcftools") as S: " --threads 2" # second thread for compression " --multiallelic-caller" " {params.variants_only}" + " {params.skip_indels}" " --output {output.vcf};" "tabix -p vcf {output.vcf};" "bcftools stats {output.vcf} > {output.stats};" From 4d30c780467c8159fd82e0b176ed5de8ecaf27d2 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 6 Oct 2021 17:58:28 -0600 Subject: [PATCH 080/133] Log every conf file read in debug --- src/ymp/yaml.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ymp/yaml.py b/src/ymp/yaml.py index e17ef8bc..2ec1be02 100644 --- a/src/ymp/yaml.py +++ b/src/ymp/yaml.py @@ -462,6 +462,7 @@ def load_one(fname, stack): fname = resolve_installed_package(fname, stack) if any(fname == entry.filename for entry in stack): raise LayeredConfError((fname, None), "Recursion in includes", stack=stack) + log.debug("Loading YAML configuration from %s", fname) try: with open(fname, "r") as fdes: yaml = rt_yaml.load(fdes) From 2b7d86ce02f9b8eb1590e8f07a0ace36b08939ef Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 6 Oct 2021 17:58:42 -0600 Subject: [PATCH 081/133] Fix test --- tests/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index c17597b3..f065e228 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -296,7 +296,7 @@ def test_env_run(invoker, demo_dir, mock_conda, mock_downloader, capfd): res = invoker.call("env", "run", "bbmap", "true") assert res.exit_code == 0 cap = capfd.readouterr() - assert "No such file or directory" in cap.err + assert "Not a conda environment" in cap.err @pytest.mark.parametrize( From 24b51d2868f496e7b658ed89172a1512a6568baa Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 6 Oct 2021 18:14:44 -0600 Subject: [PATCH 082/133] Move cache to separate module --- src/ymp/cache.py | 258 ++++++++++++++++++++++++++++++++++++++++++++++ src/ymp/common.py | 244 ------------------------------------------- src/ymp/config.py | 3 +- 3 files changed, 260 insertions(+), 245 deletions(-) create mode 100644 src/ymp/cache.py diff --git a/src/ymp/cache.py b/src/ymp/cache.py new file mode 100644 index 00000000..eb8286ff --- /dev/null +++ b/src/ymp/cache.py @@ -0,0 +1,258 @@ +""" +Caching helpers to speed up shell commands and command completion +""" + + +import logging +import os +import sqlite3 + +import ymp +from ymp.common import AttrDict, ensure_list + +log = logging.getLogger(__name__) + + +class NoCache(object): + def __init__(self, root): + self.caches = {} + + def close(self): + pass # NoCache doesn't close anything + + def get_cache(self, name, clean=False, *args, **kwargs): + if name not in self.caches: + self.caches[name] = CacheDict(self, name, *args, **kwargs) + return self.caches[name] + + def store(self, cache, key, obj): + pass # NoCache doesnt store anything + + def commit(self): + pass # NoCache doesnt commit anything + + def load(self, _cache, _key): + return None + + def load_all(self, _cache): + return () + + +class Cache(object): + def __init__(self, root): + os.makedirs(os.path.join(root), exist_ok=True) + db_fname = os.path.join(root, "ymp.db") + log.debug("Opening database %s", db_fname) + self.conn = sqlite3.connect(db_fname, check_same_thread=False) + + # Drop tables if the database has the wrong version number + # or if the user_version has not been set (defaults to 0) + version = self.conn.execute("PRAGMA user_version").fetchone()[0] + if version == ymp.__numeric_version__ and version != 0: + try: + curs = self.conn.execute("SELECT file, time from stamps") + update = any(os.path.getmtime(row[0]) > row[1] for row in curs) + except FileNotFoundError: + update = True + del curs + if update: + log.error("Dropping cache: files changed") + self.conn.executescript(""" + DROP TABLE caches; + DROP TABLE stamps; + """) + else: + log.info("No cache, loading...") + update = True + + if update: + self.conn.executescript(""" + BEGIN EXCLUSIVE; + DROP TABLE IF EXISTS caches; + CREATE TABLE caches ( + name TEXT, + key TEXT, + data, + PRIMARY KEY (name, key) + ); + DROP TABLE IF EXISTS stamps; + CREATE TABLE stamps ( + file TEXT PRIMARY KEY, + time INT + ); + + PRAGMA user_version={}; + COMMIT; + """.format(ymp.__numeric_version__)) + + self.caches = {} + self.files = {} + + def close(self): + self.conn.close() + + def get_cache(self, name, clean=False, *args, **kwargs): + if name not in self.caches: + self.caches[name] = CacheDict(self, name, *args, **kwargs) + return self.caches[name] + + def store(self, cache, key, obj): + import pickle + + files = ensure_list(getattr(obj, "defined_in", None)) + try: + stamps = [(fn, os.path.getmtime(fn)) + for fn in files + if fn not in self.files] + self.conn.executemany( + "REPLACE INTO stamps VALUES (?,?)", + stamps) + self.files.update(dict(stamps)) + self.conn.execute(""" + REPLACE INTO caches + VALUES (?, ?, ?) + """, [cache, key, pickle.dumps(obj)] + ) + except pickle.PicklingError: + log.error("Failed to pickle %s", obj) + except FileNotFoundError: + pass + + def commit(self): + import sqlite3 + try: + self.conn.commit() + except sqlite3.OperationalError as exc: + log.warning("Cache write failed: %s", exc.what()) + + def load(self, cache, key): + import pickle + row = self.conn.execute(""" + SELECT data FROM caches WHERE name=? AND key=? + """, [cache, key]).fetchone() + if row: + obj = pickle.loads(row[0]) + try: + obj.load_from_pickle() + except AttributeError: + pass + return obj + else: + return None + + def load_all(self, cache): + import pickle + rows = self.conn.execute(""" + SELECT key, data FROM caches WHERE name=? + """, [cache]) + return ((row[0], pickle.loads(row[1])) + for row in rows) + + +class CacheDict(AttrDict): + def __init__(self, cache, name, *args, loadfunc=None, + itemloadfunc=None, itemdata=None, **kwargs): + self._cache = cache + self._name = name + self._loadfunc = loadfunc + self._itemloadfunc = itemloadfunc + self._itemdata = itemdata + self._args = args + self._kwargs = kwargs + self._loading = False + self._complete = False + + def _loaditem(self, key): + cached = self._cache.load(self._name, key) + if cached: + super().__setitem__(key, cached) + elif self._itemdata is not None: + if key in self._itemdata: + item = self._itemloadfunc(key, self._itemdata[key]) + self._cache.store(self._name, key, item) + self._cache.commit() + super().__setitem__(key, item) + elif self._itemloadfunc: + item = self._itemloadfunc(key) + self._cache.store(self._name, key, item) + self._cache.commit() + super().__setitem__(key, item) + else: + self._loadall() + + def _loadall(self): + if self._complete: + return + loaded = set() + for key, obj in self._cache.load_all(self._name): + loaded.add(key) + super().__setitem__(key, obj) + if self._itemloadfunc: + for key in self._itemdata: + if key not in loaded: + self._loaditem(key) + elif self._loadfunc and not self._loading and not loaded: + self._loadfunc(*self._args, **self._kwargs) + self._loadfunc = None + for key, item in super().items(): + self._cache.store(self._name, key, item) + self._cache.commit() + self._complete = True + + def __enter__(self): + self._loading = True + return self + + def __exit__(self, a, b, c): + self._loading = False + + def __contains__(self, key): + if self._itemdata: + return key in self._itemdata + self._loadall() + return super().__contains__(key) + + def __len__(self): + if self._itemdata: + return len(self._itemdata) + self._loadall() + return super().__len__() + + def __getitem__(self, key): + if not super().__contains__(key): + self._loaditem(key) + return super().__getitem__(key) + + def __setitem__(self, key, val): + super().__setitem__(key, val) + + def __delitem__(self, key): + raise NotImplementedError() + + def __iter__(self): + if self._itemdata: + return self._itemdata.__iter__() + self._loadall() + return super().__iter__() + + def __str__(self): + self._loadall() + return super().__str__() + + def get(self, key, default=None): + if not super().__contains__(key): + self._loaditem(key) + return super().get(key, default) + + def items(self): + self._loadall() + return super().items() + + def keys(self): + if self._itemdata: + return self._itemdata.keys() + return super().keys() + + def values(self): + self._loadall() + return super().values() diff --git a/src/ymp/common.py b/src/ymp/common.py index 7a5df4e5..e29341c9 100644 --- a/src/ymp/common.py +++ b/src/ymp/common.py @@ -144,247 +144,3 @@ def ensure_list(obj): return list(obj) -class NoCache(object): - def __init__(self, root): - self.caches = {} - - def close(self): - pass # NoCache doesn't close anything - - def get_cache(self, name, clean=False, *args, **kwargs): - if name not in self.caches: - self.caches[name] = CacheDict(self, name, *args, **kwargs) - return self.caches[name] - - def store(self, cache, key, obj): - pass # NoCache doesnt store anything - - def commit(self): - pass # NoCache doesnt commit anything - - def load(self, _cache, _key): - return None - - def load_all(self, _cache): - return () - - -class Cache(object): - def __init__(self, root): - import sqlite3 - os.makedirs(os.path.join(root), exist_ok=True) - db_fname = os.path.join(root, "ymp.db") - log.debug("Opening database %s", db_fname) - self.conn = sqlite3.connect(db_fname, check_same_thread=False) - - # Drop tables if the database has the wrong version number - # or if the user_version has not been set (defaults to 0) - version = self.conn.execute("PRAGMA user_version").fetchone()[0] - if version == ymp.__numeric_version__ and version != 0: - try: - curs = self.conn.execute("SELECT file, time from stamps") - update = any(os.path.getmtime(row[0]) > row[1] for row in curs) - except FileNotFoundError: - update = True - del curs - if update: - log.error("Dropping cache: files changed") - self.conn.executescript(""" - DROP TABLE caches; - DROP TABLE stamps; - """) - else: - log.info("No cache, loading...") - update = True - - if update: - self.conn.executescript(""" - BEGIN EXCLUSIVE; - DROP TABLE IF EXISTS caches; - CREATE TABLE caches ( - name TEXT, - key TEXT, - data, - PRIMARY KEY (name, key) - ); - DROP TABLE IF EXISTS stamps; - CREATE TABLE stamps ( - file TEXT PRIMARY KEY, - time INT - ); - - PRAGMA user_version={}; - COMMIT; - """.format(ymp.__numeric_version__)) - - self.caches = {} - self.files = {} - - def close(self): - self.conn.close() - - def get_cache(self, name, clean=False, *args, **kwargs): - if name not in self.caches: - self.caches[name] = CacheDict(self, name, *args, **kwargs) - return self.caches[name] - - def store(self, cache, key, obj): - import pickle - - files = ensure_list(getattr(obj, "defined_in", None)) - try: - stamps = [(fn, os.path.getmtime(fn)) - for fn in files - if fn not in self.files] - self.conn.executemany( - "REPLACE INTO stamps VALUES (?,?)", - stamps) - self.files.update(dict(stamps)) - self.conn.execute(""" - REPLACE INTO caches - VALUES (?, ?, ?) - """, [cache, key, pickle.dumps(obj)] - ) - except pickle.PicklingError: - log.error("Failed to pickle %s", obj) - except FileNotFoundError: - pass - - def commit(self): - import sqlite3 - try: - self.conn.commit() - except sqlite3.OperationalError as exc: - log.warning("Cache write failed: %s", exc.what()) - - def load(self, cache, key): - import pickle - row = self.conn.execute(""" - SELECT data FROM caches WHERE name=? AND key=? - """, [cache, key]).fetchone() - if row: - obj = pickle.loads(row[0]) - try: - obj.load_from_pickle() - except AttributeError: - pass - return obj - else: - return None - - def load_all(self, cache): - import pickle - rows = self.conn.execute(""" - SELECT key, data FROM caches WHERE name=? - """, [cache]) - return ((row[0], pickle.loads(row[1])) - for row in rows) - - -class CacheDict(AttrDict): - def __init__(self, cache, name, *args, loadfunc=None, - itemloadfunc=None, itemdata=None, **kwargs): - self._cache = cache - self._name = name - self._loadfunc = loadfunc - self._itemloadfunc = itemloadfunc - self._itemdata = itemdata - self._args = args - self._kwargs = kwargs - self._loading = False - self._complete = False - - def _loaditem(self, key): - cached = self._cache.load(self._name, key) - if cached: - super().__setitem__(key, cached) - elif self._itemdata is not None: - if key in self._itemdata: - item = self._itemloadfunc(key, self._itemdata[key]) - self._cache.store(self._name, key, item) - self._cache.commit() - super().__setitem__(key, item) - elif self._itemloadfunc: - item = self._itemloadfunc(key) - self._cache.store(self._name, key, item) - self._cache.commit() - super().__setitem__(key, item) - else: - self._loadall() - - def _loadall(self): - if self._complete: - return - loaded = set() - for key, obj in self._cache.load_all(self._name): - loaded.add(key) - super().__setitem__(key, obj) - if self._itemloadfunc: - for key in self._itemdata: - if key not in loaded: - self._loaditem(key) - elif self._loadfunc and not self._loading and not loaded: - self._loadfunc(*self._args, **self._kwargs) - self._loadfunc = None - for key, item in super().items(): - self._cache.store(self._name, key, item) - self._cache.commit() - self._complete = True - - def __enter__(self): - self._loading = True - return self - - def __exit__(self, a, b, c): - self._loading = False - - def __contains__(self, key): - if self._itemdata: - return key in self._itemdata - self._loadall() - return super().__contains__(key) - - def __len__(self): - if self._itemdata: - return len(self._itemdata) - self._loadall() - return super().__len__() - - def __getitem__(self, key): - if not super().__contains__(key): - self._loaditem(key) - return super().__getitem__(key) - - def __setitem__(self, key, val): - super().__setitem__(key, val) - - def __delitem__(self, key): - raise NotImplementedError() - - def __iter__(self): - if self._itemdata: - return self._itemdata.__iter__() - self._loadall() - return super().__iter__() - - def __str__(self): - self._loadall() - return super().__str__() - - def get(self, key, default=None): - if not super().__contains__(key): - self._loaditem(key) - return super().get(key, default) - - def items(self): - self._loadall() - return super().items() - - def keys(self): - if self._itemdata: - return self._itemdata.keys() - return super().keys() - - def values(self): - self._loadall() - return super().values() diff --git a/src/ymp/config.py b/src/ymp/config.py index 035d9800..a5014aca 100644 --- a/src/ymp/config.py +++ b/src/ymp/config.py @@ -9,7 +9,8 @@ from typing import Mapping, Sequence, Optional import ymp.yaml -from ymp.common import AttrDict, Cache, MkdirDict, parse_number, format_number, parse_time, format_time +from ymp.common import AttrDict, MkdirDict, parse_number, format_number, parse_time, format_time +from ymp.cache import Cache from ymp.env import CondaPathExpander from ymp.exceptions import YmpSystemError, YmpConfigError from ymp.stage import Pipeline, Project, Reference From 31cd75224ad4076c02d1e209f84093a0328880b3 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Mon, 17 Oct 2022 20:07:38 -0600 Subject: [PATCH 083/133] feat!: Move to Snakemake 7.15 - Snakemake added "name" as attribute to conda environment objects. This is used for referencing pre-existing named environments. Moving to use '_ymp_name' for our names for now. - The constructor for Env() also changed, adjusted that. - The path attribute has been renamed to address - The index type for workflow.linemaps changed back to the file name from the source cache object - Input, output, log and benchmark may now have a path_modifier object attached to them. Rewrote RecursiveExpander and InheritanceExpander to be able to handle this. - Not going by data type any more, but relying on our map of ruleinfo object attributes and their datatypes and uses. I.e. format=="argstuple" for (args,kwargs[,modifer]) type attributes like input and output. --- src/ymp/__init__.py | 2 +- src/ymp/cli/env.py | 6 +- src/ymp/env.py | 70 +++++++------ src/ymp/rules/00_download.rules | 7 ++ src/ymp/snakemake.py | 179 ++++++++++++++++++++++---------- 5 files changed, 172 insertions(+), 92 deletions(-) diff --git a/src/ymp/__init__.py b/src/ymp/__init__.py index b1c65775..7cb75bf6 100644 --- a/src/ymp/__init__.py +++ b/src/ymp/__init__.py @@ -49,7 +49,7 @@ #: List of versions this version of YMP has been verified to work with snakemake_versions = [ - '6.10.0', + '7.15.2', ] diff --git a/src/ymp/cli/env.py b/src/ymp/cli/env.py index d17f23aa..134a4444 100644 --- a/src/ymp/cli/env.py +++ b/src/ymp/cli/env.py @@ -204,7 +204,7 @@ def remove(envnames): log.warning(f"Removing {len(envs)} environments.") for env in get_envs(envnames).values(): if os.path.exists(env.path): - log.warning("Removing %s (%s)", env.name, env.path) + log.warning("Removing %s (%s)", env._ymp_name, env.path) shutil.rmtree(env.path) @@ -267,7 +267,7 @@ def export(envnames, dest, overwrite, create_missing, skip_missing, filetype): if missing: raise click.UsageError( f"Cannot export uninstalled environment(s): " - f"{', '.join(env.name for env in missing)}.\n" + f"{', '.join(env._ymp_name for env in missing)}.\n" f"Use '-s' to skip these or '-c' to create them prior to export." ) @@ -328,7 +328,7 @@ def clean(param_all): if param_all: # remove up-to-date environments for env in ymp.env.by_name.values(): if os.path.exists(env.path): - log.warning("Removing %s (%s)", env.name, env.path) + log.warning("Removing %s (%s)", env._ymp_name, env.path) shutil.rmtree(env.path) # remove outdated environments diff --git a/src/ymp/env.py b/src/ymp/env.py index 09d3565a..846daac5 100644 --- a/src/ymp/env.py +++ b/src/ymp/env.py @@ -70,8 +70,9 @@ def __new__(cls, *args, **kwargs): def __init__( self, # Snakemake Params: - env_file: Optional[str] = None, workflow = None, + env_file: Optional[str] = None, + env_name: Optional[str] = None, env_dir = None, container_img=None, cleanup=None, @@ -100,13 +101,12 @@ def __init__( if env_file: if name: - import pdb; pdb.set_trace() raise YmpRuleError( self, "Env must not have both 'name' and 'env_file' parameters'" ) self.dynamic = False - self.name, _ = op.splitext(op.basename(env_file)) + self._ymp_name, _ = op.splitext(op.basename(env_file)) self.packages = None self.base = None self.channels = None @@ -116,7 +116,7 @@ def __init__( self.lineno = 1 elif name: self.dynamic = True - self.name = name + self._ymp_name = name self.packages = ensure_list(packages) + cfg.conda.defaults[base].dependencies self.channels = ensure_list(channels) + cfg.conda.defaults[base].channels env_file = op.join(cfg.ensuredir.dynamic_envs, f"{name}.yml") @@ -141,17 +141,19 @@ def __init__( }) super().__init__( - env_file, - workflow, - env_dir if env_dir else cfg.ensuredir.conda_prefix, - container_img, - cleanup) + workflow = workflow, + env_file = env_file, + env_dir = env_dir if env_dir else cfg.ensuredir.conda_prefix, + container_img = container_img, + cleanup = cleanup + ) + self.register() def _get_dynamic_contents(self): cfg = ymp.get_config() defaults = { - 'name': self.name, + 'name': self._ymp_name, 'dependencies': self.packages, 'channels': self.channels, } @@ -219,15 +221,15 @@ def create(self, dryrun=False, reinstall=False, nospec=False, noarchive=False): """ if self.installed: if reinstall: - log.info("Environment '%s' already exists. Removing...", self.name) + log.info("Environment '%s' already exists. Removing...", self._ymp_name) if not dryrun: - shutil.rmtree(self.path, ignore_errors = True) + shutil.rmtree(self.address, ignore_errors = True) else: - log.info("Environment '%s' already exists", self.name) - return self.path + log.info("Environment '%s' already exists", self._ymp_name) + return self.address - log.warning("Creating environment '%s'", self.name) - log.debug("Target dir is '%s'", self.path) + log.warning("Creating environment '%s'", self._ymp_name) + log.debug("Target dir is '%s'", self.address) if noarchive and self.archive_file: log.warning("Removing archived environment packages...") @@ -246,10 +248,10 @@ def create(self, dryrun=False, reinstall=False, nospec=False, noarchive=False): f.write("\n".join(files) + "\n") else: log.warning("Neither spec file nor package archive found for '%s'," - " falling back to native resolver", self.name) + " falling back to native resolver", self._ymp_name) res = super().create(dryrun) - log.info("Created env %s", self.name) + log.info("Created env %s", self._ymp_name) return res def _have_archive(self): @@ -267,7 +269,7 @@ def _have_archive(self): if missing_packages: log.warning( "Ignoring incomplete package archive for environment %s", - self.name) + self._ymp_name) log.debug( "Missing packages: %s", missing_packages) return False @@ -293,7 +295,7 @@ def _get_env_from_spec(self): spec_path = spec_path.replace("BUILTIN:", "") spec_path = op.join(ymp._env_dir, spec_path) for path in (op.join(spec_path, cfg.platform), spec_path): - spec_file = op.join(path, self.name + ".txt") + spec_file = op.join(path, self._ymp_name + ".txt") log.debug("Trying %s", spec_file) if op.exists(spec_file): log.info("Using %s", spec_file) @@ -324,17 +326,17 @@ def _download_files(self, urls, md5s): # remove partially download archive folder? # shutil.rmtree(self.archive_file, ignore_errors=True) raise YmpWorkflowError( - f"Unable to create environment {self.name}, " + f"Unable to create environment {self._ymp_name}, " f"because downloads failed. See log for details.") @property def installed(self): if self.is_containerized: return True # Not checking - if not op.exists(self.path): + if not op.exists(self.address): return False - start_stamp = op.join(self.path, "env_setup_start") - finish_stamp = op.join(self.path, "env_setup_done") + start_stamp = op.join(self.address, "env_setup_start") + finish_stamp = op.join(self.address, "env_setup_done") if op.exists(start_stamp) and not op.exists(finish_stamp): return False return True @@ -342,12 +344,12 @@ def installed(self): def update(self): "Update conda environment" self.create() # call create to make sure environment exists - log.warning("Updating environment '%s'", self.name) - log.warning(f"Running {self.frontend} env update --prune -p {self.path} -f {self.file} -v") + log.warning("Updating environment '%s'", self._ymp_name) + log.warning(f"Running {self.frontend} env update --prune -p {self.address} -f {self.file} -v") return subprocess.run([ self.frontend, "env", "update", "--prune", - "-p", str(self.path), + "-p", str(self.address), "-f", str(self.file), "-v" ]).returncode @@ -358,7 +360,7 @@ def run(self, command): Returns exit code of command run. """ command = " ".join(command) - command = snakemake_conda.Conda().shellcmd(self.path, command) + command = snakemake_conda.Conda().shellcmd(self.address, command) cfg = ymp.get_config() log.debug("Running: %s", command) return subprocess.run( @@ -369,30 +371,30 @@ def run(self, command): def export(self, stream, typ='yml'): """Freeze environment""" - log.warning("Exporting environment '%s'", self.name) + log.warning("Exporting environment '%s'", self._ymp_name) if typ == 'yml': res = subprocess.run([ "conda", "env", "export", - "-p", self.path, + "-p", self.address, ], stdout=subprocess.PIPE) yaml = YAML(typ='rt') yaml.default_flow_style = False env = yaml.load(res.stdout) - env['name'] = self.name + env['name'] = self._ymp_name if 'prefix' in env: del env['prefix'] yaml.dump(env, stream) elif typ == 'txt': res = subprocess.run([ "conda", "list", "--explicit", "--md5", - "-p", self.path, + "-p", self.address, ], stdout=stream) return res.returncode def __lt__(self, other): "Comparator for sorting" - return self.name < other.name + return self._ymp_name < other._ymp_name def __repr__(self): return f"{self.__class__.__name__}({self.__dict__!r})" @@ -440,6 +442,6 @@ def format(self, conda_env, *args, **kwargs): for ext in "", ".yml", ".yaml": env_file = abspath+ext if op.exists(env_file): - Env(env_file) + Env(env_file = env_file) return env_file return conda_env diff --git a/src/ymp/rules/00_download.rules b/src/ymp/rules/00_download.rules index a499b43d..38bea0ee 100644 --- a/src/ymp/rules/00_download.rules +++ b/src/ymp/rules/00_download.rules @@ -105,6 +105,13 @@ with Stage("references") as S: """ Template rule for unpacking references provisioned upstream as archive. """ + input: + tar = "dummy.in" + output: + files = "dummy.out" + params: + strip = 0, + prefix = "" message: "Unpacking {input.tar} into {params.prefix}" shell: """ diff --git a/src/ymp/snakemake.py b/src/ymp/snakemake.py index 81870496..4a8f603d 100644 --- a/src/ymp/snakemake.py +++ b/src/ymp/snakemake.py @@ -183,10 +183,12 @@ def update_tuple(self, totuple): 'format': 'argstuple', 'funcparams': ('wildcards',), 'apply_wildcards': True, + 'path_modifier': True, }, 'output': { 'format': 'argstuple', 'apply_wildcards': True, + 'path_modifier': True, }, 'threads': { 'format': 'int', @@ -214,6 +216,7 @@ def update_tuple(self, totuple): 'log': { 'format': 'argstuple', 'apply_wildcards': True, + 'path_modifier': True, }, 'message': { 'format': 'string', @@ -222,6 +225,7 @@ def update_tuple(self, totuple): 'benchmark': { 'format': 'string', 'apply_wildcards': True, + 'path_modifier': True, }, 'wrapper': { 'format': 'string', @@ -238,7 +242,8 @@ def update_tuple(self, totuple): }, 'shellcmd': { 'format': 'string', - 'format_wildcards': True + 'format_wildcards': True, + 'runner': True, }, 'docstring': { 'format': 'string', @@ -248,17 +253,68 @@ def update_tuple(self, totuple): }, 'func': { 'format': 'callable', + 'runner': True, }, 'script': { 'format': 'string', + 'runner': True, + }, + 'cache': { + # indicates whether or not output is cached across workflows + 'format': 'boolean' + }, + 'default_target': { + # whether or not the rule is the default target called when no + # targets specified + 'format': 'boolean' + }, + 'handover': { + # rule takes over entire local node + 'format': 'boolean' + }, + 'is_containerized': { + 'format': 'boolean' + }, + 'wrapper': { + 'format': 'string', # not sure it's really a string + 'runner': True, + }, + 'path_modifier': { + 'format': 'modifier', + }, + 'apply_modifier': { + 'format': 'modifier', + }, + 'cwl': { + 'format': 'unknown' + }, + 'env_modules': { + 'format': 'string' + }, + 'group': { + 'format': 'string' + }, + 'name': { + 'format': 'string' + }, + 'notebook': { + 'format': 'string', + 'runner': True + }, + 'retries': { + 'format': 'int' + }, + 'template_engine': { + 'format': 'string', + 'runner': True } + # restart_times # env_modules # shadow_depth # group # notebook # cwl - # cache } @@ -426,7 +482,7 @@ def decorate(ruleinfo): # register rule with snakemake try: decorator(ruleinfo) # does not return anything - except AttributeError: + except (AttributeError, ValueError): print_ruleinfo(rule, ruleinfo, log.error) raise @@ -557,8 +613,7 @@ def expand(self, rule, item, expand_args=None, rec=-1, cb=False): elif isinstance(item, tuple): item = self.expand_tuple(rule, item, expand_args, rec, cb) else: - log.debug("Not expanding item '{}' of type {}".format( - repr(item), type(item))) + item = self.expand_unknown(rule, item, expand_args, rec, cb) if debug: log.debug("{}=> {} {}" @@ -566,6 +621,9 @@ def expand(self, rule, item, expand_args=None, rec=-1, cb=False): return item + def expand_unknown(self, rule, item, expand_args, rec, cb): + return item + def expand_ruleinfo(self, rule, item, expand_args, rec): self.current_rule = rule for field in filter(self.expands_field, ruleinfo_fields): @@ -766,26 +824,29 @@ def expand(self, rule, ruleinfo): """Recursively expand wildcards within :class:`RuleInfo` object""" fields = list(filter(lambda x: x is not None, filter(self.expands_field, ruleinfo_fields))) - # normalize field values and create namedlist dictionary + # Fetch original ruleinfo values into a dict of NamedList args = {} + orig_tuples = {} for field in fields: - attr = getattr(ruleinfo, field) - if isinstance(attr, tuple): - if len(attr) != 2: - raise Exception("Internal Error") - # flatten named lists - for key in attr[1]: - if is_container(attr[1][key]): - attr[1][key] = list(flatten(attr[1][key])) - # flatten unnamed and overwrite tuples - # also turn attr[0] into a list, making it mutable - attr = (list(flatten(attr[0])), attr[1]) - - setattr(ruleinfo, field, attr) - args[field] = NamedList(fromtuple=attr) + if getattr(ruleinfo, field, None) is None: + pass + elif ruleinfo_fields[field]["format"] == "argstuple": + unnamed, named, *_ = getattr(ruleinfo, field) + # flatten values + unnamed = list(flatten(unnamed)) + for key in named: + if is_container(named[key]): + named[key] = list(flatten(named[key])) + orig_tuples[field] = (unnamed, named) + args[field] = NamedList(fromtuple=(unnamed, named)) + elif ruleinfo_fields[field].get("path_modifier", False): + string, *_ = getattr(ruleinfo, field, ((), None)) + args[field] = NamedList() + args[field].append(string) else: + string = getattr(ruleinfo, field, None) args[field] = NamedList() - args[field].append(attr) + args[field].append(string) # build graph of expansion dependencies deps = networkx().DiGraph() @@ -862,14 +923,19 @@ def wrapper(wildcards, **kwargs): node, value, valnew)) # update ruleinfo - for name in fields: - attr = getattr(ruleinfo, name) - if isinstance(attr, tuple): - if len(attr) != 2: - raise Exception("Internal Error") - args[name].update_tuple(attr) + for field in fields: + attr = getattr(ruleinfo, field) + if attr is None: + pass + elif ruleinfo_fields[field]["format"] == "argstuple": + args[field].update_tuple(orig_tuples[field]) + unnamed, named = orig_tuples[field] + _, _, *extras = attr + setattr(ruleinfo, field, (unnamed, named, *extras)) + elif ruleinfo_fields[field].get("path_modifier", False): + setattr(ruleinfo, field, (args[field][0], attr[1])) else: - setattr(ruleinfo, name, args[name][0]) + setattr(ruleinfo, field, args[field][0]) class InheritanceExpander(BaseExpander): @@ -913,17 +979,18 @@ def __init__(self): def get_code_line(self, rule: Rule) -> str: """Returns the source line defining *rule*""" - cached_file = infer_source_file(rule.snakefile) + # Load and cache Snakefile if rule.snakefile not in self.snakefiles: try: + cached_file = infer_source_file(rule.snakefile) with self.workflow.sourcecache.open(cached_file, "r") as sf: self.snakefiles[rule.snakefile] = sf.readlines() except IOError: raise Exception("Can't parse ...") # `rule.lineno` refers to compiled code. Convert to source line number. - real_lineno = self.workflow.linemaps[cached_file][rule.lineno] + real_lineno = self.workflow.linemaps[rule.snakefile][rule.lineno] return self.snakefiles[rule.snakefile][real_lineno - 1] @@ -940,11 +1007,13 @@ def get_super(self, rule: Rule, ruleinfo: RuleInfo) -> Optional[RuleInfo]: """ self.ruleinfos[rule.name] = ruleinfo # stash original ruleinfos + # If the rule was created with make_rule and has a parent + # attribute set, fetch that. if hasattr(ruleinfo, 'parent'): return ruleinfo.parent.name, self.ruleinfos[ruleinfo.parent.name] + # Otherwise, check the rule definition line for the marker comment line = self.get_code_line(rule) - if "#" in line: comment = line.split("#")[1].strip() if comment.startswith(self.KEYWORD): @@ -960,33 +1029,35 @@ def expand(self, rule, ruleinfo): super_name, super_ruleinfo = self.get_super(rule, ruleinfo) if super_ruleinfo is None: return - for field in dir(ruleinfo): - if field.startswith("__") or field == "parent": + if field.startswith("__") or field in ("parent", "name"): continue - base_attr = getattr(super_ruleinfo, field) - if field not in ("path_modifier", "apply_modifier"): - base_attr = deepcopy(base_attr) override_attr = getattr(ruleinfo, field) + base_attr = getattr(super_ruleinfo, field) - if field in ("shellcmd", "wrapper", "script", "func"): - if not ruleinfo.norun: # child rule is runnable, clear out base - base_attr = None - elif not super_ruleinfo.norun: # base is runnable, child not, clear out child - override_attr = None - - if isinstance(override_attr, tuple): - if base_attr is None: - base_attr = ([], {}) - if override_attr[0]: - base_attr = (override_attr[0], base_attr[1]) - if override_attr[1]: - base_attr[1].update(override_attr[1]) - elif override_attr is not None: - base_attr = override_attr - - setattr(ruleinfo, field, base_attr) + if ruleinfo_fields[field].get("runner", False): + # If the child is not runnable, copy all runner + # attributed from base. + if ruleinfo.norun: + setattr(ruleinfo, field, base_attr) + elif override_attr is None: + # Attribute missing in child, take base + setattr(ruleinfo, field, base_attr) + elif base_attr is None: + # Attribute missing in base, do nothing + pass + elif ruleinfo_fields[field]["format"] == "argstuple": + unnamed_child, named_child, *extra_child = override_attr + unnamed_base, named_base, *extra_base = base_attr + unnamed = unnamed_child or unnamed_base + extra = extra_child or extra_base + named = deepcopy(named_base) + named.update(named_child) + setattr(ruleinfo, field, (unnamed, named, *extra)) + else: + # Both set, not argstuple, keep child intact + pass if not ruleinfo.norun or not super_ruleinfo.norun: ruleinfo.norun = False @@ -1054,7 +1125,7 @@ def register(self): cache = self.get_registry() names = [] - for attr in 'name', 'altname': + for attr in 'name', 'altname', '_ymp_name': if hasattr(self, attr): names += ensure_list(getattr(self, attr)) From ab8417317eded0b75d56c0cd86bc21b58b2b09f3 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Mon, 17 Oct 2022 20:16:27 -0600 Subject: [PATCH 084/133] feat: pass frontend to Snakemake so we really get mamba --- src/ymp/cli/make.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ymp/cli/make.py b/src/ymp/cli/make.py index 6b80adf3..44fbd10d 100644 --- a/src/ymp/cli/make.py +++ b/src/ymp/cli/make.py @@ -210,6 +210,7 @@ def start_snakemake(kwargs, submit=False): if log.getEffectiveLevel() < logging.WARNING: kwargs['verbose'] = True kwargs['use_conda'] = True + kwargs['conda_frontend'] = cfg.conda.frontend # expand stack paths stage_stack_failure = None From 3cd9d7317d61f526345243995a3736e21412f866 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Mon, 17 Oct 2022 20:17:18 -0600 Subject: [PATCH 085/133] fix: cannot use !workdir tag for data key in project config --- src/ymp/stage/project.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/ymp/stage/project.py b/src/ymp/stage/project.py index 765c116f..5de2b8c5 100644 --- a/src/ymp/stage/project.py +++ b/src/ymp/stage/project.py @@ -66,9 +66,7 @@ def load_data(self, cfg, key): if not (key in cfg or isinstance(cfg, Sequence)): raise YmpConfigError(cfg, f"Missing key '{key}' in project data config", key=key) value = cfg[key] - if isinstance(value, str): - return self._load_file(cfg, key) - if isinstance(value, Sequence): + if isinstance(value, Sequence) and not isinstance(value, str): return self._rowbind(cfg, key) if isinstance(value, Mapping): command = next(iter(value), None) @@ -80,7 +78,7 @@ def load_data(self, cfg, key): return self._paste(value["paste"]) if command == "table": return self._table(value["table"]) - raise YmpConfigError(cfg, "Unrecognized statement in data config", key=key) + return self._load_file(cfg, key) def _load_file(self, cfg, key): fname = cfg.get_path(key) From 64253f3811bf08f554b88d19d3e5f15e03704ca9 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Mon, 17 Oct 2022 20:18:25 -0600 Subject: [PATCH 086/133] fix: confusing fake-file error returned if reference file/dir mismatch --- src/ymp/stage/reference.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ymp/stage/reference.py b/src/ymp/stage/reference.py index 138441a4..3bb5811b 100644 --- a/src/ymp/stage/reference.py +++ b/src/ymp/stage/reference.py @@ -394,7 +394,9 @@ def get_file(self, filename, isdir=False): local_path = self.files.get(filename) if local_path: if os.path.isdir(local_path) != isdir: - return "YMP_THIS_FILE_MUST_NOT_EXIST" + return (f"YMP ERROR: File '{local_path}' should be" + f" {'directory' if isdir else 'file'}" + f" but is not") return local_path log.error(f"{self!r}: Failed to find {filename}") log.warning(f" Available: {self.files}") From 0d966f4f7e084bc50af3c56d78b33d7b920941b4 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 26 Oct 2022 15:57:29 -0600 Subject: [PATCH 087/133] fix: newest click does not like flag_value with multiple=True --- src/ymp/cli/scan.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/ymp/cli/scan.py b/src/ymp/cli/scan.py index 1d524882..593c71e4 100644 --- a/src/ymp/cli/scan.py +++ b/src/ymp/cli/scan.py @@ -126,17 +126,19 @@ def write_csv(self, outfd): @click.option("--out", type=click.File('w')) @click.option("--sample-re", default=".*") @click.option("--folder-re", default=".*") -@click.option("-s", "extra_keys", flag_value="slot", multiple=True) -@click.option("-l", "extra_keys", flag_value="lane", multiple=True) +@click.option("-s", "--export-slot", flag_value="slot") +@click.option("-l", "--export-lane", flag_value="lane") @click.option("-v", "--verbose", count=True) @click.argument("folders", nargs=-1) -def scan(folders, out, sample_re, folder_re, extra_keys, verbose): +def scan(folders, out, sample_re, folder_re, export_slot, export_lane, verbose): if (out is None): raise click.UsageError("--out parameter required") scanner = Scanner(folders) scanner.set_sample_pattern(sample_re) scanner.set_folder_pattern(folder_re) scanner.set_verbosity(verbose) + extra_keys = [export_slot, export_lane] + extra_keys = [key for key in extra_keys if key is not None] scanner.set_extra_keys(list(extra_keys)) scanner.scan() scanner.write_csv(out) From e5923401c685450fa80367bba63ad23e54155178 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 26 Oct 2022 17:08:19 -0600 Subject: [PATCH 088/133] fix: cleanup setting core and job limits --- src/ymp/cli/make.py | 23 ++++++++++++++--------- src/ymp/etc/defaults.yml | 5 +++-- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/ymp/cli/make.py b/src/ymp/cli/make.py index 44fbd10d..40f9fc84 100644 --- a/src/ymp/cli/make.py +++ b/src/ymp/cli/make.py @@ -176,12 +176,13 @@ def start_snakemake(kwargs, submit=False): raise YmpException("internal error - CWD moved out of YMP root?!") cur_path = cur_path[len(root_path)+1:] - # translate renamed arguments to snakemake synopsis + # translate renamed arguments to snakemake synopsis. entries + # mapping to None will be deleted, entries not in this map will be + # copied 1:1, entires with value will be renamed. arg_map = { 'immediate': 'immediate_submit', 'wrapper': 'jobscript', 'scriptname': 'jobname', - 'cluster_cores': 'nodes', 'snake_config': 'config', 'scheduler': 'scheduler', 'drmaa': None, @@ -191,9 +192,11 @@ def start_snakemake(kwargs, submit=False): 'args': None, 'nohup': None } - kwargs = {arg_map.get(key, key): value - for key, value in kwargs.items() - if arg_map.get(key, key) is not None} + kwargs = { + arg_map.get(key, key): value + for key, value in kwargs.items() + if arg_map.get(key, key) is not None + } kwargs['workdir'] = root_path # our debug flag sets a new excepthoook handler, to we use this @@ -260,7 +263,7 @@ def start_snakemake(kwargs, submit=False): @command() @snake_params @click.option( - "--cores", "-j", default=1, metavar="CORES", + "--cores", "-j", default=1, metavar="N", help="The number of parallel threads used for scheduling jobs" ) @click.option( @@ -335,11 +338,13 @@ def make(**kwargs): "60 seconds." ) @click.option( - "--cluster-cores", "-J", type=int, metavar="N", - help="Limit the maximum number of cores used by jobs submitted at a time" + "--nodes", "-J", type=int, metavar="N", + help="Limit the maximum number of jobs submitted at a time. Note " + "that this does not imply a maximum core count or running job " + "count, but simply limits the number of queued jobs." ) @click.option( - "--cores", "-j", metavar="N", + "--local-cores", "-j", metavar="N", help="Number of local threads to use" ) @click.option( diff --git a/src/ymp/etc/defaults.yml b/src/ymp/etc/defaults.yml index 1f11a78c..836ce7b5 100644 --- a/src/ymp/etc/defaults.yml +++ b/src/ymp/etc/defaults.yml @@ -226,7 +226,8 @@ cluster: # - cluster.x (values from snakemake cluster config) # - rule (rule name) args: {} # arguments for job submission - cluster_cores: 1024 # max number of cores to use in parallel + nodes: 1024 # max jobs queued to cluster engine + local_cores: 4 # max threads used on submit host scriptname: "ymp.{rulename}.{jobid}.sh" command: @@ -234,7 +235,7 @@ cluster: dummy: command: "sh" # command for job submission sync_arg: "" # parameter for sync mode - cluster_cores: 2 + nodes: 2 # Profile for Torque engine torque: command: "qsub" From e147dca2a92224af345c31fc8fa229b1ccdbc8ed Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 26 Oct 2022 17:37:36 -0600 Subject: [PATCH 089/133] tests: fix cache now in module separate from common --- tests/{test_common.py => test_cache.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename tests/{test_common.py => test_cache.py} (97%) diff --git a/tests/test_common.py b/tests/test_cache.py similarity index 97% rename from tests/test_common.py rename to tests/test_cache.py index bb3b4fbd..e55dc252 100644 --- a/tests/test_common.py +++ b/tests/test_cache.py @@ -1,5 +1,5 @@ import ymp -from ymp.common import Cache +from ymp.cache import Cache class LoadFuncs(object): From 53237853deb3f21d4e632b2008a6ec5ac004fd9d Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 27 Oct 2022 12:12:18 -0600 Subject: [PATCH 090/133] tests: fix env list and snakemake_plain --- src/ymp/cli/env.py | 10 +++++----- src/ymp/env.py | 4 ++++ tests/data/snakemake_plain/rules/test.rules | 16 +++++----------- tests/test_cli.py | 2 +- 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/src/ymp/cli/env.py b/src/ymp/cli/env.py index 134a4444..4ee28561 100644 --- a/src/ymp/cli/env.py +++ b/src/ymp/cli/env.py @@ -15,7 +15,7 @@ log = logging.getLogger(__name__) # pylint: disable=invalid-name -ENV_COLUMNS = ('name', 'hash', 'path', 'installed') +ENV_COLUMNS = ('label', 'hash', 'address', 'installed') def get_envs(patterns=None): @@ -327,9 +327,9 @@ def clean(param_all): "Remove unused conda environments" if param_all: # remove up-to-date environments for env in ymp.env.by_name.values(): - if os.path.exists(env.path): - log.warning("Removing %s (%s)", env._ymp_name, env.path) - shutil.rmtree(env.path) + if os.path.exists(env.address): + log.warning("Removing %s (%s)", env._ymp_name, env.address) + shutil.rmtree(env.address) # remove outdated environments for _, path in ymp.env.dead.items(): @@ -347,7 +347,7 @@ def activate(envname): $(ymp activate env [ENVNAME]) """ env = get_env(envname) - print("source activate {}".format(env.path)) + print("source activate {}".format(env.address)) @env.command() diff --git a/src/ymp/env.py b/src/ymp/env.py index 846daac5..2267e7d7 100644 --- a/src/ymp/env.py +++ b/src/ymp/env.py @@ -329,6 +329,10 @@ def _download_files(self, urls, md5s): f"Unable to create environment {self._ymp_name}, " f"because downloads failed. See log for details.") + @property + def label(self): + return self._ymp_name + @property def installed(self): if self.is_containerized: diff --git a/tests/data/snakemake_plain/rules/test.rules b/tests/data/snakemake_plain/rules/test.rules index 71062a8c..0e6b45f3 100644 --- a/tests/data/snakemake_plain/rules/test.rules +++ b/tests/data/snakemake_plain/rules/test.rules @@ -1,15 +1,9 @@ rule test: - wildcard_constraints: input: "ymp.yml" output: "{params[0]}.tmp" -# threads: -# resources: - params: "{log}", "{input}" -# priority: + params: + "{version}", + "{input}" version: "{params[1]}" - log: "{version}" -# message: "{version}" -# benchmark: - shell: "touch {output}" - - + log: "{output}.log" + shell: "touch {output} {log}" diff --git a/tests/test_cli.py b/tests/test_cli.py index f065e228..dc1b4449 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -118,7 +118,7 @@ def test_env_list(invoker): res = invoker.call("env", "list") lines = res.output.splitlines() assert len(lines) > 2 - assert lines[0].startswith("name"), "first row should start with name" + assert lines[0].startswith("label"), "first row should start with name" assert all(lines[i].upper() <= lines[i+1].upper() for i in range(2, len(lines)-1)), \ f"output should be sorted: {lines}" From 6841f193c604fededbe730daac43882ee90ae0b6 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 27 Oct 2022 13:03:03 -0600 Subject: [PATCH 091/133] fix: env.path usage in cli --- src/ymp/cli/env.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ymp/cli/env.py b/src/ymp/cli/env.py index 4ee28561..bd9c4823 100644 --- a/src/ymp/cli/env.py +++ b/src/ymp/cli/env.py @@ -48,7 +48,7 @@ def get_env(envname): "".format(envname, envs.keys())) env = next(iter(envs.values())) - if not os.path.exists(env.path): + if not os.path.exists(env.address): log.warning("Environment not yet installed") env.create() return env @@ -203,9 +203,9 @@ def remove(envnames): envs = get_envs(envnames) log.warning(f"Removing {len(envs)} environments.") for env in get_envs(envnames).values(): - if os.path.exists(env.path): - log.warning("Removing %s (%s)", env._ymp_name, env.path) - shutil.rmtree(env.path) + if os.path.exists(env.address): + log.warning("Removing %s (%s)", env._ymp_name, env.address) + shutil.rmtree(env.address) @env.command() From 86b53163cb6a9e0786bbc6c44ca5143659b8daac Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 27 Oct 2022 18:31:04 -0600 Subject: [PATCH 092/133] feat!: use click 8.x built-in complete instead of click-complete Requires reinstalling completion as the interface by click is different from click-complete. Change was necessitated by click-complete apparently not working with click 8.x any more. --- environment.yaml | 1 - src/ymp/__main__.py | 10 +++++++--- src/ymp/cli/__init__.py | 3 --- src/ymp/cli/make.py | 7 ++++--- tests/test_cli.py | 39 +++++++++++++++++++++++++++++++-------- 5 files changed, 42 insertions(+), 18 deletions(-) diff --git a/environment.yaml b/environment.yaml index 09f20389..ab7613dd 100644 --- a/environment.yaml +++ b/environment.yaml @@ -8,7 +8,6 @@ dependencies: - mamba - conda !=4.6.11 - click - - click-completion - ruamel.yaml >0.15 # new api - drmaa - pandas >=0.20 # need dtype support in python csv engine diff --git a/src/ymp/__main__.py b/src/ymp/__main__.py index f73608b3..19a1969b 100644 --- a/src/ymp/__main__.py +++ b/src/ymp/__main__.py @@ -1,7 +1,11 @@ -""" -This allows calling the YMP cli via ``python -m`` +"""This allows calling the YMP cli via ``python -m`` >>> python -m ymp.cli show references -v + +Note that we try to behave just like running ``ymp`` from the command +line, rewriting argv[0] and setting the click program name so that +shell expansion works. This is done mostly to assist unit tests. + """ import sys @@ -9,4 +13,4 @@ if __name__ == "__main__": sys.argv[0] = "ymp" - main() + sys.exit(main(prog_name="ymp")) diff --git a/src/ymp/cli/__init__.py b/src/ymp/cli/__init__.py index 64a89d7b..90c2cef3 100644 --- a/src/ymp/cli/__init__.py +++ b/src/ymp/cli/__init__.py @@ -1,5 +1,4 @@ import click -import click_completion import ymp from ymp.cli.env import env @@ -10,8 +9,6 @@ from ymp.cli.init import init from ymp.cli.scan import scan -click_completion.init() - def install_completion(ctx, attr, value): """Installs click_completion tab expansion into users shell""" diff --git a/src/ymp/cli/make.py b/src/ymp/cli/make.py index 40f9fc84..6baa8f92 100644 --- a/src/ymp/cli/make.py +++ b/src/ymp/cli/make.py @@ -7,6 +7,7 @@ import sys import click +from click.shell_completion import CompletionItem import ymp from ymp.cli.shared_options import command, nohup_option, Log @@ -39,14 +40,14 @@ def debug(msg, *args, **kwargs): class TargetParam(click.ParamType): """Handles tab expansion for build targets""" - @classmethod - def complete(cls, ctx, incomplete): + def shell_complete(self, ctx, _param, incomplete): """Try to complete incomplete command This is executed on tab or tab-tab from the shell Args: ctx: click context object + param: current parameter requesting completion incomplete: last word in command line up until cursor Returns: @@ -97,7 +98,7 @@ def complete(cls, ctx, incomplete): if not ext[-1] == "_") debug("res={}", result) - return result + return [CompletionItem(item) for item in result] def snake_params(func): diff --git a/tests/test_cli.py b/tests/test_cli.py index dc1b4449..521c1278 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -338,20 +338,43 @@ def test_completion( exp_len, # expected number of result options (or -1) exp_res # (subset of) expected result options ): + """This tests click completion by launching an external python + process and checking the output it would return to click's bash + code. If things change within click, this code will have to change + too. + + """ + import subprocess as sp + # Set an environment variable that will make expansion code blab + # to stderr for debugging: envvar('YMP_DEBUG_EXPAND', 'stderr') - envvar('_YMP_COMPLETE', 'complete-bash') + # Set the trigger variable that will initiate bash completion by + # click: + envvar('_YMP_COMPLETE', 'bash_complete') + # Pass the variables bash would set to request completion of the + # 2nd word after the command name, which in this case is the stage + # stack name. envvar('COMP_CWORD', '2') envvar('COMP_WORDS', comp_words) + # Run and capture: sp.run(["python", "-m", "ymp"]) cap = capfd.readouterr() - result = set(cap.out.split()) - - if exp_len != -1: - assert len(result) == exp_len, \ - f"Expected {exp_len} results for '{comp_words}' but got" \ - f" {len(result)}:\n" \ - f"{result}" + # Click sends one line per expansion in form $type,$value. If the + # type is "plain", the value is added as expansion option. If the + # type is dir or file, directory or filename expansion is enabled + # in case no values match, and $value is ignored. We wrap types + # other than plain in double underscore and otherwise keep the + # value to compare to expected test results. + result = set( + val if typ == "plain" else f"__{typ}__" + for typ, val in (line.split(",") for line in cap.out.split()) + ) + + assert exp_len == -1 or len(result) == exp_len, \ + f"Expected {exp_len} results for '{comp_words}' but got" \ + f" {len(result)}:\n" \ + f"{result}" assert exp_res.issubset(result), \ f"Completion for '{comp_words}' is missing: {exp_res - result}" From 774ad7ed2f7bf45f8b18100025afb451b9165818 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 27 Oct 2022 18:52:22 -0600 Subject: [PATCH 093/133] tests: make test_pipeline_hide robust against spam on console --- tests/test_pipeline.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 5bada3b5..d5322808 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -14,9 +14,19 @@ def test_pipeline_hide(invoker, demo_dir): """Checks that hiding of pipeline intermediary outputs works""" res = invoker.call("make", "toy.mypipeline", "--dag", "-qq") - - # This line will segfault if there is any extra data in res! - dotgraph = pgv.AGraph(res.output) + # Graphvis is really fragile w.r.t. input graph format. We need to + # make sure it gets fed the graph and only the graph, otherwise it + # will segfault on us. + # The graph starts with "digraph". Make sure we have that + assert "digraph" in res.output + # Cut of anything before. Keeping snakemake quiet is just too + # fragile. Something always talks, so we just cut that off to make + # testing robust. + graphtext = res.output[res.output.index("digraph"):] + # The last line minus white space must comprise a "}" ending the graph + assert graphtext.splitlines()[-1].strip() == "}" + # Findgers crossed... + dotgraph = pgv.AGraph(graphtext) graph = nx.DiGraph(dotgraph) nodemap = { From a6c55d3bd2fa56ce6d9177691409bf53a18d4d28 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 27 Oct 2022 20:11:05 -0600 Subject: [PATCH 094/133] feat!: disable caching to fix weird sqlite issues; slow tab complete now --- src/ymp/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ymp/config.py b/src/ymp/config.py index a5014aca..2fd2fba5 100644 --- a/src/ymp/config.py +++ b/src/ymp/config.py @@ -10,7 +10,7 @@ import ymp.yaml from ymp.common import AttrDict, MkdirDict, parse_number, format_number, parse_time, format_time -from ymp.cache import Cache +from ymp.cache import Cache, NoCache from ymp.env import CondaPathExpander from ymp.exceptions import YmpSystemError, YmpConfigError from ymp.stage import Pipeline, Project, Reference @@ -354,7 +354,7 @@ def __init__(self, root, conffiles): self.cachedir = os.path.join(XDG_CACHE_HOME, "ymp") self._config = ymp.yaml.load(conffiles, root) - self.cache = cache = Cache(self.cachedir) + self.cache = cache = NoCache(self.cachedir) # lazy filled by accessors self._snakefiles = None From f122bae55970c2adb99f17c7daad57fadda7b9b1 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 27 Oct 2022 20:12:50 -0600 Subject: [PATCH 095/133] tests: unload config before and after changing cwd --- tests/conftest.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index e5d789dd..9e08e594 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -87,8 +87,12 @@ def saved_tmpdir(request, tmpdir): @pytest.fixture() def saved_cwd(saved_tmpdir): + # unload everything that may have depended on previous location + ymp.get_config().unload() with saved_tmpdir.as_cwd(): yield saved_tmpdir + # do it after to be safe + ymp.get_config().unload() # Inject executables into PATH From d732e467afef675a7abf29372725b807797bb973 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 27 Oct 2022 20:13:23 -0600 Subject: [PATCH 096/133] tests: fix changed message for reference is not a dir --- tests/test_reference.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_reference.py b/tests/test_reference.py index b729ea5a..34978a3d 100644 --- a/tests/test_reference.py +++ b/tests/test_reference.py @@ -362,7 +362,8 @@ def test_duplicate_file(saved_cwd, check_show): def test_get_file(saved_cwd): ref = Reference("test", make_cfg("- type: fasta", " url: somewhere.fasta.gz")) assert ref.get_file("ALL.fasta.gz") == "somewhere.fasta.gz" - assert ref.get_file("ALL.fasta.gz", isdir=True) == "YMP_THIS_FILE_MUST_NOT_EXIST" + assert ref.get_file("ALL.fasta.gz", isdir=True) == \ + "YMP ERROR: File 'somewhere.fasta.gz' should be directory but is not" assert ref.get_file("blabla").startswith("YMP_FILE_NOT_FOUND") From 68b72bc2d44f9206c4ec5c66ac6c0f4ffe46738c Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 27 Oct 2022 20:15:38 -0600 Subject: [PATCH 097/133] style: fix type annotation --- src/ymp/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ymp/__init__.py b/src/ymp/__init__.py index 7cb75bf6..06298e0e 100644 --- a/src/ymp/__init__.py +++ b/src/ymp/__init__.py @@ -53,7 +53,7 @@ ] -def get_config() -> 'config.ConfigMgr': +def get_config() -> 'ymp.config.ConfigMgr': """Access the current YMP configuration object. This object might change once during normal execution: it is From b6ce9f15295574f240972f64edc1dfd4c2669eed Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 27 Oct 2022 20:48:21 -0600 Subject: [PATCH 098/133] refactor: rename item in expand_ruleinfo to ruleinfo --- src/ymp/stage/expander.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/ymp/stage/expander.py b/src/ymp/stage/expander.py index 3b70bb49..c4634bf5 100644 --- a/src/ymp/stage/expander.py +++ b/src/ymp/stage/expander.py @@ -17,23 +17,23 @@ class StageExpander(ColonExpander): - Registers rules with stages when they are created """ - def expand_ruleinfo(self, rule, item, expand_args, rec): + def expand_ruleinfo(self, rule, ruleinfo, expand_args, rec): stage = Stage.get_active() if not stage: - return item + return ruleinfo stage.add_rule(rule, self.workflow) - if not item.conda_env and stage.conda_env: - item.conda_env = stage.conda_env + if not ruleinfo.conda_env and getattr(stage, "conda_env", False): + ruleinfo.conda_env = stage.conda_env - if getattr(stage, "params", None): - if not item.params: - item.params = ((), {}) + if getattr(stage, "params", False): + if not ruleinfo.params: + ruleinfo.params = ((), {}) for param in stage.params: - item.params[1][param.name] = param.parse + ruleinfo.params[1][param.name] = param.parse - return super().expand_ruleinfo(rule, item, expand_args, rec) + return super().expand_ruleinfo(rule, ruleinfo, expand_args, rec) def expand_str(self, rule, item, expand_args, rec, cb): if cb: From fa2de5db75154681bcd98f289a200eb090b31f4b Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 27 Oct 2022 20:48:50 -0600 Subject: [PATCH 099/133] tests: fix cfg.unload() does not correctly unset active stage --- src/ymp/config.py | 2 +- src/ymp/stage/base.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/ymp/config.py b/src/ymp/config.py index 2fd2fba5..534fa16f 100644 --- a/src/ymp/config.py +++ b/src/ymp/config.py @@ -341,7 +341,7 @@ def unload(cls): cls.__instance = None from ymp.stage import Stage, StageStack StageStack.stacks = {} - Stage.active = None + Stage.set_active(None) def __init__(self, root, conffiles): log.debug("Inizializing ConfigMgr") diff --git a/src/ymp/stage/base.py b/src/ymp/stage/base.py index 2eafc57c..f01cedac 100644 --- a/src/ymp/stage/base.py +++ b/src/ymp/stage/base.py @@ -196,8 +196,10 @@ def has_checkpoint(self) -> bool: return False class Activateable: - """ - Mixin for Stages that can be filled with rules from Snakefiles. + """Mixin for Stages that can be filled with rules from Snakefiles. + + There can be only one active stage across all classes deriving + from this. """ #: Currently active stage ("entered") _active: Optional[BaseStage] = None From aaaafc8148c37e5cea59a7b2ba0704e328f6805b Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Thu, 27 Oct 2022 21:21:11 -0600 Subject: [PATCH 100/133] fix(config): !workdir and string cannot override one another --- src/ymp/yaml.py | 14 ++++++++++++-- tests/test_yaml.py | 9 +++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/ymp/yaml.py b/src/ymp/yaml.py index 2ec1be02..a57376e8 100644 --- a/src/ymp/yaml.py +++ b/src/ymp/yaml.py @@ -246,14 +246,24 @@ def _finditem(self, key): items = [(fn, m[key]) for fn, m in self._maps if key in m] if not items: raise KeyError(f"key '{key}' not found in any map") - typs = set(type(m[1]) for m in items if m[1]) + # Mappings, Sequences and Atomic types should not override one + # another, can only have one of those and None. + def get_type(obj): + if isinstance(obj, Mapping): + return "Mapping" + if isinstance(obj, str): + return "Scalar" + if isinstance(obj, Sequence): + return "Sequence" + return "Scalar" + typs = set(get_type(m[1]) for m in items if m[1]) if len(typs) > 1: stack = [Entry(fn, m, key) for fn, m in self._maps if key in m] raise MixedTypeError( self, f"Mixed data types for key '{key}'s in present in files", key = key, - stack=stack + stack = stack ) return items diff --git a/tests/test_yaml.py b/tests/test_yaml.py index 88da79e2..95dd80aa 100644 --- a/tests/test_yaml.py +++ b/tests/test_yaml.py @@ -19,6 +19,15 @@ def test_mixed_type(saved_tmpdir): excinfo.value.show() +def test_mixed_type_tag_workdir(saved_tmpdir): + with open(saved_tmpdir / "ymp.yml", "w") as fdes: + fdes.write("data: string") + with open(saved_tmpdir / "other.yml", "w") as fdes: + fdes.write("data: !workdir string") + config = yaml.load([saved_tmpdir / "ymp.yml", saved_tmpdir / "other.yml"]) + assert config.get_path("data") == "string" + + def test_recusion_in_includes(saved_tmpdir): with open(saved_tmpdir / "ymp.yml", "w") as fdes: fdes.write("include: other.yaml") From c6c8e14e1d237a86c4001bb2f041986559c1e5a1 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Fri, 28 Oct 2022 14:43:07 -0600 Subject: [PATCH 101/133] tests: make test_complete more verbose about issues --- tests/test_cli.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 521c1278..32d7d1b2 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -366,10 +366,15 @@ def test_completion( # in case no values match, and $value is ignored. We wrap types # other than plain in double underscore and otherwise keep the # value to compare to expected test results. - result = set( - val if typ == "plain" else f"__{typ}__" - for typ, val in (line.split(",") for line in cap.out.split()) - ) + lines = cap.out.splitlines() + result = set() + for line in lines: + assert line.count(",") == 1, f"wrong field count in {line}" + typ, val = line.split(",") + if typ == "plain": + result.add(val) + else: + result.add(f"__{typ}__") assert exp_len == -1 or len(result) == exp_len, \ f"Expected {exp_len} results for '{comp_words}' but got" \ From 1b54b89fe57eb6bf0f721634eda79811da2cfdf1 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Fri, 28 Oct 2022 17:13:32 -0600 Subject: [PATCH 102/133] feat: abort if snakemake too old, be helpful if newer than tested --- src/ymp/__init__.py | 9 +- src/ymp/cli/make.py | 4 + src/ymp/snakemake.py | 502 ++++++++++++++++++++++------------------ tests/test_snakemake.py | 64 ++++- 4 files changed, 350 insertions(+), 229 deletions(-) diff --git a/src/ymp/__init__.py b/src/ymp/__init__.py index 06298e0e..a648ef06 100644 --- a/src/ymp/__init__.py +++ b/src/ymp/__init__.py @@ -47,10 +47,11 @@ #: >>> ymp make broken -vvv print_rule = 0 -#: List of versions this version of YMP has been verified to work with -snakemake_versions = [ - '7.15.2', -] +#: Minimal version of snakemake required +snakemake_minimum_version = "7.15" +#: Lastest version of snakemake that was tested (breaking changes for +#: us can happen at patch level) +snakemake_tested_version = "7.17" def get_config() -> 'ymp.config.ConfigMgr': diff --git a/src/ymp/cli/make.py b/src/ymp/cli/make.py index 6baa8f92..3a3eabd8 100644 --- a/src/ymp/cli/make.py +++ b/src/ymp/cli/make.py @@ -254,6 +254,10 @@ def start_snakemake(kwargs, submit=False): # snakemake. cfg.unload() + # Check snakemake version + from ymp.snakemake import check_snakemake + check_snakemake() + import snakemake res = snakemake.snakemake(ymp._snakefile, **kwargs) if not res and stage_stack_failure: diff --git a/src/ymp/snakemake.py b/src/ymp/snakemake.py index 4a8f603d..64211ee6 100644 --- a/src/ymp/snakemake.py +++ b/src/ymp/snakemake.py @@ -10,18 +10,26 @@ from inspect import Parameter, signature, stack from typing import Optional -from snakemake.exceptions import CreateRuleException, RuleException # type: ignore -from snakemake.io import AnnotatedString, apply_wildcards, \ - strip_wildcard_constraints # type: ignore +import snakemake +from snakemake.exceptions import ( + CreateRuleException, + RuleException, +) # type: ignore +from snakemake.io import ( + AnnotatedString, + apply_wildcards, + strip_wildcard_constraints, +) # type: ignore from snakemake.io import Namedlist as _Namedlist # type: ignore from snakemake.rules import Rule # type: ignore from snakemake.workflow import RuleInfo, Workflow # type: ignore from snakemake.sourcecache import infer_source_file # type: ignore +from packaging import version import ymp from ymp.common import ensure_list, flatten, is_container -from ymp.exceptions import YmpRuleError +from ymp.exceptions import YmpRuleError, YmpPrettyException from ymp.string import ProductFormatter, make_formatter @@ -31,23 +39,43 @@ get_names = partial_formatter.get_names +class IncompatibleVersionException(YmpPrettyException): + """Raised when required packages do not match version requirements""" + + def check_snakemake() -> bool: prev_result = getattr(check_snakemake, "result", None) if prev_result is not None: return prev_result - import snakemake - check_snakemake.result = snakemake.__version__ in ymp.snakemake_versions - if not check_snakemake.result: - log.fatal("YMP-%s was not verified to work with Snakemake-%s", - ymp.__version__, snakemake.__version__) - return check_snakemake.result + + have_vers = version.parse(snakemake.__version__) + need_vers = version.parse(ymp.snakemake_minimum_version) + test_vers = version.parse(ymp.snakemake_tested_version) + if have_vers < need_vers: + raise IncompatibleVersionException( + f"Snakemake version {need_vers} required but {have_vers} installed" + ) + if have_vers > test_vers: + log.warning( + "Snakemake %s found is newer than the latest version (%s) verified to" + " work with YMP-%s. If you encounter unexpected errors, please" + " downgrade Snakemake or upgrade YMP.", + have_vers, + test_vers, + version.parse(ymp.__version__), + ) + check_snakemake.result = True + return True def networkx(): import networkx + if networkx.__version__[0] != "2": - log.fatal("Networkx version 2.* required by YMP but {} found" - "".format(networkx.__version__)) + log.fatal( + "Networkx version 2.* required by YMP but {} found" + "".format(networkx.__version__) + ) sys.exit(1) return networkx @@ -60,14 +88,11 @@ def print_ruleinfo(rule: Rule, ruleinfo: RuleInfo, func=log.debug): ruleinfo: Matching RuleInfo object to be printed func: Function used for printing (default is log.error) """ - func("rule {}".format({'n': rule.name, - 'l': rule.lineno, - 's': rule.snakefile})) + func("rule {}".format({"n": rule.name, "l": rule.lineno, "s": rule.snakefile})) for attr in dir(ruleinfo): if attr.startswith("__"): continue - func(" {}: {}".format(attr, - getattr(ruleinfo, attr, ""))) + func(" {}: {}".format(attr, getattr(ruleinfo, attr, ""))) func(ruleinfo.func.__code__) @@ -87,25 +112,28 @@ class ExpandLateException(Exception): class CircularReferenceException(YmpRuleError): """Exception raised if parameters in rule contain a circular reference""" + def __init__(self, deps, rule): nodes = [n[0] for n in networkx().find_cycle(deps)] message = "Circular reference in rule {}\n'{}'".format( - rule, " => ".join(nodes + [nodes[0]])) + rule, " => ".join(nodes + [nodes[0]]) + ) rule.filename = rule.snakefile super().__init__(rule, message) class InheritanceException(RuleException): """Exception raised for errors during rule inheritance""" - def __init__(self, msg, rule, parent, - include=None, lineno=None, snakefile=None): - message = "'{}' when deriving {} from {}".format( - msg, rule.name, parent) - super().__init__(message=message, - include=include, - lineno=lineno, - snakefile=snakefile, - rule=rule) + + def __init__(self, msg, rule, parent, include=None, lineno=None, snakefile=None): + message = "'{}' when deriving {} from {}".format(msg, rule.name, parent) + super().__init__( + message=message, + include=include, + lineno=lineno, + snakefile=snakefile, + rule=rule, + ) class NamedList(_Namedlist): @@ -122,6 +150,7 @@ class NamedList(_Namedlist): :class:`ruleinfo` structures. """ + def __init__(self, fromtuple=None, **kwargs): """""" # blank out docstring in super class w different formatting super().__init__(**kwargs) @@ -151,7 +180,6 @@ def get_names(self, *args, **kwargs): """Export ``get_names`` as public func""" return self._get_names(*args, *kwargs) - def update_tuple(self, totuple): """Update values in ``(args, kwargs)`` tuple. @@ -176,139 +204,120 @@ def update_tuple(self, totuple): #: describes attributes of :py:class:`snakemake.workflow.RuleInfo` ruleinfo_fields = { - 'wildcard_constraints': { - 'format': 'argstuple', # len(t[0]) must be == 0 + "wildcard_constraints": { + "format": "argstuple", # len(t[0]) must be == 0 }, - 'input': { - 'format': 'argstuple', - 'funcparams': ('wildcards',), - 'apply_wildcards': True, - 'path_modifier': True, + "input": { + "format": "argstuple", + "funcparams": ("wildcards",), + "apply_wildcards": True, + "path_modifier": True, }, - 'output': { - 'format': 'argstuple', - 'apply_wildcards': True, - 'path_modifier': True, + "output": { + "format": "argstuple", + "apply_wildcards": True, + "path_modifier": True, }, - 'threads': { - 'format': 'int', - 'funcparams': ('wildcards', 'input', 'attempt', 'threads') + "threads": { + "format": "int", + "funcparams": ("wildcards", "input", "attempt", "threads") # stored as resources._cores }, - 'resources': { - 'format': 'argstuple', # len(t[0]) must be == 0, t[1] must be ints - 'funcparams': ('wildcards', 'input', 'attempt', 'threads'), + "resources": { + "format": "argstuple", # len(t[0]) must be == 0, t[1] must be ints + "funcparams": ("wildcards", "input", "attempt", "threads"), }, - 'params': { - 'format': 'argstuple', - 'funcparams': ('wildcards', 'input', 'resources', 'output', 'threads'), - 'apply_wildcards': True, + "params": { + "format": "argstuple", + "funcparams": ("wildcards", "input", "resources", "output", "threads"), + "apply_wildcards": True, }, - 'shadow_depth': { - 'format': 'string_or_true', + "shadow_depth": { + "format": "string_or_true", }, - 'priority': { - 'format': 'numeric', + "priority": { + "format": "numeric", }, - 'version': { - 'format': 'object', + "version": { + "format": "object", }, - 'log': { - 'format': 'argstuple', - 'apply_wildcards': True, - 'path_modifier': True, + "log": { + "format": "argstuple", + "apply_wildcards": True, + "path_modifier": True, }, - 'message': { - 'format': 'string', - 'format_wildcards': True, + "message": { + "format": "string", + "format_wildcards": True, }, - 'benchmark': { - 'format': 'string', - 'apply_wildcards': True, - 'path_modifier': True, + "benchmark": { + "format": "string", + "apply_wildcards": True, + "path_modifier": True, }, - 'wrapper': { - 'format': 'string', + "wrapper": { + "format": "string", # sets conda_env }, - 'conda_env': { - 'format': 'string', # path, relative to cwd or abs - 'apply_wildcards': True, + "conda_env": { + "format": "string", # path, relative to cwd or abs + "apply_wildcards": True, # works only with shell/script/wrapper, not run }, - 'container_img': { - 'format': 'string', + "container_img": { + "format": "string", # works ony with shell/script/wrapper, not run }, - 'shellcmd': { - 'format': 'string', - 'format_wildcards': True, - 'runner': True, + "shellcmd": { + "format": "string", + "format_wildcards": True, + "runner": True, }, - 'docstring': { - 'format': 'string', + "docstring": { + "format": "string", }, - 'norun': { # does the rule have executable data? - 'format': 'bool', + "norun": { # does the rule have executable data? + "format": "bool", }, - 'func': { - 'format': 'callable', - 'runner': True, + "func": { + "format": "callable", + "runner": True, }, - 'script': { - 'format': 'string', - 'runner': True, + "script": { + "format": "string", + "runner": True, }, - 'cache': { + "cache": { # indicates whether or not output is cached across workflows - 'format': 'boolean' + "format": "boolean" }, - 'default_target': { + "default_target": { # whether or not the rule is the default target called when no # targets specified - 'format': 'boolean' + "format": "boolean" }, - 'handover': { + "handover": { # rule takes over entire local node - 'format': 'boolean' + "format": "boolean" }, - 'is_containerized': { - 'format': 'boolean' + "is_containerized": {"format": "boolean"}, + "wrapper": { + "format": "string", # not sure it's really a string + "runner": True, }, - 'wrapper': { - 'format': 'string', # not sure it's really a string - 'runner': True, + "path_modifier": { + "format": "modifier", }, - 'path_modifier': { - 'format': 'modifier', + "apply_modifier": { + "format": "modifier", }, - 'apply_modifier': { - 'format': 'modifier', - }, - 'cwl': { - 'format': 'unknown' - }, - 'env_modules': { - 'format': 'string' - }, - 'group': { - 'format': 'string' - }, - 'name': { - 'format': 'string' - }, - 'notebook': { - 'format': 'string', - 'runner': True - }, - 'retries': { - 'format': 'int' - }, - 'template_engine': { - 'format': 'string', - 'runner': True - } - + "cwl": {"format": "unknown"}, + "env_modules": {"format": "string"}, + "group": {"format": "string"}, + "name": {"format": "string"}, + "notebook": {"format": "string", "runner": True}, + "retries": {"format": "int"}, + "template_engine": {"format": "string", "runner": True} # restart_times # env_modules # shadow_depth @@ -320,6 +329,7 @@ def update_tuple(self, totuple): class ExpandableWorkflow(Workflow): """Adds hook for additional rule expansion methods to Snakemake""" + global_workflow = None __expanders = [] @@ -348,6 +358,7 @@ def activate(cls): # Remove log stream handler installed by Snakemake from snakemake.logging import logger, ColorizingStreamHandler + for handler in logger.logger.handlers: if isinstance(handler, ColorizingStreamHandler): logger.logger.removeHandler(handler) @@ -408,15 +419,16 @@ def clear(cls): # make sure there is no workflow in snakemake either # (we try to load that in activate()) import snakemake.workflow + snakemake.workflow.workflow = None def add_rule( - self, - name=None, - lineno=None, - snakefile=None, - checkpoint=False, - allow_overwrite=False + self, + name=None, + lineno=None, + snakefile=None, + checkpoint=False, + allow_overwrite=False, ): """Add a rule. @@ -428,11 +440,7 @@ def add_rule( # super().add_rule() dynamically creates a name if `name` is None # stash the name so we can access it from `get_rule` self._last_rule_name = super().add_rule( - name, - lineno, - snakefile, - checkpoint, - allow_overwrite + name, lineno, snakefile, checkpoint, allow_overwrite ) return self._last_rule_name @@ -489,8 +497,7 @@ def decorate(ruleinfo): return decorate -def make_rule(name: str=None, lineno: int=None, snakefile: str=None, - **kwargs): +def make_rule(name: str = None, lineno: int = None, snakefile: str = None, **kwargs): log.debug("Synthesizing rule {}".format(name)) ruleinfo = RuleInfo(lambda: None) for arg in kwargs: @@ -525,8 +532,9 @@ def link_workflow(self, workflow): May be called multiple times if a new workflow object is created. """ - log.debug("Linking %s with %s", - self.__class__.__name__, workflow.__class__.__name__) + log.debug( + "Linking %s with %s", self.__class__.__name__, workflow.__class__.__name__ + ) self.workflow = workflow def format(self, item, *args, **kwargs): @@ -590,9 +598,17 @@ def expand(self, rule, item, expand_args=None, rec=-1, cb=False): expand_args = {} debug = ymp.print_rule or getattr(rule, "_ymp_print_rule", False) if debug: - log.debug("{}{}: ({}) {} in rule {} with args {}" - "".format(" "*rec*4, type(self).__name__, - type(item).__name__, item, rule, expand_args)) + log.debug( + "{}{}: ({}) {} in rule {} with args {}" + "".format( + " " * rec * 4, + type(self).__name__, + type(item).__name__, + item, + rule, + expand_args, + ) + ) if item is None: item = None elif isinstance(item, RuleInfo): @@ -602,7 +618,7 @@ def expand(self, rule, item, expand_args=None, rec=-1, cb=False): item = self.expand_str(rule, item, expand_args, rec, cb) except RemoveValue: item = None - elif hasattr(item, '__call__'): + elif hasattr(item, "__call__"): item = self.expand_func(rule, item, expand_args, rec, debug) elif isinstance(item, int) or isinstance(item, float): pass @@ -616,8 +632,9 @@ def expand(self, rule, item, expand_args=None, rec=-1, cb=False): item = self.expand_unknown(rule, item, expand_args, rec, cb) if debug: - log.debug("{}=> {} {}" - "".format(" "*(rec*4), type(item).__name__, item)) + log.debug( + "{}=> {} {}" "".format(" " * (rec * 4), type(item).__name__, item) + ) return item @@ -627,8 +644,8 @@ def expand_unknown(self, rule, item, expand_args, rec, cb): def expand_ruleinfo(self, rule, item, expand_args, rec): self.current_rule = rule for field in filter(self.expands_field, ruleinfo_fields): - expand_args['field'] = field - expand_args['ruleinfo'] = item + expand_args["field"] = field + expand_args["ruleinfo"] = item attr = getattr(item, field) value = self.expand(rule, attr, expand_args=expand_args, rec=rec) setattr(item, field, value) @@ -636,7 +653,7 @@ def expand_ruleinfo(self, rule, item, expand_args, rec): return item def expand_str(self, rule, item, expand_args, rec, cb): - expand_args['rule'] = rule + expand_args["rule"] = rule try: return self.format_annotated(item, expand_args) except (KeyError, TypeError, ExpandLateException): @@ -644,24 +661,32 @@ def expand_str(self, rule, item, expand_args, rec, cb): if cb: raise expand_args = expand_args.copy() + def item_wrapped(wc): - expand_args['wc'] = wc + expand_args["wc"] = wc return self.expand(rule, item, expand_args, cb=True) + return item_wrapped def expand_func(self, rule, item, expand_args, rec, debug): expand_args = expand_args.copy() + @functools.wraps(item) def late_expand(*args, **kwargs): if debug: - log.debug("{}{} late {} {} ".format( - " "*rec*4, type(self).__name__, args, kwargs)) - expand_args['wc'] = args[0] - res = self.expand(rule, item(*args, **kwargs), - expand_args, rec=rec, cb=True) + log.debug( + "{}{} late {} {} ".format( + " " * rec * 4, type(self).__name__, args, kwargs + ) + ) + expand_args["wc"] = args[0] + res = self.expand( + rule, item(*args, **kwargs), expand_args, rec=rec, cb=True + ) if debug: - log.debug("{}=> '{}'".format(" "*rec*4, res)) + log.debug("{}=> '{}'".format(" " * rec * 4, res)) return res + return late_expand def _make_list_wrapper(self, value): @@ -670,28 +695,32 @@ def wrapper(*args, **kwargs): for subitem in value: if callable(subitem): subparms = signature(subitem).parameters - extra_args = { - k: v - for k, v in kwargs.items() - if k in subparms - } + extra_args = {k: v for k, v in kwargs.items() if k in subparms} res.append(subitem(*args, **extra_args)) else: res.append(subitem) return res + # Gather the arguments - parms = tuple(set(flatten([ - list(signature(x).parameters.values()) - for x in value if callable(x) - ]))) + parms = tuple( + set( + flatten( + [ + list(signature(x).parameters.values()) + for x in value + if callable(x) + ] + ) + ) + ) # Rewrite signature wrapper.__signature__ = signature(wrapper).replace(parameters=parms) return wrapper def expand_dict(self, rule, item, expand_args, rec): - path = expand_args.get('path', list()) + path = expand_args.get("path", list()) for key, value in tuple(item.items()): - expand_args['path'] = path + [key] + expand_args["path"] = path + [key] value = self.expand(rule, value, expand_args=expand_args, rec=rec) # Snakemake can't have functions in lists in dictionaries. @@ -705,12 +734,13 @@ def expand_dict(self, rule, item, expand_args, rec): return item def expand_list(self, rule, item, expand_args, rec, cb): - path = expand_args.get('path', list()) + path = expand_args.get("path", list()) res = list() for n, subitem in enumerate(item): - expand_args['path'] = path + [str(n)] - newitem = self.expand(rule, subitem, expand_args=expand_args, - rec=rec, cb=cb) + expand_args["path"] = path + [str(n)] + newitem = self.expand( + rule, subitem, expand_args=expand_args, rec=rec, cb=cb + ) if newitem is not None: res.append(newitem) return res @@ -726,12 +756,13 @@ class SnakemakeExpander(BaseExpander): the functions provided themselves. Since we never want ``{input}`` to be in a string returned as a file, we expand those always. """ + def expands_field(self, field): - return field in ('input', 'output') + return field in ("input", "output") def format(self, item, *args, **kwargs): - if 'wc' in kwargs: - item = apply_wildcards(item, kwargs['wc']) + if "wc" in kwargs: + item = apply_wildcards(item, kwargs["wc"]) return item @@ -739,6 +770,7 @@ class FormatExpander(BaseExpander): """ Expander using a custom formatter object. """ + regex = re.compile( r""" \{ @@ -746,7 +778,9 @@ class FormatExpander(BaseExpander): (?P[^{}]+) ))\1 \} - """, re.VERBOSE) + """, + re.VERBOSE, + ) spec = "{{{}}}" @@ -769,22 +803,25 @@ def parse(self, format_string): start = 0 for match in self.expander.regex.finditer(format_string): - yield (format_string[start:match.start()], - match.group('name'), '', None) + yield ( + format_string[start : match.start()], + match.group("name"), + "", + None, + ) start = match.end() - yield (format_string[start:], - None, None, None) + yield (format_string[start:], None, None, None) def get_names(self, pattern): - return set(match.group('name') - for match in self.regex.finditer(pattern)) + return set(match.group("name") for match in self.regex.finditer(pattern)) class ColonExpander(FormatExpander): """ Expander using ``{:xyz:}`` formatted variables. """ + regex = re.compile( r""" \{: @@ -794,7 +831,9 @@ class ColonExpander(FormatExpander): \s* ))\1 :\} - """, re.VERBOSE) + """, + re.VERBOSE, + ) spec = "{{:{}:}}" @@ -804,6 +843,7 @@ def __init__(self): class RecursiveExpander(BaseExpander): """Recursively expands ``{xyz}`` wildcards in Snakemake rules.""" + def expands_field(self, field): """ Returns true for all fields but ``shell:``, ``message:`` and @@ -814,16 +854,13 @@ def expands_field(self, field): ``message:`` or ``shell:`` as these already have all wildcards applied just before job execution (by :meth:`format_wildcards`). """ - return field not in ( - 'shellcmd', - 'message', - 'wildcard_constraints' - ) + return field not in ("shellcmd", "message", "wildcard_constraints") def expand(self, rule, ruleinfo): """Recursively expand wildcards within :class:`RuleInfo` object""" - fields = list(filter(lambda x: x is not None, - filter(self.expands_field, ruleinfo_fields))) + fields = list( + filter(lambda x: x is not None, filter(self.expands_field, ruleinfo_fields)) + ) # Fetch original ruleinfo values into a dict of NamedList args = {} orig_tuples = {} @@ -858,9 +895,11 @@ def expand(self, rule, ruleinfo): # create node for value itself deps.add_node(s, core=True, name=field, idx=n) # node depends on wildcards contained in value - deps.add_edges_from((s, t) - for t in get_names(value) - if t.split(".")[0].split("[")[0] in fields) + deps.add_edges_from( + (s, t) + for t in get_names(value) + if t.split(".")[0].split("[")[0] in fields + ) # field node depends on all it's value nodes deps.add_edge(field, s) # create edges field.name -> field[n] @@ -868,23 +907,26 @@ def expand(self, rule, ruleinfo): s = "{}.{}".format(field, name) if j is None: j = i + 1 - deps.add_edges_from((s, "{}[{}]".format(field, n)) - for n in range(i, j)) + deps.add_edges_from((s, "{}[{}]".format(field, n)) for n in range(i, j)) # sort variables so that they can be expanded in order try: - nodes = list(reversed([ - node - for node in networkx().algorithms.dag.topological_sort(deps) - if deps.out_degree(node) > 0 and 'core' in deps.nodes[node] - ])) + nodes = list( + reversed( + [ + node + for node in networkx().algorithms.dag.topological_sort(deps) + if deps.out_degree(node) > 0 and "core" in deps.nodes[node] + ] + ) + ) except networkx().NetworkXUnfeasible: raise CircularReferenceException(deps, rule) from None # expand variables for node in nodes: - var_name = deps.nodes[node]['name'] - var_idx = deps.nodes[node]['idx'] + var_name = deps.nodes[node]["name"] + var_idx = deps.nodes[node]["idx"] value = args[var_name][var_idx] if not isinstance(value, str): continue @@ -893,9 +935,10 @@ def expand(self, rule, ruleinfo): valnew = partial_format(value, **args) # check if any remaining wilcards refer to rule fields - names = [re.split(r'\.|\[', name, maxsplit=1)[0] - for name in get_names(valnew)] - field_names = ruleinfo_fields[var_name].get('funcparams', []) + names = [ + re.split(r"\.|\[", name, maxsplit=1)[0] for name in get_names(valnew) + ] + field_names = ruleinfo_fields[var_name].get("funcparams", []) parm_names = [name for name in field_names if name in names] if parm_names: @@ -905,11 +948,15 @@ def late_recursion(val, fparms): def wrapper(wildcards, **kwargs): # no partial here, fail if anything left return strip_wildcard_constraints(val).format( - **kwargs, **wildcards) + **kwargs, **wildcards + ) + # adjust the signature so that snakemake will pass us # everything we need - parms = (Parameter(pname, Parameter.POSITIONAL_OR_KEYWORD) - for pname in fparms) + parms = ( + Parameter(pname, Parameter.POSITIONAL_OR_KEYWORD) + for pname in fparms + ) newsig = signature(wrapper).replace(parameters=parms) wrapper.__signature__ = newsig return wrapper @@ -919,8 +966,7 @@ def wrapper(wildcards, **kwargs): args[var_name][var_idx] = valnew if ymp.print_rule == 1: - log.debug("{}::{}: {} => {}".format(rule.name, - node, value, valnew)) + log.debug("{}::{}: {} => {}".format(rule.name, node, value, valnew)) # update ruleinfo for field in fields: @@ -966,6 +1012,7 @@ class InheritanceExpander(BaseExpander): specifying an unnamed value overrides all unnamed values in the parent attribute. """ + # FIXME: link to http://snakemake.readthedocs.io/en/latest/snakefiles/ # rules.html#handling-ambiguous-rules @@ -1009,7 +1056,7 @@ def get_super(self, rule: Rule, ruleinfo: RuleInfo) -> Optional[RuleInfo]: # If the rule was created with make_rule and has a parent # attribute set, fetch that. - if hasattr(ruleinfo, 'parent'): + if hasattr(ruleinfo, "parent"): return ruleinfo.parent.name, self.ruleinfos[ruleinfo.parent.name] # Otherwise, check the rule definition line for the marker comment @@ -1017,12 +1064,13 @@ def get_super(self, rule: Rule, ruleinfo: RuleInfo) -> Optional[RuleInfo]: if "#" in line: comment = line.split("#")[1].strip() if comment.startswith(self.KEYWORD): - superrule_name = comment[len(self.KEYWORD):].strip() + superrule_name = comment[len(self.KEYWORD) :].strip() try: return superrule_name, self.ruleinfos[superrule_name] except KeyError: - raise InheritanceException("Unable to find parent", - rule, superrule_name) + raise InheritanceException( + "Unable to find parent", rule, superrule_name + ) return None, None def expand(self, rule, ruleinfo): @@ -1074,6 +1122,7 @@ class DefaultExpander(InheritanceExpander): The implementation simply makes all rules inherit from a defaults rule. """ + def __init__(self, **kwargs): """ Creates DefaultExpander @@ -1103,6 +1152,7 @@ class WorkflowObject(object): within the Snakemake workflow object and provides an accessor method to this registry. """ + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1111,10 +1161,10 @@ def __init__(self, *args, **kwargs): # that is not a constructor call (i.e. not __init__) try: caller = next(fi for fi in stack() if fi.function != "__init__") - if not hasattr(self, 'filename'): + if not hasattr(self, "filename"): #: str: Name of file in which object was defined self.filename = caller.filename - if not hasattr(self, 'lineno'): + if not hasattr(self, "lineno"): #: int: Line number of object definition self.lineno = caller.lineno except IndexError: @@ -1125,20 +1175,24 @@ def register(self): cache = self.get_registry() names = [] - for attr in 'name', 'altname', '_ymp_name': + for attr in "name", "altname", "_ymp_name": if hasattr(self, attr): names += ensure_list(getattr(self, attr)) for name in names: - if (name in cache + if ( + name in cache and self != cache[name] - and (self.filename != cache[name].filename - or self.lineno != cache[name].lineno)): + and ( + self.filename != cache[name].filename + or self.lineno != cache[name].lineno + ) + ): other = cache[name] raise YmpRuleError( self, f"Failed to create {self.__class__.__name__} '{names[0]}':" - f" already defined in {other.filename}:{other.lineno}" + f" already defined in {other.filename}:{other.lineno}", ) for name in names: @@ -1158,8 +1212,10 @@ def get_registry(cls, clean=False): Return all objects of this class registered with current workflow """ import ymp + cfg = ymp.get_config() return cfg.cache.get_cache( cls.__name__, loadfunc=ExpandableWorkflow.ensure_global_workflow, - clean=clean) + clean=clean, + ) diff --git a/tests/test_snakemake.py b/tests/test_snakemake.py index ec7b108f..32711773 100644 --- a/tests/test_snakemake.py +++ b/tests/test_snakemake.py @@ -19,9 +19,68 @@ import pytest +import ymp +from ymp.snakemake import check_snakemake +from ymp.exceptions import YmpException +from packaging import version + + log = logging.getLogger(__name__) +def test_snakemake_version(): + assert check_snakemake(), "Snakemake version unsupported (too old)" + minvers = version.parse(ymp.snakemake_minimum_version) + testvers = version.parse(ymp.snakemake_tested_version) + assert ( + minvers <= testvers + ), "Minimum snakemake version must not be larger than tested version" + + +def test_snakemake_version_below_min_raises(monkeypatch): + with monkeypatch.context() as m: + m.setattr("ymp.snakemake_minimum_version", "99!1") + m.setattr("ymp.snakemake.check_snakemake.result", None) + with pytest.raises(YmpException): + check_snakemake() + assert check_snakemake(), "cached value not reset?" + + +def test_snakemake_version_above_tested_warns(monkeypatch, caplog): + with monkeypatch.context() as m: + m.setattr("ymp.snakemake_tested_version", "0") + m.setattr("ymp.snakemake.check_snakemake.result", None) + check_snakemake() + assert "newer than the latest version" in caplog.records[-1].message + assert check_snakemake(), "cached value not reset?" + + +def test_snakemake_version_above_tested_warns_once( + invoker, demo_dir, monkeypatch, caplog +): + with monkeypatch.context() as m: + m.setattr("ymp.snakemake_tested_version", "0") + m.setattr("ymp.snakemake.check_snakemake.result", None) + invoker.call("make", "-n", "toy") + msg_count = sum( + "newer than the latest version" in rec.message for rec in caplog.records + ) + assert msg_count == 1 + + +def test_snakemake_version_above_tested_quiet_with_q( + invoker, demo_dir, monkeypatch, caplog +): + with monkeypatch.context() as m: + m.setattr("ymp.snakemake_tested_version", "0") + m.setattr("ymp.snakemake.check_snakemake.result", None) + invoker.call("make", "-nq", "toy") + msg_count = sum( + "newer than the latest version" in rec.message for rec in caplog.records + ) + assert msg_count == 0 + + @pytest.mark.parametrize("project", ["snakemake_circle"], indirect=True) def test_snakemake_failure(project_dir, invoker): "These are expected to fail" @@ -30,8 +89,9 @@ def test_snakemake_failure(project_dir, invoker): assert "Circular reference in" in msg -@pytest.mark.parametrize("project", ["snakemake_plain", "snakemake_function"], - indirect=True) +@pytest.mark.parametrize( + "project", ["snakemake_plain", "snakemake_function"], indirect=True +) def test_snakemake(project_dir, invoker): "These should work" res = invoker.call("make", "test") From 5524b7078079d7fcd1fdf9af8987fc28e0ad6ba2 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 1 Nov 2022 12:54:22 -0600 Subject: [PATCH 103/133] tests: fix empty completion result --- tests/test_cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 32d7d1b2..55189a7b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -366,9 +366,10 @@ def test_completion( # in case no values match, and $value is ignored. We wrap types # other than plain in double underscore and otherwise keep the # value to compare to expected test results. - lines = cap.out.splitlines() result = set() - for line in lines: + for line in cap.out.splitlines(): + if exp_len == 0 and not line: + continue # empty line ok for empty result assert line.count(",") == 1, f"wrong field count in {line}" typ, val = line.split(",") if typ == "plain": From 6d30b5bf28f365bc6fc8f7812c15dbf803ea5b9d Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 1 Nov 2022 12:56:01 -0600 Subject: [PATCH 104/133] fix(install): require newer snakemake in environment --- environment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yaml b/environment.yaml index ab7613dd..1eca4f05 100644 --- a/environment.yaml +++ b/environment.yaml @@ -4,7 +4,7 @@ channels: - bioconda dependencies: - python >=3.7 - - snakemake-minimal >=6.0.5 + - snakemake-minimal >=7.15 - mamba - conda !=4.6.11 - click From 229ec8d5651a41f19271aec0318bf751d950fd8c Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 1 Nov 2022 13:26:50 -0600 Subject: [PATCH 105/133] tests: remove gh action caching and go to py3.10 --- .github/workflows/tests.yml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 71b6b559..2cbc55f8 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -15,7 +15,7 @@ jobs: matrix: os: ['ubuntu-latest', 'macos-latest'] section: ["Tools", "Core"] - python-version: ['3.7'] + python-version: ['3.10'] defaults: run: shell: bash -l {0} @@ -24,12 +24,6 @@ jobs: with: submodules: true fetch-depth: 0 # full history for setuptools_scm - - uses: actions/cache@v1 - env: - CACHE_VERS: 1 # bump to manually reset cache - with: - path: ~/conda_pkgs_dir - key: ${{runner.os}}-conda-${{env.CACHE_VERS}}-${{hashFiles('environment.yaml')}} - uses: conda-incubator/setup-miniconda@v2 with: # Don't update conda - performance: @@ -40,7 +34,6 @@ jobs: environment-file: environment.yaml activate-environment: ymp channel-priority: strict - use-only-tar-bz2: true # needed for caching mamba-version: "*" - name: Install run: | From 9398c488e662f83d07e511362f2b731838efd50c Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 1 Nov 2022 16:05:08 -0600 Subject: [PATCH 106/133] tests: increase log level for snakemake version warning to "error" --- src/ymp/snakemake.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ymp/snakemake.py b/src/ymp/snakemake.py index 64211ee6..de03af5c 100644 --- a/src/ymp/snakemake.py +++ b/src/ymp/snakemake.py @@ -56,7 +56,7 @@ def check_snakemake() -> bool: f"Snakemake version {need_vers} required but {have_vers} installed" ) if have_vers > test_vers: - log.warning( + log.error( "Snakemake %s found is newer than the latest version (%s) verified to" " work with YMP-%s. If you encounter unexpected errors, please" " downgrade Snakemake or upgrade YMP.", From bbc56483683084027ac99728bf3928d0026f23d1 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 1 Nov 2022 17:07:07 -0600 Subject: [PATCH 107/133] feat: add --fresh to ymp env prepare --- src/ymp/cli/env.py | 26 ++++++++++++++++++++++++-- src/ymp/cli/make.py | 5 +++-- src/ymp/env.py | 15 ++++++++++++++- src/ymp/etc/defaults.yml | 5 +++++ 4 files changed, 46 insertions(+), 5 deletions(-) diff --git a/src/ymp/cli/env.py b/src/ymp/cli/env.py index bd9c4823..bea8e525 100644 --- a/src/ymp/cli/env.py +++ b/src/ymp/cli/env.py @@ -117,10 +117,32 @@ def ls(param_all, static, dynamic, sort_col, reverse, envnames): @env.command() @snake_params -def prepare(**kwargs): +@click.option( + "--reinstall", is_flag=True, + help="Delete existing environment and reinstall" +) +@click.option( + "--no-spec", is_flag=True, + help="Don't use conda env spec even if present" +) +@click.option( + "--no-archive", is_flag=True, + help="Delete existing archives before install" +) +@click.option( + "--fresh", is_flag=True, + help="Create fresh install. Implies reinstall, no-spec and no-archve" +) +def prepare(reinstall, no_spec, no_archive, fresh, **kwargs): "Create envs needed to build target" kwargs['conda_create_envs_only'] = True - rval = start_snakemake(kwargs) + cfg = ymp.get_config() + if (fresh): + reinstall = no_spec = no_archive = True + cfg.conda.create.reinstall = reinstall + cfg.conda.create.nospec = no_spec + cfg.conda.create.noarchive = no_archive + rval = start_snakemake(kwargs, unload=False) if not rval: sys.exit(1) diff --git a/src/ymp/cli/make.py b/src/ymp/cli/make.py index 3a3eabd8..29932792 100644 --- a/src/ymp/cli/make.py +++ b/src/ymp/cli/make.py @@ -159,7 +159,7 @@ def decorated(*args, **kwargs): # pylint: disable=missing-docstring return decorated -def start_snakemake(kwargs, submit=False): +def start_snakemake(kwargs, submit=False, unload=True): """Execute Snakemake with given parameters and targets Fixes paths of kwargs['targets'] to be relative to YMP root. @@ -252,7 +252,8 @@ def start_snakemake(kwargs, submit=False): # A snakemake workflow was created above to resolve the # stage stack. Unload it so things run correctly from within # snakemake. - cfg.unload() + if unload: + cfg.unload() # Check snakemake version from ymp.snakemake import check_snakemake diff --git a/src/ymp/env.py b/src/ymp/env.py index 2267e7d7..6cb893a6 100644 --- a/src/ymp/env.py +++ b/src/ymp/env.py @@ -200,7 +200,7 @@ def _get_content(self): def set_prefix(self, prefix): self._env_dir = op.abspath(prefix) - def create(self, dryrun=False, reinstall=False, nospec=False, noarchive=False): + def create(self, dryrun=False, reinstall=None, nospec=None, noarchive=None): """Ensure the conda environment has been created Inherits from snakemake.deployment.conda.Env.create @@ -218,7 +218,20 @@ def create(self, dryrun=False, reinstall=False, nospec=False, noarchive=False): the package binaries, we allow maintaining a copy of the package binary URLs, from which the archive folder is populated on demand. We just download those to self.archive and pass on. + + Parameters: + - reinstall: force re-installing already installed envs + - noarchive: delete existing archives before installing, forcing re-download + - nospec: do not use stored spec ("lock", set of urls for env) """ + cfg = ymp.get_config() + if nospec is None: + nospec = cfg.conda.create.nospec + if noarchive is None: + noarchive = cfg.conda.create.noarchive + if reinstall is None: + reinstall = cfg.conda.create.reinstall + if self.installed: if reinstall: log.info("Environment '%s' already exists. Removing...", self._ymp_name) diff --git a/src/ymp/etc/defaults.yml b/src/ymp/etc/defaults.yml index 836ce7b5..2fb18728 100644 --- a/src/ymp/etc/defaults.yml +++ b/src/ymp/etc/defaults.yml @@ -48,6 +48,11 @@ conda: - /conda-forge/conda-forge\/label\/broken/ - /conda-forge/conda-forge\/label\/cf201901/ # gcc4 - /conda-forge/conda-forge\/label\/old_feature_broken/ # gcc4 + create: # defaults for env creation + reinstall: false # always install again, used by --fresh option + noarchive: false # delete archive files before creating + nospec: false # no not use spec, always calculate new package set + # Default references references: # Human Genomes From 19cddfe9acd7889f042eaf53f3f0151b7814cf80 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 1 Nov 2022 17:14:48 -0600 Subject: [PATCH 108/133] tests: try fix failing log check on osx --- tests/test_snakemake.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_snakemake.py b/tests/test_snakemake.py index 32711773..3e96004d 100644 --- a/tests/test_snakemake.py +++ b/tests/test_snakemake.py @@ -51,11 +51,14 @@ def test_snakemake_version_above_tested_warns(monkeypatch, caplog): m.setattr("ymp.snakemake_tested_version", "0") m.setattr("ymp.snakemake.check_snakemake.result", None) check_snakemake() - assert "newer than the latest version" in caplog.records[-1].message + msg_count = sum( + "newer than the latest version" in rec.message for rec in caplog.records + ) + assert msg_count == 1 assert check_snakemake(), "cached value not reset?" -def test_snakemake_version_above_tested_warns_once( +def test_snakemake_version_above_tested_warns_invoked( invoker, demo_dir, monkeypatch, caplog ): with monkeypatch.context() as m: From 194cc7f125d12b07e1a66861d69eecc2755eaf13 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 1 Nov 2022 17:30:25 -0600 Subject: [PATCH 109/133] revert 9398c48 (version warning loglevel) --- src/ymp/snakemake.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ymp/snakemake.py b/src/ymp/snakemake.py index de03af5c..64211ee6 100644 --- a/src/ymp/snakemake.py +++ b/src/ymp/snakemake.py @@ -56,7 +56,7 @@ def check_snakemake() -> bool: f"Snakemake version {need_vers} required but {have_vers} installed" ) if have_vers > test_vers: - log.error( + log.warning( "Snakemake %s found is newer than the latest version (%s) verified to" " work with YMP-%s. If you encounter unexpected errors, please" " downgrade Snakemake or upgrade YMP.", From 89ed35f2b10365cdae8f4c15cfdc79791eed7f46 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 1 Nov 2022 17:42:57 -0600 Subject: [PATCH 110/133] tests: mark the test that won't work on osx as xfail --- tests/test_snakemake.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_snakemake.py b/tests/test_snakemake.py index 3e96004d..a3a6b69a 100644 --- a/tests/test_snakemake.py +++ b/tests/test_snakemake.py @@ -16,6 +16,7 @@ """ import logging +import sys import pytest @@ -45,7 +46,10 @@ def test_snakemake_version_below_min_raises(monkeypatch): check_snakemake() assert check_snakemake(), "cached value not reset?" - +@pytest.mark.xfail( + sys.platform == "darwin", + "unclear with this is failing on osx, likely the test" +) def test_snakemake_version_above_tested_warns(monkeypatch, caplog): with monkeypatch.context() as m: m.setattr("ymp.snakemake_tested_version", "0") From 717228060ce0a1f37deab63569af712167f83f25 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 8 Nov 2022 11:37:29 -0700 Subject: [PATCH 111/133] tests: fix xfail reason must have arg name --- tests/test_snakemake.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_snakemake.py b/tests/test_snakemake.py index a3a6b69a..cfae7319 100644 --- a/tests/test_snakemake.py +++ b/tests/test_snakemake.py @@ -48,7 +48,7 @@ def test_snakemake_version_below_min_raises(monkeypatch): @pytest.mark.xfail( sys.platform == "darwin", - "unclear with this is failing on osx, likely the test" + reason="unclear with this is failing on osx, likely the test" ) def test_snakemake_version_above_tested_warns(monkeypatch, caplog): with monkeypatch.context() as m: From cd92ce0f605e41fa9087087f31a5b93d819b1d0b Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 8 Nov 2022 13:18:02 -0700 Subject: [PATCH 112/133] tests: extend conda mock --- tests/conftest.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 9e08e594..28c1716a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,7 @@ import os import shlex import shutil +import json import py @@ -153,10 +154,11 @@ def mock_conda(bin_dir): 'cmd=""', 'while [ -n "$1" ]; do', ' case $1 in', - ' --version) echo conda 4.2; exit 0;;', + ' --version) echo conda 22.9.0; exit 0;;', ' --prefix|-p) shift; p="$1";;', ' --file|-f) shift; f="$1";;' ' --json) j=Y;;' + ' --get) shift; get="$1";;', ' *) cmd="$cmd $1";;', ' esac', ' shift', @@ -167,8 +169,18 @@ def mock_conda(bin_dir): 'if [ x"$cmd" = x" env export" -a -n "$p" ]; then', ' echo "dependencies: [one, two]"', 'fi', - 'if [ x"$cmd" = x" info" ]; then', - ' echo \'{{"conda_prefix": "{}"}}\''.format(base_dir), + 'if [ x"$cmd" = x" info" ]; then', + ' echo \'{}\''.format(json.dumps({ + "platform": "linux", + "conda_prefix": base_dir + })), + 'fi', + 'if [ x"$cmd" = x" config" -a x"$get" = x"channel_priority" -a -n "$j" ]; then', + ' echo \'{}\''.format(json.dumps({ + "get": { + "channel_priority": "strict" + } + })), 'fi', ])) From 1520fec8b1e824d6a70bf733ef96445a99eff175 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 8 Nov 2022 13:18:12 -0700 Subject: [PATCH 113/133] tests: update env run expected error --- tests/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 55189a7b..fe8d1655 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -296,7 +296,7 @@ def test_env_run(invoker, demo_dir, mock_conda, mock_downloader, capfd): res = invoker.call("env", "run", "bbmap", "true") assert res.exit_code == 0 cap = capfd.readouterr() - assert "Not a conda environment" in cap.err + assert "bin/activate: No such file or directory" in cap.err @pytest.mark.parametrize( From 5301a14042f7ed9e0d91a720a0a1143583c632da Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 8 Nov 2022 13:18:39 -0700 Subject: [PATCH 114/133] tests: update mock_stack --- tests/test_pipeline.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index d5322808..75367753 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -136,7 +136,7 @@ def test_param_from_stage(saved_cwd): "stages: [trim_bbmap]" )) assert pipe.params - + def test_stage_with_curly(saved_cwd): pipe = Pipeline("test", make_cfg( "params:\n" @@ -158,14 +158,15 @@ def test_stage_not_parametrizable(saved_cwd): assert pipe.params == [] -class mock: - pass +class mock_stack: + def __init__(self, name): + self.name = f"stack.{name}" + self.stage_name = name + self.stage = f"stage.{name}" def test_pipeline_path(saved_cwd): - stack = mock() - stack.name = "stack.test_pipe" - stack.stage_name = "test_pipe" + stack = mock_stack("test_pipe") pipe = Pipeline("test_pipe", make_cfg( "stages:\n" " - trim_bbmap\n" @@ -177,9 +178,7 @@ def test_pipeline_path(saved_cwd): def test_pipeline_path_with_param(saved_cwd): - stack = mock() - stack.name = "stack.test_pipe" - stack.stage_name = "test_pipe" + stack = mock_stack("test_pipe") pipe = Pipeline("test_pipe", make_cfg( "stages:\n" " - trim_bbmapQ10\n" From e6b3aca0a7d7fcc835b698ac134d4c66fb2ff2fc Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 8 Nov 2022 13:33:46 -0700 Subject: [PATCH 115/133] tests: loosen test_env_run --- tests/test_cli.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index fe8d1655..e689f47a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -296,7 +296,10 @@ def test_env_run(invoker, demo_dir, mock_conda, mock_downloader, capfd): res = invoker.call("env", "run", "bbmap", "true") assert res.exit_code == 0 cap = capfd.readouterr() - assert "bin/activate: No such file or directory" in cap.err + assert ( + "bin/activate: No such file or directory" in cap.err + or "Not a conda environment:" in cap.err + ) @pytest.mark.parametrize( From e1ed53d6d9074fd865abfa4b8d6e236d20c791ef Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 11 Oct 2023 08:42:15 -0600 Subject: [PATCH 116/133] fix: minor --- src/ymp/etc/defaults.yml | 2 +- src/ymp/rules/multiqc.rules | 7 ++----- src/ymp/rules/salmon.rules | 21 ++++++++++++++++----- src/ymp/yaml.py | 8 +++++--- 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/src/ymp/etc/defaults.yml b/src/ymp/etc/defaults.yml index 2fb18728..8fe0145c 100644 --- a/src/ymp/etc/defaults.yml +++ b/src/ymp/etc/defaults.yml @@ -21,7 +21,7 @@ conda: # If set, use frozen environments from this set env_specs: - *conda_envs - - ../conda_envs/latest + #- ../conda_envs/latest # Search path for .yml files: env_path: diff --git a/src/ymp/rules/multiqc.rules b/src/ymp/rules/multiqc.rules index 4d7913ca..de8beca9 100644 --- a/src/ymp/rules/multiqc.rules +++ b/src/ymp/rules/multiqc.rules @@ -21,7 +21,7 @@ with Stage("qc_multiqc") as S: sp = {} module_order = [] sample_names_replace = {} - for conffile in input.conf: + for conffile in ensure_list(input.conf): with open(conffile, "r") as fd: data = yaml.load(fd) run_modules.extend(data.get("run_modules", [])) @@ -53,7 +53,7 @@ with Stage("qc_multiqc") as S: benchmark: "benchmarks/{:name:}/{:this:}/all.txt", params: - dirs = lambda wc, input: [os.path.dirname(p) for p in input.parts] + dirs = lambda wc, input: [os.path.dirname(p) for p in ensure_list(input.parts)] log: "{:this:}/multiqc.log" resources: @@ -70,6 +70,3 @@ with Stage("qc_multiqc") as S: " --config {input.conf}" " --filename {output.report}" " {params.dirs}" - - - diff --git a/src/ymp/rules/salmon.rules b/src/ymp/rules/salmon.rules index 7029d149..5d056794 100644 --- a/src/ymp/rules/salmon.rules +++ b/src/ymp/rules/salmon.rules @@ -77,7 +77,7 @@ with Stage("index_salmon_decoy") as S: with Stage("quant_salmon_sa") as S: S.doc(""" """) - S.add_param("L", typ="choice", name="libtype", default="A", + S.add_param("L", typ="choice", name="libtype", default="A", value=["A", "IU", "MU", "OU", "ISF", "ISR", "MSF", "MSR", "OSF", "OSR", "U", "SF", "SR"]) rule salmon_sa_quant: @@ -154,8 +154,8 @@ with Stage("quant_salmon_sa") as S: } with open(output[0], "w") as out: yaml.dump(data, out) - - + + with Stage("quant_salmon") as S: S.doc(""" """) @@ -183,7 +183,9 @@ with Stage("quant_salmon") as S: mem = "48G", shell: "exec >{log} 2>&1;" - "salmon quant" + "echo Launching salmon on $HOSTNAME;" + "set -x; " + "if ! salmon quant" " --libType {params.libtype}" " --threads {threads}" " --seqBias" @@ -193,7 +195,16 @@ with Stage("quant_salmon") as S: " --targets {input.txfa}" " --output $(dirname {output.quant})" " --minAssignedFrags 0" - " {params.gencode}" + " {params.gencode}; then" + " echo Salmon or Samtools failed;" + " if tail -n20 $(dirname {output.quant})/logs/salmon_quant.log |" + " grep -qE ' [0-9]+ fragments were mapped, but the number of burn-in fragments'; then" + " echo Salmon found insufficient fragments. Faking output.;" + " echo -e 'Name\tLength\tEffectiveLength\tTPM\tNumReads' > {output.quant};" + " exit 0;" + " fi;" + " exit 1;" + "fi;" localrules: salmon_quant_multiqc_cfg rule salmon_quant_multiqc_cfg: diff --git a/src/ymp/yaml.py b/src/ymp/yaml.py index a57376e8..fea61a2c 100644 --- a/src/ymp/yaml.py +++ b/src/ymp/yaml.py @@ -261,9 +261,11 @@ def get_type(obj): stack = [Entry(fn, m, key) for fn, m in self._maps if key in m] raise MixedTypeError( self, - f"Mixed data types for key '{key}'s in present in files", - key = key, - stack = stack + f"Mixed data types for key '{key}'s in present in files: {typs}", + key=key, + stack=stack, + typs=typs, + stack=stack ) return items From e9dceee076b6db40d372c370dda1b14bc9254d32 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 18 Oct 2023 11:00:43 -0600 Subject: [PATCH 117/133] fix: pandas deprecation warning --- src/ymp/stage/project.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ymp/stage/project.py b/src/ymp/stage/project.py index 5de2b8c5..1193e7b8 100644 --- a/src/ymp/stage/project.py +++ b/src/ymp/stage/project.py @@ -101,7 +101,7 @@ def _load_file(self, cfg, key): ) from exc # prefix fq files with name of config file's directory rdir = os.path.dirname(fname) - data = data.applymap( + data = data.map( lambda s: os.path.join(rdir, s) if is_fq(s) and os.path.exists(os.path.join(rdir, s)) else s @@ -360,7 +360,7 @@ def get_ids(self, stack, groups, match_groups=None, match_values=None): def do_get_ids(self, _stack, groups, match_groups=None, match_values=None): if match_values: match_values = match_values.split("__") - + return ["__".join(t) for t in self.data.fetch( groups, match_groups, From dd9a80bee114d4b3c76555bd19a60c283bcc5845 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 18 Oct 2023 11:01:38 -0600 Subject: [PATCH 118/133] feat: enable slurm job cancelling --- src/ymp/etc/defaults.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ymp/etc/defaults.yml b/src/ymp/etc/defaults.yml index 8fe0145c..78a53c77 100644 --- a/src/ymp/etc/defaults.yml +++ b/src/ymp/etc/defaults.yml @@ -268,6 +268,7 @@ cluster: memory: "--mem={resources.mem_mb}" walltime: "--time={resources.walltime}" cluster_status: "python -m ymp.cluster slurm status" + cluster_cancel: "scancel" lsf: command: "python -m ymp.cluster lsf submit" args: From c8f63cad3a80a11c12c6782d651ad8b8658b0c07 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 18 Oct 2023 11:01:57 -0600 Subject: [PATCH 119/133] fix(multiqc): python 3.12 not working yet --- src/ymp/rules/multiqc.rules | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ymp/rules/multiqc.rules b/src/ymp/rules/multiqc.rules index de8beca9..7e194cc7 100644 --- a/src/ymp/rules/multiqc.rules +++ b/src/ymp/rules/multiqc.rules @@ -1,5 +1,7 @@ Env(name="multiqc", base="bioconda", packages=[ - "multiqc >=1.12" + "multiqc >=1.12", + "Python <3.12" # multiqc uses lzstring which uses future which uses + # imp which was removed in 3.12 ]) with Stage("qc_multiqc") as S: From 24c5b04d6d67cced07a87de05c5df6f89edc3af3 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 18 Oct 2023 11:02:24 -0600 Subject: [PATCH 120/133] fix(blast): avoid split blast results in subdir created by other rule --- src/ymp/rules/blast.rules | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/src/ymp/rules/blast.rules b/src/ymp/rules/blast.rules index c5958645..d14ecf29 100644 --- a/src/ymp/rules/blast.rules +++ b/src/ymp/rules/blast.rules @@ -229,10 +229,12 @@ with Stage("annotate_blast") as S: def blastn_join_input(wildcards): cpt = checkpoints.blastn_split_query_fasta.get(**wildcards) - cpt_outdir = cpt.output.queries - indices = glob_wildcards(os.path.join(cpt_outdir, '{index}.fasta')) - return expand(os.path.join(cpt_outdir, '{index}.blast7.gz'), - index=indices.index) + fastadir = cpt.output.queries + blastdir = re.sub("_queries$", "_results", fastadir) + indices = glob_wildcards(os.path.join(fastadir, '{index}.fasta')) + res = expand(os.path.join(blastdir, '{index}.blast7.gz'), + index=indices.index) + return res localrules: blastn_join_result rule blastn_join_result: @@ -240,12 +242,20 @@ with Stage("annotate_blast") as S: message: "{:name:}: merging result {output}" input: - results = blastn_join_input, - folder = "{:this:}/{target}.split_queries" + results = blastn_join_input output: "{:this:}/{target}.blast7.gz" + log: + "{:this:}/{target}.log" shell: - "cat {input.results} > {output}" + "if [ -z \"{input.results}\" ]; then" + " echo YMP: making empty output >{log};" + " echo | gzip > {output};" + "else " + " echo YMP: concatenating files >{log};" + " echo \"{input.results}\" >> {log};" + " cat {input.results} > {output};" + "fi" rule blastn_query: """Runs BLAST""" @@ -257,9 +267,9 @@ with Stage("annotate_blast") as S: db = expand("{{:prev:}}/{{:target:}}.{ext}", ext=BLASTIDX_SUFFIXES) output: - "{:this:}/{target}.split_queries/{index}.blast7.gz" + "{:this:}/{target}.split_results/{index}.blast7.gz" log: - "{:this:}/{target}.split_queries.{index}.log" + "{:this:}/{target}.split_results.{index}.log" benchmark: "benchmarks/{:name:}/{:this:}/{target}.{index}.txt" params: @@ -294,7 +304,7 @@ with Stage("annotate_blast") as S: ';' 'mv $tmpout {output}' - rule blastn_query_SPLIT: # ymp: extends blastn_query + rule blastn_query_SPLITIDX: # ymp: extends blastn_query """Variant of `blastn_query` for multi-file blast indices""" input: db = expand("{{:prev:}}/{{:target:}}.{ext}", From e51f872406f313d3ff28627042263ef3b6248b77 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 18 Oct 2023 11:04:03 -0600 Subject: [PATCH 121/133] fix(snakemake): pass cores value again, snakemake change --- src/ymp/cli/make.py | 6 +++++- src/ymp/etc/defaults.yml | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/ymp/cli/make.py b/src/ymp/cli/make.py index 29932792..2f91d750 100644 --- a/src/ymp/cli/make.py +++ b/src/ymp/cli/make.py @@ -350,7 +350,11 @@ def make(**kwargs): "count, but simply limits the number of queued jobs." ) @click.option( - "--local-cores", "-j", metavar="N", + "--cores", "-c", type=int, metavar="N", + help="Maximum number of cluster cores to use" +) +@click.option( + "--local-cores", "-j", type=int, metavar="N", help="Number of local threads to use" ) @click.option( diff --git a/src/ymp/etc/defaults.yml b/src/ymp/etc/defaults.yml index 78a53c77..663e982d 100644 --- a/src/ymp/etc/defaults.yml +++ b/src/ymp/etc/defaults.yml @@ -232,6 +232,7 @@ cluster: # - rule (rule name) args: {} # arguments for job submission nodes: 1024 # max jobs queued to cluster engine + cores: 1024 # max cores local_cores: 4 # max threads used on submit host scriptname: "ymp.{rulename}.{jobid}.sh" command: @@ -284,4 +285,3 @@ pairnames: - R2 shell: "/bin/bash" - From 0209e97fbd949987883c2a3455790e0fbe346bc0 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 18 Oct 2023 11:04:36 -0600 Subject: [PATCH 122/133] feat(yamlconfig): improve error message --- src/ymp/yaml.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ymp/yaml.py b/src/ymp/yaml.py index fea61a2c..889101f4 100644 --- a/src/ymp/yaml.py +++ b/src/ymp/yaml.py @@ -256,15 +256,15 @@ def get_type(obj): if isinstance(obj, Sequence): return "Sequence" return "Scalar" - typs = set(get_type(m[1]) for m in items if m[1]) - if len(typs) > 1: + typs = [get_type(m[1]) for m in items if m[1]] + if len(set(typs)) > 1: stack = [Entry(fn, m, key) for fn, m in self._maps if key in m] raise MixedTypeError( self, - f"Mixed data types for key '{key}'s in present in files: {typs}", + f"Cannot merge contents of configuration key '{key}'" + f" due to mismatching content types.\n" + f" types = {typs}", key=key, - stack=stack, - typs=typs, stack=stack ) return items From 2709c9501b4918844a18b853f1ae3c6e2327418e Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Wed, 18 Oct 2023 11:05:10 -0600 Subject: [PATCH 123/133] feat(snakemake): match version 7.32 (InOutput) --- src/ymp/__init__.py | 2 +- src/ymp/snakemake.py | 75 +++++++++++++++++++++++++++++++++----------- 2 files changed, 57 insertions(+), 20 deletions(-) diff --git a/src/ymp/__init__.py b/src/ymp/__init__.py index a648ef06..90f47d9d 100644 --- a/src/ymp/__init__.py +++ b/src/ymp/__init__.py @@ -51,7 +51,7 @@ snakemake_minimum_version = "7.15" #: Lastest version of snakemake that was tested (breaking changes for #: us can happen at patch level) -snakemake_tested_version = "7.17" +snakemake_tested_version = "7.32.4" def get_config() -> 'ymp.config.ConfigMgr': diff --git a/src/ymp/snakemake.py b/src/ymp/snakemake.py index 64211ee6..73dbde2c 100644 --- a/src/ymp/snakemake.py +++ b/src/ymp/snakemake.py @@ -22,8 +22,9 @@ ) # type: ignore from snakemake.io import Namedlist as _Namedlist # type: ignore from snakemake.rules import Rule # type: ignore -from snakemake.workflow import RuleInfo, Workflow # type: ignore +from snakemake.workflow import Workflow # type: ignore from snakemake.sourcecache import infer_source_file # type: ignore +from snakemake.ruleinfo import InOutput, RuleInfo # type: ignore from packaging import version @@ -71,9 +72,9 @@ def check_snakemake() -> bool: def networkx(): import networkx - if networkx.__version__[0] != "2": + if networkx.__version__[0] not in ("2", "3"): log.fatal( - "Networkx version 2.* required by YMP but {} found" + "Networkx version 1.x not supported by YMP (found {})" "".format(networkx.__version__) ) sys.exit(1) @@ -208,15 +209,13 @@ def update_tuple(self, totuple): "format": "argstuple", # len(t[0]) must be == 0 }, "input": { - "format": "argstuple", + "format": "inoutput", "funcparams": ("wildcards",), "apply_wildcards": True, - "path_modifier": True, }, "output": { - "format": "argstuple", + "format": "inoutput", "apply_wildcards": True, - "path_modifier": True, }, "threads": { "format": "int", @@ -242,18 +241,16 @@ def update_tuple(self, totuple): "format": "object", }, "log": { - "format": "argstuple", + "format": "inoutput", "apply_wildcards": True, - "path_modifier": True, }, "message": { "format": "string", "format_wildcards": True, }, "benchmark": { - "format": "string", + "format": "inoutput", "apply_wildcards": True, - "path_modifier": True, }, "wrapper": { "format": "string", @@ -317,7 +314,9 @@ def update_tuple(self, totuple): "name": {"format": "string"}, "notebook": {"format": "string", "runner": True}, "retries": {"format": "int"}, - "template_engine": {"format": "string", "runner": True} + "template_engine": {"format": "string", "runner": True}, + "localrule": {"format": "boolean"}, + "ref_attributes": {"format": "set"} # restart_times # env_modules # shadow_depth @@ -469,10 +468,20 @@ def apply_expanders(rule, ruleinfo): rule._ymp_print_rule = True for expander in reversed(self.__expanders): + rule_pre = copy(rule) + ruleinfo_pre = copy(ruleinfo) expander.expand(rule, ruleinfo) if ymp.print_rule == 1: log.error("### expanded with " + type(expander).__name__) print_ruleinfo(rule, ruleinfo, log.error) + # Check types: + for field_name,field in ruleinfo_fields.items(): + if field["format"] == "inoutput": + attr = getattr(ruleinfo, field_name) + if attr is not None and not isinstance(attr, InOutput): + raise TypeError( + f"Expected InOut object for '{field_name}'" + ) if ymp.print_rule: log.error("#### END expansion") @@ -499,11 +508,15 @@ def decorate(ruleinfo): def make_rule(name: str = None, lineno: int = None, snakefile: str = None, **kwargs): log.debug("Synthesizing rule {}".format(name)) + workflow = get_workflow() ruleinfo = RuleInfo(lambda: None) for arg in kwargs: + if ruleinfo_fields.get(arg, {}).get("format") == "inoutput": + if not isinstance(kwargs[arg], InOutput): + kwargs[arg] = InOutput(kwargs[arg][0], kwargs[arg][1], + workflow.modifier.path_modifier) setattr(ruleinfo, arg, kwargs[arg]) ruleinfo.norun = True - workflow = get_workflow() try: return workflow.rule(name, lineno, snakefile)(ruleinfo) except CreateRuleException: @@ -626,6 +639,8 @@ def expand(self, rule, item, expand_args=None, rec=-1, cb=False): item = self.expand_dict(rule, item, expand_args, rec) elif isinstance(item, list): item = self.expand_list(rule, item, expand_args, rec, cb) + elif isinstance(item, InOutput): + item = self.expand_inoutput(rule, item, expand_args, rec, cb) elif isinstance(item, tuple): item = self.expand_tuple(rule, item, expand_args, rec, cb) else: @@ -748,6 +763,11 @@ def expand_list(self, rule, item, expand_args, rec, cb): def expand_tuple(self, rule, item, expand_args, rec, cb): return tuple(self.expand_list(rule, item, expand_args, rec, cb)) + def expand_inoutput(self, rule, item, expand_args, rec, cb): + res = self.expand_tuple(rule, (item.paths, item.kwpaths), + expand_args, rec, cb) + return InOutput(res[0], res[1], item.modifier) + class SnakemakeExpander(BaseExpander): """Expand wildcards in strings returned from functions. @@ -876,10 +896,15 @@ def expand(self, rule, ruleinfo): named[key] = list(flatten(named[key])) orig_tuples[field] = (unnamed, named) args[field] = NamedList(fromtuple=(unnamed, named)) - elif ruleinfo_fields[field].get("path_modifier", False): - string, *_ = getattr(ruleinfo, field, ((), None)) - args[field] = NamedList() - args[field].append(string) + elif ruleinfo_fields[field]["format"] == "inoutput": + inout = getattr(ruleinfo, field) + unnamed = list(flatten(inout.paths)) + named = copy(inout.kwpaths) + for key in named: + if is_container(named[key]): + named[key] = list(flatten(named[key])) + orig_tuples[field] = (unnamed, named) + args[field] = NamedList(fromtuple=orig_tuples[field]) else: string = getattr(ruleinfo, field, None) args[field] = NamedList() @@ -978,8 +1003,14 @@ def wrapper(wildcards, **kwargs): unnamed, named = orig_tuples[field] _, _, *extras = attr setattr(ruleinfo, field, (unnamed, named, *extras)) - elif ruleinfo_fields[field].get("path_modifier", False): - setattr(ruleinfo, field, (args[field][0], attr[1])) + elif ruleinfo_fields[field]["format"] == "inoutput": + args[field].update_tuple(orig_tuples[field]) + unnamed, named = orig_tuples[field] + if isinstance(attr.paths, str): + unnamed = unnamed[0] + setattr(ruleinfo, field, InOutput( + unnamed, named, attr.modifier + )) else: setattr(ruleinfo, field, args[field][0]) @@ -1103,6 +1134,12 @@ def expand(self, rule, ruleinfo): named = deepcopy(named_base) named.update(named_child) setattr(ruleinfo, field, (unnamed, named, *extra)) + elif ruleinfo_fields[field]["format"] == "inoutput": + kwpaths = deepcopy(base_attr.kwpaths) + kwpaths.update(override_attr.kwpaths) + paths = override_attr.paths or base_attr.paths + modifier = override_attr.modifier or base_attr.modifier + setattr(ruleinfo, field, InOutput(paths, kwpaths, modifier)) else: # Both set, not argstuple, keep child intact pass From f34090ddc8adc3b04ff289a6a76e21a0a0e89204 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Mon, 6 Nov 2023 10:59:59 -0700 Subject: [PATCH 124/133] Add missing file --- src/cache.py | 247 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 247 insertions(+) create mode 100644 src/cache.py diff --git a/src/cache.py b/src/cache.py new file mode 100644 index 00000000..4290b952 --- /dev/null +++ b/src/cache.py @@ -0,0 +1,247 @@ +import sqlite3 + +from ymp.common import AttrDict + +class NoCache(object): + def __init__(self, root): + self.caches = {} + + def close(self): + pass # NoCache doesn't close anything + + def get_cache(self, name, clean=False, *args, **kwargs): + if name not in self.caches: + self.caches[name] = CacheDict(self, name, *args, **kwargs) + return self.caches[name] + + def store(self, cache, key, obj): + pass # NoCache doesnt store anything + + def commit(self): + pass # NoCache doesnt commit anything + + def load(self, _cache, _key): + return None + + def load_all(self, _cache): + return () + + +class Cache(object): + def __init__(self, root): + os.makedirs(os.path.join(root), exist_ok=True) + db_fname = os.path.join(root, "ymp.db") + log.debug("Opening database %s", db_fname) + self.conn = sqlite3.connect(db_fname, check_same_thread=False) + + # Drop tables if the database has the wrong version number + # or if the user_version has not been set (defaults to 0) + version = self.conn.execute("PRAGMA user_version").fetchone()[0] + if version == ymp.__numeric_version__ and version != 0: + try: + curs = self.conn.execute("SELECT file, time from stamps") + update = any(os.path.getmtime(row[0]) > row[1] for row in curs) + except FileNotFoundError: + update = True + del curs + if update: + log.error("Dropping cache: files changed") + self.conn.executescript(""" + DROP TABLE caches; + DROP TABLE stamps; + """) + else: + log.info("No cache, loading...") + update = True + + if update: + self.conn.executescript(""" + BEGIN EXCLUSIVE; + DROP TABLE IF EXISTS caches; + CREATE TABLE caches ( + name TEXT, + key TEXT, + data, + PRIMARY KEY (name, key) + ); + DROP TABLE IF EXISTS stamps; + CREATE TABLE stamps ( + file TEXT PRIMARY KEY, + time INT + ); + + PRAGMA user_version={}; + COMMIT; + """.format(ymp.__numeric_version__)) + + self.caches = {} + self.files = {} + + def close(self): + self.conn.close() + + def get_cache(self, name, clean=False, *args, **kwargs): + if name not in self.caches: + self.caches[name] = CacheDict(self, name, *args, **kwargs) + return self.caches[name] + + def store(self, cache, key, obj): + import pickle + + files = ensure_list(getattr(obj, "defined_in", None)) + try: + stamps = [(fn, os.path.getmtime(fn)) + for fn in files + if fn not in self.files] + self.conn.executemany( + "REPLACE INTO stamps VALUES (?,?)", + stamps) + self.files.update(dict(stamps)) + self.conn.execute(""" + REPLACE INTO caches + VALUES (?, ?, ?) + """, [cache, key, pickle.dumps(obj)] + ) + except pickle.PicklingError: + log.error("Failed to pickle %s", obj) + except FileNotFoundError: + pass + + def commit(self): + import sqlite3 + try: + self.conn.commit() + except sqlite3.OperationalError as exc: + log.warning("Cache write failed: %s", exc.what()) + + def load(self, cache, key): + import pickle + row = self.conn.execute(""" + SELECT data FROM caches WHERE name=? AND key=? + """, [cache, key]).fetchone() + if row: + obj = pickle.loads(row[0]) + try: + obj.load_from_pickle() + except AttributeError: + pass + return obj + else: + return None + + def load_all(self, cache): + import pickle + rows = self.conn.execute(""" + SELECT key, data FROM caches WHERE name=? + """, [cache]) + return ((row[0], pickle.loads(row[1])) + for row in rows) + + +class CacheDict(AttrDict): + def __init__(self, cache, name, *args, loadfunc=None, + itemloadfunc=None, itemdata=None, **kwargs): + self._cache = cache + self._name = name + self._loadfunc = loadfunc + self._itemloadfunc = itemloadfunc + self._itemdata = itemdata + self._args = args + self._kwargs = kwargs + self._loading = False + self._complete = False + + def _loaditem(self, key): + cached = self._cache.load(self._name, key) + if cached: + super().__setitem__(key, cached) + elif self._itemdata is not None: + if key in self._itemdata: + item = self._itemloadfunc(key, self._itemdata[key]) + self._cache.store(self._name, key, item) + self._cache.commit() + super().__setitem__(key, item) + elif self._itemloadfunc: + item = self._itemloadfunc(key) + self._cache.store(self._name, key, item) + self._cache.commit() + super().__setitem__(key, item) + else: + self._loadall() + + def _loadall(self): + if self._complete: + return + loaded = set() + for key, obj in self._cache.load_all(self._name): + loaded.add(key) + super().__setitem__(key, obj) + if self._itemloadfunc: + for key in self._itemdata: + if key not in loaded: + self._loaditem(key) + elif self._loadfunc and not self._loading and not loaded: + self._loadfunc(*self._args, **self._kwargs) + self._loadfunc = None + for key, item in super().items(): + self._cache.store(self._name, key, item) + self._cache.commit() + self._complete = True + + def __enter__(self): + self._loading = True + return self + + def __exit__(self, a, b, c): + self._loading = False + + def __contains__(self, key): + if self._itemdata: + return key in self._itemdata + self._loadall() + return super().__contains__(key) + + def __len__(self): + if self._itemdata: + return len(self._itemdata) + self._loadall() + return super().__len__() + + def __getitem__(self, key): + if not super().__contains__(key): + self._loaditem(key) + return super().__getitem__(key) + + def __setitem__(self, key, val): + super().__setitem__(key, val) + + def __delitem__(self, key): + raise NotImplementedError() + + def __iter__(self): + if self._itemdata: + return self._itemdata.__iter__() + self._loadall() + return super().__iter__() + + def __str__(self): + self._loadall() + return super().__str__() + + def get(self, key, default=None): + if not super().__contains__(key): + self._loaditem(key) + return super().get(key, default) + + def items(self): + self._loadall() + return super().items() + + def keys(self): + if self._itemdata: + return self._itemdata.keys() + return super().keys() + + def values(self): + self._loadall() + return super().values() From 3b6fbcb810a4ee1358a3940bd8d40a3442237e26 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Mon, 11 Dec 2023 16:13:54 -0700 Subject: [PATCH 125/133] chore(conda): adjust environment.yaml pkg versions --- environment.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yaml b/environment.yaml index 1eca4f05..bd37bc97 100644 --- a/environment.yaml +++ b/environment.yaml @@ -4,10 +4,10 @@ channels: - bioconda dependencies: - python >=3.7 - - snakemake-minimal >=7.15 + - snakemake-minimal >=7.32 - mamba - conda !=4.6.11 - - click + - click >8 - ruamel.yaml >0.15 # new api - drmaa - pandas >=0.20 # need dtype support in python csv engine From bac43df62dfd96d5e3293bd54777cb88c2077b13 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Mon, 11 Dec 2023 16:15:02 -0700 Subject: [PATCH 126/133] fix(util/check_input): handle file error --- src/ymp/util.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/ymp/util.py b/src/ymp/util.py index 7f025074..4cc4d1c0 100644 --- a/src/ymp/util.py +++ b/src/ymp/util.py @@ -129,16 +129,21 @@ def check_input_func(wildcards, input): openfunc = gzip.open else: openfunc = open - with openfunc(fname, "rb") as fd: - btes = fd.read(8192) - while btes: - nlines += btes.count(b"\n") - nbytes += len(btes) - if nbytes >= minbytes and nlines >= minlines: - break + try: + with openfunc(fname, "rb") as fd: btes = fd.read(8192) - if nbytes < minbytes or nlines < minlines: - return False + while btes: + nlines += btes.count(b"\n") + nbytes += len(btes) + if nbytes >= minbytes and nlines >= minlines: + break + btes = fd.read(8192) + if nbytes < minbytes or nlines < minlines: + return False + except (IOError, EOFError): + raise YmpRuleError( + None, f"Failed to read file '{fname}'" + ) elif any(files_exist): raise YmpRuleError( None, From 49f07d1c4646ecc64a51dfbbb08fad754c6da051 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Mon, 11 Dec 2023 16:15:47 -0700 Subject: [PATCH 127/133] fix(blast): bug in new checkpoint usage --- src/ymp/rules/blast.rules | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/src/ymp/rules/blast.rules b/src/ymp/rules/blast.rules index d14ecf29..b95deeeb 100644 --- a/src/ymp/rules/blast.rules +++ b/src/ymp/rules/blast.rules @@ -181,7 +181,6 @@ with Stage("annotate_blast") as S: echo {input.contigs} > {output.contig_list} """ - localrules: blastn_split_query_fasta checkpoint blastn_split_query_fasta: """Split FASTA query file into chunks for individual BLAST runs""" message: @@ -189,11 +188,12 @@ with Stage("annotate_blast") as S: input: contigs = "{:prev:}/{:target:}.fasta.gz", dbsize = "{:this:}/{target}.blast_db_size", - contig_list = "{:this:}/{target}.fasta_files" + contig_list = "{:this:}/{target}.fasta_files", output: queries = temp(directory( "{:this:}/{target}.split_queries" - )) + )), + query_list = "{:this:}/{target}.split_fasta_files", params: nseq_max = 100000, nseq_min = 10 @@ -205,6 +205,7 @@ with Stage("annotate_blast") as S: with open(input.contig_list, "r") as fd: contigs = fd.read().strip() + fnames = [] os.makedirs(output.queries, exist_ok=True) import gzip template = os.path.join(output.queries,"{index}.fasta") @@ -218,6 +219,7 @@ with Stage("annotate_blast") as S: fname = template.format(index=file_count) with open(fname, "wb") as out: out.write(b"".join(lines)) + fnames.append(fname) seq_count = 0 file_count += 1 lines = [] @@ -226,15 +228,22 @@ with Stage("annotate_blast") as S: fname = template.format(index=file_count) with open(fname, "wb") as out: out.write(b"".join(lines)) + fnames.append(fname) + with open(output.query_list, "w") as fd: + fd.writelines(fname + "\n" for fname in fnames) def blastn_join_input(wildcards): cpt = checkpoints.blastn_split_query_fasta.get(**wildcards) - fastadir = cpt.output.queries - blastdir = re.sub("_queries$", "_results", fastadir) - indices = glob_wildcards(os.path.join(fastadir, '{index}.fasta')) - res = expand(os.path.join(blastdir, '{index}.blast7.gz'), - index=indices.index) - return res + with open(cpt.output.query_list) as fd: + fastafiles = fd.readlines() + return [ + re.sub( + r".split_queries/(.*).fasta$", + r".split_results/\1.blast7.gz", + fname.rstrip("\n") + ) + for fname in fastafiles + ] localrules: blastn_join_result rule blastn_join_result: @@ -248,12 +257,14 @@ with Stage("annotate_blast") as S: log: "{:this:}/{target}.log" shell: + "exec >{log} 2>&1;" + "echo YMP: concatenating files;" + "echo \"{input.results}\";" "if [ -z \"{input.results}\" ]; then" - " echo YMP: making empty output >{log};" + " echo YMP: no files - making empty output;" + " exit 1;" # not blast7 format, need to fix " echo | gzip > {output};" "else " - " echo YMP: concatenating files >{log};" - " echo \"{input.results}\" >> {log};" " cat {input.results} > {output};" "fi" From 2209bea7b1851dc745d646c2c1693ad3a39736bb Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 12 Dec 2023 11:30:08 -0700 Subject: [PATCH 128/133] feat(cli/env-list): allow filtering installed/not-installed --- src/ymp/cli/env.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/ymp/cli/env.py b/src/ymp/cli/env.py index bea8e525..eb1ca452 100644 --- a/src/ymp/cli/env.py +++ b/src/ymp/cli/env.py @@ -89,8 +89,12 @@ def env(): "--reverse", "-r", is_flag=True, help="Reverse sort order" ) +@click.option( + "--installed/--not-installed", default=None, is_flag=True, + help="List only installed/not installed environments" +) @click.argument("ENVNAMES", nargs=-1) -def ls(param_all, static, dynamic, sort_col, reverse, envnames): +def ls(param_all, static, dynamic, sort_col, reverse, envnames, installed): """List conda environments""" envs = get_envs(envnames) @@ -103,6 +107,11 @@ def ls(param_all, static, dynamic, sort_col, reverse, envnames): ] table_content.sort(key=lambda row: row[sort_col].upper(), reverse=reverse) + if installed is not None: + table_content = [ + row for row in table_content + if row['installed'] == str(installed) + ] table_header = [{col: col for col in ENV_COLUMNS}] table = table_header + table_content From c29ca13e79167395f0a6ee0b9623c27913eaf475 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 12 Dec 2023 11:32:21 -0700 Subject: [PATCH 129/133] fix(cli/env-list): remove unimplemented options --- src/ymp/cli/env.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/src/ymp/cli/env.py b/src/ymp/cli/env.py index eb1ca452..6b6726e3 100644 --- a/src/ymp/cli/env.py +++ b/src/ymp/cli/env.py @@ -68,18 +68,6 @@ def env(): @env.command(name="list") -@click.option( - "--static/--no-static", default=True, - help="List environments statically defined via env.yml files" -) -@click.option( - "--dynamic/--no-dynamic", default=True, - help="List environments defined inline from rule files" -) -@click.option( - "--all", "-a", "param_all", is_flag=True, - help="List all environments, including outdated ones." -) @click.option( "--sort", "-s", "sort_col", type=click.Choice(ENV_COLUMNS), default=ENV_COLUMNS[0], @@ -94,7 +82,7 @@ def env(): help="List only installed/not installed environments" ) @click.argument("ENVNAMES", nargs=-1) -def ls(param_all, static, dynamic, sort_col, reverse, envnames, installed): +def ls(sort_col, reverse, envnames, installed): """List conda environments""" envs = get_envs(envnames) From db6d66697e312eee932bb151b913a07dd47e0daa Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 12 Dec 2023 11:55:14 -0700 Subject: [PATCH 130/133] feat(cli/env-list): allow showing more fields; black --- src/ymp/cli/env.py | 234 +++++++++++++++++++++++++++------------------ 1 file changed, 140 insertions(+), 94 deletions(-) diff --git a/src/ymp/cli/env.py b/src/ymp/cli/env.py index 6b6726e3..03957d32 100644 --- a/src/ymp/cli/env.py +++ b/src/ymp/cli/env.py @@ -15,7 +15,21 @@ log = logging.getLogger(__name__) # pylint: disable=invalid-name -ENV_COLUMNS = ('label', 'hash', 'address', 'installed') +ENV_COLUMNS = ("label", "hash", "address", "installed") +ENV_COLUMNS_ALL = ( + "label", + "hash", + "content_hash", + "address", + "installed", + "content_deploy", + "content_pin", + "container_img_url", + "is_containerized", + "is_named", + "archive_file", + "content", +) def get_envs(patterns=None): @@ -25,11 +39,14 @@ def get_envs(patterns=None): envnames: list of strings to match """ from ymp.env import Env + envs = Env.get_registry() if patterns: - envs = {env: envs[env] for env in envs - if any(fnmatch(env, pat) - for pat in ensure_list(patterns))} + envs = { + env: envs[env] + for env in envs + if any(fnmatch(env, pat) for pat in ensure_list(patterns)) + } return envs @@ -44,8 +61,9 @@ def get_env(envname): raise click.UsageError("Environment {} unknown".format(envname)) if len(envs) > 1: - raise click.UsageError("Multiple environments match '{}': {}" - "".format(envname, envs.keys())) + raise click.UsageError( + "Multiple environments match '{}': {}" "".format(envname, envs.keys()) + ) env = next(iter(envs.values())) if not os.path.exists(env.address): @@ -69,72 +87,89 @@ def env(): @env.command(name="list") @click.option( - "--sort", "-s", "sort_col", - type=click.Choice(ENV_COLUMNS), default=ENV_COLUMNS[0], - help="Sort by column" + "--sort", + "-s", + "sort_col", + type=click.Choice(ENV_COLUMNS), + default=ENV_COLUMNS[0], + help="Sort by column", ) +@click.option("--reverse", "-r", is_flag=True, help="Reverse sort order") @click.option( - "--reverse", "-r", is_flag=True, - help="Reverse sort order" + "--installed/--not-installed", + default=None, + is_flag=True, + help="List only installed/not installed environments", ) @click.option( - "--installed/--not-installed", default=None, is_flag=True, - help="List only installed/not installed environments" + "--extra", + "-e", + "extra_fields", + type=str, + help="Show additional fields (all: everything)", ) @click.argument("ENVNAMES", nargs=-1) -def ls(sort_col, reverse, envnames, installed): +def ls(sort_col, reverse, envnames, installed, extra_fields): """List conda environments""" envs = get_envs(envnames) + if extra_fields is None: + fields = ENV_COLUMNS + else: + extra_fields = extra_fields.split(",") + if "all" in extra_fields: + extra_fields = ENV_COLUMNS_ALL + unknown = " ,".join( + field for field in extra_fields if field not in ENV_COLUMNS_ALL + ) + if unknown: + raise click.UsageError(f"Unknown fields requested: {unknown}") + fields = [ + field + for field in ENV_COLUMNS_ALL + if field in extra_fields or field in ENV_COLUMNS + ] table_content = [ - { - key: str(getattr(env, key)) - for key in ENV_COLUMNS - } - for env in envs.values() + {key: str(getattr(env, key)) for key in fields} for env in envs.values() ] - table_content.sort(key=lambda row: row[sort_col].upper(), - reverse=reverse) + table_content.sort(key=lambda row: row[sort_col].upper(), reverse=reverse) if installed is not None: table_content = [ - row for row in table_content - if row['installed'] == str(installed) + row for row in table_content if row["installed"] == str(installed) ] - table_header = [{col: col for col in ENV_COLUMNS}] + table_header = [{col: col for col in fields}] table = table_header + table_content - widths = {col: max(len(row[col]) for row in table) - for col in ENV_COLUMNS} + widths = {col: max(len(row[col]) for row in table) for col in fields} - lines = [" ".join("{!s:<{}}".format(row[col], widths[col]) - for col in ENV_COLUMNS) - for row in table] + lines = [ + " ".join("{!s:<{}}".format(row[col], widths[col]) for col in fields) + for row in table + ] echo("\n".join(lines)) @env.command() @snake_params @click.option( - "--reinstall", is_flag=True, - help="Delete existing environment and reinstall" + "--reinstall", is_flag=True, help="Delete existing environment and reinstall" ) @click.option( - "--no-spec", is_flag=True, - help="Don't use conda env spec even if present" + "--no-spec", is_flag=True, help="Don't use conda env spec even if present" ) @click.option( - "--no-archive", is_flag=True, - help="Delete existing archives before install" + "--no-archive", is_flag=True, help="Delete existing archives before install" ) @click.option( - "--fresh", is_flag=True, - help="Create fresh install. Implies reinstall, no-spec and no-archve" + "--fresh", + is_flag=True, + help="Create fresh install. Implies reinstall, no-spec and no-archve", ) def prepare(reinstall, no_spec, no_archive, fresh, **kwargs): "Create envs needed to build target" - kwargs['conda_create_envs_only'] = True + kwargs["conda_create_envs_only"] = True cfg = ymp.get_config() - if (fresh): + if fresh: reinstall = no_spec = no_archive = True cfg.conda.create.reinstall = reinstall cfg.conda.create.nospec = no_spec @@ -145,44 +180,34 @@ def prepare(reinstall, no_spec, no_archive, fresh, **kwargs): @env.command() +@click.option("--conda-prefix", "-p", help="Override location for conda environments") +@click.option("--conda-env-spec", "-e", help="Override conda env specs settings") +@click.option("--dry-run", "-n", is_flag=True, help="Only show what would be done") @click.option( - "--conda-prefix", "-p", - help="Override location for conda environments" -) -@click.option( - "--conda-env-spec", "-e", - help="Override conda env specs settings" -) -@click.option( - "--dry-run", "-n", is_flag=True, - help="Only show what would be done" + "--reinstall", "-r", is_flag=True, help="Delete existing environment and reinstall" ) @click.option( - "--reinstall", "-r", is_flag=True, - help="Delete existing environment and reinstall" + "--no-spec", is_flag=True, help="Don't use conda env spec even if present" ) @click.option( - "--no-spec", is_flag=True, - help="Don't use conda env spec even if present" + "--no-archive", is_flag=True, help="Delete existing archives before install" ) @click.option( - "--no-archive", is_flag=True, - help="Delete existing archives before install" -) -@click.option( - "--fresh", is_flag=True, - help="Create fresh install. Implies reinstall, no-spec and no-archve" + "--fresh", + is_flag=True, + help="Create fresh install. Implies reinstall, no-spec and no-archve", ) @click.argument("ENVNAMES", nargs=-1) def install( - conda_prefix, - conda_env_spec, - dry_run, - reinstall, - no_spec, - no_archive, - fresh, - envnames): + conda_prefix, + conda_env_spec, + dry_run, + reinstall, + no_spec, + no_archive, + fresh, + envnames, +): "Install conda software environments" if conda_env_spec is not None: cfg = ymp.get_config() @@ -193,8 +218,11 @@ def install( envs = get_envs(envnames) need_install = len([env for env in envs.values() if not env.installed]) if not reinstall and len(envs) != need_install: - log.warning("Creating %i environments (%i already installed)", - need_install, len(envs)-need_install) + log.warning( + "Creating %i environments (%i already installed)", + need_install, + len(envs) - need_install, + ) else: log.warning(f"Creating {len(envs)} environments.") for env in envs.values(): @@ -228,19 +256,38 @@ def remove(envnames): @env.command() -@click.option("--dest", "-d", type=click.Path(), metavar="FILE", - help="Destination file or directory. If a directory, file names" - " will be derived from environment names and selected export " - "format. Default: print to standard output.") -@click.option("--overwrite", "-f", is_flag=True, default=False, - help="Overwrite existing files") -@click.option("--create-missing", "-c", is_flag=True, default=False, - help="Create environments not yet installed") -@click.option("--skip-missing", "-s", is_flag=True, default=False, - help="Skip environments not yet installed") -@click.option("--filetype", "-t", type=click.Choice(['yml', 'txt']), - help="Select export format. " - "Default: yml unless FILE ends in '.txt'") +@click.option( + "--dest", + "-d", + type=click.Path(), + metavar="FILE", + help="Destination file or directory. If a directory, file names" + " will be derived from environment names and selected export " + "format. Default: print to standard output.", +) +@click.option( + "--overwrite", "-f", is_flag=True, default=False, help="Overwrite existing files" +) +@click.option( + "--create-missing", + "-c", + is_flag=True, + default=False, + help="Create environments not yet installed", +) +@click.option( + "--skip-missing", + "-s", + is_flag=True, + default=False, + help="Skip environments not yet installed", +) +@click.option( + "--filetype", + "-t", + type=click.Choice(["yml", "txt"]), + help="Select export format. " "Default: yml unless FILE ends in '.txt'", +) @click.argument("ENVNAMES", nargs=-1) def export(envnames, dest, overwrite, create_missing, skip_missing, filetype): """Export conda environments @@ -267,12 +314,13 @@ def export(envnames, dest, overwrite, create_missing, skip_missing, filetype): if skip_missing and create_missing: raise click.UsageError( - "--skip-missing and --create-missing are mutually exclusive") + "--skip-missing and --create-missing are mutually exclusive" + ) - if dest and not filetype and dest.endswith('.txt'): - filetype = 'txt' + if dest and not filetype and dest.endswith(".txt"): + filetype = "txt" if not filetype: - filetype = 'yml' + filetype = "yml" missing = [env for env in envs.values() if not env.installed] if skip_missing: @@ -301,19 +349,18 @@ def export(envnames, dest, overwrite, create_missing, skip_missing, filetype): if dest: if os.path.isdir(dest): - file_names = [os.path.join(dest, ".".join((name, filetype))) - for name in envs.keys()] + file_names = [ + os.path.join(dest, ".".join((name, filetype))) for name in envs.keys() + ] else: file_names = [dest] for fname in file_names: if not overwrite and os.path.exists(fname): - raise click.UsageError( - f"File '{fname}' exists. Use '-f' to overwrite") + raise click.UsageError(f"File '{fname}' exists. Use '-f' to overwrite") with ExitStack() as stack: - files = [stack.enter_context(open(fname, "w")) - for fname in file_names] + files = [stack.enter_context(open(fname, "w")) for fname in file_names] files_stack = stack.pop_all() else: files = [sys.stdout] @@ -339,8 +386,7 @@ def export(envnames, dest, overwrite, create_missing, skip_missing, filetype): @env.command() -@click.option("--all", "-a", "param_all", is_flag=True, - help="Delete all environments") +@click.option("--all", "-a", "param_all", is_flag=True, help="Delete all environments") @click.argument("ENVNAMES", nargs=-1) def clean(param_all): "Remove unused conda environments" From ff24b0298d9d272843a6046257c65dda8b105718 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 12 Dec 2023 12:04:54 -0700 Subject: [PATCH 131/133] feat(cli/env-list): allow writing as CSV (machine readable) --- src/ymp/cli/env.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/ymp/cli/env.py b/src/ymp/cli/env.py index 03957d32..7a7d43c2 100644 --- a/src/ymp/cli/env.py +++ b/src/ymp/cli/env.py @@ -1,3 +1,4 @@ +import csv import logging import os import shutil @@ -108,8 +109,9 @@ def env(): type=str, help="Show additional fields (all: everything)", ) +@click.option("--csv", "write_csv", is_flag=True, help="Output as machine readable CSV") @click.argument("ENVNAMES", nargs=-1) -def ls(sort_col, reverse, envnames, installed, extra_fields): +def ls(sort_col, reverse, envnames, installed, extra_fields, write_csv): """List conda environments""" envs = get_envs(envnames) if extra_fields is None: @@ -138,15 +140,20 @@ def ls(sort_col, reverse, envnames, installed, extra_fields): row for row in table_content if row["installed"] == str(installed) ] - table_header = [{col: col for col in fields}] - table = table_header + table_content - widths = {col: max(len(row[col]) for row in table) for col in fields} + if write_csv: + writer = csv.DictWriter(sys.stdout, fields) + writer.writeheader() + writer.writerows(table_content) + else: + table_header = [{col: col for col in fields}] + table = table_header + table_content + widths = {col: max(len(row[col]) for row in table) for col in fields} - lines = [ - " ".join("{!s:<{}}".format(row[col], widths[col]) for col in fields) - for row in table - ] - echo("\n".join(lines)) + lines = [ + " ".join("{!s:<{}}".format(row[col], widths[col]) for col in fields) + for row in table + ] + echo("\n".join(lines)) @env.command() From 8c57c79b33349ca23370600ceafd5ec0be275d27 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 16 Jul 2024 13:56:31 -0600 Subject: [PATCH 132/133] Update environment.yaml --- environment.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/environment.yaml b/environment.yaml index bd37bc97..9dc6ab6b 100644 --- a/environment.yaml +++ b/environment.yaml @@ -3,11 +3,12 @@ channels: - conda-forge - bioconda dependencies: - - python >=3.7 - - snakemake-minimal >=7.32 + - python >=3.10 + - snakemake-minimal >=7.34 - mamba - conda !=4.6.11 - click >8 + - shellingham # (needed for click) - ruamel.yaml >0.15 # new api - drmaa - pandas >=0.20 # need dtype support in python csv engine From 05f05f6a11af36732884a835959225af48f01380 Mon Sep 17 00:00:00 2001 From: Elmar Pruesse Date: Tue, 16 Jul 2024 15:10:54 -0600 Subject: [PATCH 133/133] Pin Snakemake --- environment.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environment.yaml b/environment.yaml index 9dc6ab6b..066b62fa 100644 --- a/environment.yaml +++ b/environment.yaml @@ -4,10 +4,11 @@ channels: - bioconda dependencies: - python >=3.10 - - snakemake-minimal >=7.34 + - snakemake-minimal =7.32.* - mamba - conda !=4.6.11 - click >8 + - click-completion - shellingham # (needed for click) - ruamel.yaml >0.15 # new api - drmaa