From 799e9f217dceae8a9ff1692bdbbb8294e302e9b6 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 22 Jul 2024 16:38:27 +0200 Subject: [PATCH 001/148] clean up annex-remote-compute --- datalad_compute/annexremotes/compute.py | 105 ++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 datalad_compute/annexremotes/compute.py diff --git a/datalad_compute/annexremotes/compute.py b/datalad_compute/annexremotes/compute.py new file mode 100644 index 0000000..3a45d7d --- /dev/null +++ b/datalad_compute/annexremotes/compute.py @@ -0,0 +1,105 @@ +from __future__ import annotations + +from base64 import urlsafe_b64decode +from pathlib import Path +from urllib.parse import urlparse + +from annexremote import Master +from datalad_next.annexremotes import ( + SpecialRemote, + super_main +) + +from datalad_compute.utils.compute import compute + + +class ComputeRemote(SpecialRemote): + def __init__(self, annex: Master): + super().__init__(annex) + + def __del__(self): + self.close() + + def close(self) -> None: + pass + + def _check_url(self, url: str) -> bool: + return url.startswith('URL--compute:') or url.startswith('compute:') + + def prepare(self): + self.annex.debug(f'PREPARE') + + def initremote(self): + self.annex.debug(f'INITREMOTE') + + def remove(self, key: str): + self.annex.debug(f'REMOVE {key!r}') + + def transfer_store(self, key: str, local_file: str): + self.annex.debug(f'TRANSFER STORE') + + def claimurl(self, url: str) -> bool: + self.annex.debug(f'CLAIMURL {url!r}') + return self._check_url(url) + + def checkurl(self, url: str) -> bool: + self.annex.debug(f'CHECKURL {url!r}') + return self._check_url(url) + + def getcost(self) -> int: + self.annex.debug(f'GETCOST') + return 100 + + def _compute(self, compute_info, file_name: str) -> None: + template = Path(self.annex.getgitdir()).parent / '.datalad' / 'compute' / 'methods' / compute_info['method'] + arguments = { + assignment.split('=')[0]: assignment.split('=')[1] + for assignment in compute_info['parameter'].split(';') + } + if compute_info.get('dependencies', 'none') != 'none': + dependencies = { + spec.split(':')[0]: spec.split(':')[1] + for spec in compute_info['dependencies'].split(';') + if spec + } + else: + dependencies = dict() + self.annex.debug(f'COMPUTE calling compute with: {template!r} {arguments!r} {file_name!r}') + compute(template, arguments, file_name) + + def transfer_retrieve(self, key: str, file_name: str) -> None: + self.annex.debug(f'TRANSFER RETRIEVE {key!r} {file_name!r}') + urls = self.annex.geturls(key, 'compute:') + self.annex.debug(f'TRANSFER RETRIEVE urls({key!r}, "compute"): {urls!r}') + + parsed_urls = [urlparse(url) for url in urls] + + # assure a single ID + ids = set(parts.netloc for parts in parsed_urls) + assert len(ids) == 1, f"Expected a single ID, got {ids}" + + # we need "method", "parameter", and "dependencies" data + categories = ('method', 'parameter', 'dependencies') + compute_info = { + category: urlsafe_b64decode(parts.path.split('/')[2]).strip().decode() + for category in categories + for parts in parsed_urls if parts.path.startswith(f'/{category}/') + } + assert tuple(compute_info.keys()) == categories, \ + f"Expected 'method', 'parameter', and 'dependencies', got {compute_info.keys()}" + + self.annex.debug(f'TRANSFER RETRIEVE {key!r}: compute_info: {compute_info!r}, file_name: {file_name!r}') + self._compute(compute_info, file_name) + + def checkpresent(self, key: str) -> bool: + # See if any compute: URL is present + return self.annex.geturls(key, 'compute:') != [] + + +def main(): + """cmdline entry point""" + super_main( + cls=ComputeRemote, + remote_name='compute', + description="Access to computed data", + ) From e7bf64079380ea11633953ebcdadfad4931daadc Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 22 Jul 2024 17:40:03 +0200 Subject: [PATCH 002/148] add compute-command stub --- datalad_compute/__init__.py | 30 ++++++++++++++++++ datalad_compute/commands/compute_cmd.py | 42 +++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 datalad_compute/__init__.py create mode 100644 datalad_compute/commands/compute_cmd.py diff --git a/datalad_compute/__init__.py b/datalad_compute/__init__.py new file mode 100644 index 0000000..97ee045 --- /dev/null +++ b/datalad_compute/__init__.py @@ -0,0 +1,30 @@ +"""DataLad compute extension""" + +__docformat__ = 'restructuredtext' + +import logging +lgr = logging.getLogger('datalad.compute') + +# Defines a datalad command suite. +# This variable must be bound as a setuptools entrypoint +# to be found by datalad +command_suite = ( + # description of the command suite, displayed in cmdline help + "Demo DataLad command suite", + [ + # specification of a command, any number of commands can be defined + ( + # importable module that contains the command implementation + 'datalad_compute.commands.compute_cmd', + # name of the command class implementation in above module + 'Compute', + # optional name of the command in the cmdline API + 'compute', + # optional name of the command in the Python API + 'compute' + ), + ] +) + +from . import _version +__version__ = _version.get_versions()['version'] diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py new file mode 100644 index 0000000..5f98b91 --- /dev/null +++ b/datalad_compute/commands/compute_cmd.py @@ -0,0 +1,42 @@ +"""DataLad demo command""" + +__docformat__ = 'restructuredtext' + +import logging +from os.path import curdir +from os.path import abspath + +from datalad.distribution.dataset import datasetmethod +from datalad.interface.base import Interface +from datalad.interface.base import build_doc +from datalad.interface.base import eval_results +from datalad.interface.results import get_status_dict + + +lgr = logging.getLogger('datalad.compute') + + +# decoration auto-generates standard help +@build_doc +# all commands must be derived from Interface +class Compute(Interface): + # first docstring line is used a short description in the cmdline help + # the rest is put in the verbose help and manpage + """Specify a computation and optionally execute it + """ + + # parameters of the command, must be exhaustive + _params_ = dict() + + @staticmethod + @datasetmethod(name='compute') + @eval_results + # signature must match parameter list above + # additional generic arguments are added by decorators + def __call__(): + yield get_status_dict( + action='compute', + path=abspath(curdir), + status='ok', + message='compute command NOT YET IMPLEMENTED', + ) From a7160491fe4cf92c95baa6b5cf9b2578beccb77c Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 23 Jul 2024 05:11:49 +0200 Subject: [PATCH 003/148] add compute POC --- datalad_compute/commands/compute_cmd.py | 104 +++++++++++++++++++++--- 1 file changed, 91 insertions(+), 13 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 5f98b91..e935601 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -3,14 +3,23 @@ __docformat__ = 'restructuredtext' import logging +import time +from base64 import urlsafe_b64encode from os.path import curdir from os.path import abspath -from datalad.distribution.dataset import datasetmethod -from datalad.interface.base import Interface -from datalad.interface.base import build_doc -from datalad.interface.base import eval_results -from datalad.interface.results import get_status_dict +from datalad_next.commands import ( + EnsureCommandParameterization, + ValidatedInterface, + Parameter, + build_doc, + datasetmethod, + eval_results, + get_status_dict, +) +from datalad_next.constraints import EnsureDataset + +from datalad_compute.utils.compute import compute lgr = logging.getLogger('datalad.compute') @@ -19,24 +28,93 @@ # decoration auto-generates standard help @build_doc # all commands must be derived from Interface -class Compute(Interface): +class Compute(ValidatedInterface): # first docstring line is used a short description in the cmdline help # the rest is put in the verbose help and manpage """Specify a computation and optionally execute it """ + _validator_ = EnsureCommandParameterization(dict( + dataset=EnsureDataset(installed=True), + )) + # parameters of the command, must be exhaustive - _params_ = dict() + _params_ = dict( + dataset=Parameter( + args=('-d', '--dataset'), + doc="""Dataset to be used as a configuration source. Beyond + reading configuration items, this command does not interact with + the dataset."""), + url_only=Parameter( + args=('-u', '--url-only'), + action="store_true", + doc="""Don't perform the computation, register an URL-key + instead"""), + url_id=Parameter( + args=('-i', '--id'), + doc="""Use as URL-id for the computation URLs"""), + template=Parameter( + args=('template',), + doc="""Name of the computing template (template should be present + in $DATASET/.datalad/compute/methods)"""), + output=Parameter( + args=('output',), + doc="""name of the output file"""), + parameters=Parameter( + args=('parameters',), + doc="""parameters in the form =""", + nargs='*'), + ) @staticmethod @datasetmethod(name='compute') @eval_results # signature must match parameter list above # additional generic arguments are added by decorators - def __call__(): + def __call__(dataset, + url_only=False, + url_id=None, + template=None, + output=None, + parameters=None + ): + + dataset = dataset.ds + print(f'dataset={dataset}') + print(f'url_only={url_only}') + print(f'url_id={url_id}') + print(f'template={template}') + print(f'output={output}') + print(f'parameters={parameters}') + + if not url_id: + url_id = str(time.time()) + + if not url_only: + parameter_dict = { + parameter.split('=')[0]: parameter.split('=')[1] + for parameter in parameters + } + template_path = dataset.pathobj / '.datalad' / 'compute' / 'methods' / template + compute(template_path, parameter_dict, output) + dataset.save() + + relaxed = ['--relaxed'] if url_only else [] + urls = get_urls(url_id, template, parameters) + for url in urls: + dataset.repo.call_annex(['addurl', url, '--file', output] + relaxed) + yield get_status_dict( - action='compute', - path=abspath(curdir), - status='ok', - message='compute command NOT YET IMPLEMENTED', - ) + action='compute', + path=abspath(curdir), + status='ok', + message=f'added urls: {urls!r} to {output!r}', + ) + + +def get_urls(url_id, template_name: str, parameters: list[str]): + method_url = 'compute://' + url_id + '/method/' + urlsafe_b64encode(template_name.encode()).decode() + parameter_url = 'compute://' + url_id + '/parameter/' + urlsafe_b64encode(';'.join(parameters).encode()).decode() + dependencies_url = 'compute://' + url_id + '/dependencies/' + urlsafe_b64encode('none'.encode()).decode() + + return [method_url, parameter_url, dependencies_url] From 98a21c9b40295ad27b839124ce5c4f661d194cd2 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 23 Jul 2024 11:03:31 +0200 Subject: [PATCH 004/148] support `use_shell`, ignore `dependencies` in template --- datalad_compute/utils/compute.py | 76 ++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 datalad_compute/utils/compute.py diff --git a/datalad_compute/utils/compute.py b/datalad_compute/utils/compute.py new file mode 100644 index 0000000..736a721 --- /dev/null +++ b/datalad_compute/utils/compute.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +import subprocess +from pathlib import Path +from typing import Any + +import tomllib + + +def substitute_string(format_str: str, + replacements: dict[str, str], + ) -> str: + for variable_name, replacement in replacements.items(): + place_holder = '{' + variable_name + '}' + if place_holder in format_str: + format_str = format_str.replace(place_holder, replacement) + return format_str + + +def substitute_arguments(spec: dict[str, Any], + replacements: dict[str, str], + format_list_id: str, + ) -> list[str]: + + return [ + substitute_string(str(format_str), replacements) + for format_str in spec[format_list_id] + ] + + +def get_substitutions(template: dict[str, Any], + arguments: dict[str, str], + output_path: str, + ) -> dict[str, str]: + + # Check the user specified inputs + inputs = template['inputs'] + if len(inputs) != len(arguments.keys()): + raise ValueError('Template inputs and arguments have different lengths') + if not all(input_name in arguments for input_name in inputs): + raise ValueError('Template inputs and arguments have different names') + + output_name = template['output'] + all_variables = inputs + [output_name] + if len(all_variables) != len(set(all_variables)): + raise ValueError('Template inputs/output contain duplicates') + + return { + **{ + input_name: arguments[input_name] + for input_name in inputs + }, + output_name: output_path + } + + +def compute(template_path: Path, + compute_arguments: dict[str, str], + output_path: str, + ): + with template_path.open('rb') as f: + template = tomllib.load(f) + + substitutions = get_substitutions(template, compute_arguments, output_path) + + substituted_executable = substitute_string(template['executable'], substitutions) + substituted_arguments = substitute_arguments( + template, + substitutions, + 'arguments' + ) + + if template.get('use_shell', 'false') == 'true': + subprocess.run(' '.join([substituted_executable] + substituted_arguments), shell=True) + else: + subprocess.run([substituted_executable] + substituted_arguments) From 37e20046c020cfa9bcd8337d258f3a69a4b3ac84 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 23 Jul 2024 14:53:24 +0200 Subject: [PATCH 005/148] update README.md --- README.md | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/README.md b/README.md index 181b343..a8c04e2 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,104 @@ [![Hatch project](https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg)](https://github.com/pypa/hatch) +This is a naive datalad compute extension that serves as a playground for +the datalad remake-project. + +It contains an annex remote that can compute content on demand. It uses template +files that specify the operations and per-data file parameters that are encoded +in annex URL-keys. It also contains the new datalad command `compute` that +can trigger the computation of content and store the parameters that are +used for content creation in the git-annex branch, where they can be used by +the annex remote to repeat the computation. + + +## Example usage + +Install the extension, create a dataset, configure it to use `compute`-URLs + + +```bash +> datalad create compute-test-1 +> cd compute-test-1 +> git config annex.security.allowed-url-schemes compute +> git config annex.security.allowed-ip-addresses all +> git config annex.security.allow-unverified-downloads ACKTHPPT +``` + +Create the template directory and a template + +```bash +> mkdir -p .datalad/compute/methods +> cat > .datalad/compute/methods/params_to_text <", + "'{output_file}'", +] +EOF +> datalad save -m "add compute method" +``` + +Create a "compute" annex special remote: +```bash +> git annex initremote compute encryption=none type=external externaltype=compute +``` + +Execute a computation and save the result: +```bash +> datalad compute params_to_text text-1.txt points=101 label=einhunderteins +``` + +The previous command will create the file `text-1.txt`: +```bash +> cat text-1.txt +generated via compute: points: 101, label: einhunderteins +``` + +Drop the content of `text-1.txt`, verify it is gone, recreate it via +`datalad get`, which "fetches" is from the compute remote: + +```bash +> datalad drop text-1.txt +> cat text-1.txt +> datalad get text-1.txt +> cat text-1.txt +``` + +Generate a speculative computation, i.e. do not perform the computation but generate an +URL-KEY with associated URLs that describe the computation that should be performed. This +is done by giving the `-u` option (url-only) to `datalad compute`. + +```bash +> datalad compute params_to_text -u text-2.txt points=303 label=dreihunderdrei +> cat text-2.txt # this will fail, because the computation has not yet been performed +``` + +`ls -l text-2.txt` will show a link to a non-existent URL-KEY with a time-stamp identifier. +`git annex whereis text-2.txt` will show the associated URLs. No computation has been +performed yet, `datalad compute` just creates an URL-KEY and associates the method-, +parameters-, and dependencies-URLs with the URL-KEY. + +Use `datalad get` to perform the computation for the first time and receive the result:: +```bash +> datalad get text-2.txt +> cat text-2.txt +``` + + +# Contributing + +See [CONTRIBUTING.md](CONTRIBUTING.md) if you are interested in internals or +contributing to the project. + ## Acknowledgements This development was supported by European Union’s Horizon research and From 4c9faa52d6a0e3001845c796a3c88e59f596710a Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 23 Jul 2024 14:58:51 +0200 Subject: [PATCH 006/148] add a POC-disclaimer to README.md --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a8c04e2..caa61c6 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,11 @@ [![Hatch project](https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg)](https://github.com/pypa/hatch) +**This code is a POC, that means: code does not +thoroughly validate inputs, names might be inconsistent.** + This is a naive datalad compute extension that serves as a playground for -the datalad remake-project. +the datalad remake-project. It contains an annex remote that can compute content on demand. It uses template files that specify the operations and per-data file parameters that are encoded From f78846eb3c5af524957690ffa44511c18ea812ba Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 23 Jul 2024 15:05:27 +0200 Subject: [PATCH 007/148] update python version to 3.11 in actions --- .github/workflows/docbuild.yml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 .github/workflows/docbuild.yml diff --git a/.github/workflows/docbuild.yml b/.github/workflows/docbuild.yml new file mode 100644 index 0000000..a7319f8 --- /dev/null +++ b/.github/workflows/docbuild.yml @@ -0,0 +1,27 @@ +name: docs + +on: [push, pull_request] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - name: Set up environment + run: | + git config --global user.email "test@github.land" + git config --global user.name "GitHub Almighty" + - uses: actions/checkout@v4 + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: 3.11 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-devel.txt + pip install . + - name: Build docs + run: | + make -C docs html From c293c60941abd9ed7fd9b9d634847632fa736fdc Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 23 Jul 2024 15:10:56 +0200 Subject: [PATCH 008/148] update docs --- datalad_compute/tests/test_register.py | 6 ++++++ docs/source/cli_reference.rst | 7 +++++++ docs/source/python_reference.rst | 8 ++++++++ 3 files changed, 21 insertions(+) create mode 100644 datalad_compute/tests/test_register.py create mode 100644 docs/source/cli_reference.rst create mode 100644 docs/source/python_reference.rst diff --git a/datalad_compute/tests/test_register.py b/datalad_compute/tests/test_register.py new file mode 100644 index 0000000..8e2b9f9 --- /dev/null +++ b/datalad_compute/tests/test_register.py @@ -0,0 +1,6 @@ +from datalad.tests.utils_pytest import assert_result_count + + +def test_register(): + import datalad.api as da + assert hasattr(da, 'compute') diff --git a/docs/source/cli_reference.rst b/docs/source/cli_reference.rst new file mode 100644 index 0000000..2fe6ba9 --- /dev/null +++ b/docs/source/cli_reference.rst @@ -0,0 +1,7 @@ +Command line reference +====================== + +.. toctree:: + :maxdepth: 1 + + generated/man/datalad-remake diff --git a/docs/source/python_reference.rst b/docs/source/python_reference.rst new file mode 100644 index 0000000..7655dfe --- /dev/null +++ b/docs/source/python_reference.rst @@ -0,0 +1,8 @@ +High-level API commands +======================= + +.. currentmodule:: datalad.api +.. autosummary:: + :toctree: generated + + compute From a2948cafd7fc2cfc532ce36ec6f29d25dff371e9 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 25 Jul 2024 22:28:38 +0200 Subject: [PATCH 009/148] encode execution parameters in one URL --- datalad_compute/annexremotes/compute.py | 49 ++++++++----------------- datalad_compute/commands/compute_cmd.py | 35 +++++------------- 2 files changed, 25 insertions(+), 59 deletions(-) diff --git a/datalad_compute/annexremotes/compute.py b/datalad_compute/annexremotes/compute.py index 3a45d7d..58674d4 100644 --- a/datalad_compute/annexremotes/compute.py +++ b/datalad_compute/annexremotes/compute.py @@ -1,8 +1,10 @@ from __future__ import annotations -from base64 import urlsafe_b64decode from pathlib import Path -from urllib.parse import urlparse +from urllib.parse import ( + unquote, + urlparse, +) from annexremote import Master from datalad_next.annexremotes import ( @@ -50,46 +52,25 @@ def getcost(self) -> int: self.annex.debug(f'GETCOST') return 100 - def _compute(self, compute_info, file_name: str) -> None: - template = Path(self.annex.getgitdir()).parent / '.datalad' / 'compute' / 'methods' / compute_info['method'] - arguments = { - assignment.split('=')[0]: assignment.split('=')[1] - for assignment in compute_info['parameter'].split(';') - } - if compute_info.get('dependencies', 'none') != 'none': - dependencies = { - spec.split(':')[0]: spec.split(':')[1] - for spec in compute_info['dependencies'].split(';') - if spec - } - else: - dependencies = dict() - self.annex.debug(f'COMPUTE calling compute with: {template!r} {arguments!r} {file_name!r}') - compute(template, arguments, file_name) - def transfer_retrieve(self, key: str, file_name: str) -> None: self.annex.debug(f'TRANSFER RETRIEVE {key!r} {file_name!r}') + urls = self.annex.geturls(key, 'compute:') self.annex.debug(f'TRANSFER RETRIEVE urls({key!r}, "compute"): {urls!r}') + assert len(urls) == 1 - parsed_urls = [urlparse(url) for url in urls] - - # assure a single ID - ids = set(parts.netloc for parts in parsed_urls) - assert len(ids) == 1, f"Expected a single ID, got {ids}" - - # we need "method", "parameter", and "dependencies" data - categories = ('method', 'parameter', 'dependencies') + dependencies, method, parameters = urlparse(urls[0]).query.split('&', 2) compute_info = { - category: urlsafe_b64decode(parts.path.split('/')[2]).strip().decode() - for category in categories - for parts in parsed_urls if parts.path.startswith(f'/{category}/') + 'dependencies': dependencies.split('=', 1)[1], + 'method': Path(self.annex.getgitdir()).parent / '.datalad' / 'compute' / 'methods' / method.split('=', 1)[1], + 'parameter': { + assignment.split('=')[0]: unquote(assignment.split('=')[1]) + for assignment in parameters.split('&') + } } - assert tuple(compute_info.keys()) == categories, \ - f"Expected 'method', 'parameter', and 'dependencies', got {compute_info.keys()}" - self.annex.debug(f'TRANSFER RETRIEVE {key!r}: compute_info: {compute_info!r}, file_name: {file_name!r}') - self._compute(compute_info, file_name) + compute(compute_info['method'], compute_info['parameter'], file_name) + def checkpresent(self, key: str) -> bool: # See if any compute: URL is present diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index e935601..19f21ed 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -3,10 +3,9 @@ __docformat__ = 'restructuredtext' import logging -import time -from base64 import urlsafe_b64encode from os.path import curdir from os.path import abspath +from urllib.parse import quote from datalad_next.commands import ( EnsureCommandParameterization, @@ -50,9 +49,6 @@ class Compute(ValidatedInterface): action="store_true", doc="""Don't perform the computation, register an URL-key instead"""), - url_id=Parameter( - args=('-i', '--id'), - doc="""Use as URL-id for the computation URLs"""), template=Parameter( args=('template',), doc="""Name of the computing template (template should be present @@ -73,22 +69,12 @@ class Compute(ValidatedInterface): # additional generic arguments are added by decorators def __call__(dataset, url_only=False, - url_id=None, template=None, output=None, parameters=None ): dataset = dataset.ds - print(f'dataset={dataset}') - print(f'url_only={url_only}') - print(f'url_id={url_id}') - print(f'template={template}') - print(f'output={output}') - print(f'parameters={parameters}') - - if not url_id: - url_id = str(time.time()) if not url_only: parameter_dict = { @@ -100,21 +86,20 @@ def __call__(dataset, dataset.save() relaxed = ['--relaxed'] if url_only else [] - urls = get_urls(url_id, template, parameters) - for url in urls: - dataset.repo.call_annex(['addurl', url, '--file', output] + relaxed) + url = get_url(template, parameters) + dataset.repo.call_annex(['addurl', url, '--file', output] + relaxed) yield get_status_dict( action='compute', path=abspath(curdir), status='ok', - message=f'added urls: {urls!r} to {output!r}', + message=f'added url: {url!r} to {output!r}', ) -def get_urls(url_id, template_name: str, parameters: list[str]): - method_url = 'compute://' + url_id + '/method/' + urlsafe_b64encode(template_name.encode()).decode() - parameter_url = 'compute://' + url_id + '/parameter/' + urlsafe_b64encode(';'.join(parameters).encode()).decode() - dependencies_url = 'compute://' + url_id + '/dependencies/' + urlsafe_b64encode('none'.encode()).decode() - - return [method_url, parameter_url, dependencies_url] +def get_url(template_name: str, parameters: list[str]) -> str: + url_params = '&'.join( + f'{assignment.split("=", 1)[0]}={quote(assignment.split("=", 1)[1])}' + for assignment in parameters + ) + return f'compute:///?dep=&method={template_name}&' + url_params From 331ff3351d7e0715e2baa31d6b5d5612c49a889a Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 26 Jul 2024 09:44:55 +0200 Subject: [PATCH 010/148] refactor compute-annex remote code --- datalad_compute/annexremotes/compute.py | 37 +++++++++++++++++-------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/datalad_compute/annexremotes/compute.py b/datalad_compute/annexremotes/compute.py index 58674d4..33ed1de 100644 --- a/datalad_compute/annexremotes/compute.py +++ b/datalad_compute/annexremotes/compute.py @@ -1,6 +1,7 @@ from __future__ import annotations from pathlib import Path +from typing import Any from urllib.parse import ( unquote, urlparse, @@ -16,6 +17,8 @@ class ComputeRemote(SpecialRemote): + template_path = Path('.datalad/compute/methods') + def __init__(self, annex: Master): super().__init__(annex) @@ -52,26 +55,38 @@ def getcost(self) -> int: self.annex.debug(f'GETCOST') return 100 - def transfer_retrieve(self, key: str, file_name: str) -> None: - self.annex.debug(f'TRANSFER RETRIEVE {key!r} {file_name!r}') + def get_url_encoded_info(self, url: str) -> tuple[str, str, str]: + parts = urlparse(url).query.split('&', 2) + return parts[0], parts[1], parts[2] + def get_url_for_key(self, key: str) -> str: urls = self.annex.geturls(key, 'compute:') self.annex.debug(f'TRANSFER RETRIEVE urls({key!r}, "compute"): {urls!r}') - assert len(urls) == 1 - - dependencies, method, parameters = urlparse(urls[0]).query.split('&', 2) - compute_info = { - 'dependencies': dependencies.split('=', 1)[1], - 'method': Path(self.annex.getgitdir()).parent / '.datalad' / 'compute' / 'methods' / method.split('=', 1)[1], + return urls[0] + + def get_compute_info(self, key: str) -> dict[str, Any]: + def get_assignment_value(assignment: str) -> str: + return assignment.split('=', 1)[1] + + dependencies, method, parameters = self.get_url_encoded_info( + self.get_url_for_key(key) + ) + return { + 'dependencies': get_assignment_value(dependencies), + 'method': Path(self.annex.getgitdir()).parent + / self.template_path + / get_assignment_value(method), 'parameter': { - assignment.split('=')[0]: unquote(assignment.split('=')[1]) - for assignment in parameters.split('&') + name: unquote(value) + for name, value in map(lambda s: s.split('=', 1), parameters.split('&')) } } + + def transfer_retrieve(self, key: str, file_name: str) -> None: + compute_info = self.get_compute_info(key) self.annex.debug(f'TRANSFER RETRIEVE {key!r}: compute_info: {compute_info!r}, file_name: {file_name!r}') compute(compute_info['method'], compute_info['parameter'], file_name) - def checkpresent(self, key: str) -> bool: # See if any compute: URL is present return self.annex.geturls(key, 'compute:') != [] From da753839697f4447dfc98a604b3d612d4b5614ac Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 26 Jul 2024 10:33:41 +0200 Subject: [PATCH 011/148] change scheme to `datalad-make` --- datalad_compute/__init__.py | 4 ++++ datalad_compute/annexremotes/compute.py | 17 ++++++++++------- datalad_compute/commands/compute_cmd.py | 12 ++++++++---- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/datalad_compute/__init__.py b/datalad_compute/__init__.py index 97ee045..3783291 100644 --- a/datalad_compute/__init__.py +++ b/datalad_compute/__init__.py @@ -28,3 +28,7 @@ from . import _version __version__ = _version.get_versions()['version'] + + +url_scheme = 'datalad-make' +template_dir = '.datalad/compute/methods' diff --git a/datalad_compute/annexremotes/compute.py b/datalad_compute/annexremotes/compute.py index 33ed1de..4ccbd85 100644 --- a/datalad_compute/annexremotes/compute.py +++ b/datalad_compute/annexremotes/compute.py @@ -13,11 +13,14 @@ super_main ) +from datalad_compute import ( + template_dir, + url_scheme, +) from datalad_compute.utils.compute import compute class ComputeRemote(SpecialRemote): - template_path = Path('.datalad/compute/methods') def __init__(self, annex: Master): super().__init__(annex) @@ -29,7 +32,7 @@ def close(self) -> None: pass def _check_url(self, url: str) -> bool: - return url.startswith('URL--compute:') or url.startswith('compute:') + return url.startswith(f'URL--{url_scheme}:') or url.startswith(f'{url_scheme}:') def prepare(self): self.annex.debug(f'PREPARE') @@ -60,8 +63,8 @@ def get_url_encoded_info(self, url: str) -> tuple[str, str, str]: return parts[0], parts[1], parts[2] def get_url_for_key(self, key: str) -> str: - urls = self.annex.geturls(key, 'compute:') - self.annex.debug(f'TRANSFER RETRIEVE urls({key!r}, "compute"): {urls!r}') + urls = self.annex.geturls(key, f'{url_scheme}:') + self.annex.debug(f'TRANSFER RETRIEVE urls({key!r}, "{url_scheme}"): {urls!r}') return urls[0] def get_compute_info(self, key: str) -> dict[str, Any]: @@ -74,7 +77,7 @@ def get_assignment_value(assignment: str) -> str: return { 'dependencies': get_assignment_value(dependencies), 'method': Path(self.annex.getgitdir()).parent - / self.template_path + / template_dir / get_assignment_value(method), 'parameter': { name: unquote(value) @@ -88,8 +91,8 @@ def transfer_retrieve(self, key: str, file_name: str) -> None: compute(compute_info['method'], compute_info['parameter'], file_name) def checkpresent(self, key: str) -> bool: - # See if any compute: URL is present - return self.annex.geturls(key, 'compute:') != [] + # See if at least one URL with the compute url-scheme is present + return self.annex.geturls(key, f'{url_scheme}:') != [] def main(): diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 19f21ed..601b5f5 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -18,6 +18,10 @@ ) from datalad_next.constraints import EnsureDataset +from datalad_compute import ( + template_dir, + url_scheme, +) from datalad_compute.utils.compute import compute @@ -81,7 +85,7 @@ def __call__(dataset, parameter.split('=')[0]: parameter.split('=')[1] for parameter in parameters } - template_path = dataset.pathobj / '.datalad' / 'compute' / 'methods' / template + template_path = dataset.pathobj / template_dir / template compute(template_path, parameter_dict, output) dataset.save() @@ -99,7 +103,7 @@ def __call__(dataset, def get_url(template_name: str, parameters: list[str]) -> str: url_params = '&'.join( - f'{assignment.split("=", 1)[0]}={quote(assignment.split("=", 1)[1])}' - for assignment in parameters + f'{name}={quote(value)}' + for name, value in map(lambda s: s.split('=', 1), parameters) ) - return f'compute:///?dep=&method={template_name}&' + url_params + return f'{url_scheme}:///?dep=&method={template_name}&' + url_params From 1c592c10cdfad201a2ba2cf1a8116e50083155dd Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 26 Jul 2024 10:47:43 +0200 Subject: [PATCH 012/148] adapt README.md to modified URL-scheme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index caa61c6..1ae4410 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Install the extension, create a dataset, configure it to use `compute`-URLs ```bash > datalad create compute-test-1 > cd compute-test-1 -> git config annex.security.allowed-url-schemes compute +> git config annex.security.allowed-url-schemes datalad-make > git config annex.security.allowed-ip-addresses all > git config annex.security.allow-unverified-downloads ACKTHPPT ``` @@ -90,7 +90,7 @@ is done by giving the `-u` option (url-only) to `datalad compute`. > cat text-2.txt # this will fail, because the computation has not yet been performed ``` -`ls -l text-2.txt` will show a link to a non-existent URL-KEY with a time-stamp identifier. +`ls -l text-2.txt` will show a link to a non-existent URL-KEY. `git annex whereis text-2.txt` will show the associated URLs. No computation has been performed yet, `datalad compute` just creates an URL-KEY and associates the method-, parameters-, and dependencies-URLs with the URL-KEY. From 858c3d82ee9d0a92e2a0c40ee2da53d6763d8997 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 26 Jul 2024 10:48:45 +0200 Subject: [PATCH 013/148] add a statement about tests to README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1ae4410..796bd3b 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,8 @@ **This code is a POC, that means: code does not -thoroughly validate inputs, names might be inconsistent.** +thoroughly validate inputs, names might be inconsistent. Tests do mostly +not exist.** This is a naive datalad compute extension that serves as a playground for the datalad remake-project. From 50cc8ac8bb2b78089690d7b2092c7fd6fde9b17f Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sat, 27 Jul 2024 21:35:19 +0200 Subject: [PATCH 014/148] improve README.md --- README.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 796bd3b..0e22094 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,9 @@ [![Hatch project](https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg)](https://github.com/pypa/hatch) -**This code is a POC, that means: code does not -thoroughly validate inputs, names might be inconsistent. Tests do mostly -not exist.** +**This code is a POC, that means: ncode does not +thoroughly validate inputs, names might be inconsistent, little docs. +Tests do mostly not exist.** This is a naive datalad compute extension that serves as a playground for the datalad remake-project. @@ -83,7 +83,8 @@ Drop the content of `text-1.txt`, verify it is gone, recreate it via ``` Generate a speculative computation, i.e. do not perform the computation but generate an -URL-KEY with associated URLs that describe the computation that should be performed. This +URL-KEY with an associated URL that describes the computation that should be performed (a +"computation description URL"). This is done by giving the `-u` option (url-only) to `datalad compute`. ```bash @@ -91,10 +92,10 @@ is done by giving the `-u` option (url-only) to `datalad compute`. > cat text-2.txt # this will fail, because the computation has not yet been performed ``` -`ls -l text-2.txt` will show a link to a non-existent URL-KEY. -`git annex whereis text-2.txt` will show the associated URLs. No computation has been -performed yet, `datalad compute` just creates an URL-KEY and associates the method-, -parameters-, and dependencies-URLs with the URL-KEY. +`ls -l text-2.txt` will show a link to a not-downloaded URL-KEY. +`git annex whereis text-2.txt` will show the associated computation description URL. +No computation has been performed yet, `datalad compute` just creates an URL-KEY and +associates a computation description URL with the URL-KEY. Use `datalad get` to perform the computation for the first time and receive the result:: ```bash From 406e232a4e5b15f2fa23070b8c3c2e4c9ef23abe Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 30 Aug 2024 17:06:15 +0200 Subject: [PATCH 015/148] fix a typo --- README.md | 2 +- datalad_compute/dataprovider/gitsource.py | 0 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 datalad_compute/dataprovider/gitsource.py diff --git a/README.md b/README.md index 0e22094..c0a076c 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ URL-KEY with an associated URL that describes the computation that should be per is done by giving the `-u` option (url-only) to `datalad compute`. ```bash -> datalad compute params_to_text -u text-2.txt points=303 label=dreihunderdrei +> datalad compute params_to_text -u text-2.txt points=303 label=dreihundertdrei > cat text-2.txt # this will fail, because the computation has not yet been performed ``` diff --git a/datalad_compute/dataprovider/gitsource.py b/datalad_compute/dataprovider/gitsource.py new file mode 100644 index 0000000..e69de29 From 61e842b7032d514c6962a7b80f40c488260ac998 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 30 Aug 2024 17:07:14 +0200 Subject: [PATCH 016/148] [experimental] start a git data provider --- datalad_compute/dataprovider/gitsource.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/datalad_compute/dataprovider/gitsource.py b/datalad_compute/dataprovider/gitsource.py index e69de29..6588f43 100644 --- a/datalad_compute/dataprovider/gitsource.py +++ b/datalad_compute/dataprovider/gitsource.py @@ -0,0 +1,19 @@ + + +from argparse import ArgumentParser + + +argument_parser = ArgumentParser() +argument_parser.add_argument('source', help='Source URL') +argument_parser.add_argument('-v', '--version', help='Version of the source (sha or tag)') +argument_parser.add_argument('-p', '--pattern', help='File pattern of files that should be provisioned') + + +def main(): + arguments = argument_parser.parse_args() + + + + +if __name__ == '__main__': + main() From e83b4a62a571072642606dacc21ba08dac2f130e Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 2 Sep 2024 15:34:31 +0200 Subject: [PATCH 017/148] start one-to-many support --- datalad_compute/annexremotes/compute.py | 9 +++---- datalad_compute/commands/compute_cmd.py | 31 ++++++++++++++----------- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/datalad_compute/annexremotes/compute.py b/datalad_compute/annexremotes/compute.py index 4ccbd85..7896d74 100644 --- a/datalad_compute/annexremotes/compute.py +++ b/datalad_compute/annexremotes/compute.py @@ -58,9 +58,9 @@ def getcost(self) -> int: self.annex.debug(f'GETCOST') return 100 - def get_url_encoded_info(self, url: str) -> tuple[str, str, str]: - parts = urlparse(url).query.split('&', 2) - return parts[0], parts[1], parts[2] + def get_url_encoded_info(self, url: str) -> tuple[str, str, str, str]: + parts = urlparse(url).query.split('&', 3) + return parts[0], parts[1], parts[2], parts[3] def get_url_for_key(self, key: str) -> str: urls = self.annex.geturls(key, f'{url_scheme}:') @@ -71,7 +71,7 @@ def get_compute_info(self, key: str) -> dict[str, Any]: def get_assignment_value(assignment: str) -> str: return assignment.split('=', 1)[1] - dependencies, method, parameters = self.get_url_encoded_info( + dependencies, method, output, parameters = self.get_url_encoded_info( self.get_url_for_key(key) ) return { @@ -79,6 +79,7 @@ def get_assignment_value(assignment: str) -> str: 'method': Path(self.annex.getgitdir()).parent / template_dir / get_assignment_value(method), + 'output': get_assignment_value(output), 'parameter': { name: unquote(value) for name, value in map(lambda s: s.split('=', 1), parameters.split('&')) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 601b5f5..839df89 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -52,18 +52,19 @@ class Compute(ValidatedInterface): args=('-u', '--url-only'), action="store_true", doc="""Don't perform the computation, register an URL-key - instead"""), + instead. A `git annex get ` will trigger the computation"""), template=Parameter( args=('template',), doc="""Name of the computing template (template should be present in $DATASET/.datalad/compute/methods)"""), - output=Parameter( - args=('output',), - doc="""name of the output file"""), + outputs=Parameter( + args=('--output',), + action='append', + doc="""Name of an output file"""), parameters=Parameter( - args=('parameters',), - doc="""parameters in the form =""", - nargs='*'), + args=('--parameter',), + action='append', + doc="""Input parameter in the form ="""), ) @staticmethod @@ -74,7 +75,7 @@ class Compute(ValidatedInterface): def __call__(dataset, url_only=False, template=None, - output=None, + outputs=None, parameters=None ): @@ -86,12 +87,13 @@ def __call__(dataset, for parameter in parameters } template_path = dataset.pathobj / template_dir / template - compute(template_path, parameter_dict, output) + compute(template_path, parameter_dict, outputs) dataset.save() relaxed = ['--relaxed'] if url_only else [] - url = get_url(template, parameters) - dataset.repo.call_annex(['addurl', url, '--file', output] + relaxed) + for output in outputs: + url = get_url(template, parameters, output) + dataset.repo.call_annex(['addurl', url, '--file', output] + relaxed) yield get_status_dict( action='compute', @@ -101,9 +103,12 @@ def __call__(dataset, ) -def get_url(template_name: str, parameters: list[str]) -> str: +def get_url(template_name: str, parameters: list[str], output: str) -> str: url_params = '&'.join( f'{name}={quote(value)}' for name, value in map(lambda s: s.split('=', 1), parameters) ) - return f'{url_scheme}:///?dep=&method={template_name}&' + url_params + return ( + f'{url_scheme}:///?dep=&method={template_name}&output={quote(output)}&' + + url_params + ) From 3ddfe1a1083a7cdb4f3cb8fe4bbed2e4366c42df Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 5 Sep 2024 14:44:19 +0200 Subject: [PATCH 018/148] add a simple git-worktree provider --- datalad_compute/annexremotes/compute.py | 17 ++++-- datalad_compute/commands/compute_cmd.py | 57 ++++++++++++------ datalad_compute/dataprovider/gitsource.py | 19 ------ datalad_compute/dataprovider/gitworktree.py | 66 +++++++++++++++++++++ datalad_compute/utils/compute.py | 17 ++---- 5 files changed, 121 insertions(+), 55 deletions(-) delete mode 100644 datalad_compute/dataprovider/gitsource.py create mode 100644 datalad_compute/dataprovider/gitworktree.py diff --git a/datalad_compute/annexremotes/compute.py b/datalad_compute/annexremotes/compute.py index 7896d74..6ea843e 100644 --- a/datalad_compute/annexremotes/compute.py +++ b/datalad_compute/annexremotes/compute.py @@ -1,5 +1,8 @@ from __future__ import annotations +import contextlib +import shutil +import tempfile from pathlib import Path from typing import Any from urllib.parse import ( @@ -68,18 +71,18 @@ def get_url_for_key(self, key: str) -> str: return urls[0] def get_compute_info(self, key: str) -> dict[str, Any]: - def get_assignment_value(assignment: str) -> str: + def get_assigned_value(assignment: str) -> str: return assignment.split('=', 1)[1] dependencies, method, output, parameters = self.get_url_encoded_info( self.get_url_for_key(key) ) return { - 'dependencies': get_assignment_value(dependencies), + 'dependencies': get_assigned_value(dependencies), 'method': Path(self.annex.getgitdir()).parent / template_dir - / get_assignment_value(method), - 'output': get_assignment_value(output), + / get_assigned_value(method), + 'output': get_assigned_value(output), 'parameter': { name: unquote(value) for name, value in map(lambda s: s.split('=', 1), parameters.split('&')) @@ -89,7 +92,11 @@ def get_assignment_value(assignment: str) -> str: def transfer_retrieve(self, key: str, file_name: str) -> None: compute_info = self.get_compute_info(key) self.annex.debug(f'TRANSFER RETRIEVE {key!r}: compute_info: {compute_info!r}, file_name: {file_name!r}') - compute(compute_info['method'], compute_info['parameter'], file_name) + with tempfile.TemporaryDirectory() as tmpdir: + absolute_method = compute_info['method'].absolute() + with contextlib.chdir(tmpdir): + compute(absolute_method, compute_info['parameter']) + shutil.copy(Path(tmpdir) / compute_info['output'], file_name) def checkpresent(self, key: str) -> bool: # See if at least one URL with the compute url-scheme is present diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 839df89..63fd620 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -7,6 +7,23 @@ from os.path import abspath from urllib.parse import quote +from datalad_next.constraints import ( + AnyOf, + EnsureChoice, + EnsureDataset, + EnsureGeneratorFromFileLike, + EnsureIterableOf, + EnsureJSON, + EnsureListOf, + EnsureMapping, + EnsurePath, + EnsureStr, + EnsureURL, + EnsureValue, + WithDescription, +) + + from datalad_next.commands import ( EnsureCommandParameterization, ValidatedInterface, @@ -39,6 +56,8 @@ class Compute(ValidatedInterface): _validator_ = EnsureCommandParameterization(dict( dataset=EnsureDataset(installed=True), + output=EnsureListOf(EnsureStr(min_len=3)), + parameter=EnsureListOf(EnsureStr(min_len=3)), )) # parameters of the command, must be exhaustive @@ -57,12 +76,12 @@ class Compute(ValidatedInterface): args=('template',), doc="""Name of the computing template (template should be present in $DATASET/.datalad/compute/methods)"""), - outputs=Parameter( - args=('--output',), + output=Parameter( + args=('-o', '--output',), action='append', doc="""Name of an output file"""), - parameters=Parameter( - args=('--parameter',), + parameter=Parameter( + args=('-p', '--parameter',), action='append', doc="""Input parameter in the form ="""), ) @@ -75,32 +94,32 @@ class Compute(ValidatedInterface): def __call__(dataset, url_only=False, template=None, - outputs=None, - parameters=None + output=None, + parameter=None ): dataset = dataset.ds if not url_only: parameter_dict = { - parameter.split('=')[0]: parameter.split('=')[1] - for parameter in parameters + p.split('=')[0]: p.split('=')[1] + for p in parameter } template_path = dataset.pathobj / template_dir / template - compute(template_path, parameter_dict, outputs) + compute(template_path, parameter_dict) dataset.save() relaxed = ['--relaxed'] if url_only else [] - for output in outputs: - url = get_url(template, parameters, output) - dataset.repo.call_annex(['addurl', url, '--file', output] + relaxed) - - yield get_status_dict( - action='compute', - path=abspath(curdir), - status='ok', - message=f'added url: {url!r} to {output!r}', - ) + for out in output: + url = get_url(template, parameter, out) + dataset.repo.call_annex(['addurl', url, '--file', out] + relaxed) + + yield get_status_dict( + action='compute', + path=abspath(curdir), + status='ok', + message=f'added url: {url!r} to {out!r}', + ) def get_url(template_name: str, parameters: list[str], output: str) -> str: diff --git a/datalad_compute/dataprovider/gitsource.py b/datalad_compute/dataprovider/gitsource.py deleted file mode 100644 index 6588f43..0000000 --- a/datalad_compute/dataprovider/gitsource.py +++ /dev/null @@ -1,19 +0,0 @@ - - -from argparse import ArgumentParser - - -argument_parser = ArgumentParser() -argument_parser.add_argument('source', help='Source URL') -argument_parser.add_argument('-v', '--version', help='Version of the source (sha or tag)') -argument_parser.add_argument('-p', '--pattern', help='File pattern of files that should be provisioned') - - -def main(): - arguments = argument_parser.parse_args() - - - - -if __name__ == '__main__': - main() diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py new file mode 100644 index 0000000..e05def4 --- /dev/null +++ b/datalad_compute/dataprovider/gitworktree.py @@ -0,0 +1,66 @@ +""" +A data provisioner that works with local git repositories. +Data is provisioned in a temporary worktree +Currently there is no support for subdatasets +""" +from __future__ import annotations + +import subprocess +import tempfile +from argparse import ArgumentParser +from contextlib import chdir +from pathlib import Path + + +argument_parser = ArgumentParser() +argument_parser.add_argument('dataset', help='Path to source dataset') +argument_parser.add_argument( + '-v', '--version', + help='Version of the source (sha or tag). If not given the default branch ' + 'will be used', +) +argument_parser.add_argument( + '-p', '--pattern', + action='append', + help='File pattern of files that should be provisioned. If not given, the ' + 'complete repository will be provisioned', +) + + +def provide(dataset: str, + version: str | None, + pattern: list[str] | None, + ) -> Path: + + dataset = Path(dataset) + worktree_dir = temporary_worktree(dataset) + if version: + with chdir(worktree_dir): + subprocess.run(['git', 'checkout', version], check=True) + if pattern: + for p in pattern: + subprocess.run(['git', 'annex', 'get', p], check=True) + else: + subprocess.run(['git', 'annex', 'get'], check=True) + return worktree_dir + + +def temporary_worktree(dataset: Path) -> Path: + worktree_dir = tempfile.TemporaryDirectory().name + with chdir(dataset): + subprocess.run(['git', 'worktree', 'add', worktree_dir], check=True) + return Path(worktree_dir) + + +def main(): + arguments = argument_parser.parse_args() + provision_dir = provide( + arguments.dataset, + arguments.version, + arguments.pattern, + ) + print(provision_dir) + + +if __name__ == '__main__': + main() diff --git a/datalad_compute/utils/compute.py b/datalad_compute/utils/compute.py index 736a721..6eba07c 100644 --- a/datalad_compute/utils/compute.py +++ b/datalad_compute/utils/compute.py @@ -30,7 +30,6 @@ def substitute_arguments(spec: dict[str, Any], def get_substitutions(template: dict[str, Any], arguments: dict[str, str], - output_path: str, ) -> dict[str, str]: # Check the user specified inputs @@ -40,28 +39,22 @@ def get_substitutions(template: dict[str, Any], if not all(input_name in arguments for input_name in inputs): raise ValueError('Template inputs and arguments have different names') - output_name = template['output'] - all_variables = inputs + [output_name] - if len(all_variables) != len(set(all_variables)): - raise ValueError('Template inputs/output contain duplicates') + if len(inputs) != len(set(inputs)): + raise ValueError('Template inputs contain duplicates') return { - **{ - input_name: arguments[input_name] - for input_name in inputs - }, - output_name: output_path + input_name: arguments[input_name] + for input_name in inputs } def compute(template_path: Path, compute_arguments: dict[str, str], - output_path: str, ): with template_path.open('rb') as f: template = tomllib.load(f) - substitutions = get_substitutions(template, compute_arguments, output_path) + substitutions = get_substitutions(template, compute_arguments) substituted_executable = substitute_string(template['executable'], substitutions) substituted_arguments = substitute_arguments( From 9075a400286a4a7a16b8f323718eda48a6875d73 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 10 Sep 2024 17:27:54 +0200 Subject: [PATCH 019/148] add support for subdatasets in provider --- datalad_compute/dataprovider/gitworktree.py | 84 ++++++++++++++------- datalad_compute/executors/simple.py | 0 2 files changed, 56 insertions(+), 28 deletions(-) create mode 100644 datalad_compute/executors/simple.py diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index e05def4..d06488c 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -1,7 +1,7 @@ """ A data provisioner that works with local git repositories. -Data is provisioned in a temporary worktree -Currently there is no support for subdatasets +Data is provisioned in a temporary worktree. All subdatasets +are currently also provisioned. """ from __future__ import annotations @@ -11,53 +11,81 @@ from contextlib import chdir from pathlib import Path +from datalad.distribution.dataset import Dataset + argument_parser = ArgumentParser() -argument_parser.add_argument('dataset', help='Path to source dataset') argument_parser.add_argument( - '-v', '--version', - help='Version of the source (sha or tag). If not given the default branch ' - 'will be used', + 'dataset', + default='.', + help='Path to source dataset (default: current directory)', +) +argument_parser.add_argument( + '-b', '--branch', + help='Branch (name, sha, or tag) that should be used. If not given the ' + 'default branch will be used', ) argument_parser.add_argument( - '-p', '--pattern', + '-i', '--input', action='append', - help='File pattern of files that should be provisioned. If not given, the ' - 'complete repository will be provisioned', + help='Name of a file that should be provisioned (use multiple times to ' + 'define multiple inputs). If not provided, the complete dataset, ' + 'including all subdatasets, will be provisioned', ) def provide(dataset: str, - version: str | None, - pattern: list[str] | None, + branch: str | None = None, + input_files: list[str] | None = None, ) -> Path: - dataset = Path(dataset) - worktree_dir = temporary_worktree(dataset) - if version: - with chdir(worktree_dir): - subprocess.run(['git', 'checkout', version], check=True) - if pattern: - for p in pattern: - subprocess.run(['git', 'annex', 'get', p], check=True) - else: - subprocess.run(['git', 'annex', 'get'], check=True) + worktree_dir = Path(tempfile.TemporaryDirectory().name) + # Get all datasets including subdatasets into the worktree + provide_datasets( + Dataset(dataset), + worktree_dir=worktree_dir, + temp_branch=worktree_dir.name, + source_branch=branch, + ) + + # Fetch file content + with chdir(worktree_dir): + if input_files: + for p in input_files: + subprocess.run(['datalad', 'get', p], check=True) + else: + subprocess.run(['datalad', 'get', '-r'], check=True) return worktree_dir -def temporary_worktree(dataset: Path) -> Path: - worktree_dir = tempfile.TemporaryDirectory().name - with chdir(dataset): - subprocess.run(['git', 'worktree', 'add', worktree_dir], check=True) - return Path(worktree_dir) +def provide_datasets(dataset: Dataset, + worktree_dir: Path, + temp_branch: str, + source_branch: str | None = None, + ) -> None: + + with chdir(dataset.path): + args = ['git', 'worktree', 'add', '-b', temp_branch, str(worktree_dir)] + ( + [source_branch] if source_branch else [] + ) + + subprocess.run(args, check=True) + for subdataset in dataset.subdatasets(): + subdataset_path = Path(subdataset['path']).relative_to(dataset.pathobj) + dataset.install(path=subdataset_path) + provide_datasets( + Dataset(subdataset_path), + worktree_dir / subdataset_path, + temp_branch, + ) def main(): arguments = argument_parser.parse_args() provision_dir = provide( arguments.dataset, - arguments.version, - arguments.pattern, + arguments.branch, + arguments.input, ) print(provision_dir) diff --git a/datalad_compute/executors/simple.py b/datalad_compute/executors/simple.py new file mode 100644 index 0000000..e69de29 From c4d78de1e68c588a0d92854cbaaac150b0659bf7 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 10 Sep 2024 19:45:56 +0200 Subject: [PATCH 020/148] add delete functionality --- datalad_compute/dataprovider/gitworktree.py | 27 +++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index d06488c..e1b12de 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -5,6 +5,7 @@ """ from __future__ import annotations +import shutil import subprocess import tempfile from argparse import ArgumentParser @@ -20,6 +21,11 @@ default='.', help='Path to source dataset (default: current directory)', ) +argument_parser.add_argument( + '-d', '--delete', + help='Delete the temporary worktree WORKTREE that belongs the the ' + 'dataset (cannot be used with `-b`, `--branch`, `-i`, or `--input`)', +) argument_parser.add_argument( '-b', '--branch', help='Branch (name, sha, or tag) that should be used. If not given the ' @@ -34,6 +40,22 @@ ) +def remove(dataset: str, + worktree: str + ) -> None: + + shutil.rmtree(worktree) + dataset = Dataset(dataset) + prune_worktrees(dataset) + + +def prune_worktrees(dataset: Dataset) -> None: + with chdir(dataset.path): + subprocess.run(['git', 'worktree', 'prune'], check=True) + for result in dataset.subdatasets(): + prune_worktrees(Dataset(result['path'])) + + def provide(dataset: str, branch: str | None = None, input_files: list[str] | None = None, @@ -82,6 +104,11 @@ def provide_datasets(dataset: Dataset, def main(): arguments = argument_parser.parse_args() + if arguments.delete: + if arguments.branch or arguments.input: + raise ValueError('Cannot use `-d`, `--delete` with `-b`, `--branch`, `-i`, `--input`') + remove(arguments.dataset, arguments.delete) + return provision_dir = provide( arguments.dataset, arguments.branch, From 50a9b457f8733bae18d3b023552993018ecdc52b Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 11 Sep 2024 11:32:45 +0200 Subject: [PATCH 021/148] disable result rendering --- datalad_compute/dataprovider/__init__.py | 0 datalad_compute/dataprovider/gitworktree.py | 21 ++++++++++--------- .../dataprovider/tests/__init__.py | 0 .../dataprovider/tests/test_gitworktree.py | 0 4 files changed, 11 insertions(+), 10 deletions(-) create mode 100644 datalad_compute/dataprovider/__init__.py create mode 100644 datalad_compute/dataprovider/tests/__init__.py create mode 100644 datalad_compute/dataprovider/tests/test_gitworktree.py diff --git a/datalad_compute/dataprovider/__init__.py b/datalad_compute/dataprovider/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index e1b12de..9dacd01 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -6,13 +6,13 @@ from __future__ import annotations import shutil -import subprocess import tempfile from argparse import ArgumentParser from contextlib import chdir from pathlib import Path -from datalad.distribution.dataset import Dataset +from datalad_next.datasets import Dataset +from datalad_next.runners import call_git_success argument_parser = ArgumentParser() @@ -51,8 +51,8 @@ def remove(dataset: str, def prune_worktrees(dataset: Dataset) -> None: with chdir(dataset.path): - subprocess.run(['git', 'worktree', 'prune'], check=True) - for result in dataset.subdatasets(): + call_git_success(['worktree', 'prune']) + for result in dataset.subdatasets(result_renderer='disabled'): prune_worktrees(Dataset(result['path'])) @@ -71,12 +71,13 @@ def provide(dataset: str, ) # Fetch file content + work_dataset = Dataset(worktree_dir) with chdir(worktree_dir): if input_files: for p in input_files: - subprocess.run(['datalad', 'get', p], check=True) + work_dataset.get(p, result_renderer='disabled') else: - subprocess.run(['datalad', 'get', '-r'], check=True) + work_dataset.get(recursive=True, result_renderer='disabled') return worktree_dir @@ -87,14 +88,14 @@ def provide_datasets(dataset: Dataset, ) -> None: with chdir(dataset.path): - args = ['git', 'worktree', 'add', '-b', temp_branch, str(worktree_dir)] + ( + args = ['worktree', 'add', '-b', temp_branch, str(worktree_dir)] + ( [source_branch] if source_branch else [] ) - subprocess.run(args, check=True) - for subdataset in dataset.subdatasets(): + call_git_success(args) + for subdataset in dataset.subdatasets(result_renderer='disabled'): subdataset_path = Path(subdataset['path']).relative_to(dataset.pathobj) - dataset.install(path=subdataset_path) + dataset.install(path=subdataset_path, result_renderer='disabled') provide_datasets( Dataset(subdataset_path), worktree_dir / subdataset_path, diff --git a/datalad_compute/dataprovider/tests/__init__.py b/datalad_compute/dataprovider/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datalad_compute/dataprovider/tests/test_gitworktree.py b/datalad_compute/dataprovider/tests/test_gitworktree.py new file mode 100644 index 0000000..e69de29 From d07100f495620a39b3ee94c8443a018fad2fea6a Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 11 Sep 2024 11:33:17 +0200 Subject: [PATCH 022/148] add tests for gitworktree-provider --- .../dataprovider/tests/test_gitworktree.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/datalad_compute/dataprovider/tests/test_gitworktree.py b/datalad_compute/dataprovider/tests/test_gitworktree.py index e69de29..542d568 100644 --- a/datalad_compute/dataprovider/tests/test_gitworktree.py +++ b/datalad_compute/dataprovider/tests/test_gitworktree.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +import tempfile +from contextlib import chdir + +from datalad_next.datasets import Dataset +from datalad_next.runners import call_git_lines + +from ..gitworktree import ( + provide, + remove, +) + + +def create_subdatasets(parent_dataset: Dataset, + subdataset_levels: int = 2, + level_id: int = 0, + ): + if subdataset_levels == 0: + return + + subdataset = Dataset(parent_dataset.pathobj / f'subds{level_id}') + subdataset.create(result_renderer='disabled') + create_subdatasets(subdataset, subdataset_levels - 1, level_id + 1) + (subdataset.pathobj / f'a{level_id}.txt').write_text(f'a{level_id}') + (subdataset.pathobj / f'b{level_id}.txt').write_text(f'b{level_id}') + subdataset.save(result_renderer='disabled') + + +def create_ds_hierarchy(subdataset_levels: int = 2): + dataset = Dataset(tempfile.TemporaryDirectory().name) + dataset.create(force=True, result_renderer='disabled') + create_subdatasets(dataset, subdataset_levels) + (dataset.pathobj / 'a.txt').write_text('a') + (dataset.pathobj / 'b.txt').write_text('b') + dataset.save(result_renderer='disabled') + return dataset + + +def test_worktree_basic(): + dataset = create_ds_hierarchy(3) + worktree = Dataset(provide(dataset.path)) + + r_orig = [r['gitmodule_url'] for r in dataset.subdatasets(recursive=True, result_renderer='disabled')] + r_worktree = [r['gitmodule_url'] for r in worktree.subdatasets(recursive=True, result_renderer='disabled')] + assert r_orig == r_worktree + + remove(dataset.path, worktree.path) + + def check_deleted_worktrees(ds: Dataset): + with chdir(ds.path): + for line in call_git_lines(['worktree', 'list']): + directory = line.split()[0] + assert directory == ds.path + for sub_ds in ds.subdatasets(result_renderer='disabled'): + check_deleted_worktrees(Dataset(sub_ds['path'])) + + check_deleted_worktrees(dataset) + dataset.drop(reckless='kill', result_renderer='disabled') From 1f33e2a7e85cabdf7252b54f9a9b174e769dcefb Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 12 Sep 2024 10:33:08 +0200 Subject: [PATCH 023/148] improve gitworktree-provisioning --- datalad_compute/dataprovider/gitworktree.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index 9dacd01..27d3eb6 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -66,7 +66,6 @@ def provide(dataset: str, provide_datasets( Dataset(dataset), worktree_dir=worktree_dir, - temp_branch=worktree_dir.name, source_branch=branch, ) @@ -83,10 +82,10 @@ def provide(dataset: str, def provide_datasets(dataset: Dataset, worktree_dir: Path, - temp_branch: str, source_branch: str | None = None, ) -> None: + temp_branch = worktree_dir.name with chdir(dataset.path): args = ['worktree', 'add', '-b', temp_branch, str(worktree_dir)] + ( [source_branch] if source_branch else [] @@ -99,7 +98,7 @@ def provide_datasets(dataset: Dataset, provide_datasets( Dataset(subdataset_path), worktree_dir / subdataset_path, - temp_branch, + None, # Use default branches for subdatasets ) From 44b9a6098ce244257d82ac27e1b5d4bf8f395947 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 12 Sep 2024 14:35:07 +0200 Subject: [PATCH 024/148] improve cli-docs --- datalad_compute/dataprovider/gitworktree.py | 24 +++++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index 27d3eb6..c444873 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -23,20 +23,23 @@ ) argument_parser.add_argument( '-d', '--delete', + metavar='WORKTREE', help='Delete the temporary worktree WORKTREE that belongs the the ' 'dataset (cannot be used with `-b`, `--branch`, `-i`, or `--input`)', ) argument_parser.add_argument( '-b', '--branch', - help='Branch (name, sha, or tag) that should be used. If not given the ' - 'default branch will be used', + help='Branch (name, sha, or tag) of `dataset` that should be provisioned. ' + 'If not given the default branch will be used', ) argument_parser.add_argument( '-i', '--input', action='append', - help='Name of a file that should be provisioned (use multiple times to ' - 'define multiple inputs). If not provided, the complete dataset, ' - 'including all subdatasets, will be provisioned', + metavar='PATH', + help='Path of a file that should be provisioned (relative from dataset ' + 'root). If not provided, the complete dataset, including all data in ' + 'all subdatasets, will be provisioned (use multiple times to define ' + 'multiple inputs)', ) @@ -88,7 +91,9 @@ def provide_datasets(dataset: Dataset, temp_branch = worktree_dir.name with chdir(dataset.path): args = ['worktree', 'add', '-b', temp_branch, str(worktree_dir)] + ( - [source_branch] if source_branch else [] + [source_branch] + if source_branch + else [] ) call_git_success(args) @@ -105,10 +110,15 @@ def provide_datasets(dataset: Dataset, def main(): arguments = argument_parser.parse_args() if arguments.delete: + if arguments.branch or arguments.input: - raise ValueError('Cannot use `-d`, `--delete` with `-b`, `--branch`, `-i`, `--input`') + raise ValueError( + 'Cannot use `-d`, `--delete` with `-b`, `--branch`,' + ' `-i`, or `--input`') + remove(arguments.dataset, arguments.delete) return + provision_dir = provide( arguments.dataset, arguments.branch, From 8c06941de148191aa99b9b28612287233feed477 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 16 Sep 2024 11:12:19 +0200 Subject: [PATCH 025/148] enforce input specification for provision --- datalad_compute/dataprovider/gitworktree.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index c444873..a09473f 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -72,14 +72,11 @@ def provide(dataset: str, source_branch=branch, ) - # Fetch file content + # Fetch file content in the worktree work_dataset = Dataset(worktree_dir) with chdir(worktree_dir): - if input_files: - for p in input_files: - work_dataset.get(p, result_renderer='disabled') - else: - work_dataset.get(recursive=True, result_renderer='disabled') + for p in input_files: + work_dataset.get(p, result_renderer='disabled') return worktree_dir @@ -119,6 +116,9 @@ def main(): remove(arguments.dataset, arguments.delete) return + if not arguments.input: + raise ValueError('At least one input file must be provided') + provision_dir = provide( arguments.dataset, arguments.branch, From 77cce97a58e5ff52fb79e93e5787ebe6bbaa6562 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 16 Sep 2024 11:57:45 +0200 Subject: [PATCH 026/148] add root id and default version to URLs --- datalad_compute/commands/compute_cmd.py | 133 +++++++++++++++--------- 1 file changed, 85 insertions(+), 48 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 63fd620..dabc46f 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -1,29 +1,15 @@ """DataLad demo command""" +from __future__ import annotations + __docformat__ = 'restructuredtext' +import json import logging -from os.path import curdir from os.path import abspath +from pathlib import Path from urllib.parse import quote -from datalad_next.constraints import ( - AnyOf, - EnsureChoice, - EnsureDataset, - EnsureGeneratorFromFileLike, - EnsureIterableOf, - EnsureJSON, - EnsureListOf, - EnsureMapping, - EnsurePath, - EnsureStr, - EnsureURL, - EnsureValue, - WithDescription, -) - - from datalad_next.commands import ( EnsureCommandParameterization, ValidatedInterface, @@ -33,7 +19,13 @@ eval_results, get_status_dict, ) -from datalad_next.constraints import EnsureDataset +from datalad_next.constraints import ( + EnsureDataset, + EnsureListOf, + EnsureStr, +) +from datalad_next.datasets import Dataset +from datalad_next.runners import call_git_oneline, call_git_success from datalad_compute import ( template_dir, @@ -56,7 +48,8 @@ class Compute(ValidatedInterface): _validator_ = EnsureCommandParameterization(dict( dataset=EnsureDataset(installed=True), - output=EnsureListOf(EnsureStr(min_len=3)), + input=EnsureListOf(EnsureStr(min_len=1), min_len=1), + output=EnsureListOf(EnsureStr(min_len=1), min_len=1), parameter=EnsureListOf(EnsureStr(min_len=3)), )) @@ -64,70 +57,114 @@ class Compute(ValidatedInterface): _params_ = dict( dataset=Parameter( args=('-d', '--dataset'), - doc="""Dataset to be used as a configuration source. Beyond - reading configuration items, this command does not interact with - the dataset."""), + doc="Dataset to be used as a configuration source. Beyond " + "reading configuration items, this command does not interact with " + "the dataset."), url_only=Parameter( args=('-u', '--url-only'), action="store_true", - doc="""Don't perform the computation, register an URL-key - instead. A `git annex get ` will trigger the computation"""), + doc="Don't perform the computation, register an URL-key " + "instead. A `git annex get ` will trigger the computation"), template=Parameter( args=('template',), - doc="""Name of the computing template (template should be present - in $DATASET/.datalad/compute/methods)"""), + doc="Name of the computing template (template should be present " + "in $DATASET/.datalad/compute/methods)"), + branch=Parameter( + args=('-b', '--branch',), + doc="Branch (or commit) that should be used for computation, if " + "not specified HEAD will be used"), + input=Parameter( + args=('-i', '--input',), + action='append', + doc="Name of an input file (repeat for multiple inputs)"), output=Parameter( args=('-o', '--output',), action='append', - doc="""Name of an output file"""), + doc="Name of an output file (repeat for multiple outputs)"), parameter=Parameter( args=('-p', '--parameter',), action='append', - doc="""Input parameter in the form ="""), + doc="Input parameter in the form = (repeat for " + "multiple parameters)"), ) + @staticmethod @datasetmethod(name='compute') @eval_results - # signature must match parameter list above - # additional generic arguments are added by decorators def __call__(dataset, url_only=False, template=None, + branch=None, + input=None, output=None, - parameter=None + parameter=None, ): - dataset = dataset.ds + root_dataset : Dataset = dataset.ds if not url_only: + template_path = root_dataset.pathobj / template_dir / template parameter_dict = { - p.split('=')[0]: p.split('=')[1] + p.split('=', 1)[0]: p.split('=', 1)[1] for p in parameter } - template_path = dataset.pathobj / template_dir / template - compute(template_path, parameter_dict) - dataset.save() - + compute( + root_dataset, + branch, + template_path, + [Path(i) for i in input], + [Path(o) for o in output], + parameter_dict + ) + root_dataset.save(recursive=True) + + url = get_url(dataset, branch, template, parameter, input, output) relaxed = ['--relaxed'] if url_only else [] for out in output: - url = get_url(template, parameter, out) - dataset.repo.call_annex(['addurl', url, '--file', out] + relaxed) + file_dataset_path = get_file_dataset(Path(out)) + call_git_success( + [ + '-C', str(file_dataset_path), 'annex', + 'addurl', url, '--file', out + ] + + relaxed + ) yield get_status_dict( action='compute', - path=abspath(curdir), + path=abspath(out), status='ok', message=f'added url: {url!r} to {out!r}', ) -def get_url(template_name: str, parameters: list[str], output: str) -> str: - url_params = '&'.join( - f'{name}={quote(value)}' - for name, value in map(lambda s: s.split('=', 1), parameters) - ) +def get_url(dataset: Dataset, + branch: str | None, + template_name: str, + parameters: dict[str, str], + input_files: list[str], + output_files: list[str], + ) -> str: + + branch = dataset.repo.get_hexsha() if branch is None else branch return ( - f'{url_scheme}:///?dep=&method={template_name}&output={quote(output)}&' - + url_params + f'{url_scheme}:///' + + f'?root_id={quote(dataset.id)}' + + f'&default_root_version={quote(branch)}' + + f'&method={quote(template_name)}' + + f'&input={quote(json.dumps(input_files))}' + + f'&output={quote(json.dumps(output_files))}' + + f'¶ms={quote(json.dumps(parameters))}' + ) + + +def get_file_dataset(file: Path) -> Path: + """ Get dataset of file and path from that dataset to root dataset + + Determine the dataset that contains the file and the relative path from + this dataset to root dataset.""" + top_level = call_git_oneline( + ['-C', str(file.parent), 'rev-parse', '--show-toplevel'] ) + return Path(top_level) From 6b45ee7fac1f36fd93efca3ca486f80f306cc1f0 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 16 Sep 2024 15:06:05 +0200 Subject: [PATCH 027/148] improve gitworktree provider cli-doc --- datalad_compute/dataprovider/gitworktree.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index a09473f..21900da 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -37,9 +37,8 @@ action='append', metavar='PATH', help='Path of a file that should be provisioned (relative from dataset ' - 'root). If not provided, the complete dataset, including all data in ' - 'all subdatasets, will be provisioned (use multiple times to define ' - 'multiple inputs)', + 'root), at least one input has tp be provided (use multiple times to ' + 'define multiple inputs)', ) From 67cadb053c4141b1f531991e6b5c45de4bc4e949 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 17 Sep 2024 08:18:35 +0200 Subject: [PATCH 028/148] use real random branch names in provision worktree --- datalad_compute/dataprovider/gitworktree.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index 21900da..1cc09fb 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -5,6 +5,7 @@ """ from __future__ import annotations +import random import shutil import tempfile from argparse import ArgumentParser @@ -84,15 +85,18 @@ def provide_datasets(dataset: Dataset, source_branch: str | None = None, ) -> None: - temp_branch = worktree_dir.name + temp_branch = 'tmp_' + ''.join( + random.choices('abcdefghijklmnopqrstuvwxyz', k=10) + ) with chdir(dataset.path): + args = ['worktree', 'add', '-b', temp_branch, str(worktree_dir)] + ( [source_branch] if source_branch else [] ) - call_git_success(args) + for subdataset in dataset.subdatasets(result_renderer='disabled'): subdataset_path = Path(subdataset['path']).relative_to(dataset.pathobj) dataset.install(path=subdataset_path, result_renderer='disabled') From a147e6e741cd94240efc5a900243c391621681c9 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 17 Sep 2024 16:34:32 +0200 Subject: [PATCH 029/148] add a first complete version --- TODO.txt | 3 + datalad_compute/commands/compute_cmd.py | 164 +++++++++++++----- .../dataprovider/tests/test_gitworktree.py | 40 ++++- datalad_compute/executors/simple.py | 83 +++++++++ datalad_compute/utils/compute.py | 18 +- examples/one-to-many | 9 + setup.cfg | 0 7 files changed, 271 insertions(+), 46 deletions(-) create mode 100644 TODO.txt create mode 100644 examples/one-to-many create mode 100644 setup.cfg diff --git a/TODO.txt b/TODO.txt new file mode 100644 index 0000000..774d4ea --- /dev/null +++ b/TODO.txt @@ -0,0 +1,3 @@ + +1. Associate URLs with output files. This allows to register URLs when + the file was copied to the dataset and saved. diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index dabc46f..d02ad4f 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -4,10 +4,14 @@ __docformat__ = 'restructuredtext' +import contextlib import json import logging -from os.path import abspath +import shutil +import subprocess +from itertools import chain from pathlib import Path +from tempfile import template from urllib.parse import quote from datalad_next.commands import ( @@ -25,16 +29,21 @@ EnsureStr, ) from datalad_next.datasets import Dataset -from datalad_next.runners import call_git_oneline, call_git_success - +from datalad_next.runners import call_git_oneline, call_git_success, iter_subproc from datalad_compute import ( template_dir, url_scheme, ) from datalad_compute.utils.compute import compute +from datasalad.runners import iter_subproc +from datasalad.itertools import ( + itemize, + load_json, +) +from more_itertools import intersperse -lgr = logging.getLogger('datalad.compute') +lgr = logging.getLogger('datalad.compute.compute_cmd') # decoration auto-generates standard help @@ -92,7 +101,7 @@ class Compute(ValidatedInterface): @staticmethod @datasetmethod(name='compute') @eval_results - def __call__(dataset, + def __call__(dataset=None, url_only=False, template=None, branch=None, @@ -101,42 +110,32 @@ def __call__(dataset, parameter=None, ): - root_dataset : Dataset = dataset.ds + dataset : Dataset = dataset.ds if dataset else Dataset() if not url_only: - template_path = root_dataset.pathobj / template_dir / template - parameter_dict = { - p.split('=', 1)[0]: p.split('=', 1)[1] - for p in parameter - } - compute( - root_dataset, - branch, - template_path, - [Path(i) for i in input], - [Path(o) for o in output], - parameter_dict - ) - root_dataset.save(recursive=True) - - url = get_url(dataset, branch, template, parameter, input, output) + worktree = provide(dataset, branch, input) + execute(worktree, template, parameter, output) + collect(worktree, dataset, output) + un_provide(dataset, worktree) + + url_base = get_url(dataset, branch, template, parameter, input, output) relaxed = ['--relaxed'] if url_only else [] + for out in output: - file_dataset_path = get_file_dataset(Path(out)) - call_git_success( - [ - '-C', str(file_dataset_path), 'annex', - 'addurl', url, '--file', out - ] - + relaxed - ) + + # Build the file-specific URL and store it in the annex + url = url_base + f'&this={quote(out)}' + file_dataset_path, file_path = get_file_dataset(dataset.pathobj / out) + lgr.debug('addurl: -C:%s file_path:%s url:%s', str(file_dataset_path), str(file_path), str(url)) + call_git_success([ + '-C', str(file_dataset_path), 'annex', + 'addurl', url, '--file', file_path] + relaxed) yield get_status_dict( action='compute', - path=abspath(out), + path=dataset.pathobj / out, status='ok', - message=f'added url: {url!r} to {out!r}', - ) + message=f'added url: {url!r} to {out!r} in {dataset.pathobj}',) def get_url(dataset: Dataset, @@ -159,12 +158,99 @@ def get_url(dataset: Dataset, ) -def get_file_dataset(file: Path) -> Path: - """ Get dataset of file and path from that dataset to root dataset +def get_file_dataset(file: Path) -> [Path, Path]: + """ Get dataset of file and relative path of file from the dataset - Determine the dataset that contains the file and the relative path from - this dataset to root dataset.""" - top_level = call_git_oneline( + Determine the dataset that contains the file and the relative path of the + file in this dataset.""" + top_level = Path(call_git_oneline( ['-C', str(file.parent), 'rev-parse', '--show-toplevel'] + )) + return ( + Path(top_level), + file.absolute().relative_to(top_level)) + + +def provide(dataset: Dataset, + branch: str | None, + input: list[str], + ) -> Path: + + lgr.debug('provide: %s %s %s', dataset, branch, input) + + args = ['provide-gitworktree', dataset.path, ] + ( + ['--branch', branch] if branch else [] ) - return Path(top_level) + args.extend(chain(*[('--input', i) for i in input])) + stdout = subprocess.run(args, stdout=subprocess.PIPE, check=True).stdout + return Path(stdout.splitlines()[-1].decode()) + + +def execute(worktree: Path, + template_name: str, + parameter: list[str], + output: list[str], + ) -> None: + + lgr.debug( + 'execute: %s %s %s %s', str(worktree), + template_name, repr(parameter), repr(output)) + + assert_annexed(worktree, output) + + # Unlock output files in the worktree-directory + for o in output: + call_git_success(['-C', str(worktree), 'annex', 'unlock', o]) + + # Run the computation in the worktree-directory + template_path = worktree / template_dir / template_name + parameter_dict = { + p.split('=', 1)[0]: p.split('=', 1)[1] + for p in parameter + } + compute(worktree, template_path, parameter_dict) + + +def assert_annexed(worktree: Path, + files: list[str] + ) -> None: + + present_files = list(filter(lambda f: Path(f).exists(), files)) + with contextlib.chdir(worktree): + with iter_subproc(['git', 'annex', 'info', '--json', '--batch', '-z'], + inputs=(file.encode() + b'\x00' for file in present_files), + bufsize=0) as results: + not_annexed = tuple(filter( + lambda r: r['success'] == False, + load_json(itemize(results, sep=b'\n')))) + if not_annexed: + raise ValueError( + f'Output files are not annexed: ' + ', '.join( + map(lambda na: na['file'], not_annexed))) + + +def collect(worktree: Path, + dataset: Dataset, + output: list[str], + ) -> None: + + lgr.debug('collect: %s %s %s', str(worktree), dataset, repr(output)) + + # Unlock output files in the dataset-directory and copy the result + for o in output: + dest = dataset.pathobj / o + call_git_success(['-C', dataset.path, 'annex', 'unlock', str(dest)]) + shutil.copyfile(worktree / o, dest) + + # Save the dataset + dataset.save() + + +def un_provide(dataset: Dataset, + worktree: Path, + ) -> None: + + lgr.debug('un_provide: %s %s', dataset, str(worktree)) + + args = ['provide-gitworktree', dataset.path, '--delete', str(worktree)] + subprocess.run(args, check=True) diff --git a/datalad_compute/dataprovider/tests/test_gitworktree.py b/datalad_compute/dataprovider/tests/test_gitworktree.py index 542d568..5078a6c 100644 --- a/datalad_compute/dataprovider/tests/test_gitworktree.py +++ b/datalad_compute/dataprovider/tests/test_gitworktree.py @@ -39,7 +39,14 @@ def create_ds_hierarchy(subdataset_levels: int = 2): def test_worktree_basic(): dataset = create_ds_hierarchy(3) - worktree = Dataset(provide(dataset.path)) + worktree = Dataset(provide( + dataset.path, + input_files=[ + 'a.txt', 'b.txt', + 'subds0/a0.txt', 'subds0/b0.txt', + 'subds0/subds1/a1.txt', 'subds0/subds1/b1.txt' + ], + )) r_orig = [r['gitmodule_url'] for r in dataset.subdatasets(recursive=True, result_renderer='disabled')] r_worktree = [r['gitmodule_url'] for r in worktree.subdatasets(recursive=True, result_renderer='disabled')] @@ -57,3 +64,34 @@ def check_deleted_worktrees(ds: Dataset): check_deleted_worktrees(dataset) dataset.drop(reckless='kill', result_renderer='disabled') + + +def test_worktree_example(): + dataset = Dataset('/home/cristian/tmp/compute/compute-test-4') + worktree = Dataset(provide( + dataset.path, + input_files=[ + 'b.txt', + 'subds1/c.txt', + 'subds1/a/d.txt', + 'd/subds2/a/e.txt' + ], + )) + + r_orig = [r['gitmodule_url'] for r in dataset.subdatasets(recursive=True, result_renderer='disabled')] + r_worktree = [r['gitmodule_url'] for r in worktree.subdatasets(recursive=True, result_renderer='disabled')] + assert r_orig == r_worktree + + + remove(dataset.path, worktree.path) + + def check_deleted_worktrees(ds: Dataset): + with chdir(ds.path): + for line in call_git_lines(['worktree', 'list']): + directory = line.split()[0] + assert directory == ds.path + for sub_ds in ds.subdatasets(result_renderer='disabled'): + check_deleted_worktrees(Dataset(sub_ds['path'])) + + check_deleted_worktrees(dataset) + dataset.drop(reckless='kill', result_renderer='disabled') diff --git a/datalad_compute/executors/simple.py b/datalad_compute/executors/simple.py index e69de29..5cc1bf5 100644 --- a/datalad_compute/executors/simple.py +++ b/datalad_compute/executors/simple.py @@ -0,0 +1,83 @@ +""" +A data provisioner that works with local git repositories. +Data is provisioned in a temporary worktree +Currently there is no support for subdatasets +""" +from __future__ import annotations + +import contextlib +import subprocess +import tempfile +from argparse import ArgumentParser +from contextlib import chdir +from pathlib import Path + +from datalad_compute import template_dir +from datalad_compute.utils.compute import compute + + +argument_parser = ArgumentParser() +argument_parser.add_argument( + 'dataset', + help='Path to provisioned dataset' +) +argument_parser.add_argument( + 'template', + help='Name of the computing template (template should be present ' + 'in $DATASET/.datalad/compute/methods)' +) +argument_parser.add_argument( + '-p', '--parameter', + action='append', + help='Parameter for the execution in the form = (repeat for ' + 'multiple parameters)', +) +argument_parser.add_argument( + '-o', '--output', + action='append', + help='Files that will be written or modified by the template execution', +) + + +def unlock(dataset: Path, outputs: list[str] | None) -> None: + with contextlib.chdir(dataset): + for output in outputs: + if Path(output).exists(): + subprocess.run(['git', 'annex', 'unlock', output], check=True) + + +def execute(dataset: Path, + method: str | None, + parameters: list[str] | None, + outputs: list[str] | None, + ) -> None: + + unlock(dataset, outputs) + compute( + template_path=dataset / template_dir / method, + compute_arguments={ + parameter.split('=', 1)[0]: parameter.split('=', 1)[1] + for parameter in parameters + }, + ) + + +def temporary_worktree(dataset: Path) -> Path: + worktree_dir = tempfile.TemporaryDirectory().name + with chdir(dataset): + subprocess.run(['git', 'worktree', 'add', worktree_dir], check=True) + return Path(worktree_dir) + + +def main(): + arguments = argument_parser.parse_args() + execute( + Path(arguments.dataset), + arguments.template, + arguments.parameter, + arguments.output, + ) + + +if __name__ == '__main__': + main() diff --git a/datalad_compute/utils/compute.py b/datalad_compute/utils/compute.py index 6eba07c..141c7c5 100644 --- a/datalad_compute/utils/compute.py +++ b/datalad_compute/utils/compute.py @@ -1,11 +1,14 @@ from __future__ import annotations +import contextlib import subprocess from pathlib import Path from typing import Any import tomllib +from datalad_next.datasets import Dataset + def substitute_string(format_str: str, replacements: dict[str, str], @@ -48,9 +51,11 @@ def get_substitutions(template: dict[str, Any], } -def compute(template_path: Path, +def compute(root_directory: Path, + template_path: Path, compute_arguments: dict[str, str], - ): + ) -> None: + with template_path.open('rb') as f: template = tomllib.load(f) @@ -63,7 +68,8 @@ def compute(template_path: Path, 'arguments' ) - if template.get('use_shell', 'false') == 'true': - subprocess.run(' '.join([substituted_executable] + substituted_arguments), shell=True) - else: - subprocess.run([substituted_executable] + substituted_arguments) + with contextlib.chdir(root_directory): + if template.get('use_shell', 'false') == 'true': + subprocess.run(' '.join([substituted_executable] + substituted_arguments), shell=True) + else: + subprocess.run([substituted_executable] + substituted_arguments) diff --git a/examples/one-to-many b/examples/one-to-many new file mode 100644 index 0000000..2f0dc26 --- /dev/null +++ b/examples/one-to-many @@ -0,0 +1,9 @@ +inputs = ['first', 'second', 'output'] + +use_shell = 'true' +executable = "echo" + +arguments = [ + "content: {first} > '{output}-1.txt';", + "echo content: {second} > '{output}-2.txt'", +] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..e69de29 From 619efb9dc7fd9b228f8ca21cab403a87a935d962 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 17 Sep 2024 21:20:02 +0200 Subject: [PATCH 030/148] refactor compute_cmd a little --- datalad_compute/commands/compute_cmd.py | 30 ++++++++++++++++--------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index d02ad4f..5482217 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -119,18 +119,9 @@ def __call__(dataset=None, un_provide(dataset, worktree) url_base = get_url(dataset, branch, template, parameter, input, output) - relaxed = ['--relaxed'] if url_only else [] for out in output: - - # Build the file-specific URL and store it in the annex - url = url_base + f'&this={quote(out)}' - file_dataset_path, file_path = get_file_dataset(dataset.pathobj / out) - lgr.debug('addurl: -C:%s file_path:%s url:%s', str(file_dataset_path), str(file_path), str(url)) - call_git_success([ - '-C', str(file_dataset_path), 'annex', - 'addurl', url, '--file', file_path] + relaxed) - + url = add_url(dataset, out, url_base, url_only) yield get_status_dict( action='compute', path=dataset.pathobj / out, @@ -158,6 +149,25 @@ def get_url(dataset: Dataset, ) +def add_url(dataset: Dataset, + file_path: str, + url_base: str, + url_only: bool + ) -> str: + + lgr.debug( + 'add_url: %s %s %s %s', + str(dataset), str(file_path), url_base, repr(url_only)) + + # Build the file-specific URL and store it in the annex + url = url_base + f'&this={quote(file_path)}' + file_dataset_path, file_path = get_file_dataset(dataset.pathobj / file_path) + call_git_success( + ['-C', str(file_dataset_path), 'annex', 'addurl', url, '--file', file_path] + + (['--relaxed'] if url_only else [])) + return url + + def get_file_dataset(file: Path) -> [Path, Path]: """ Get dataset of file and relative path of file from the dataset From b8677f1623c0ed7e7d37ac707d03ecc3aeeae90c Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 18 Sep 2024 12:59:06 +0200 Subject: [PATCH 031/148] updates remote to new encodings --- TODO.txt | 4 + datalad_compute/annexremotes/compute.py | 97 ++++++++++++++++++------- datalad_compute/commands/compute_cmd.py | 8 +- 3 files changed, 79 insertions(+), 30 deletions(-) diff --git a/TODO.txt b/TODO.txt index 774d4ea..5c98886 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,3 +1,7 @@ 1. Associate URLs with output files. This allows to register URLs when the file was copied to the dataset and saved. + +2. Consolidate compute-URL construction and splitting code + +3. Use dataclass for compute_info in annex remote diff --git a/datalad_compute/annexremotes/compute.py b/datalad_compute/annexremotes/compute.py index 6ea843e..9670f94 100644 --- a/datalad_compute/annexremotes/compute.py +++ b/datalad_compute/annexremotes/compute.py @@ -1,8 +1,8 @@ from __future__ import annotations -import contextlib +import json import shutil -import tempfile +import subprocess from pathlib import Path from typing import Any from urllib.parse import ( @@ -11,16 +11,19 @@ ) from annexremote import Master +from datalad.customremotes import RemoteError from datalad_next.annexremotes import ( SpecialRemote, super_main ) -from datalad_compute import ( - template_dir, - url_scheme, +from datalad_next.datasets import Dataset +from datalad_compute import url_scheme +from datalad_compute.commands.compute_cmd import ( + execute, + provide, + un_provide ) -from datalad_compute.utils.compute import compute class ComputeRemote(SpecialRemote): @@ -61,47 +64,85 @@ def getcost(self) -> int: self.annex.debug(f'GETCOST') return 100 - def get_url_encoded_info(self, url: str) -> tuple[str, str, str, str]: - parts = urlparse(url).query.split('&', 3) - return parts[0], parts[1], parts[2], parts[3] + def get_url_encoded_info(self, url: str) -> list[str]: + parts = urlparse(url).query.split('&', 6) + self.annex.debug(f'get_url_encoded_info: url: {url!r}, parts: {parts!r}') + return parts def get_url_for_key(self, key: str) -> str: urls = self.annex.geturls(key, f'{url_scheme}:') - self.annex.debug(f'TRANSFER RETRIEVE urls({key!r}, "{url_scheme}"): {urls!r}') + self.annex.debug(f'get_url_for_key: key: {key!r}, urls: {urls!r}') return urls[0] def get_compute_info(self, key: str) -> dict[str, Any]: def get_assigned_value(assignment: str) -> str: return assignment.split('=', 1)[1] - dependencies, method, output, parameters = self.get_url_encoded_info( - self.get_url_for_key(key) - ) + root_id, root_version, method, inputs, outputs, parameters, this \ + = self.get_url_encoded_info(self.get_url_for_key(key)) + return { - 'dependencies': get_assigned_value(dependencies), - 'method': Path(self.annex.getgitdir()).parent - / template_dir - / get_assigned_value(method), - 'output': get_assigned_value(output), - 'parameter': { - name: unquote(value) - for name, value in map(lambda s: s.split('=', 1), parameters.split('&')) - } + 'root_id': unquote(get_assigned_value(root_id)), + 'root_version': unquote(get_assigned_value(root_version)), + 'method': unquote(get_assigned_value(method)), + 'input': json.loads(unquote(get_assigned_value(inputs))), + 'output': json.loads(unquote(get_assigned_value(outputs))), + 'parameter': json.loads(unquote(get_assigned_value(parameters))), + 'this': unquote(get_assigned_value(this)), } def transfer_retrieve(self, key: str, file_name: str) -> None: + self.annex.debug(f'TRANSFER RETRIEVE key: {key!r}, file_name: {file_name!r}') + compute_info = self.get_compute_info(key) - self.annex.debug(f'TRANSFER RETRIEVE {key!r}: compute_info: {compute_info!r}, file_name: {file_name!r}') - with tempfile.TemporaryDirectory() as tmpdir: - absolute_method = compute_info['method'].absolute() - with contextlib.chdir(tmpdir): - compute(absolute_method, compute_info['parameter']) - shutil.copy(Path(tmpdir) / compute_info['output'], file_name) + self.annex.debug(f'TRANSFER RETRIEVE compute_info: {compute_info!r}') + + # TODO: get version override from configuration + dataset = self._find_dataset(compute_info['root_id']) + + # Perform the computation, and collect the results + worktree = provide(dataset, compute_info['root_version'], compute_info['input']) + execute(worktree, compute_info['method'], compute_info['parameter'], compute_info['output']) + self._collect(worktree, dataset, compute_info['output'], compute_info['this'], file_name) + un_provide(dataset, worktree) def checkpresent(self, key: str) -> bool: # See if at least one URL with the compute url-scheme is present return self.annex.geturls(key, f'{url_scheme}:') != [] + def _find_dataset(self, + root_id: str + ) -> Dataset: + """Find the first enclosing dataset with the given root_id""" + current_dir = Path(self.annex.getgitdir()) / '..' + + while current_dir != Path('/'): + result = subprocess.run( + [ + 'git', 'config', '-f', + str(current_dir/ '.datalad' / 'config'), + '--get', 'datalad.dataset.id' + ], + stdout=subprocess.PIPE) + if result.returncode != 0: + continue + if result.stdout.decode().strip() == root_id: + return Dataset(current_dir) + current_dir = current_dir / '..' + raise RemoteError(f'Could not find dataset {root_id!r}') + + def _collect(self, + worktree: Path, + dataset: Dataset, + outputs: list[str], + this: str, + this_destination: str, + ) -> None: + """Collect computation results for `this` (and all other outputs) """ + + # TODO: reap all other output files that are known to the annex + shutil.copyfile(worktree / this, this_destination) + def main(): """cmdline entry point""" diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 5482217..a0cee82 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -210,7 +210,9 @@ def execute(worktree: Path, # Unlock output files in the worktree-directory for o in output: - call_git_success(['-C', str(worktree), 'annex', 'unlock', o]) + call_git_success( + ['-C', str(worktree), 'annex', 'unlock', o], + capture_output=True) # Run the computation in the worktree-directory template_path = worktree / template_dir / template_name @@ -249,7 +251,9 @@ def collect(worktree: Path, # Unlock output files in the dataset-directory and copy the result for o in output: dest = dataset.pathobj / o - call_git_success(['-C', dataset.path, 'annex', 'unlock', str(dest)]) + call_git_success( + ['-C', dataset.path, 'annex', 'unlock', str(dest)], + capture_output=True) shutil.copyfile(worktree / o, dest) # Save the dataset From e73ee3e2c3965ac095ea944302998388181372a0 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 18 Sep 2024 15:17:57 +0200 Subject: [PATCH 032/148] update README.md to new architecture --- README.md | 80 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index c0a076c..8e2bdc2 100644 --- a/README.md +++ b/README.md @@ -6,17 +6,22 @@ [![Hatch project](https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg)](https://github.com/pypa/hatch) -**This code is a POC, that means: ncode does not -thoroughly validate inputs, names might be inconsistent, little docs. -Tests do mostly not exist.** +**This code is a POC**, that means currently: +- code does not thoroughly validate inputs +- names might be inconsistent +- few tests +- fewer docs +- no support for locking This is a naive datalad compute extension that serves as a playground for the datalad remake-project. It contains an annex remote that can compute content on demand. It uses template -files that specify the operations and per-data file parameters that are encoded -in annex URL-keys. It also contains the new datalad command `compute` that -can trigger the computation of content and store the parameters that are +files that specify the operations. It encodes computation parameters in URLs +that are associated with annex keys, that allows to compute dropped content +instead of fetching it from some storage system. It also contains the new +datalad command `compute` that +can trigger the computation of content and stores the parameters that are used for content creation in the git-annex branch, where they can be used by the annex remote to repeat the computation. @@ -38,22 +43,17 @@ Create the template directory and a template ```bash > mkdir -p .datalad/compute/methods -> cat > .datalad/compute/methods/params_to_text < cat > .datalad/compute/methods/one-to-many <", - "'{output_file}'", + "content: {first} > '{output}-1.txt';", + "echo content: {second} > '{output}-2.txt'", ] EOF -> datalad save -m "add compute method" +> datalad save -m "add `one-to-many` compute method" ``` Create a "compute" annex special remote: @@ -63,44 +63,52 @@ Create a "compute" annex special remote: Execute a computation and save the result: ```bash -> datalad compute params_to_text text-1.txt points=101 label=einhunderteins +> datalad compute -p first=bob -p second=alice -p output=name -o name-1.txt \ +-o name-2.txt one-to-many ``` +The method `one-to-many` will create two files with the names -1.txt +and `-2.txt`. That is why those two files are listed as outputs in the +command above. -The previous command will create the file `text-1.txt`: ```bash -> cat text-1.txt -generated via compute: points: 101, label: einhunderteins +> cat name-1.txt +bob +> cat name-2.txt +alice ``` -Drop the content of `text-1.txt`, verify it is gone, recreate it via +Drop the content of `name-1.txt`, verify it is gone, recreate it via `datalad get`, which "fetches" is from the compute remote: ```bash -> datalad drop text-1.txt -> cat text-1.txt -> datalad get text-1.txt -> cat text-1.txt +> datalad drop name-1.txt +> cat name-1.txt +> datalad get name-1.txt +> cat name-1.txt ``` -Generate a speculative computation, i.e. do not perform the computation but generate an -URL-KEY with an associated URL that describes the computation that should be performed (a -"computation description URL"). This -is done by giving the `-u` option (url-only) to `datalad compute`. +The command `datalad compute` does also support to just record the parameters +that would lead to a certain computation, without actually performing the +computation. We refer to this as *speculative computation*. + +Generate a speculative computation, this is done by providing the `-u` option +(url-only) to `datalad compute`. ```bash -> datalad compute params_to_text -u text-2.txt points=303 label=dreihundertdrei -> cat text-2.txt # this will fail, because the computation has not yet been performed +> datalad compute -p first=john -p second=susan -p output=person \ +-o person-1.txt -o person-2.txt -u one-to-many +> cat person-1.txt # this will fail, because the computation has not yet been performed ``` -`ls -l text-2.txt` will show a link to a not-downloaded URL-KEY. -`git annex whereis text-2.txt` will show the associated computation description URL. +`ls -l person-1.txt` will show a link to a not-downloaded URL-KEY. +`git annex whereis person-1.txt` will show the associated computation description URL. No computation has been performed yet, `datalad compute` just creates an URL-KEY and associates a computation description URL with the URL-KEY. Use `datalad get` to perform the computation for the first time and receive the result:: ```bash -> datalad get text-2.txt -> cat text-2.txt +> datalad get person-1.txt +> cat person-1.txt ``` From 98929de3f67ec12cd18bbd2a9bb91d5990cc5474 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 18 Sep 2024 15:18:21 +0200 Subject: [PATCH 033/148] update example method --- examples/one-to-many | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/examples/one-to-many b/examples/one-to-many index 2f0dc26..61bb74e 100644 --- a/examples/one-to-many +++ b/examples/one-to-many @@ -1,8 +1,28 @@ +# This is a computing template that demonstrates a computation with +# multiple output files. +# +# Templates are addressed by their name. They should be stored in +# `$DATASET_ROOT/.datalad/compute/methods` + +# An invocation of `datalad compute` has to provide a parameter argument for +# each input variable. In this case the invocation could look like this: +# `datalad compute -p first=bob -p second=alice -p output=name ... one-to-many` +# inputs = ['first', 'second', 'output'] +# Use a shell to interpret `arguments`. By default, `use_shell` is 'false'. +# use_shell = 'true' -executable = "echo" +# The name of the executable. This will be the prepended to the argument list +# given in `arguments`. +# +executable = 'echo' + +# Arguments to the executable. The curly braces are placeholders for the +# input variables that were defined above. They will be replaced with the +# values provided in the parameter arguments of `datalad compute`. +# arguments = [ "content: {first} > '{output}-1.txt';", "echo content: {second} > '{output}-2.txt'", From 144fdfddf09e68564f54994e898e66dbae7c83c4 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 18 Sep 2024 15:19:11 +0200 Subject: [PATCH 034/148] support no input files in git worktree provision --- datalad_compute/dataprovider/gitworktree.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index 1cc09fb..fc65b5a 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -75,7 +75,7 @@ def provide(dataset: str, # Fetch file content in the worktree work_dataset = Dataset(worktree_dir) with chdir(worktree_dir): - for p in input_files: + for p in input_files or []: work_dataset.get(p, result_renderer='disabled') return worktree_dir @@ -119,9 +119,6 @@ def main(): remove(arguments.dataset, arguments.delete) return - if not arguments.input: - raise ValueError('At least one input file must be provided') - provision_dir = provide( arguments.dataset, arguments.branch, From d993ffaff9d94d45c2b81ed7d0313f033acb1413 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 18 Sep 2024 15:20:09 +0200 Subject: [PATCH 035/148] update requirements-devel.txt --- requirements-devel.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 requirements-devel.txt diff --git a/requirements-devel.txt b/requirements-devel.txt new file mode 100644 index 0000000..db4986b --- /dev/null +++ b/requirements-devel.txt @@ -0,0 +1,12 @@ +# requirements for a development environment +coverage +datalad +datalad-next +datasalad +pytest +pytest-cov + +# requirements for a document building +sphinx +sphinx_rtd_theme +sphinx_copybutton From b0b7f94e54eb3cf64dee8b81d95ec7c01f8ffedb Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 18 Sep 2024 15:23:19 +0200 Subject: [PATCH 036/148] support no input files in compute-command --- datalad_compute/commands/compute_cmd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index a0cee82..108e273 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -110,7 +110,7 @@ def __call__(dataset=None, parameter=None, ): - dataset : Dataset = dataset.ds if dataset else Dataset() + dataset : Dataset = dataset.ds if dataset else Dataset('.') if not url_only: worktree = provide(dataset, branch, input) @@ -191,7 +191,7 @@ def provide(dataset: Dataset, args = ['provide-gitworktree', dataset.path, ] + ( ['--branch', branch] if branch else [] ) - args.extend(chain(*[('--input', i) for i in input])) + args.extend(chain(*[('--input', i) for i in (input or [])])) stdout = subprocess.run(args, stdout=subprocess.PIPE, check=True).stdout return Path(stdout.splitlines()[-1].decode()) From ba9569329eda3939334ca403714c8d7b70dbe4b8 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 18 Sep 2024 15:25:42 +0200 Subject: [PATCH 037/148] update TODO --- TODO.txt | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/TODO.txt b/TODO.txt index 5c98886..9ad2e4c 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,7 +1,13 @@ -1. Associate URLs with output files. This allows to register URLs when - the file was copied to the dataset and saved. +- Provide a way to overwrite the root dataset version during `datalad get`. + This could be done via a config variable or an option that is used when + creating the annex remote. -2. Consolidate compute-URL construction and splitting code +- Implement opportunistic collection of all results of a single computation + that are not yet present (via `git annex reinject`). -3. Use dataclass for compute_info in annex remote +- Consolidate compute-URL construction and splitting code + +- Use dataclass for compute_info in annex remote + +- Implement locking for result collection From 3b875fe354f2b45d62bf2946380954870243b1ba Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 18 Sep 2024 15:46:37 +0200 Subject: [PATCH 038/148] use packages and adapt imports --- datalad_compute/annexremotes/__init__.py | 0 datalad_compute/annexremotes/compute.py | 6 +- datalad_compute/commands/__init__.py | 0 datalad_compute/commands/compute_cmd.py | 22 ++--- .../dataprovider/tests/test_gitworktree.py | 31 ------- datalad_compute/executors/simple.py | 83 ------------------- datalad_compute/utils/__init__.py | 0 datalad_compute/utils/compute.py | 5 +- 8 files changed, 15 insertions(+), 132 deletions(-) create mode 100644 datalad_compute/annexremotes/__init__.py create mode 100644 datalad_compute/commands/__init__.py delete mode 100644 datalad_compute/executors/simple.py create mode 100644 datalad_compute/utils/__init__.py diff --git a/datalad_compute/annexremotes/__init__.py b/datalad_compute/annexremotes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datalad_compute/annexremotes/compute.py b/datalad_compute/annexremotes/compute.py index 9670f94..f036c7c 100644 --- a/datalad_compute/annexremotes/compute.py +++ b/datalad_compute/annexremotes/compute.py @@ -16,10 +16,10 @@ SpecialRemote, super_main ) - from datalad_next.datasets import Dataset -from datalad_compute import url_scheme -from datalad_compute.commands.compute_cmd import ( + +from .. import url_scheme +from ..commands.compute_cmd import ( execute, provide, un_provide diff --git a/datalad_compute/commands/__init__.py b/datalad_compute/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 108e273..d375df6 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -1,9 +1,7 @@ -"""DataLad demo command""" +"""DataLad compute command""" from __future__ import annotations -__docformat__ = 'restructuredtext' - import contextlib import json import logging @@ -11,7 +9,6 @@ import subprocess from itertools import chain from pathlib import Path -from tempfile import template from urllib.parse import quote from datalad_next.commands import ( @@ -30,17 +27,20 @@ ) from datalad_next.datasets import Dataset from datalad_next.runners import call_git_oneline, call_git_success, iter_subproc -from datalad_compute import ( - template_dir, - url_scheme, -) -from datalad_compute.utils.compute import compute from datasalad.runners import iter_subproc from datasalad.itertools import ( itemize, load_json, ) -from more_itertools import intersperse + +from .. import ( + template_dir, + url_scheme, +) +from ..utils.compute import compute + + +__docformat__ = 'restructuredtext' lgr = logging.getLogger('datalad.compute.compute_cmd') @@ -57,7 +57,7 @@ class Compute(ValidatedInterface): _validator_ = EnsureCommandParameterization(dict( dataset=EnsureDataset(installed=True), - input=EnsureListOf(EnsureStr(min_len=1), min_len=1), + input=EnsureListOf(EnsureStr(min_len=1)), output=EnsureListOf(EnsureStr(min_len=1), min_len=1), parameter=EnsureListOf(EnsureStr(min_len=3)), )) diff --git a/datalad_compute/dataprovider/tests/test_gitworktree.py b/datalad_compute/dataprovider/tests/test_gitworktree.py index 5078a6c..78e0545 100644 --- a/datalad_compute/dataprovider/tests/test_gitworktree.py +++ b/datalad_compute/dataprovider/tests/test_gitworktree.py @@ -64,34 +64,3 @@ def check_deleted_worktrees(ds: Dataset): check_deleted_worktrees(dataset) dataset.drop(reckless='kill', result_renderer='disabled') - - -def test_worktree_example(): - dataset = Dataset('/home/cristian/tmp/compute/compute-test-4') - worktree = Dataset(provide( - dataset.path, - input_files=[ - 'b.txt', - 'subds1/c.txt', - 'subds1/a/d.txt', - 'd/subds2/a/e.txt' - ], - )) - - r_orig = [r['gitmodule_url'] for r in dataset.subdatasets(recursive=True, result_renderer='disabled')] - r_worktree = [r['gitmodule_url'] for r in worktree.subdatasets(recursive=True, result_renderer='disabled')] - assert r_orig == r_worktree - - - remove(dataset.path, worktree.path) - - def check_deleted_worktrees(ds: Dataset): - with chdir(ds.path): - for line in call_git_lines(['worktree', 'list']): - directory = line.split()[0] - assert directory == ds.path - for sub_ds in ds.subdatasets(result_renderer='disabled'): - check_deleted_worktrees(Dataset(sub_ds['path'])) - - check_deleted_worktrees(dataset) - dataset.drop(reckless='kill', result_renderer='disabled') diff --git a/datalad_compute/executors/simple.py b/datalad_compute/executors/simple.py deleted file mode 100644 index 5cc1bf5..0000000 --- a/datalad_compute/executors/simple.py +++ /dev/null @@ -1,83 +0,0 @@ -""" -A data provisioner that works with local git repositories. -Data is provisioned in a temporary worktree -Currently there is no support for subdatasets -""" -from __future__ import annotations - -import contextlib -import subprocess -import tempfile -from argparse import ArgumentParser -from contextlib import chdir -from pathlib import Path - -from datalad_compute import template_dir -from datalad_compute.utils.compute import compute - - -argument_parser = ArgumentParser() -argument_parser.add_argument( - 'dataset', - help='Path to provisioned dataset' -) -argument_parser.add_argument( - 'template', - help='Name of the computing template (template should be present ' - 'in $DATASET/.datalad/compute/methods)' -) -argument_parser.add_argument( - '-p', '--parameter', - action='append', - help='Parameter for the execution in the form = (repeat for ' - 'multiple parameters)', -) -argument_parser.add_argument( - '-o', '--output', - action='append', - help='Files that will be written or modified by the template execution', -) - - -def unlock(dataset: Path, outputs: list[str] | None) -> None: - with contextlib.chdir(dataset): - for output in outputs: - if Path(output).exists(): - subprocess.run(['git', 'annex', 'unlock', output], check=True) - - -def execute(dataset: Path, - method: str | None, - parameters: list[str] | None, - outputs: list[str] | None, - ) -> None: - - unlock(dataset, outputs) - compute( - template_path=dataset / template_dir / method, - compute_arguments={ - parameter.split('=', 1)[0]: parameter.split('=', 1)[1] - for parameter in parameters - }, - ) - - -def temporary_worktree(dataset: Path) -> Path: - worktree_dir = tempfile.TemporaryDirectory().name - with chdir(dataset): - subprocess.run(['git', 'worktree', 'add', worktree_dir], check=True) - return Path(worktree_dir) - - -def main(): - arguments = argument_parser.parse_args() - execute( - Path(arguments.dataset), - arguments.template, - arguments.parameter, - arguments.output, - ) - - -if __name__ == '__main__': - main() diff --git a/datalad_compute/utils/__init__.py b/datalad_compute/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datalad_compute/utils/compute.py b/datalad_compute/utils/compute.py index 141c7c5..ef324cd 100644 --- a/datalad_compute/utils/compute.py +++ b/datalad_compute/utils/compute.py @@ -2,13 +2,10 @@ import contextlib import subprocess +import tomllib from pathlib import Path from typing import Any -import tomllib - -from datalad_next.datasets import Dataset - def substitute_string(format_str: str, replacements: dict[str, str], From 19a359c4aa2709dbb5913f88d2748a58695fc3dc Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 18 Sep 2024 15:57:58 +0200 Subject: [PATCH 039/148] add installation instructions to README.md --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 8e2bdc2..ccbf1ce 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,18 @@ can trigger the computation of content and stores the parameters that are used for content creation in the git-annex branch, where they can be used by the annex remote to repeat the computation. +## Installation + +There is no pypi-package yet. To install the extension, clone the repository +and install it via `pip` (preferably in a virtual environment): + +```bash +git clone https://github.com/christian-monch/datalad-compute.git +cd datalad-compute +pip install -r requirements-devel.txt +pip install . +``` + ## Example usage From 1a3354c7ef9ecb8ce00f73f67d3e4c1f76e390d5 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 18 Sep 2024 20:23:00 +0200 Subject: [PATCH 040/148] update README.md --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index ccbf1ce..84da43e 100644 --- a/README.md +++ b/README.md @@ -18,12 +18,12 @@ the datalad remake-project. It contains an annex remote that can compute content on demand. It uses template files that specify the operations. It encodes computation parameters in URLs -that are associated with annex keys, that allows to compute dropped content +that are associated with annex keys, which allows to compute dropped content instead of fetching it from some storage system. It also contains the new datalad command `compute` that -can trigger the computation of content and stores the parameters that are -used for content creation in the git-annex branch, where they can be used by -the annex remote to repeat the computation. +can trigger the computation of content, generate the parameterized URLs, and +associate this URL with the respective annex key. This information can then +be used by the annex remote to repeat the computation. ## Installation @@ -78,9 +78,9 @@ Execute a computation and save the result: > datalad compute -p first=bob -p second=alice -p output=name -o name-1.txt \ -o name-2.txt one-to-many ``` -The method `one-to-many` will create two files with the names -1.txt -and `-2.txt`. That is why those two files are listed as outputs in the -command above. +The method `one-to-many` will create two files with the names `-1.txt` +and `-2.txt`. That is why the two files `name-1.txt` and `name-2.txt` +are listed as outputs in the command above. ```bash > cat name-1.txt From 67e99b7b181a86627c231d5d78500c4ab40fd6a3 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 18 Sep 2024 20:26:45 +0200 Subject: [PATCH 041/148] improve description of `datalad compute` in README.md --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 84da43e..d377ffc 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,12 @@ The method `one-to-many` will create two files with the names `-1.txt` and `-2.txt`. That is why the two files `name-1.txt` and `name-2.txt` are listed as outputs in the command above. +Note that only output files that are defined by the `-o/--output` option will +be available in the dataset after `datalad compute`. Similarly, only the files +defined by `-i/--input` will be available as inputs to the computation (the +computation is performed in a "scratch" directory, so the input files must be +copied there and the output files must be copied back). + ```bash > cat name-1.txt bob From 8ccbb75428be20f396d3dbcb1b57319aa4d0febc Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 20 Sep 2024 09:06:23 +0200 Subject: [PATCH 042/148] use reinject collection in compute special remote --- datalad_compute/annexremotes/compute.py | 19 +++++++++++++++++-- datalad_compute/commands/compute_cmd.py | 6 +++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/datalad_compute/annexremotes/compute.py b/datalad_compute/annexremotes/compute.py index f036c7c..0f13355 100644 --- a/datalad_compute/annexremotes/compute.py +++ b/datalad_compute/annexremotes/compute.py @@ -17,12 +17,14 @@ super_main ) from datalad_next.datasets import Dataset +from datalad_next.runners import call_git_success from .. import url_scheme from ..commands.compute_cmd import ( execute, + get_file_dataset, provide, - un_provide + un_provide, ) @@ -140,7 +142,20 @@ def _collect(self, ) -> None: """Collect computation results for `this` (and all other outputs) """ - # TODO: reap all other output files that are known to the annex + # Collect all output files that have been created while creating + # `this` file. + for output in outputs: + if output == this: + continue + dataset_path, file_path = get_file_dataset(dataset.pathobj / output) + call_git_success([ + '-C', str(dataset_path), + 'annex', 'reinject', + str(worktree / output), + str(file_path)], + capture_output=True) + + # Collect `this` file shutil.copyfile(worktree / this, this_destination) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index d375df6..3016d76 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -168,11 +168,11 @@ def add_url(dataset: Dataset, return url -def get_file_dataset(file: Path) -> [Path, Path]: +def get_file_dataset(file: Path) -> tuple[Path, Path]: """ Get dataset of file and relative path of file from the dataset - Determine the dataset that contains the file and the relative path of the - file in this dataset.""" + Determine the path of the dataset that contains the file and the relative + path of the file in this dataset.""" top_level = Path(call_git_oneline( ['-C', str(file.parent), 'rev-parse', '--show-toplevel'] )) From 84b3331829d459e8e22387f31ab809b005720710 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 20 Sep 2024 12:50:34 +0200 Subject: [PATCH 043/148] [temp] add tests with dataset hierarchies --- .../dataprovider/tests/test_gitworktree.py | 36 +++++-------------- examples/one-to-many | 12 +++++++ 2 files changed, 20 insertions(+), 28 deletions(-) diff --git a/datalad_compute/dataprovider/tests/test_gitworktree.py b/datalad_compute/dataprovider/tests/test_gitworktree.py index 78e0545..28e2f08 100644 --- a/datalad_compute/dataprovider/tests/test_gitworktree.py +++ b/datalad_compute/dataprovider/tests/test_gitworktree.py @@ -10,35 +10,11 @@ provide, remove, ) +from ...test_utils.create_datasets import create_ds_hierarchy -def create_subdatasets(parent_dataset: Dataset, - subdataset_levels: int = 2, - level_id: int = 0, - ): - if subdataset_levels == 0: - return - - subdataset = Dataset(parent_dataset.pathobj / f'subds{level_id}') - subdataset.create(result_renderer='disabled') - create_subdatasets(subdataset, subdataset_levels - 1, level_id + 1) - (subdataset.pathobj / f'a{level_id}.txt').write_text(f'a{level_id}') - (subdataset.pathobj / f'b{level_id}.txt').write_text(f'b{level_id}') - subdataset.save(result_renderer='disabled') - - -def create_ds_hierarchy(subdataset_levels: int = 2): - dataset = Dataset(tempfile.TemporaryDirectory().name) - dataset.create(force=True, result_renderer='disabled') - create_subdatasets(dataset, subdataset_levels) - (dataset.pathobj / 'a.txt').write_text('a') - (dataset.pathobj / 'b.txt').write_text('b') - dataset.save(result_renderer='disabled') - return dataset - - -def test_worktree_basic(): - dataset = create_ds_hierarchy(3) +def test_worktree_basic(tmp_path): + dataset = create_ds_hierarchy(str(tmp_path), 3)[0][0] worktree = Dataset(provide( dataset.path, input_files=[ @@ -63,4 +39,8 @@ def check_deleted_worktrees(ds: Dataset): check_deleted_worktrees(Dataset(sub_ds['path'])) check_deleted_worktrees(dataset) - dataset.drop(reckless='kill', result_renderer='disabled') + dataset.drop( + what='all', + reckless='kill', + recursive=True, + result_renderer='disabled') diff --git a/examples/one-to-many b/examples/one-to-many index 61bb74e..82c1139 100644 --- a/examples/one-to-many +++ b/examples/one-to-many @@ -3,6 +3,18 @@ # # Templates are addressed by their name. They should be stored in # `$DATASET_ROOT/.datalad/compute/methods` +# +# Each template must define the following variables: +# - `inputs`: a list of strings that define the input variables +# - `use_shell`: a boolean that defines whether to use a shell to interpret executable and arguments +# - `executable`: the name of the executable +# - `arguments`: a list of strings that define the arguments to the executable +# +# During execution `subprocess.run([executable] + arguments, shell=use_shell, ...)` +# will be invoked. +# Variable placeholders, i.e `{}` in `arguments` will be +# replaced with the values provided in the parameter arguments of +# `datalad compute`. # An invocation of `datalad compute` has to provide a parameter argument for # each input variable. In this case the invocation could look like this: From d5e260c98e1d18c703bf3afdf92991095d4b8d36 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 20 Sep 2024 16:02:47 +0200 Subject: [PATCH 044/148] fix collect mechanism in compute-command --- datalad_compute/commands/compute_cmd.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 3016d76..8477a87 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -162,9 +162,10 @@ def add_url(dataset: Dataset, # Build the file-specific URL and store it in the annex url = url_base + f'&this={quote(file_path)}' file_dataset_path, file_path = get_file_dataset(dataset.pathobj / file_path) - call_git_success( + success = call_git_success( ['-C', str(file_dataset_path), 'annex', 'addurl', url, '--file', file_path] + (['--relaxed'] if url_only else [])) + assert success, f'\naddurl failed:\nfile_dataset_path: {file_dataset_path}\nurl: {url!r}\nfile_path: {file_path!r}' return url @@ -251,13 +252,12 @@ def collect(worktree: Path, # Unlock output files in the dataset-directory and copy the result for o in output: dest = dataset.pathobj / o - call_git_success( - ['-C', dataset.path, 'annex', 'unlock', str(dest)], - capture_output=True) + if dest.exists(): + dataset.unlock(str(dest)) shutil.copyfile(worktree / o, dest) # Save the dataset - dataset.save() + dataset.save(recursive=True) def un_provide(dataset: Dataset, From fac1e501434bd6b93a1859d4c0c648c53c0be400 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 20 Sep 2024 16:28:07 +0200 Subject: [PATCH 045/148] fix unlocking in provisioning in compute-command --- datalad_compute/commands/compute_cmd.py | 27 ++++--------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 8477a87..0a266f5 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -207,13 +207,12 @@ def execute(worktree: Path, 'execute: %s %s %s %s', str(worktree), template_name, repr(parameter), repr(output)) - assert_annexed(worktree, output) - # Unlock output files in the worktree-directory + temp_dataset = Dataset(worktree) for o in output: - call_git_success( - ['-C', str(worktree), 'annex', 'unlock', o], - capture_output=True) + file = temp_dataset.pathobj / o + if file.exists(): + temp_dataset.unlock(file) # Run the computation in the worktree-directory template_path = worktree / template_dir / template_name @@ -224,24 +223,6 @@ def execute(worktree: Path, compute(worktree, template_path, parameter_dict) -def assert_annexed(worktree: Path, - files: list[str] - ) -> None: - - present_files = list(filter(lambda f: Path(f).exists(), files)) - with contextlib.chdir(worktree): - with iter_subproc(['git', 'annex', 'info', '--json', '--batch', '-z'], - inputs=(file.encode() + b'\x00' for file in present_files), - bufsize=0) as results: - not_annexed = tuple(filter( - lambda r: r['success'] == False, - load_json(itemize(results, sep=b'\n')))) - if not_annexed: - raise ValueError( - f'Output files are not annexed: ' + ', '.join( - map(lambda na: na['file'], not_annexed))) - - def collect(worktree: Path, dataset: Dataset, output: list[str], From 5954baef7e86badb1da1463791347d953de36890 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 20 Sep 2024 16:29:20 +0200 Subject: [PATCH 046/148] clean up imports in compute-command --- datalad_compute/commands/compute_cmd.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 0a266f5..744b6c5 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -2,7 +2,6 @@ from __future__ import annotations -import contextlib import json import logging import shutil @@ -26,11 +25,9 @@ EnsureStr, ) from datalad_next.datasets import Dataset -from datalad_next.runners import call_git_oneline, call_git_success, iter_subproc -from datasalad.runners import iter_subproc -from datasalad.itertools import ( - itemize, - load_json, +from datalad_next.runners import ( + call_git_oneline, + call_git_success, ) from .. import ( From d96472fb33bdd92a04bf13c35582655bae4efe8d Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 20 Sep 2024 16:54:56 +0200 Subject: [PATCH 047/148] mimic `annex unlock` behavior for dangling links --- datalad_compute/commands/compute_cmd.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 744b6c5..64a125d 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -4,6 +4,7 @@ import json import logging +import os import shutil import subprocess from itertools import chain @@ -210,6 +211,12 @@ def execute(worktree: Path, file = temp_dataset.pathobj / o if file.exists(): temp_dataset.unlock(file) + elif file.is_symlink(): + # `datalad unlock` does not unlock dangling symlinks, so we mimic + # the behavior of `git annex unlock` here: + link = os.readlink(file) + file.unlink() + file.write('/annex/objects/' + link.split('/')[-1]) # Run the computation in the worktree-directory template_path = worktree / template_dir / template_name From 64e5c8f74ce05e27550c5ac2a9d1ac5bbce1fa35 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 20 Sep 2024 17:03:51 +0200 Subject: [PATCH 048/148] add a test for computed data in subdatasets --- .../annexremotes/tests/__init__.py | 0 .../annexremotes/tests/test_hierarchies.py | 89 +++++++++++++++++++ datalad_compute/test_utils/__init__.py | 0 datalad_compute/test_utils/create_datasets.py | 56 ++++++++++++ 4 files changed, 145 insertions(+) create mode 100644 datalad_compute/annexremotes/tests/__init__.py create mode 100644 datalad_compute/annexremotes/tests/test_hierarchies.py create mode 100644 datalad_compute/test_utils/__init__.py create mode 100644 datalad_compute/test_utils/create_datasets.py diff --git a/datalad_compute/annexremotes/tests/__init__.py b/datalad_compute/annexremotes/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datalad_compute/annexremotes/tests/test_hierarchies.py b/datalad_compute/annexremotes/tests/test_hierarchies.py new file mode 100644 index 0000000..303cd38 --- /dev/null +++ b/datalad_compute/annexremotes/tests/test_hierarchies.py @@ -0,0 +1,89 @@ +import contextlib + +from datalad_next.runners import call_git_success +from datalad_next.tests.fixtures import datalad_cfg + +from ... import ( + template_dir, + url_scheme, +) +from ...test_utils.create_datasets import create_ds_hierarchy + + +test_method = """ +inputs = ['first', 'second', 'third'] +use_shell = 'true' +executable = 'echo' +arguments = [ + "content: {first} > 'a.txt';", + "echo content: {second} > 'b.txt';", + "echo content: {third} > 'new.txt';", + "echo content: {first} > 'subds0/a0.txt';", + "echo content: {second} > 'subds0/b0.txt';", + "echo content: {third} > 'subds0/new.txt';", + "echo content: {first} > 'subds0/subds1/a1.txt';", + "echo content: {second} > 'subds0/subds1/b1.txt';", + "echo content: {third} > 'subds0/subds1/new.txt';", + "echo content: {first} > 'subds0/subds1/subds2/a2.txt';", + "echo content: {second} > 'subds0/subds1/subds2/b2.txt';", + "echo content: {third} > 'subds0/subds1/subds2/new.txt';", +] +""" + + +output = [ + 'a.txt', 'b.txt', 'new.txt', + 'subds0/a0.txt', 'subds0/b0.txt', 'subds0/new.txt', + 'subds0/subds1/a1.txt', 'subds0/subds1/b1.txt', 'subds0/subds1/new.txt', + 'subds0/subds1/subds2/a2.txt', 'subds0/subds1/subds2/b2.txt', 'subds0/subds1/subds2/new.txt', +] + + +def test_end_to_end(tmp_path, datalad_cfg): + + datasets = create_ds_hierarchy(str(tmp_path), 3) + root_dataset = datasets[0][0] + + # add method template + template_path = root_dataset.pathobj / template_dir + template_path.mkdir(parents=True) + (template_path / 'test_method').write_text(test_method) + root_dataset.save(result_renderer='disabled') + + # set annex security related variables to allow compute-URLs + datalad_cfg.set('annex.security.allowed-url-schemes', url_scheme, scope='global') + datalad_cfg.set('annex.security.allowed-ip-addresses', 'all', scope='global') + datalad_cfg.set('annex.security.allow-unverified-downloads', 'ACKTHPPT', scope='global') + + # add a compute remotes to all datasets + for _, dataset_path, _ in datasets: + call_git_success([ + '-C', str(dataset_path), + 'annex', 'initremote', 'compute', + 'type=external', 'externaltype=compute', + 'encryption=none']) + + # run compute command + root_dataset.compute( + template='test_method', + parameter=[ + 'first=first', + 'second=second', + 'third=third', + ], + output=output) + + # check computation success + for file, content in zip(output, ['first', 'second', 'third'] * 4): + assert (root_dataset.pathobj / file).read_text() == f'content: {content}\n' + + # Drop all computed content + for file in output: + root_dataset.drop(file) + + # Go to the subdataset `subds0/subds1` and fetch the content of `a1.txt` + # from a compute remote. + with contextlib.chdir(root_dataset.pathobj / 'subds0' / 'subds1'): + root_dataset.get('a1.txt') + + print(datasets) diff --git a/datalad_compute/test_utils/__init__.py b/datalad_compute/test_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datalad_compute/test_utils/create_datasets.py b/datalad_compute/test_utils/create_datasets.py new file mode 100644 index 0000000..662aa09 --- /dev/null +++ b/datalad_compute/test_utils/create_datasets.py @@ -0,0 +1,56 @@ +import tempfile +from pathlib import Path + +from datalad_next.datasets import Dataset + + +def create_subdatasets(parent_dataset: Dataset, + subdataset_levels: int = 2, + level_id: int = 0, + top_level_path: Path | None = None + ) -> list[tuple[Dataset, Path]]: + """Create a hierarchy of subdatasets in the dataset `parent_dataset`. + + The subdatasets are created in the directories `subds{level_id}`, where + `level_id` is an integer counter starting at `0`. Each subdataset has two + annexed files `a{level_id}.txt` and `b{level_id}.txt`. + + `subdataset_levels` determines the depth of the hierarchy. If, for example, + `subdataset_levels` is 3, the following subdatasets are created: + + - parent_dataset/subds0 + - parent_dataset/subds0/subds1 + - parent_dataset/subds0/subds1/subds2 + """ + if subdataset_levels == 0: + return [] + + if top_level_path is None: + top_level_path = parent_dataset.pathobj + + subdataset = Dataset(parent_dataset.pathobj / f'subds{level_id}') + subdataset.create(result_renderer='disabled') + child_datasets = create_subdatasets( + subdataset, + subdataset_levels - 1, + level_id + 1, + top_level_path) + (subdataset.pathobj / f'a{level_id}.txt').write_text(f'a{level_id}\n') + (subdataset.pathobj / f'b{level_id}.txt').write_text(f'b{level_id}\n') + subdataset.save(result_renderer='disabled') + return [( + subdataset, + subdataset.pathobj, + subdataset.pathobj.relative_to(top_level_path))] + child_datasets + + +def create_ds_hierarchy(directory_name: str, + subdataset_levels: int = 2 + ) -> list[tuple[Dataset, Path]]: + dataset = Dataset(directory_name) + dataset.create(force=True, result_renderer='disabled') + subdatasets = create_subdatasets(dataset, subdataset_levels) + (dataset.pathobj / 'a.txt').write_text('a\n') + (dataset.pathobj / 'b.txt').write_text('b\n') + dataset.save(result_renderer='disabled') + return [(dataset, dataset.pathobj, Path('.'))] + subdatasets From b879906aeb14f6c68c7f28f56b965f80a12bb080 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 20 Sep 2024 17:48:16 +0200 Subject: [PATCH 049/148] fix unlocking for `execute` and `collect` --- datalad_compute/commands/compute_cmd.py | 34 ++++++++++++++----------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 64a125d..9d0b374 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -206,17 +206,7 @@ def execute(worktree: Path, template_name, repr(parameter), repr(output)) # Unlock output files in the worktree-directory - temp_dataset = Dataset(worktree) - for o in output: - file = temp_dataset.pathobj / o - if file.exists(): - temp_dataset.unlock(file) - elif file.is_symlink(): - # `datalad unlock` does not unlock dangling symlinks, so we mimic - # the behavior of `git annex unlock` here: - link = os.readlink(file) - file.unlink() - file.write('/annex/objects/' + link.split('/')[-1]) + unlock_files(Dataset(worktree), output) # Run the computation in the worktree-directory template_path = worktree / template_dir / template_name @@ -235,16 +225,30 @@ def collect(worktree: Path, lgr.debug('collect: %s %s %s', str(worktree), dataset, repr(output)) # Unlock output files in the dataset-directory and copy the result + unlock_files(dataset, output) for o in output: - dest = dataset.pathobj / o - if dest.exists(): - dataset.unlock(str(dest)) - shutil.copyfile(worktree / o, dest) + shutil.copyfile(worktree / o, dataset.pathobj / o) # Save the dataset dataset.save(recursive=True) +def unlock_files(dataset: Dataset, + files: list[str] + ) -> None: + """Use datalad to resolve subdatasets and unlock files in the dataset.""" + for f in files: + file = dataset.pathobj / f + if not file.exists() and file.is_symlink(): + # `datalad unlock` does not unlock dangling symlinks, so we + # mimic the behavior of `git annex unlock` here: + link = os.readlink(file) + file.unlink() + file.write_text('/annex/objects/' + link.split('/')[-1] + '\n') + elif file.is_symlink(): + dataset.unlock(file) + + def un_provide(dataset: Dataset, worktree: Path, ) -> None: From f6aa84419f6abc3d08848758bdc8a97e03f9b46d Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sat, 21 Sep 2024 07:38:50 +0200 Subject: [PATCH 050/148] fix getting of `a1.txt` in hierarchy test --- .../annexremotes/tests/test_hierarchies.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/datalad_compute/annexremotes/tests/test_hierarchies.py b/datalad_compute/annexremotes/tests/test_hierarchies.py index 303cd38..c205556 100644 --- a/datalad_compute/annexremotes/tests/test_hierarchies.py +++ b/datalad_compute/annexremotes/tests/test_hierarchies.py @@ -1,5 +1,5 @@ -import contextlib +from datalad.api import get as datalad_get from datalad_next.runners import call_git_success from datalad_next.tests.fixtures import datalad_cfg @@ -39,7 +39,7 @@ ] -def test_end_to_end(tmp_path, datalad_cfg): +def test_end_to_end(tmp_path, datalad_cfg, monkeypatch): datasets = create_ds_hierarchy(str(tmp_path), 3) root_dataset = datasets[0][0] @@ -81,9 +81,18 @@ def test_end_to_end(tmp_path, datalad_cfg): for file in output: root_dataset.drop(file) + # check that all files are dropped + for file in output: + assert not (root_dataset.pathobj / file).exists() + # Go to the subdataset `subds0/subds1` and fetch the content of `a1.txt` # from a compute remote. - with contextlib.chdir(root_dataset.pathobj / 'subds0' / 'subds1'): - root_dataset.get('a1.txt') + monkeypatch.chdir(root_dataset.pathobj / 'subds0' / 'subds1') + datalad_get('a1.txt') + + # check that all files are calculated + for file, content in zip(output, ['first', 'second', 'third'] * 4): + assert (root_dataset.pathobj / file).read_text() == f'content: {content}\n' - print(datasets) + # TODO: check `datalad get subds0/subds1/a1.txt``from top level directory + return From 7449352b91f2d76ad4d9168c5b51d11ca47478f8 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sat, 21 Sep 2024 08:00:20 +0200 Subject: [PATCH 051/148] add a test for get in subsubdataset --- .../annexremotes/tests/test_hierarchies.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/datalad_compute/annexremotes/tests/test_hierarchies.py b/datalad_compute/annexremotes/tests/test_hierarchies.py index c205556..9f1cf15 100644 --- a/datalad_compute/annexremotes/tests/test_hierarchies.py +++ b/datalad_compute/annexremotes/tests/test_hierarchies.py @@ -94,5 +94,13 @@ def test_end_to_end(tmp_path, datalad_cfg, monkeypatch): for file, content in zip(output, ['first', 'second', 'third'] * 4): assert (root_dataset.pathobj / file).read_text() == f'content: {content}\n' - # TODO: check `datalad get subds0/subds1/a1.txt``from top level directory - return + # Drop all computed content + for file in output: + root_dataset.drop(file) + + monkeypatch.chdir(root_dataset.pathobj) + datalad_get('subds0/subds1/a1.txt') + + # check that all files are calculated + for file, content in zip(output, ['first', 'second', 'third'] * 4): + assert (root_dataset.pathobj / file).read_text() == f'content: {content}\n' From e1c6697ecb9a890e1b0fd8c9ad8ac36f7d016c07 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sat, 21 Sep 2024 18:11:15 +0200 Subject: [PATCH 052/148] improve end-to-end test for compute-remote --- .../annexremotes/tests/test_hierarchies.py | 46 ++++++++++++------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/datalad_compute/annexremotes/tests/test_hierarchies.py b/datalad_compute/annexremotes/tests/test_hierarchies.py index 9f1cf15..fba00bc 100644 --- a/datalad_compute/annexremotes/tests/test_hierarchies.py +++ b/datalad_compute/annexremotes/tests/test_hierarchies.py @@ -1,5 +1,7 @@ +from collections.abc import Iterable from datalad.api import get as datalad_get +from datalad_next.datasets import Dataset from datalad_next.runners import call_git_success from datalad_next.tests.fixtures import datalad_cfg @@ -38,6 +40,26 @@ 'subds0/subds1/subds2/a2.txt', 'subds0/subds1/subds2/b2.txt', 'subds0/subds1/subds2/new.txt', ] +test_file_content = [ + (file, content) + for file, content in + zip(output, ['content: first\n', 'content: second\n', 'content: third\n'] * 4) +] + + +def _drop_files(dataset: Dataset, + files: Iterable[str]): + for file in files: + dataset.drop(file) + assert not (dataset.pathobj / file).exists() + + +def _check_content(dataset, + file_content: Iterable[tuple[str, str]] + ): + for file, content in file_content: + assert (dataset.pathobj / file).read_text() == content + def test_end_to_end(tmp_path, datalad_cfg, monkeypatch): @@ -74,33 +96,23 @@ def test_end_to_end(tmp_path, datalad_cfg, monkeypatch): output=output) # check computation success - for file, content in zip(output, ['first', 'second', 'third'] * 4): - assert (root_dataset.pathobj / file).read_text() == f'content: {content}\n' + _check_content(root_dataset, test_file_content) # Drop all computed content - for file in output: - root_dataset.drop(file) - - # check that all files are dropped - for file in output: - assert not (root_dataset.pathobj / file).exists() + _drop_files(root_dataset, output) # Go to the subdataset `subds0/subds1` and fetch the content of `a1.txt` # from a compute remote. monkeypatch.chdir(root_dataset.pathobj / 'subds0' / 'subds1') datalad_get('a1.txt') - # check that all files are calculated - for file, content in zip(output, ['first', 'second', 'third'] * 4): - assert (root_dataset.pathobj / file).read_text() == f'content: {content}\n' + # check that all files are computed + _check_content(root_dataset, test_file_content) - # Drop all computed content - for file in output: - root_dataset.drop(file) + _drop_files(root_dataset, output) + # check get in subdatasets monkeypatch.chdir(root_dataset.pathobj) datalad_get('subds0/subds1/a1.txt') - # check that all files are calculated - for file, content in zip(output, ['first', 'second', 'third'] * 4): - assert (root_dataset.pathobj / file).read_text() == f'content: {content}\n' + _check_content(root_dataset, test_file_content) From 5e97b7bcc77eba17dda9316861c95ea3e857fb58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20M=C3=B6nch?= Date: Sun, 22 Sep 2024 14:24:27 +0200 Subject: [PATCH 053/148] add input_list and output_list parameter to compute-command --- datalad_compute/commands/compute_cmd.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 9d0b374..921c86a 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -56,7 +56,9 @@ class Compute(ValidatedInterface): _validator_ = EnsureCommandParameterization(dict( dataset=EnsureDataset(installed=True), input=EnsureListOf(EnsureStr(min_len=1)), + input_list=EnsureStr(min_len=1), output=EnsureListOf(EnsureStr(min_len=1), min_len=1), + output_list=EnsureStr(min_len=1), parameter=EnsureListOf(EnsureStr(min_len=3)), )) @@ -84,10 +86,20 @@ class Compute(ValidatedInterface): args=('-i', '--input',), action='append', doc="Name of an input file (repeat for multiple inputs)"), + input_list=Parameter( + args=('-I', '--input-list',), + doc="Name of a file that contains a list of input files. Format is " + "one file per line, relative path from `dataset`. This is " + "useful if a large number of input files should be provided."), output=Parameter( args=('-o', '--output',), action='append', doc="Name of an output file (repeat for multiple outputs)"), + output_list=Parameter( + args=('-O', '--output-list',), + doc="Name of a file that contains a list of output files. Format " + "is one file per line, relative path from `dataset`. This is " + "useful if a large number of output files should be provided."), parameter=Parameter( args=('-p', '--parameter',), action='append', @@ -104,12 +116,17 @@ def __call__(dataset=None, template=None, branch=None, input=None, + input_list=None, output=None, + output_list=None, parameter=None, ): dataset : Dataset = dataset.ds if dataset else Dataset('.') + input = (input or []) + read_files(input_list) + output = (output or []) + read_files(output_list) + if not url_only: worktree = provide(dataset, branch, input) execute(worktree, template, parameter, output) @@ -127,6 +144,12 @@ def __call__(dataset=None, message=f'added url: {url!r} to {out!r} in {dataset.pathobj}',) +def read_files(list_file: str | None) -> list[str]: + if list_file is None: + return [] + return Path(list_file).read_text().splitlines() + + def get_url(dataset: Dataset, branch: str | None, template_name: str, From acfd2b2d04e7491af8e6213b3d0a3729b983c8d3 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 23 Sep 2024 10:35:22 +0200 Subject: [PATCH 054/148] add tests --- datalad_compute/commands/compute_cmd.py | 30 ++++++++++++++----- .../commands/tests/test_listhandling.py | 25 ++++++++++++++++ requirements-devel.txt | 1 + 3 files changed, 49 insertions(+), 7 deletions(-) create mode 100644 datalad_compute/commands/tests/test_listhandling.py diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 921c86a..2c54409 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -60,6 +60,7 @@ class Compute(ValidatedInterface): output=EnsureListOf(EnsureStr(min_len=1), min_len=1), output_list=EnsureStr(min_len=1), parameter=EnsureListOf(EnsureStr(min_len=3)), + parameter_list=EnsureStr(min_len=1), )) # parameters of the command, must be exhaustive @@ -89,7 +90,8 @@ class Compute(ValidatedInterface): input_list=Parameter( args=('-I', '--input-list',), doc="Name of a file that contains a list of input files. Format is " - "one file per line, relative path from `dataset`. This is " + "one file per line, relative path from `dataset`. Empty lines, " + "i.e. lines that contain only newlines, arg ignored. This is " "useful if a large number of input files should be provided."), output=Parameter( args=('-o', '--output',), @@ -98,13 +100,22 @@ class Compute(ValidatedInterface): output_list=Parameter( args=('-O', '--output-list',), doc="Name of a file that contains a list of output files. Format " - "is one file per line, relative path from `dataset`. This is " - "useful if a large number of output files should be provided."), + "is one file per line, relative path from `dataset`. Empty " + "lines, i.e. lines that contain only newlines, arg ignored. " + "This is useful if a large number of output files should be " + "provided."), parameter=Parameter( args=('-p', '--parameter',), action='append', doc="Input parameter in the form = (repeat for " "multiple parameters)"), + parameter_list=Parameter( + args=('-P', '--parameter-list',), + action='append', + doc="Name of a file that contains a list of parameters. Format " + "is one `=` string per line. Empty lines, " + "i.e. lines that contain only newlines, arg ignored. This is " + "useful if a large number of parameters should be provided."), ) @@ -120,12 +131,14 @@ def __call__(dataset=None, output=None, output_list=None, parameter=None, + parameter_list=None, ): dataset : Dataset = dataset.ds if dataset else Dataset('.') - input = (input or []) + read_files(input_list) - output = (output or []) + read_files(output_list) + input = (input or []) + read_list(input_list) + output = (output or []) + read_list(output_list) + parameter = (parameter or []) + read_list(parameter_list) if not url_only: worktree = provide(dataset, branch, input) @@ -144,10 +157,13 @@ def __call__(dataset=None, message=f'added url: {url!r} to {out!r} in {dataset.pathobj}',) -def read_files(list_file: str | None) -> list[str]: +def read_list(list_file: str | Path | None) -> list[str]: if list_file is None: return [] - return Path(list_file).read_text().splitlines() + return list( + filter( + lambda s: s != '', + Path(list_file).read_text().splitlines(keepends=False))) def get_url(dataset: Dataset, diff --git a/datalad_compute/commands/tests/test_listhandling.py b/datalad_compute/commands/tests/test_listhandling.py new file mode 100644 index 0000000..9c4b251 --- /dev/null +++ b/datalad_compute/commands/tests/test_listhandling.py @@ -0,0 +1,25 @@ +import tempfile +from pathlib import Path +from datalad_compute.commands.compute_cmd import read_list + +from hypothesis import given +from hypothesis.strategies import lists, text + + +def test_empty_list_reading(): + assert read_list(None) == [] + + +@given(lists(text('abcdefghijklmnopqrstuvwxyz _', min_size=1))) +def test_list_reading(word_list): + with tempfile.TemporaryDirectory() as temp_dir: + _test_wordlist(Path(temp_dir), word_list) + + +def _test_wordlist(tmp_path: Path, + word_list: list[str], + ) -> None: + list_file = tmp_path / 'list.txt' + list_file.write_text('\n'.join(word_list)) + assert read_list(str(list_file)) == word_list + assert read_list(list_file) == word_list diff --git a/requirements-devel.txt b/requirements-devel.txt index db4986b..9ddee5e 100644 --- a/requirements-devel.txt +++ b/requirements-devel.txt @@ -3,6 +3,7 @@ coverage datalad datalad-next datasalad +hypothesis pytest pytest-cov From 5a355e6fd3fef9cbe27f3448a780fa526fe1eeca Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 23 Sep 2024 12:55:53 +0200 Subject: [PATCH 055/148] circumvent a problem with `Dataset.unlock` --- datalad_compute/commands/compute_cmd.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 2c54409..1e4d548 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -2,6 +2,7 @@ from __future__ import annotations +import contextlib import json import logging import os @@ -276,16 +277,20 @@ def unlock_files(dataset: Dataset, files: list[str] ) -> None: """Use datalad to resolve subdatasets and unlock files in the dataset.""" - for f in files: - file = dataset.pathobj / f - if not file.exists() and file.is_symlink(): - # `datalad unlock` does not unlock dangling symlinks, so we - # mimic the behavior of `git annex unlock` here: - link = os.readlink(file) - file.unlink() - file.write_text('/annex/objects/' + link.split('/')[-1] + '\n') - elif file.is_symlink(): - dataset.unlock(file) + # TODO: for some reason `dataset unlock` does not operate in the + # context of `dataset.pathobj`, so we need to change the working + # directory manually here. + with contextlib.chdir(dataset.pathobj): + for f in files: + file = dataset.pathobj / f + if not file.exists() and file.is_symlink(): + # `datalad unlock` does not unlock dangling symlinks, so we + # mimic the behavior of `git annex unlock` here: + link = os.readlink(file) + file.unlink() + file.write_text('/annex/objects/' + link.split('/')[-1] + '\n') + elif file.is_symlink(): + dataset.unlock(file) def un_provide(dataset: Dataset, From a62627de572a6cf98fefbd5f4be985403a3b93be Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 23 Sep 2024 14:45:22 +0200 Subject: [PATCH 056/148] fix parameter list argument spec --- datalad_compute/commands/compute_cmd.py | 1 - 1 file changed, 1 deletion(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 1e4d548..37a7441 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -112,7 +112,6 @@ class Compute(ValidatedInterface): "multiple parameters)"), parameter_list=Parameter( args=('-P', '--parameter-list',), - action='append', doc="Name of a file that contains a list of parameters. Format " "is one `=` string per line. Empty lines, " "i.e. lines that contain only newlines, arg ignored. This is " From 11d6ebd1e1fd7fffa8e844fe2ef6941e3b680fc9 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 24 Sep 2024 09:38:15 +0200 Subject: [PATCH 057/148] add {root_directory} placeholder to template resolution --- datalad_compute/utils/compute.py | 14 ++++++-- .../utils/tests/test_substitution.py | 32 +++++++++++++++++++ 2 files changed, 43 insertions(+), 3 deletions(-) create mode 100644 datalad_compute/utils/tests/test_substitution.py diff --git a/datalad_compute/utils/compute.py b/datalad_compute/utils/compute.py index ef324cd..745ffd7 100644 --- a/datalad_compute/utils/compute.py +++ b/datalad_compute/utils/compute.py @@ -1,12 +1,17 @@ from __future__ import annotations import contextlib +import logging import subprocess import tomllib + from pathlib import Path from typing import Any +lgr = logging.getLogger('datalad.compute') + + def substitute_string(format_str: str, replacements: dict[str, str], ) -> str: @@ -37,7 +42,7 @@ def get_substitutions(template: dict[str, Any], if len(inputs) != len(arguments.keys()): raise ValueError('Template inputs and arguments have different lengths') if not all(input_name in arguments for input_name in inputs): - raise ValueError('Template inputs and arguments have different names') + raise ValueError(f'Template inputs and arguments have different names: inputs: {inputs}, arguments: {arguments}') if len(inputs) != len(set(inputs)): raise ValueError('Template inputs contain duplicates') @@ -57,6 +62,7 @@ def compute(root_directory: Path, template = tomllib.load(f) substitutions = get_substitutions(template, compute_arguments) + substitutions['root_directory'] = str(root_directory) substituted_executable = substitute_string(template['executable'], substitutions) substituted_arguments = substitute_arguments( @@ -67,6 +73,8 @@ def compute(root_directory: Path, with contextlib.chdir(root_directory): if template.get('use_shell', 'false') == 'true': - subprocess.run(' '.join([substituted_executable] + substituted_arguments), shell=True) + lgr.debug(f'compute(): RUNNING: with shell=True: {" ".join([substituted_executable] + substituted_arguments)}') + subprocess.run(' '.join([substituted_executable] + substituted_arguments), shell=True, check=True) else: - subprocess.run([substituted_executable] + substituted_arguments) + lgr.debug(f'compute(): RUNNING: {[substituted_executable] + substituted_arguments}') + subprocess.run([substituted_executable] + substituted_arguments, check=True) diff --git a/datalad_compute/utils/tests/test_substitution.py b/datalad_compute/utils/tests/test_substitution.py new file mode 100644 index 0000000..2ee2480 --- /dev/null +++ b/datalad_compute/utils/tests/test_substitution.py @@ -0,0 +1,32 @@ + + +from ..compute import ( + substitute_arguments, + substitute_string, +) + + +def test_multiple_substitutions(): + assert substitute_string( + 'This is a {test} with {multiple} substitutions', + {'test': 'string', 'multiple': 'multiple'}, + ) == 'This is a string with multiple substitutions' + + +def test_argument_substitution(): + arguments = [ + '{root_directory}/{input_dir}', + '{root_directory}/{output_dir}', + ] + s = substitute_arguments( + {'arguments': arguments}, + {'root_directory': '/path/to/root', + 'input_dir': 'input', + 'output_dir': 'output', + }, + 'arguments' + ) + assert s == [ + '/path/to/root/input', + '/path/to/root/output', + ] From a8410105add697e430fe5ec5ece53f7e10c8b27c Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 24 Sep 2024 11:19:20 +0200 Subject: [PATCH 058/148] add an argument for temp-directory to gitworktree --- datalad_compute/dataprovider/gitworktree.py | 28 +++++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index fc65b5a..de2e709 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -5,6 +5,7 @@ """ from __future__ import annotations +import os import random import shutil import tempfile @@ -41,6 +42,13 @@ 'root), at least one input has tp be provided (use multiple times to ' 'define multiple inputs)', ) +argument_parser.add_argument( + '-t', '--temp-dir', + metavar='PATH', + default=os.getenv('TMP', '/tmp'), + help='Path of the directory where temporary worktrees should be created. ' + 'The default is `$TMP` if set, otherwise `/tmp`.', +) def remove(dataset: str, @@ -60,15 +68,21 @@ def prune_worktrees(dataset: Dataset) -> None: def provide(dataset: str, + temp_dir: str, branch: str | None = None, input_files: list[str] | None = None, ) -> Path: - worktree_dir = Path(tempfile.TemporaryDirectory().name) + worktree_name = random_name() + worktree_dir = Path(temp_dir) / worktree_name + if not worktree_dir.exists(): + worktree_dir.mkdir(parents=True, exist_ok=True) + # Get all datasets including subdatasets into the worktree provide_datasets( Dataset(dataset), worktree_dir=worktree_dir, + branch_name=worktree_name, source_branch=branch, ) @@ -82,15 +96,13 @@ def provide(dataset: str, def provide_datasets(dataset: Dataset, worktree_dir: Path, + branch_name: str, source_branch: str | None = None, ) -> None: - temp_branch = 'tmp_' + ''.join( - random.choices('abcdefghijklmnopqrstuvwxyz', k=10) - ) with chdir(dataset.path): - args = ['worktree', 'add', '-b', temp_branch, str(worktree_dir)] + ( + args = ['worktree', 'add', '-b', branch_name, str(worktree_dir)] + ( [source_branch] if source_branch else [] @@ -107,6 +119,11 @@ def provide_datasets(dataset: Dataset, ) +def random_name() -> str: + return 'tmp_' + ''.join( + random.choices('abcdefghijklmnopqrstuvwxyz', k=10)) + + def main(): arguments = argument_parser.parse_args() if arguments.delete: @@ -121,6 +138,7 @@ def main(): provision_dir = provide( arguments.dataset, + arguments.temp_dir, arguments.branch, arguments.input, ) From ead61ed3df9164532f48be146bec803682390629 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 24 Sep 2024 11:50:17 +0200 Subject: [PATCH 059/148] fix: give branch name in recursive calls --- datalad_compute/dataprovider/gitworktree.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index de2e709..e2edfa1 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -8,7 +8,6 @@ import os import random import shutil -import tempfile from argparse import ArgumentParser from contextlib import chdir from pathlib import Path @@ -96,17 +95,21 @@ def provide(dataset: str, def provide_datasets(dataset: Dataset, worktree_dir: Path, - branch_name: str, + branch_name: str | None, source_branch: str | None = None, ) -> None: with chdir(dataset.path): - args = ['worktree', 'add', '-b', branch_name, str(worktree_dir)] + ( - [source_branch] - if source_branch + args = ['worktree', 'add'] + ( + ['-b', branch_name] + if branch_name else [] - ) + ) + [str(worktree_dir)] + ( + [source_branch] + if source_branch + else [] + ) call_git_success(args) for subdataset in dataset.subdatasets(result_renderer='disabled'): @@ -115,7 +118,8 @@ def provide_datasets(dataset: Dataset, provide_datasets( Dataset(subdataset_path), worktree_dir / subdataset_path, - None, # Use default branches for subdatasets + branch_name, + None, # Use default commit-ish for subdatasets ) From d7d9ac2849cdb6c2e2cef9bfba85231d9a29aae3 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 25 Sep 2024 21:28:40 +0200 Subject: [PATCH 060/148] improve collect This commit fixes various issues in the collect-code that are related to non-existing output files or non existing directories. --- datalad_compute/commands/compute_cmd.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 37a7441..4059d22 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -12,6 +12,7 @@ from pathlib import Path from urllib.parse import quote +from datalad.support.exceptions import IncompleteResultsError from datalad_next.commands import ( EnsureCommandParameterization, ValidatedInterface, @@ -244,6 +245,10 @@ def execute(worktree: Path, 'execute: %s %s %s %s', str(worktree), template_name, repr(parameter), repr(output)) + # Get the subdatasets, directories, and files that are part of the output + # space. + create_output_space(Dataset(worktree), output) + # Unlock output files in the worktree-directory unlock_files(Dataset(worktree), output) @@ -266,7 +271,9 @@ def collect(worktree: Path, # Unlock output files in the dataset-directory and copy the result unlock_files(dataset, output) for o in output: - shutil.copyfile(worktree / o, dataset.pathobj / o) + destination = dataset.pathobj / o + destination.parent.mkdir(parents=True, exist_ok=True) + shutil.copyfile(worktree / o, destination) # Save the dataset dataset.save(recursive=True) @@ -283,7 +290,7 @@ def unlock_files(dataset: Dataset, for f in files: file = dataset.pathobj / f if not file.exists() and file.is_symlink(): - # `datalad unlock` does not unlock dangling symlinks, so we + # `datalad unlock` does not "unlock" dangling symlinks, so we # mimic the behavior of `git annex unlock` here: link = os.readlink(file) file.unlink() @@ -292,6 +299,19 @@ def unlock_files(dataset: Dataset, dataset.unlock(file) +def create_output_space(dataset: Dataset, + files: list[str] + ) -> None: + """Get all files that are part of the output space.""" + for f in files: + try: + dataset.get(f) + except IncompleteResultsError: + # The file does not yet exist. The computation should create it. + # We create the directory here. + (dataset.pathobj / f).parent.mkdir(parents=True, exist_ok=True) + + def un_provide(dataset: Dataset, worktree: Path, ) -> None: From abd9d70d516ab19baa724fecac4ea97172245e9d Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 25 Sep 2024 16:56:50 +0200 Subject: [PATCH 061/148] improve gitworktree provisioning This commit adds various improvemnts to the user interface and the list reading code. --- datalad_compute/annexremotes/compute.py | 12 ++ datalad_compute/commands/compute_cmd.py | 10 +- .../commands/tests/test_listhandling.py | 25 ++++- datalad_compute/dataprovider/gitworktree.py | 105 ++++++++++-------- .../tests}/create_datasets.py | 48 ++++++-- .../dataprovider/tests/test_gitworktree.py | 6 +- datalad_compute/test_utils/__init__.py | 0 7 files changed, 140 insertions(+), 66 deletions(-) rename datalad_compute/{test_utils => dataprovider/tests}/create_datasets.py (59%) delete mode 100644 datalad_compute/test_utils/__init__.py diff --git a/datalad_compute/annexremotes/compute.py b/datalad_compute/annexremotes/compute.py index 0f13355..7c7a4a8 100644 --- a/datalad_compute/annexremotes/compute.py +++ b/datalad_compute/annexremotes/compute.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +import logging import shutil import subprocess from pathlib import Path @@ -28,6 +29,9 @@ ) +lgr = logging.getLogger('datalad.compute.annexremotes.compute') + + class ComputeRemote(SpecialRemote): def __init__(self, annex: Master): @@ -103,9 +107,17 @@ def transfer_retrieve(self, key: str, file_name: str) -> None: dataset = self._find_dataset(compute_info['root_id']) # Perform the computation, and collect the results + lgr.debug('Starting provision') + self.annex.debug('Starting provision') worktree = provide(dataset, compute_info['root_version'], compute_info['input']) + lgr.debug('Starting execution') + self.annex.debug('Starting execution') execute(worktree, compute_info['method'], compute_info['parameter'], compute_info['output']) + lgr.debug('Starting collection') + self.annex.debug('Starting collection') self._collect(worktree, dataset, compute_info['output'], compute_info['this'], file_name) + lgr.debug('Starting unprovision') + self.annex.debug('Starting unprovision') un_provide(dataset, worktree) def checkpresent(self, key: str) -> bool: diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 4059d22..de27db3 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -161,10 +161,12 @@ def __call__(dataset=None, def read_list(list_file: str | Path | None) -> list[str]: if list_file is None: return [] - return list( - filter( - lambda s: s != '', - Path(list_file).read_text().splitlines(keepends=False))) + return list(filter( + lambda s: s != '' and not s.startswith('#'), + [ + line.strip() + for line in Path(list_file).read_text().splitlines(keepends=False) + ])) def get_url(dataset: Dataset, diff --git a/datalad_compute/commands/tests/test_listhandling.py b/datalad_compute/commands/tests/test_listhandling.py index 9c4b251..3c9e53d 100644 --- a/datalad_compute/commands/tests/test_listhandling.py +++ b/datalad_compute/commands/tests/test_listhandling.py @@ -10,16 +10,33 @@ def test_empty_list_reading(): assert read_list(None) == [] -@given(lists(text('abcdefghijklmnopqrstuvwxyz _', min_size=1))) -def test_list_reading(word_list): +@given(lists(text('abcdefghijklmnopqrstuvwxyz_', min_size=1))) +def test_list_reading_basic(word_list): with tempfile.TemporaryDirectory() as temp_dir: _test_wordlist(Path(temp_dir), word_list) +def test_list_reading_comments(tmp_path: Path): + list_file = _write_list(tmp_path, ['# a', 'a', ' # b']) + assert read_list(str(list_file)) == ['a'] + + +def test_list_reading_strip(tmp_path: Path): + list_file = _write_list(tmp_path, [' a', 'b ', ' c ']) + assert read_list(str(list_file)) == ['a', 'b', 'c'] + + def _test_wordlist(tmp_path: Path, word_list: list[str], ) -> None: - list_file = tmp_path / 'list.txt' - list_file.write_text('\n'.join(word_list)) + list_file = _write_list(tmp_path, word_list) assert read_list(str(list_file)) == word_list assert read_list(list_file) == word_list + + +def _write_list(tmp_path: Path, + word_list: list[str], + ) -> Path: + list_file = tmp_path / 'list.txt' + list_file.write_text('\n'.join(word_list)) + return list_file diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index e2edfa1..7cfaf02 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -5,16 +5,22 @@ """ from __future__ import annotations +import logging import os import random import shutil from argparse import ArgumentParser from contextlib import chdir from pathlib import Path +from urllib.parse import urlparse from datalad_next.datasets import Dataset from datalad_next.runners import call_git_success +from ..commands.compute_cmd import read_list + + +lgr = logging.getLogger('datalad.compute.dataprovider.gitworktree') argument_parser = ArgumentParser() argument_parser.add_argument( @@ -41,6 +47,12 @@ 'root), at least one input has tp be provided (use multiple times to ' 'define multiple inputs)', ) +argument_parser.add_argument( + '-I', '--input-list', + metavar='PATH', + default=None, + help='Path of a file that contains a list of input paths', +) argument_parser.add_argument( '-t', '--temp-dir', metavar='PATH', @@ -53,12 +65,23 @@ def remove(dataset: str, worktree: str ) -> None: - + remove_subdatasets(worktree) shutil.rmtree(worktree) dataset = Dataset(dataset) prune_worktrees(dataset) +def remove_subdatasets(worktree: str): + dataset = Dataset(worktree) + for subdataset_info in dataset.subdatasets(result_renderer='disabled'): + dataset.drop( + subdataset_info['path'], + recursive=True, + reckless='kill', + what='all', + result_renderer='disabled') + + def prune_worktrees(dataset: Dataset) -> None: with chdir(dataset.path): call_git_success(['worktree', 'prune']) @@ -66,61 +89,51 @@ def prune_worktrees(dataset: Dataset) -> None: prune_worktrees(Dataset(result['path'])) -def provide(dataset: str, +def ensure_absolute_gitmodule_urls(original_dataset: Dataset, + dataset: Dataset + ) -> None: + sub_datasets = dataset.subdatasets(result_renderer='disabled') + for subdataset in sub_datasets: + name, location_spec = subdataset['gitmodule_name'], subdataset['gitmodule_url'] + parse_result = urlparse(location_spec) + if parse_result.scheme == '': + if not Path(location_spec).is_absolute(): + args = ['submodule', 'set-url', name, original_dataset.path] + call_git_success(args, cwd=dataset.path) + dataset.save() + + +def provide(dataset_dir: str, temp_dir: str, - branch: str | None = None, + source_branch: str | None = None, input_files: list[str] | None = None, ) -> Path: + lgr.debug('Provisioning dataset %s', dataset_dir) worktree_name = random_name() worktree_dir = Path(temp_dir) / worktree_name if not worktree_dir.exists(): worktree_dir.mkdir(parents=True, exist_ok=True) - # Get all datasets including subdatasets into the worktree - provide_datasets( - Dataset(dataset), - worktree_dir=worktree_dir, - branch_name=worktree_name, - source_branch=branch, - ) - - # Fetch file content in the worktree - work_dataset = Dataset(worktree_dir) - with chdir(worktree_dir): - for p in input_files or []: - work_dataset.get(p, result_renderer='disabled') - return worktree_dir - - -def provide_datasets(dataset: Dataset, - worktree_dir: Path, - branch_name: str | None, - source_branch: str | None = None, - ) -> None: - - with chdir(dataset.path): - - args = ['worktree', 'add'] + ( - ['-b', branch_name] - if branch_name + # Create a worktree + with chdir(dataset_dir): + args = ['worktree', 'add', '-b', worktree_name] + [str(worktree_dir)] + ( + [source_branch] + if source_branch else [] - ) + [str(worktree_dir)] + ( - [source_branch] - if source_branch - else [] - ) + ) call_git_success(args) - for subdataset in dataset.subdatasets(result_renderer='disabled'): - subdataset_path = Path(subdataset['path']).relative_to(dataset.pathobj) - dataset.install(path=subdataset_path, result_renderer='disabled') - provide_datasets( - Dataset(subdataset_path), - worktree_dir / subdataset_path, - branch_name, - None, # Use default commit-ish for subdatasets - ) + worktree_dataset = Dataset(worktree_dir) + # Ensure that all subdatasets have absolute URLs + ensure_absolute_gitmodule_urls(Dataset(dataset_dir), worktree_dataset) + # Get all input files in the worktree + with chdir(worktree_dataset.path): + for file in input_files or []: + lgr.debug('Provisioning file %s', file) + worktree_dataset.get(file, result_renderer='disabled') + + return worktree_dir def random_name() -> str: @@ -140,11 +153,13 @@ def main(): remove(arguments.dataset, arguments.delete) return + inputs = arguments.input or [] + read_list(arguments.input_list) + provision_dir = provide( arguments.dataset, arguments.temp_dir, arguments.branch, - arguments.input, + inputs, ) print(provision_dir) diff --git a/datalad_compute/test_utils/create_datasets.py b/datalad_compute/dataprovider/tests/create_datasets.py similarity index 59% rename from datalad_compute/test_utils/create_datasets.py rename to datalad_compute/dataprovider/tests/create_datasets.py index 662aa09..afe5135 100644 --- a/datalad_compute/test_utils/create_datasets.py +++ b/datalad_compute/dataprovider/tests/create_datasets.py @@ -1,16 +1,22 @@ -import tempfile +from __future__ import annotations + from pathlib import Path from datalad_next.datasets import Dataset -def create_subdatasets(parent_dataset: Dataset, +def create_subdatasets(tmp_path: Path, + parent_dataset: Dataset, subdataset_levels: int = 2, level_id: int = 0, - top_level_path: Path | None = None + top_level_path: Path | None = None, + relative_subdataset_path: Path = None, ) -> list[tuple[Dataset, Path]]: """Create a hierarchy of subdatasets in the dataset `parent_dataset`. + Individual datasets are created in the temporary directory `tmp_path` and + installed in the parent_dataset. + The subdatasets are created in the directories `subds{level_id}`, where `level_id` is an integer counter starting at `0`. Each subdataset has two annexed files `a{level_id}.txt` and `b{level_id}.txt`. @@ -25,31 +31,53 @@ def create_subdatasets(parent_dataset: Dataset, if subdataset_levels == 0: return [] + if relative_subdataset_path is None: + relative_subdataset_path = Path(f'subds{level_id}') + else: + relative_subdataset_path /= Path(f'subds{level_id}') + if top_level_path is None: top_level_path = parent_dataset.pathobj - subdataset = Dataset(parent_dataset.pathobj / f'subds{level_id}') + # Create a dataset in the tempaorary directory + subdataset = Dataset(tmp_path / f'subds{level_id}') subdataset.create(result_renderer='disabled') + (subdataset.pathobj / f'a{level_id}.txt').write_text(f'a{level_id}\n') + (subdataset.pathobj / f'b{level_id}.txt').write_text(f'b{level_id}\n') + child_datasets = create_subdatasets( + tmp_path, subdataset, subdataset_levels - 1, level_id + 1, - top_level_path) - (subdataset.pathobj / f'a{level_id}.txt').write_text(f'a{level_id}\n') - (subdataset.pathobj / f'b{level_id}.txt').write_text(f'b{level_id}\n') + top_level_path, + relative_subdataset_path) + subdataset.save(result_renderer='disabled') + + # Install the dataset in the parent dataset + parent_dataset.install( + path=f'subds{level_id}', + source='file://' + subdataset.path, + + )#result_renderer='disabled') + + parent_dataset.save(result_renderer='disabled') + + return [( subdataset, subdataset.pathobj, - subdataset.pathobj.relative_to(top_level_path))] + child_datasets + relative_subdataset_path)] + child_datasets -def create_ds_hierarchy(directory_name: str, +def create_ds_hierarchy(tmp_path: Path, + directory_name: str, subdataset_levels: int = 2 ) -> list[tuple[Dataset, Path]]: dataset = Dataset(directory_name) dataset.create(force=True, result_renderer='disabled') - subdatasets = create_subdatasets(dataset, subdataset_levels) + subdatasets = create_subdatasets(tmp_path, dataset, subdataset_levels) (dataset.pathobj / 'a.txt').write_text('a\n') (dataset.pathobj / 'b.txt').write_text('b\n') dataset.save(result_renderer='disabled') diff --git a/datalad_compute/dataprovider/tests/test_gitworktree.py b/datalad_compute/dataprovider/tests/test_gitworktree.py index 28e2f08..a6c003b 100644 --- a/datalad_compute/dataprovider/tests/test_gitworktree.py +++ b/datalad_compute/dataprovider/tests/test_gitworktree.py @@ -1,6 +1,5 @@ from __future__ import annotations -import tempfile from contextlib import chdir from datalad_next.datasets import Dataset @@ -10,13 +9,14 @@ provide, remove, ) -from ...test_utils.create_datasets import create_ds_hierarchy +from .create_datasets import create_ds_hierarchy def test_worktree_basic(tmp_path): - dataset = create_ds_hierarchy(str(tmp_path), 3)[0][0] + dataset = create_ds_hierarchy(str(tmp_path / 'root_dataset'), 3)[0][0] worktree = Dataset(provide( dataset.path, + str(tmp_path), input_files=[ 'a.txt', 'b.txt', 'subds0/a0.txt', 'subds0/b0.txt', diff --git a/datalad_compute/test_utils/__init__.py b/datalad_compute/test_utils/__init__.py deleted file mode 100644 index e69de29..0000000 From 906e1f81823df04fbd4d69aef53aee572b8ca319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20M=C3=B6nch?= Date: Thu, 26 Sep 2024 17:42:46 +0200 Subject: [PATCH 062/148] improve test code This commit improves the dataset hierarchy creation code and the end-to-end tests for compute-remotes. --- .../annexremotes/tests/test_hierarchies.py | 50 +++---- .../dataprovider/tests/create_datasets.py | 132 ++++++++---------- .../dataprovider/tests/test_gitworktree.py | 6 +- 3 files changed, 90 insertions(+), 98 deletions(-) diff --git a/datalad_compute/annexremotes/tests/test_hierarchies.py b/datalad_compute/annexremotes/tests/test_hierarchies.py index fba00bc..bf6ab6e 100644 --- a/datalad_compute/annexremotes/tests/test_hierarchies.py +++ b/datalad_compute/annexremotes/tests/test_hierarchies.py @@ -9,7 +9,7 @@ template_dir, url_scheme, ) -from ...test_utils.create_datasets import create_ds_hierarchy +from datalad_compute.dataprovider.tests.create_datasets import create_ds_hierarchy test_method = """ @@ -20,24 +20,24 @@ "content: {first} > 'a.txt';", "echo content: {second} > 'b.txt';", "echo content: {third} > 'new.txt';", - "echo content: {first} > 'subds0/a0.txt';", - "echo content: {second} > 'subds0/b0.txt';", - "echo content: {third} > 'subds0/new.txt';", - "echo content: {first} > 'subds0/subds1/a1.txt';", - "echo content: {second} > 'subds0/subds1/b1.txt';", - "echo content: {third} > 'subds0/subds1/new.txt';", - "echo content: {first} > 'subds0/subds1/subds2/a2.txt';", - "echo content: {second} > 'subds0/subds1/subds2/b2.txt';", - "echo content: {third} > 'subds0/subds1/subds2/new.txt';", + "echo content: {first} > 'd2_subds0/a0.txt';", + "echo content: {second} > 'd2_subds0/b0.txt';", + "echo content: {third} > 'd2_subds0/new.txt';", + "echo content: {first} > 'd2_subds0/d2_subds1/a1.txt';", + "echo content: {second} > 'd2_subds0/d2_subds1/b1.txt';", + "echo content: {third} > 'd2_subds0/d2_subds1/new.txt';", + "echo content: {first} > 'd2_subds0/d2_subds1/d2_subds2/a2.txt';", + "echo content: {second} > 'd2_subds0/d2_subds1/d2_subds2/b2.txt';", + "echo content: {third} > 'd2_subds0/d2_subds1/d2_subds2/new.txt';", ] """ output = [ 'a.txt', 'b.txt', 'new.txt', - 'subds0/a0.txt', 'subds0/b0.txt', 'subds0/new.txt', - 'subds0/subds1/a1.txt', 'subds0/subds1/b1.txt', 'subds0/subds1/new.txt', - 'subds0/subds1/subds2/a2.txt', 'subds0/subds1/subds2/b2.txt', 'subds0/subds1/subds2/new.txt', + 'd2_subds0/a0.txt', 'd2_subds0/b0.txt', 'd2_subds0/new.txt', + 'd2_subds0/d2_subds1/a1.txt', 'd2_subds0/d2_subds1/b1.txt', 'd2_subds0/d2_subds1/new.txt', + 'd2_subds0/d2_subds1/d2_subds2/a2.txt', 'd2_subds0/d2_subds1/d2_subds2/b2.txt', 'd2_subds0/d2_subds1/d2_subds2/new.txt', ] test_file_content = [ @@ -63,8 +63,8 @@ def _check_content(dataset, def test_end_to_end(tmp_path, datalad_cfg, monkeypatch): - datasets = create_ds_hierarchy(str(tmp_path), 3) - root_dataset = datasets[0][0] + datasets = create_ds_hierarchy(tmp_path, 'd2', 3) + root_dataset = datasets[0][2] # add method template template_path = root_dataset.pathobj / template_dir @@ -77,13 +77,13 @@ def test_end_to_end(tmp_path, datalad_cfg, monkeypatch): datalad_cfg.set('annex.security.allowed-ip-addresses', 'all', scope='global') datalad_cfg.set('annex.security.allow-unverified-downloads', 'ACKTHPPT', scope='global') - # add a compute remotes to all datasets - for _, dataset_path, _ in datasets: - call_git_success([ - '-C', str(dataset_path), - 'annex', 'initremote', 'compute', - 'type=external', 'externaltype=compute', - 'encryption=none']) + ## add a compute remotes to all datasets + #for _, dataset_path, _ in datasets: + # call_git_success([ + # '-C', str(dataset_path), + # 'annex', 'initremote', 'compute', + # 'type=external', 'externaltype=compute', + # 'encryption=none']) # run compute command root_dataset.compute( @@ -101,9 +101,9 @@ def test_end_to_end(tmp_path, datalad_cfg, monkeypatch): # Drop all computed content _drop_files(root_dataset, output) - # Go to the subdataset `subds0/subds1` and fetch the content of `a1.txt` + # Go to the subdataset `d2_subds0/d2_subds1` and fetch the content of `a1.txt` # from a compute remote. - monkeypatch.chdir(root_dataset.pathobj / 'subds0' / 'subds1') + monkeypatch.chdir(root_dataset.pathobj / 'd2_subds0' / 'd2_subds1') datalad_get('a1.txt') # check that all files are computed @@ -113,6 +113,6 @@ def test_end_to_end(tmp_path, datalad_cfg, monkeypatch): # check get in subdatasets monkeypatch.chdir(root_dataset.pathobj) - datalad_get('subds0/subds1/a1.txt') + datalad_get('d2_subds0/d2_subds1/a1.txt') _check_content(root_dataset, test_file_content) diff --git a/datalad_compute/dataprovider/tests/create_datasets.py b/datalad_compute/dataprovider/tests/create_datasets.py index afe5135..f0c6ef2 100644 --- a/datalad_compute/dataprovider/tests/create_datasets.py +++ b/datalad_compute/dataprovider/tests/create_datasets.py @@ -3,82 +3,74 @@ from pathlib import Path from datalad_next.datasets import Dataset +from datalad_next.runners import call_git_success +from datalad_compute import url_scheme -def create_subdatasets(tmp_path: Path, - parent_dataset: Dataset, - subdataset_levels: int = 2, - level_id: int = 0, - top_level_path: Path | None = None, - relative_subdataset_path: Path = None, - ) -> list[tuple[Dataset, Path]]: - """Create a hierarchy of subdatasets in the dataset `parent_dataset`. - Individual datasets are created in the temporary directory `tmp_path` and - installed in the parent_dataset. +def update_config_for_compute(dataset: Dataset): + # set annex security related variables to allow compute-URLs + dataset.configuration( + action='set', + scope='local', + recursive=True, + spec=[ + ('annex.security.allowed-url-schemes', url_scheme), + ('annex.security.allowed-ip-addresses', 'all'), + ('annex.security.allow-unverified-downloads', 'ACKTHPPT')]) - The subdatasets are created in the directories `subds{level_id}`, where - `level_id` is an integer counter starting at `0`. Each subdataset has two - annexed files `a{level_id}.txt` and `b{level_id}.txt`. - `subdataset_levels` determines the depth of the hierarchy. If, for example, - `subdataset_levels` is 3, the following subdatasets are created: - - - parent_dataset/subds0 - - parent_dataset/subds0/subds1 - - parent_dataset/subds0/subds1/subds2 - """ - if subdataset_levels == 0: - return [] - - if relative_subdataset_path is None: - relative_subdataset_path = Path(f'subds{level_id}') - else: - relative_subdataset_path /= Path(f'subds{level_id}') - - if top_level_path is None: - top_level_path = parent_dataset.pathobj - - # Create a dataset in the tempaorary directory - subdataset = Dataset(tmp_path / f'subds{level_id}') - subdataset.create(result_renderer='disabled') - (subdataset.pathobj / f'a{level_id}.txt').write_text(f'a{level_id}\n') - (subdataset.pathobj / f'b{level_id}.txt').write_text(f'b{level_id}\n') - - child_datasets = create_subdatasets( - tmp_path, - subdataset, - subdataset_levels - 1, - level_id + 1, - top_level_path, - relative_subdataset_path) - - subdataset.save(result_renderer='disabled') - - # Install the dataset in the parent dataset - parent_dataset.install( - path=f'subds{level_id}', - source='file://' + subdataset.path, - - )#result_renderer='disabled') - - parent_dataset.save(result_renderer='disabled') - - - return [( - subdataset, - subdataset.pathobj, - relative_subdataset_path)] + child_datasets +def add_compute_remote(dataset: Dataset): + call_git_success([ + '-C', dataset.path, + 'annex', 'initremote', 'compute', + 'type=external', 'externaltype=compute', + 'encryption=none']) def create_ds_hierarchy(tmp_path: Path, - directory_name: str, + name: str, subdataset_levels: int = 2 - ) -> list[tuple[Dataset, Path]]: - dataset = Dataset(directory_name) - dataset.create(force=True, result_renderer='disabled') - subdatasets = create_subdatasets(tmp_path, dataset, subdataset_levels) - (dataset.pathobj / 'a.txt').write_text('a\n') - (dataset.pathobj / 'b.txt').write_text('b\n') - dataset.save(result_renderer='disabled') - return [(dataset, dataset.pathobj, Path('.'))] + subdatasets + ) -> list[tuple[str, Path, Dataset]]: + + # Create root dataset + root_dataset = Dataset(tmp_path / name) + root_dataset.create(force=True, result_renderer='disabled') + (root_dataset.pathobj / 'a.txt').write_text('a\n') + (root_dataset.pathobj / 'b.txt').write_text('b\n') + root_dataset.save() + datasets = [(name, tmp_path / name, root_dataset)] + + # Create subdatasets + for level in range(subdataset_levels): + subdataset_path = tmp_path / f'{name}_subds{level}' + subdataset = Dataset(subdataset_path) + subdataset.create(force=True, result_renderer='disabled') + (subdataset.pathobj / f'a{level}.txt').write_text(f'a{level}\n') + (subdataset.pathobj / f'b{level}.txt').write_text(f'b{level}\n') + subdataset.save() + datasets.append((f'{name}_subds{level}', subdataset_path, subdataset)) + + # Link the datasets + for index in range(len(datasets) - 2, -1, -1): + dataset, subdataset = datasets[index:index+2] + print(index) + print(dataset) + print(subdataset) + dataset[2].install( + path=subdataset[0], + source='file://' + subdataset[2].path, + ) + dataset[2].save() + + root_dataset.get(recursive=True) + update_config_for_compute(root_dataset) + + # Add compute remotes to the root dataset and all subdatasets + add_compute_remote(root_dataset) + subdataset_path = Path() + for index in range(subdataset_levels): + subdataset_path /= f'{name}_subds{index}' + add_compute_remote(Dataset(root_dataset.pathobj / subdataset_path)) + + return datasets diff --git a/datalad_compute/dataprovider/tests/test_gitworktree.py b/datalad_compute/dataprovider/tests/test_gitworktree.py index a6c003b..bc8c4ca 100644 --- a/datalad_compute/dataprovider/tests/test_gitworktree.py +++ b/datalad_compute/dataprovider/tests/test_gitworktree.py @@ -13,14 +13,14 @@ def test_worktree_basic(tmp_path): - dataset = create_ds_hierarchy(str(tmp_path / 'root_dataset'), 3)[0][0] + dataset = create_ds_hierarchy(tmp_path, 'ds1', 3)[0][2] worktree = Dataset(provide( dataset.path, str(tmp_path), input_files=[ 'a.txt', 'b.txt', - 'subds0/a0.txt', 'subds0/b0.txt', - 'subds0/subds1/a1.txt', 'subds0/subds1/b1.txt' + 'ds1_subds0/a0.txt', 'ds1_subds0/b0.txt', + 'ds1_subds0/ds1_subds1/a1.txt', 'ds1_subds0/ds1_subds1/b1.txt' ], )) From 8576907bb4c86266e2e758a58dd2d01a1ef6465f Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 27 Sep 2024 13:14:28 +0200 Subject: [PATCH 063/148] capture output of git-calls --- .../annexremotes/tests/test_hierarchies.py | 14 +++----------- datalad_compute/commands/compute_cmd.py | 10 ++++++---- datalad_compute/dataprovider/gitworktree.py | 11 +++++++---- .../dataprovider/tests/create_datasets.py | 18 +++++++++--------- 4 files changed, 25 insertions(+), 28 deletions(-) diff --git a/datalad_compute/annexremotes/tests/test_hierarchies.py b/datalad_compute/annexremotes/tests/test_hierarchies.py index bf6ab6e..7b184a1 100644 --- a/datalad_compute/annexremotes/tests/test_hierarchies.py +++ b/datalad_compute/annexremotes/tests/test_hierarchies.py @@ -2,7 +2,6 @@ from datalad.api import get as datalad_get from datalad_next.datasets import Dataset -from datalad_next.runners import call_git_success from datalad_next.tests.fixtures import datalad_cfg from ... import ( @@ -50,7 +49,7 @@ def _drop_files(dataset: Dataset, files: Iterable[str]): for file in files: - dataset.drop(file) + dataset.drop(file, result_renderer='disabled') assert not (dataset.pathobj / file).exists() @@ -77,14 +76,6 @@ def test_end_to_end(tmp_path, datalad_cfg, monkeypatch): datalad_cfg.set('annex.security.allowed-ip-addresses', 'all', scope='global') datalad_cfg.set('annex.security.allow-unverified-downloads', 'ACKTHPPT', scope='global') - ## add a compute remotes to all datasets - #for _, dataset_path, _ in datasets: - # call_git_success([ - # '-C', str(dataset_path), - # 'annex', 'initremote', 'compute', - # 'type=external', 'externaltype=compute', - # 'encryption=none']) - # run compute command root_dataset.compute( template='test_method', @@ -93,7 +84,8 @@ def test_end_to_end(tmp_path, datalad_cfg, monkeypatch): 'second=second', 'third=third', ], - output=output) + output=output, + result_renderer='disabled') # check computation success _check_content(root_dataset, test_file_content) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index de27db3..1da6d3c 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -204,7 +204,9 @@ def add_url(dataset: Dataset, file_dataset_path, file_path = get_file_dataset(dataset.pathobj / file_path) success = call_git_success( ['-C', str(file_dataset_path), 'annex', 'addurl', url, '--file', file_path] - + (['--relaxed'] if url_only else [])) + + (['--relaxed'] if url_only else []), + capture_output=True,) + assert success, f'\naddurl failed:\nfile_dataset_path: {file_dataset_path}\nurl: {url!r}\nfile_path: {file_path!r}' return url @@ -278,7 +280,7 @@ def collect(worktree: Path, shutil.copyfile(worktree / o, destination) # Save the dataset - dataset.save(recursive=True) + dataset.save(recursive=True, result_renderer='disabled') def unlock_files(dataset: Dataset, @@ -298,7 +300,7 @@ def unlock_files(dataset: Dataset, file.unlink() file.write_text('/annex/objects/' + link.split('/')[-1] + '\n') elif file.is_symlink(): - dataset.unlock(file) + dataset.unlock(file, result_renderer='disabled') def create_output_space(dataset: Dataset, @@ -307,7 +309,7 @@ def create_output_space(dataset: Dataset, """Get all files that are part of the output space.""" for f in files: try: - dataset.get(f) + dataset.get(f, result_renderer='disabled') except IncompleteResultsError: # The file does not yet exist. The computation should create it. # We create the directory here. diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index 7cfaf02..c844aa0 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -84,7 +84,10 @@ def remove_subdatasets(worktree: str): def prune_worktrees(dataset: Dataset) -> None: with chdir(dataset.path): - call_git_success(['worktree', 'prune']) + call_git_success( + ['worktree', 'prune'], + capture_output=True) + for result in dataset.subdatasets(result_renderer='disabled'): prune_worktrees(Dataset(result['path'])) @@ -99,8 +102,8 @@ def ensure_absolute_gitmodule_urls(original_dataset: Dataset, if parse_result.scheme == '': if not Path(location_spec).is_absolute(): args = ['submodule', 'set-url', name, original_dataset.path] - call_git_success(args, cwd=dataset.path) - dataset.save() + call_git_success(args, cwd=dataset.path, capture_output=True) + dataset.save(result_renderer='disabled') def provide(dataset_dir: str, @@ -122,7 +125,7 @@ def provide(dataset_dir: str, if source_branch else [] ) - call_git_success(args) + call_git_success(args, capture_output=True) worktree_dataset = Dataset(worktree_dir) # Ensure that all subdatasets have absolute URLs diff --git a/datalad_compute/dataprovider/tests/create_datasets.py b/datalad_compute/dataprovider/tests/create_datasets.py index f0c6ef2..b3c6bce 100644 --- a/datalad_compute/dataprovider/tests/create_datasets.py +++ b/datalad_compute/dataprovider/tests/create_datasets.py @@ -17,7 +17,8 @@ def update_config_for_compute(dataset: Dataset): spec=[ ('annex.security.allowed-url-schemes', url_scheme), ('annex.security.allowed-ip-addresses', 'all'), - ('annex.security.allow-unverified-downloads', 'ACKTHPPT')]) + ('annex.security.allow-unverified-downloads', 'ACKTHPPT')], + result_renderer='disabled') def add_compute_remote(dataset: Dataset): @@ -25,7 +26,8 @@ def add_compute_remote(dataset: Dataset): '-C', dataset.path, 'annex', 'initremote', 'compute', 'type=external', 'externaltype=compute', - 'encryption=none']) + 'encryption=none'], + capture_output=True) def create_ds_hierarchy(tmp_path: Path, @@ -38,7 +40,7 @@ def create_ds_hierarchy(tmp_path: Path, root_dataset.create(force=True, result_renderer='disabled') (root_dataset.pathobj / 'a.txt').write_text('a\n') (root_dataset.pathobj / 'b.txt').write_text('b\n') - root_dataset.save() + root_dataset.save(result_renderer='disabled') datasets = [(name, tmp_path / name, root_dataset)] # Create subdatasets @@ -48,22 +50,20 @@ def create_ds_hierarchy(tmp_path: Path, subdataset.create(force=True, result_renderer='disabled') (subdataset.pathobj / f'a{level}.txt').write_text(f'a{level}\n') (subdataset.pathobj / f'b{level}.txt').write_text(f'b{level}\n') - subdataset.save() + subdataset.save(result_renderer='disabled') datasets.append((f'{name}_subds{level}', subdataset_path, subdataset)) # Link the datasets for index in range(len(datasets) - 2, -1, -1): dataset, subdataset = datasets[index:index+2] - print(index) - print(dataset) - print(subdataset) dataset[2].install( path=subdataset[0], source='file://' + subdataset[2].path, + result_renderer='disabled', ) - dataset[2].save() + dataset[2].save(result_renderer='disabled') - root_dataset.get(recursive=True) + root_dataset.get(recursive=True, result_renderer='disabled') update_config_for_compute(root_dataset) # Add compute remotes to the root dataset and all subdatasets From 2a68d447999459968894c4b450edb55d99a85bb4 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 27 Sep 2024 15:19:15 +0200 Subject: [PATCH 064/148] start globing support for input/output --- datalad_compute/commands/compute_cmd.py | 82 +++++++++++++-------- datalad_compute/dataprovider/gitworktree.py | 17 ++++- 2 files changed, 66 insertions(+), 33 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 1da6d3c..9c3a8f0 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -3,6 +3,7 @@ from __future__ import annotations import contextlib +import glob import json import logging import os @@ -10,6 +11,7 @@ import subprocess from itertools import chain from pathlib import Path +from typing import Iterable from urllib.parse import quote from datalad.support.exceptions import IncompleteResultsError @@ -32,6 +34,7 @@ call_git_oneline, call_git_success, ) +from hypothesis.strategies import recursive from .. import ( template_dir, @@ -88,24 +91,28 @@ class Compute(ValidatedInterface): input=Parameter( args=('-i', '--input',), action='append', - doc="Name of an input file (repeat for multiple inputs)"), + doc="Name of an input file pattern (repeat for multiple inputs), " + "file pattern support python globbing"), input_list=Parameter( args=('-I', '--input-list',), - doc="Name of a file that contains a list of input files. Format is " - "one file per line, relative path from `dataset`. Empty lines, " - "i.e. lines that contain only newlines, arg ignored. This is " - "useful if a large number of input files should be provided."), + doc="Name of a file that contains a list of input file patterns. " + "Format is one file per line, relative path from `dataset`. " + "Empty lines, i.e. lines that contain only newlines, and lines " + "that start with '#' are ignored. Line content is stripped " + "before used. This is useful if a large number of input file " + "patterns should be provided."), output=Parameter( args=('-o', '--output',), action='append', doc="Name of an output file (repeat for multiple outputs)"), output_list=Parameter( args=('-O', '--output-list',), - doc="Name of a file that contains a list of output files. Format " - "is one file per line, relative path from `dataset`. Empty " - "lines, i.e. lines that contain only newlines, arg ignored. " - "This is useful if a large number of output files should be " - "provided."), + doc="Name of a file that contains a list of output file patterns. " + "Format is one file per line, relative path from `dataset`. " + "Empty lines, i.e. lines that contain only newlines, and lines " + "that start with '#' are ignored. Line content is stripped " + "before used. This is useful if a large number of output file " + "patterns should be provided."), parameter=Parameter( args=('-p', '--parameter',), action='append', @@ -114,9 +121,11 @@ class Compute(ValidatedInterface): parameter_list=Parameter( args=('-P', '--parameter-list',), doc="Name of a file that contains a list of parameters. Format " - "is one `=` string per line. Empty lines, " - "i.e. lines that contain only newlines, arg ignored. This is " - "useful if a large number of parameters should be provided."), + "is one `=` string per line. " + "Empty lines, i.e. lines that contain only newlines, and lines " + "that start with '#' are ignored. Line content is stripped " + "before used. This is useful if a large number of parameters " + "should be provided."), ) @@ -137,19 +146,25 @@ def __call__(dataset=None, dataset : Dataset = dataset.ds if dataset else Dataset('.') - input = (input or []) + read_list(input_list) - output = (output or []) + read_list(output_list) + input_pattern = (input or []) + read_list(input_list) + output_pattern = (output or []) + read_list(output_list) parameter = (parameter or []) + read_list(parameter_list) if not url_only: - worktree = provide(dataset, branch, input) - execute(worktree, template, parameter, output) - collect(worktree, dataset, output) + worktree = provide(dataset, branch, input_pattern) + execute(worktree, template, parameter, output_pattern) + output_files = collect(worktree, dataset, output_pattern) un_provide(dataset, worktree) - url_base = get_url(dataset, branch, template, parameter, input, output) + url_base = get_url( + dataset, + branch, + template, + parameter, + input_pattern, + output_pattern) - for out in output: + for out in (output_pattern if url_only else output_files): url = add_url(dataset, out, url_base, url_only) yield get_status_dict( action='compute', @@ -226,15 +241,15 @@ def get_file_dataset(file: Path) -> tuple[Path, Path]: def provide(dataset: Dataset, branch: str | None, - input: list[str], + input_patterns: list[str], ) -> Path: - lgr.debug('provide: %s %s %s', dataset, branch, input) + lgr.debug('provide: %s %s %s', dataset, branch, input_patterns) args = ['provide-gitworktree', dataset.path, ] + ( ['--branch', branch] if branch else [] ) - args.extend(chain(*[('--input', i) for i in (input or [])])) + args.extend(chain(*[('--input', i) for i in (input_patterns or [])])) stdout = subprocess.run(args, stdout=subprocess.PIPE, check=True).stdout return Path(stdout.splitlines()[-1].decode()) @@ -267,24 +282,33 @@ def execute(worktree: Path, def collect(worktree: Path, dataset: Dataset, - output: list[str], - ) -> None: + output_patterns: list[str], + ) -> Iterable[str]: + + lgr.debug( + 'collect: %s %s %s', + str(worktree), dataset, repr(output_patterns)) - lgr.debug('collect: %s %s %s', str(worktree), dataset, repr(output)) + # Get the list of created output files based on the output patterns + output_files = set( + chain.from_iterable( + glob.glob(pattern, root_dir=worktree, recursive=True) + for pattern in output_patterns)) # Unlock output files in the dataset-directory and copy the result - unlock_files(dataset, output) - for o in output: + unlock_files(dataset, output_files) + for o in output_files: destination = dataset.pathobj / o destination.parent.mkdir(parents=True, exist_ok=True) shutil.copyfile(worktree / o, destination) # Save the dataset dataset.save(recursive=True, result_renderer='disabled') + return output_files def unlock_files(dataset: Dataset, - files: list[str] + files: Iterable[str] ) -> None: """Use datalad to resolve subdatasets and unlock files in the dataset.""" # TODO: for some reason `dataset unlock` does not operate in the diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index c844aa0..54cccf9 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -5,13 +5,16 @@ """ from __future__ import annotations +import glob import logging import os import random import shutil from argparse import ArgumentParser from contextlib import chdir +from itertools import chain from pathlib import Path +from typing import Iterable from urllib.parse import urlparse from datalad_next.datasets import Dataset @@ -43,15 +46,15 @@ '-i', '--input', action='append', metavar='PATH', - help='Path of a file that should be provisioned (relative from dataset ' - 'root), at least one input has tp be provided (use multiple times to ' + help='File pattern that should be provisioned (relative from dataset ' + 'root), at least one input has to be provided (use multiple times to ' 'define multiple inputs)', ) argument_parser.add_argument( '-I', '--input-list', metavar='PATH', default=None, - help='Path of a file that contains a list of input paths', + help='Path of a file that contains a list of input file patterns', ) argument_parser.add_argument( '-t', '--temp-dir', @@ -109,7 +112,7 @@ def ensure_absolute_gitmodule_urls(original_dataset: Dataset, def provide(dataset_dir: str, temp_dir: str, source_branch: str | None = None, - input_files: list[str] | None = None, + input_patterns: Iterable[str] | None = None, ) -> Path: lgr.debug('Provisioning dataset %s', dataset_dir) @@ -118,6 +121,12 @@ def provide(dataset_dir: str, if not worktree_dir.exists(): worktree_dir.mkdir(parents=True, exist_ok=True) + # Resolve input file patterns in the original dataset + input_files = set( + chain.from_iterable( + glob.glob(pattern, root_dir=dataset_dir, recursive=True) + for pattern in input_patterns)) + # Create a worktree with chdir(dataset_dir): args = ['worktree', 'add', '-b', worktree_name] + [str(worktree_dir)] + ( From 2fce20f9e3056610952322ee295b10d096a81249 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sun, 29 Sep 2024 09:07:25 +0200 Subject: [PATCH 065/148] remove submodule url modification This commit removes code that converted relative file paths of submodules into file-URLs. This code is no longer necessary, it was related to an old version of dataset hierarchy creation for tests, which has since been changed. --- datalad_compute/dataprovider/gitworktree.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index 54cccf9..160ec6e 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -15,7 +15,6 @@ from itertools import chain from pathlib import Path from typing import Iterable -from urllib.parse import urlparse from datalad_next.datasets import Dataset from datalad_next.runners import call_git_success @@ -95,20 +94,6 @@ def prune_worktrees(dataset: Dataset) -> None: prune_worktrees(Dataset(result['path'])) -def ensure_absolute_gitmodule_urls(original_dataset: Dataset, - dataset: Dataset - ) -> None: - sub_datasets = dataset.subdatasets(result_renderer='disabled') - for subdataset in sub_datasets: - name, location_spec = subdataset['gitmodule_name'], subdataset['gitmodule_url'] - parse_result = urlparse(location_spec) - if parse_result.scheme == '': - if not Path(location_spec).is_absolute(): - args = ['submodule', 'set-url', name, original_dataset.path] - call_git_success(args, cwd=dataset.path, capture_output=True) - dataset.save(result_renderer='disabled') - - def provide(dataset_dir: str, temp_dir: str, source_branch: str | None = None, @@ -137,8 +122,6 @@ def provide(dataset_dir: str, call_git_success(args, capture_output=True) worktree_dataset = Dataset(worktree_dir) - # Ensure that all subdatasets have absolute URLs - ensure_absolute_gitmodule_urls(Dataset(dataset_dir), worktree_dataset) # Get all input files in the worktree with chdir(worktree_dataset.path): for file in input_files or []: From ed55f62ecadd5c32172b97a8dfe7efef2b724c1e Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sun, 29 Sep 2024 09:08:06 +0200 Subject: [PATCH 066/148] add globbing for input files in provide This commit adds globbing support for input files in worktree provisioning. Only content of matching files is retrieved. Note that links to non-present annexed files and non-annexed files may still be available in the worktree. --- datalad_compute/dataprovider/gitworktree.py | 7 +- .../dataprovider/tests/test_gitworktree.py | 91 ++++++++++++++++++- 2 files changed, 94 insertions(+), 4 deletions(-) diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index 160ec6e..1ec3ca6 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -47,7 +47,8 @@ metavar='PATH', help='File pattern that should be provisioned (relative from dataset ' 'root), at least one input has to be provided (use multiple times to ' - 'define multiple inputs)', + 'define multiple inputs). Patterns are resolved by Python\'s globbing ' + 'rules. They are resolved in the source dataset.', ) argument_parser.add_argument( '-I', '--input-list', @@ -109,7 +110,7 @@ def provide(dataset_dir: str, # Resolve input file patterns in the original dataset input_files = set( chain.from_iterable( - glob.glob(pattern, root_dir=dataset_dir, recursive=True) + glob.glob(pattern, root_dir=dataset_dir) for pattern in input_patterns)) # Create a worktree @@ -124,7 +125,7 @@ def provide(dataset_dir: str, worktree_dataset = Dataset(worktree_dir) # Get all input files in the worktree with chdir(worktree_dataset.path): - for file in input_files or []: + for file in input_files: lgr.debug('Provisioning file %s', file) worktree_dataset.get(file, result_renderer='disabled') diff --git a/datalad_compute/dataprovider/tests/test_gitworktree.py b/datalad_compute/dataprovider/tests/test_gitworktree.py index bc8c4ca..a0a2bc4 100644 --- a/datalad_compute/dataprovider/tests/test_gitworktree.py +++ b/datalad_compute/dataprovider/tests/test_gitworktree.py @@ -1,10 +1,13 @@ from __future__ import annotations from contextlib import chdir +from pathlib import Path +from typing import Iterable from datalad_next.datasets import Dataset from datalad_next.runners import call_git_lines + from ..gitworktree import ( provide, remove, @@ -12,12 +15,39 @@ from .create_datasets import create_ds_hierarchy +file_path_templates = [ + '{file}.txt', + '{{ds_name}}_subds0/{file}0.txt', + '{{ds_name}}_subds0/{{ds_name}}_subds1/{file}1.txt', + '{{ds_name}}_subds0/{{ds_name}}_subds1/{{ds_name}}_subds2/{file}2.txt', +] + + +all_paths = [ + template.format(file=f) + for template in file_path_templates + for f in ['a', 'b'] +] + +a_paths = [ + path.format(file='a') + for path in file_path_templates +] + +b_paths = [ + path.format(file='b') + for path in file_path_templates +] + +all_paths = a_paths + b_paths + + def test_worktree_basic(tmp_path): dataset = create_ds_hierarchy(tmp_path, 'ds1', 3)[0][2] worktree = Dataset(provide( dataset.path, str(tmp_path), - input_files=[ + input_patterns=[ 'a.txt', 'b.txt', 'ds1_subds0/a0.txt', 'ds1_subds0/b0.txt', 'ds1_subds0/ds1_subds1/a1.txt', 'ds1_subds0/ds1_subds1/b1.txt' @@ -44,3 +74,62 @@ def check_deleted_worktrees(ds: Dataset): reckless='kill', recursive=True, result_renderer='disabled') + + +def test_worktree_globbing(tmp_path): + dataset = create_ds_hierarchy(tmp_path, 'ds1', 3)[0][2] + worktree = Dataset(provide( + dataset.path, + str(tmp_path), + input_patterns=[ + '*.txt', + '*_subds0/*.txt', + '*_subds0/*_subds1/*.txt', + '*_subds0/*_subds1/*_subds2/*.txt', + ], + )) + + worktree_set = set(get_file_list(worktree.pathobj)) + assert worktree_set == set( + path.format(ds_name='ds1') + for path in all_paths + ) + remove(dataset.path, worktree.path) + + worktree = Dataset(provide( + dataset.path, + str(tmp_path), + input_patterns=[ + 'b*txt', + '*_subds0/b*txt', + '*_subds0/*_subds1/b*txt', + '*_subds0/*_subds1/*_subds2/b*txt', + ], + )) + + worktree_set = set(get_file_list(worktree.pathobj)) + assert set( + path.format(ds_name='ds1') + for path in b_paths + ).issubset(worktree_set) + remove(dataset.path, worktree.path) + + dataset.drop( + what='all', + reckless='kill', + recursive=True, + result_renderer='disabled') + + +def get_file_list(root: Path, + path: Path|None = None, + prefix: Path|None = None + ) -> Iterable[str]: + prefix = prefix or Path('') + path = path or root + for child in path.iterdir(): + if not child.name.startswith('.'): + if child.is_dir(): + yield from get_file_list(root, child, prefix=prefix / child) + else: + yield str((prefix / child).relative_to(root)) From 134eaa42ba9ace583b78c402ef3248a11bb5ec95 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sun, 29 Sep 2024 14:59:38 +0200 Subject: [PATCH 067/148] factor out resolve_patterns --- datalad_compute/dataprovider/gitworktree.py | 8 ++------ datalad_compute/utils/glob.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 6 deletions(-) create mode 100644 datalad_compute/utils/glob.py diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index 1ec3ca6..cdcc75b 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -5,20 +5,19 @@ """ from __future__ import annotations -import glob import logging import os import random import shutil from argparse import ArgumentParser from contextlib import chdir -from itertools import chain from pathlib import Path from typing import Iterable from datalad_next.datasets import Dataset from datalad_next.runners import call_git_success +from datalad_compute.utils.glob import resolve_patterns from ..commands.compute_cmd import read_list @@ -108,10 +107,7 @@ def provide(dataset_dir: str, worktree_dir.mkdir(parents=True, exist_ok=True) # Resolve input file patterns in the original dataset - input_files = set( - chain.from_iterable( - glob.glob(pattern, root_dir=dataset_dir) - for pattern in input_patterns)) + input_files = resolve_patterns(dataset_dir, input_patterns) # Create a worktree with chdir(dataset_dir): diff --git a/datalad_compute/utils/glob.py b/datalad_compute/utils/glob.py new file mode 100644 index 0000000..f11a68b --- /dev/null +++ b/datalad_compute/utils/glob.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from glob import glob +from itertools import chain +from pathlib import Path +from typing import Iterable + + +# Resolve input file patterns in the original dataset +def resolve_patterns(root_dir: str | Path, + patterns: Iterable[str] + ) -> set[str]: + return set( + chain.from_iterable( + glob(pattern, root_dir=str(root_dir)) + for pattern in patterns)) From d64f7c2f556c7df937ccc92d194d2e460f968c56 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sun, 29 Sep 2024 17:01:51 +0200 Subject: [PATCH 068/148] fix directory creation in test_method --- datalad_compute/annexremotes/tests/test_hierarchies.py | 1 + 1 file changed, 1 insertion(+) diff --git a/datalad_compute/annexremotes/tests/test_hierarchies.py b/datalad_compute/annexremotes/tests/test_hierarchies.py index 7b184a1..a16e21d 100644 --- a/datalad_compute/annexremotes/tests/test_hierarchies.py +++ b/datalad_compute/annexremotes/tests/test_hierarchies.py @@ -17,6 +17,7 @@ executable = 'echo' arguments = [ "content: {first} > 'a.txt';", + "mkdir -p 'd2_subds0/d2_subds1/d2_subds2';", "echo content: {second} > 'b.txt';", "echo content: {third} > 'new.txt';", "echo content: {first} > 'd2_subds0/a0.txt';", From 9c1564f355ab408d8f31f52c61644771bdbff7ab Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sun, 29 Sep 2024 17:06:35 +0200 Subject: [PATCH 069/148] remove output-globbing from compute_cmd --- datalad_compute/commands/compute_cmd.py | 63 ++++++++++--------------- 1 file changed, 25 insertions(+), 38 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 9c3a8f0..e820f06 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -3,7 +3,6 @@ from __future__ import annotations import contextlib -import glob import json import logging import os @@ -34,7 +33,6 @@ call_git_oneline, call_git_success, ) -from hypothesis.strategies import recursive from .. import ( template_dir, @@ -45,7 +43,6 @@ __docformat__ = 'restructuredtext' - lgr = logging.getLogger('datalad.compute.compute_cmd') @@ -91,8 +88,9 @@ class Compute(ValidatedInterface): input=Parameter( args=('-i', '--input',), action='append', - doc="Name of an input file pattern (repeat for multiple inputs), " - "file pattern support python globbing"), + doc="An input file pattern (repeat for multiple inputs, " + "file pattern support python globbing, globbing is expanded " + "in the source dataset"), input_list=Parameter( args=('-I', '--input-list',), doc="Name of a file that contains a list of input file patterns. " @@ -107,12 +105,11 @@ class Compute(ValidatedInterface): doc="Name of an output file (repeat for multiple outputs)"), output_list=Parameter( args=('-O', '--output-list',), - doc="Name of a file that contains a list of output file patterns. " - "Format is one file per line, relative path from `dataset`. " - "Empty lines, i.e. lines that contain only newlines, and lines " - "that start with '#' are ignored. Line content is stripped " - "before used. This is useful if a large number of output file " - "patterns should be provided."), + doc="Name of a file that contains a list of output files. Format " + "is one file per line, relative path from `dataset`. Empty " + "lines, i.e. lines that contain only newlines, arg ignored. " + "This is useful if a large number of output files should be " + "provided."), parameter=Parameter( args=('-p', '--parameter',), action='append', @@ -147,13 +144,13 @@ def __call__(dataset=None, dataset : Dataset = dataset.ds if dataset else Dataset('.') input_pattern = (input or []) + read_list(input_list) - output_pattern = (output or []) + read_list(output_list) + output = (output or []) + read_list(output_list) parameter = (parameter or []) + read_list(parameter_list) if not url_only: worktree = provide(dataset, branch, input_pattern) - execute(worktree, template, parameter, output_pattern) - output_files = collect(worktree, dataset, output_pattern) + execute(worktree, template, parameter, output) + collect(worktree, dataset, output) un_provide(dataset, worktree) url_base = get_url( @@ -162,9 +159,9 @@ def __call__(dataset=None, template, parameter, input_pattern, - output_pattern) + output) - for out in (output_pattern if url_only else output_files): + for out in output: url = add_url(dataset, out, url_base, url_only) yield get_status_dict( action='compute', @@ -264,12 +261,13 @@ def execute(worktree: Path, 'execute: %s %s %s %s', str(worktree), template_name, repr(parameter), repr(output)) + worktree_ds = Dataset(worktree) # Get the subdatasets, directories, and files that are part of the output # space. - create_output_space(Dataset(worktree), output) + create_output_space(worktree_ds, output) - # Unlock output files in the worktree-directory - unlock_files(Dataset(worktree), output) + # Unlock output files in the output space (worktree-directory) + unlock_files(worktree_ds, output) # Run the computation in the worktree-directory template_path = worktree / template_dir / template_name @@ -282,29 +280,19 @@ def execute(worktree: Path, def collect(worktree: Path, dataset: Dataset, - output_patterns: list[str], - ) -> Iterable[str]: - - lgr.debug( - 'collect: %s %s %s', - str(worktree), dataset, repr(output_patterns)) - - # Get the list of created output files based on the output patterns - output_files = set( - chain.from_iterable( - glob.glob(pattern, root_dir=worktree, recursive=True) - for pattern in output_patterns)) + output: Iterable[str], + ) -> None: # Unlock output files in the dataset-directory and copy the result - unlock_files(dataset, output_files) - for o in output_files: + unlock_files(dataset, output) + for o in output: + lgr.debug('collect: collecting %s', o) destination = dataset.pathobj / o destination.parent.mkdir(parents=True, exist_ok=True) shutil.copyfile(worktree / o, destination) # Save the dataset dataset.save(recursive=True, result_renderer='disabled') - return output_files def unlock_files(dataset: Dataset, @@ -328,16 +316,15 @@ def unlock_files(dataset: Dataset, def create_output_space(dataset: Dataset, - files: list[str] + files: Iterable[str] ) -> None: """Get all files that are part of the output space.""" for f in files: try: dataset.get(f, result_renderer='disabled') except IncompleteResultsError: - # The file does not yet exist. The computation should create it. - # We create the directory here. - (dataset.pathobj / f).parent.mkdir(parents=True, exist_ok=True) + # Ignore non-existing files + pass def un_provide(dataset: Dataset, From 9d3151515ff030000e46204ef299abc71046e534 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sun, 29 Sep 2024 17:09:05 +0200 Subject: [PATCH 070/148] remove whitespace --- datalad_compute/commands/compute_cmd.py | 1 - 1 file changed, 1 deletion(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index e820f06..9353519 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -125,7 +125,6 @@ class Compute(ValidatedInterface): "should be provided."), ) - @staticmethod @datasetmethod(name='compute') @eval_results From bfddeed075c2ca7b308948cba45cb0490f077de1 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 30 Sep 2024 06:39:26 +0200 Subject: [PATCH 071/148] improve README.md This commit changes README.md to reflect that `git config annex.security.allow-unverified-downloads ACKTHPPT` is only necessary if speculative computetions are used. --- README.md | 9 +++++++-- datalad_compute/dataprovider/tests/create_datasets.py | 5 +++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index d377ffc..ad1beea 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,6 @@ Install the extension, create a dataset, configure it to use `compute`-URLs > cd compute-test-1 > git config annex.security.allowed-url-schemes datalad-make > git config annex.security.allowed-ip-addresses all -> git config annex.security.allow-unverified-downloads ACKTHPPT ``` Create the template directory and a template @@ -109,7 +108,13 @@ The command `datalad compute` does also support to just record the parameters that would lead to a certain computation, without actually performing the computation. We refer to this as *speculative computation*. -Generate a speculative computation, this is done by providing the `-u` option +To use this feature, the following configuration value has to be set: + +```bash +> git config annex.security.allow-unverified-downloads ACKTHPPT +``` + +Afterward, a speculative computation can be recorded by providing the `-u` option (url-only) to `datalad compute`. ```bash diff --git a/datalad_compute/dataprovider/tests/create_datasets.py b/datalad_compute/dataprovider/tests/create_datasets.py index b3c6bce..d123781 100644 --- a/datalad_compute/dataprovider/tests/create_datasets.py +++ b/datalad_compute/dataprovider/tests/create_datasets.py @@ -15,9 +15,10 @@ def update_config_for_compute(dataset: Dataset): scope='local', recursive=True, spec=[ - ('annex.security.allowed-url-schemes', url_scheme), + #('annex.security.allowed-url-schemes', url_scheme), ('annex.security.allowed-ip-addresses', 'all'), - ('annex.security.allow-unverified-downloads', 'ACKTHPPT')], + #('annex.security.allow-unverified-downloads', 'ACKTHPPT'), + ], result_renderer='disabled') From 0dff2156f376439111d9e2f8fc014e600ae942a6 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 30 Sep 2024 09:58:40 +0200 Subject: [PATCH 072/148] add an example for fmriprep-docker execution --- examples/fmriprep docker/fmriprep-docker | 38 +++++++++++ examples/fmriprep docker/input.txt | 12 ++++ examples/fmriprep docker/output.txt | 87 ++++++++++++++++++++++++ examples/fmriprep docker/parameter.txt | 4 ++ examples/fmriprep docker/readme.md | 16 +++++ 5 files changed, 157 insertions(+) create mode 100644 examples/fmriprep docker/fmriprep-docker create mode 100644 examples/fmriprep docker/input.txt create mode 100644 examples/fmriprep docker/output.txt create mode 100644 examples/fmriprep docker/parameter.txt create mode 100644 examples/fmriprep docker/readme.md diff --git a/examples/fmriprep docker/fmriprep-docker b/examples/fmriprep docker/fmriprep-docker new file mode 100644 index 0000000..2406687 --- /dev/null +++ b/examples/fmriprep docker/fmriprep-docker @@ -0,0 +1,38 @@ +# This is a very simple template to run fmriprep-docker on a single subject +# of a BIDS dataset. +# It needs an environment with `fmriprep-docker` installed (e.g. a virtualenv +# in which `pip install fmriprep-docker` has been executed). +# +# The template takes the following inputs: +# - input_dir: the path to the BIDS dataset +# - output_dir: the path to the output directory, typically a directory called +# `derivatives` in `{input_dir}`. +# - participant_label: the label of the participant to be processed, e.g. `01`. +# - license_file: the path to the FreeSurfer license file. +# +# The template assumes that the BIDS dataset referenced in `input_dir` is +# a subdataset of the dataset in which the computation is started, as outlined +ä in the fairly-big-follow-up document. +# +# Input files, output files, and parameter for the computation are defined in +# the lists: `input.txt`, `output.txt`, and `parameter.txt` to keep the command +# line short. +# +# `datalad compute -I input.txt -O output.txt -P parameter.txt fmriprep_template` + +inputs = ['input_dir', 'output_dir', 'participant_label', 'license_file'] + +use_shell = 'false' +executable = 'fmriprep-docker' + +# Note: `{root_directory}` resolves to the directory of the dataset in which the +# computation was started with `datalad compute`. +arguments = [ + '{root_directory}/{input_dir}', + '{root_directory}/{output_dir}', + 'participant', + '--participant-label', '{participant_label}', + '--fs-license-file', '{license_file}', + '--skip-bids-validation', + '--ignore', 'slicetiming', +] diff --git a/examples/fmriprep docker/input.txt b/examples/fmriprep docker/input.txt new file mode 100644 index 0000000..9693bad --- /dev/null +++ b/examples/fmriprep docker/input.txt @@ -0,0 +1,12 @@ +# Paths are relative to the dataset in which `datalad compute` was executed +datasets/ds000102/dataset_description.json +datasets/ds000102/participants.tsv +datasets/ds000102/T1w.json +datasets/ds000102/task-flanker_bold.json + +datasets/ds000102/sub-01/anat/sub-01_T1w.nii.gz + +datasets/ds000102/sub-01/func/sub-01_task-flanker_run-1_bold.nii.gz +datasets/ds000102/sub-01/func/sub-01_task-flanker_run-1_events.tsv +datasets/ds000102/sub-01/func/sub-01_task-flanker_run-2_bold.nii.gz +datasets/ds000102/sub-01/func/sub-01_task-flanker_run-2_events.tsv diff --git a/examples/fmriprep docker/output.txt b/examples/fmriprep docker/output.txt new file mode 100644 index 0000000..b401f0a --- /dev/null +++ b/examples/fmriprep docker/output.txt @@ -0,0 +1,87 @@ +# Paths are relative to the dataset in which `datalad compute` was executed +datasets/ds000102/derivatives/logs/CITATION.md +datasets/ds000102/derivatives/logs/CITATION.bib +datasets/ds000102/derivatives/logs/CITATION.html +datasets/ds000102/derivatives/logs/CITATION.tex + +datasets/ds000102/derivatives/sub-01/log/20240920-082920_76a9b76a-421a-43c9-9b54-354d2ab772fc/fmriprep.toml + +datasets/ds000102/derivatives/sub-01/figures/sub-01_desc-about_T1w.html +datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-1_desc-validation_bold.html +datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-2_desc-validation_bold.html +datasets/ds000102/derivatives/sub-01/figures/sub-01_desc-summary_T1w.html +datasets/ds000102/derivatives/sub-01/figures/sub-01_desc-conform_T1w.html +datasets/ds000102/derivatives/sub-01/figures/sub-01_dseg.svg +datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-1_desc-summary_bold.html +datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-2_desc-summary_bold.html +datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-1_desc-coreg_bold.svg +datasets/ds000102/derivatives/sub-01/figures/sub-01_space-MNI152NLin2009cAsym_T1w.svg +datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-2_desc-coreg_bold.svg +datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-1_desc-rois_bold.svg +datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-1_desc-compcorvar_bold.svg +datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-1_desc-confoundcorr_bold.svg +datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-2_desc-compcorvar_bold.svg +datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-2_desc-rois_bold.svg +datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-2_desc-confoundcorr_bold.svg +datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-1_desc-carpetplot_bold.svg +datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-2_desc-carpetplot_bold.svg + +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_desc-hmc_boldref.nii.gz +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_desc-hmc_boldref.json +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_desc-hmc_boldref.nii.gz +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_desc-hmc_boldref.json +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_from-orig_to-boldref_mode-image_desc-hmc_xfm.txt +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_from-orig_to-boldref_mode-image_desc-hmc_xfm.json +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_from-orig_to-boldref_mode-image_desc-hmc_xfm.txt +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_from-orig_to-boldref_mode-image_desc-hmc_xfm.json +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_desc-coreg_boldref.nii.gz +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_desc-coreg_boldref.json +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_desc-coreg_boldref.nii.gz +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_desc-coreg_boldref.json +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_desc-brain_mask.nii.gz +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_desc-brain_mask.json +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_desc-brain_mask.nii.gz +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_desc-brain_mask.json +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_from-boldref_to-T1w_mode-image_desc-coreg_xfm.txt +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_from-boldref_to-T1w_mode-image_desc-coreg_xfm.json +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_from-boldref_to-T1w_mode-image_desc-coreg_xfm.txt +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_from-boldref_to-T1w_mode-image_desc-coreg_xfm.json +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_space-MNI152NLin2009cAsym_boldref.nii.gz +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_space-MNI152NLin2009cAsym_boldref.json +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_space-MNI152NLin2009cAsym_desc-brain_mask.nii.gz +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_space-MNI152NLin2009cAsym_desc-brain_mask.json +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_space-MNI152NLin2009cAsym_boldref.nii.gz +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_space-MNI152NLin2009cAsym_boldref.json +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_space-MNI152NLin2009cAsym_desc-brain_mask.nii.gz +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_space-MNI152NLin2009cAsym_desc-brain_mask.json +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_space-MNI152NLin2009cAsym_desc-preproc_bold.nii.gz +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_space-MNI152NLin2009cAsym_desc-preproc_bold.json +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_space-MNI152NLin2009cAsym_desc-preproc_bold.nii.gz +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_space-MNI152NLin2009cAsym_desc-preproc_bold.json +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_desc-confounds_timeseries.tsv +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_desc-confounds_timeseries.json +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_desc-confounds_timeseries.tsv +datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_desc-confounds_timeseries.json + +datasets/ds000102/derivatives/sub-01/anat/sub-01_desc-preproc_T1w.nii.gz +datasets/ds000102/derivatives/sub-01/anat/sub-01_desc-preproc_T1w.json +datasets/ds000102/derivatives/sub-01/anat/sub-01_desc-brain_mask.nii.gz +datasets/ds000102/derivatives/sub-01/anat/sub-01_desc-brain_mask.json +datasets/ds000102/derivatives/sub-01/anat/sub-01_dseg.nii.gz +datasets/ds000102/derivatives/sub-01/anat/sub-01_label-GM_probseg.nii.gz +datasets/ds000102/derivatives/sub-01/anat/sub-01_label-WM_probseg.nii.gz +datasets/ds000102/derivatives/sub-01/anat/sub-01_label-CSF_probseg.nii.gz +datasets/ds000102/derivatives/sub-01/anat/sub-01_from-T1w_to-MNI152NLin2009cAsym_mode-image_xfm.h5 +datasets/ds000102/derivatives/sub-01/anat/sub-01_from-MNI152NLin2009cAsym_to-T1w_mode-image_xfm.h5 +datasets/ds000102/derivatives/sub-01/anat/sub-01_space-MNI152NLin2009cAsym_desc-brain_mask.nii.gz +datasets/ds000102/derivatives/sub-01/anat/sub-01_space-MNI152NLin2009cAsym_desc-brain_mask.json +datasets/ds000102/derivatives/sub-01/anat/sub-01_space-MNI152NLin2009cAsym_desc-preproc_T1w.nii.gz +datasets/ds000102/derivatives/sub-01/anat/sub-01_space-MNI152NLin2009cAsym_desc-preproc_T1w.json +datasets/ds000102/derivatives/sub-01/anat/sub-01_space-MNI152NLin2009cAsym_dseg.nii.gz +datasets/ds000102/derivatives/sub-01/anat/sub-01_space-MNI152NLin2009cAsym_label-GM_probseg.nii.gz +datasets/ds000102/derivatives/sub-01/anat/sub-01_space-MNI152NLin2009cAsym_label-WM_probseg.nii.gz +datasets/ds000102/derivatives/sub-01/anat/sub-01_space-MNI152NLin2009cAsym_label-CSF_probseg.nii.gz + +datasets/ds000102/derivatives/sub-01.html +datasets/ds000102/derivatives/dataset_description.json +datasets/ds000102/derivatives/.bidsignore diff --git a/examples/fmriprep docker/parameter.txt b/examples/fmriprep docker/parameter.txt new file mode 100644 index 0000000..7398855 --- /dev/null +++ b/examples/fmriprep docker/parameter.txt @@ -0,0 +1,4 @@ +input_dir=datasets/ds000102 +output_dir=datasets/ds000102/derivatives +participant_label=01 +license_file=license.txt diff --git a/examples/fmriprep docker/readme.md b/examples/fmriprep docker/readme.md new file mode 100644 index 0000000..4c35ec6 --- /dev/null +++ b/examples/fmriprep docker/readme.md @@ -0,0 +1,16 @@ +This directory contains a simple example for running `fmriprep-docker` on a single subject of a BIDS dataset. The template is `fmriprep-docker`, input, output, and parameter files are defined in `input.txt`, `output.txt`, and `parameter.txt`, respectively. + +The example assumes that the BIDS dataset referenced in `input_dir` is a subdataset of the dataset in which the computation is started (the root-dataset), as outlined in the fairly-big-follow-up document (https://hackmd.io/7oRB8qwuRtCm6BkV44Ubww). + +Executing the computation requires installation of this extension (see https://github.com/christian-monch/datalad-compute/tree/main/README.md), and the installation of the python package `fmriprep-docker`. The template, i.e. `fmriprep-docker` has to be placed in the folder `.datalad/compute/methods` of the root-dataset (and the dataset has to be saved). + +To keep the command line short, input files, output files, and parameter for the computation are defined in the lists: +- `input.txt` +- `output.txt` +- `parameter.txt` + +The computation can be executed with the following command: + +```bash +> datalad compute -I input.txt -O output.txt -P parameter.txt fmriprep_template +``` From d0428652ded4c5ecd11fca12aa25183f11960e73 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 1 Oct 2024 08:35:15 +0200 Subject: [PATCH 073/148] set all recommended annex configs in tests --- datalad_compute/dataprovider/tests/create_datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datalad_compute/dataprovider/tests/create_datasets.py b/datalad_compute/dataprovider/tests/create_datasets.py index d123781..703b1e6 100644 --- a/datalad_compute/dataprovider/tests/create_datasets.py +++ b/datalad_compute/dataprovider/tests/create_datasets.py @@ -15,9 +15,9 @@ def update_config_for_compute(dataset: Dataset): scope='local', recursive=True, spec=[ - #('annex.security.allowed-url-schemes', url_scheme), + ('annex.security.allowed-url-schemes', url_scheme), ('annex.security.allowed-ip-addresses', 'all'), - #('annex.security.allow-unverified-downloads', 'ACKTHPPT'), + ('annex.security.allow-unverified-downloads', 'ACKTHPPT'), ], result_renderer='disabled') From 1534b44416b1d2ed2ed45f2dae21f54bbf532913 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 1 Oct 2024 14:09:36 +0200 Subject: [PATCH 074/148] ensure that local subdatasets are provisioned --- datalad_compute/dataprovider/gitworktree.py | 23 ++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index cdcc75b..164ab99 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -118,16 +118,37 @@ def provide(dataset_dir: str, ) call_git_success(args, capture_output=True) - worktree_dataset = Dataset(worktree_dir) + source_dataset = Dataset(dataset_dir) + # get candidate environment variables for each subdataset + env_vars = get_candidate_env_vars(source_dataset) + + stored_environ = dict(os.environ) # Get all input files in the worktree + os.environ.update(env_vars) + worktree_dataset = Dataset(worktree_dir) with chdir(worktree_dataset.path): for file in input_files: lgr.debug('Provisioning file %s', file) worktree_dataset.get(file, result_renderer='disabled') + os.environ.clear() + os.environ.update(stored_environ) return worktree_dir +def get_candidate_env_vars(dataset: Dataset, counter: int = 1) -> dict[str, str]: + env_vars = {} + for result in dataset.subdatasets(result_renderer='disabled'): + env_vars[f'DATALAD_GET_SUBDATASET__SOURCE__CANDIDATE__100_{counter}'] = result['path'] + counter += 1 + subdataset = Dataset(result['path']) + env_vars = { + **env_vars, + **get_candidate_env_vars(subdataset, counter) + } + return env_vars + + def random_name() -> str: return 'tmp_' + ''.join( random.choices('abcdefghijklmnopqrstuvwxyz', k=10)) From b111f6c01b8c036286e2c365d9556253a9bbe669 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 1 Oct 2024 14:55:40 +0200 Subject: [PATCH 075/148] refactor gitworktree.py --- datalad_compute/dataprovider/gitworktree.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index 164ab99..e14a237 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -119,19 +119,20 @@ def provide(dataset_dir: str, call_git_success(args, capture_output=True) source_dataset = Dataset(dataset_dir) + # get candidate environment variables for each subdataset env_vars = get_candidate_env_vars(source_dataset) - stored_environ = dict(os.environ) # Get all input files in the worktree - os.environ.update(env_vars) worktree_dataset = Dataset(worktree_dir) with chdir(worktree_dataset.path): + stored_environ = dict(os.environ) + os.environ.update(env_vars) for file in input_files: lgr.debug('Provisioning file %s', file) worktree_dataset.get(file, result_renderer='disabled') - os.environ.clear() - os.environ.update(stored_environ) + os.environ.clear() + os.environ.update(stored_environ) return worktree_dir From 0b816cc4f7a06579bd2365333e5f4cfea35e28d1 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 1 Oct 2024 17:09:35 +0200 Subject: [PATCH 076/148] fix errors in fmriprep docker example --- datalad_compute/dataprovider/gitworktree.py | 2 +- examples/fmriprep docker/fmriprep-docker | 2 +- examples/fmriprep docker/output.txt | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index e14a237..591bc23 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -140,7 +140,7 @@ def provide(dataset_dir: str, def get_candidate_env_vars(dataset: Dataset, counter: int = 1) -> dict[str, str]: env_vars = {} for result in dataset.subdatasets(result_renderer='disabled'): - env_vars[f'DATALAD_GET_SUBDATASET__SOURCE__CANDIDATE__100_{counter}'] = result['path'] + env_vars[f'DATALAD_GET_SUBDATASET__SOURCE__CANDIDATE__100__{counter}'] = result['path'] counter += 1 subdataset = Dataset(result['path']) env_vars = { diff --git a/examples/fmriprep docker/fmriprep-docker b/examples/fmriprep docker/fmriprep-docker index 2406687..c889700 100644 --- a/examples/fmriprep docker/fmriprep-docker +++ b/examples/fmriprep docker/fmriprep-docker @@ -12,7 +12,7 @@ # # The template assumes that the BIDS dataset referenced in `input_dir` is # a subdataset of the dataset in which the computation is started, as outlined -ä in the fairly-big-follow-up document. +# in the fairly-big-follow-up document. # # Input files, output files, and parameter for the computation are defined in # the lists: `input.txt`, `output.txt`, and `parameter.txt` to keep the command diff --git a/examples/fmriprep docker/output.txt b/examples/fmriprep docker/output.txt index b401f0a..14ac0f4 100644 --- a/examples/fmriprep docker/output.txt +++ b/examples/fmriprep docker/output.txt @@ -4,8 +4,6 @@ datasets/ds000102/derivatives/logs/CITATION.bib datasets/ds000102/derivatives/logs/CITATION.html datasets/ds000102/derivatives/logs/CITATION.tex -datasets/ds000102/derivatives/sub-01/log/20240920-082920_76a9b76a-421a-43c9-9b54-354d2ab772fc/fmriprep.toml - datasets/ds000102/derivatives/sub-01/figures/sub-01_desc-about_T1w.html datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-1_desc-validation_bold.html datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-2_desc-validation_bold.html From a2b9d80bdac3213d0325e7aec3a8d20a02f4773a Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 1 Oct 2024 12:24:40 +0200 Subject: [PATCH 077/148] promote provision to a proper datalad command --- datalad_compute/__init__.py | 10 ++ datalad_compute/commands/provision_cmd.py | 186 ++++++++++++++++++++ datalad_compute/dataprovider/gitworktree.py | 182 ------------------- 3 files changed, 196 insertions(+), 182 deletions(-) create mode 100644 datalad_compute/commands/provision_cmd.py diff --git a/datalad_compute/__init__.py b/datalad_compute/__init__.py index 3783291..acd9b56 100644 --- a/datalad_compute/__init__.py +++ b/datalad_compute/__init__.py @@ -23,6 +23,16 @@ # optional name of the command in the Python API 'compute' ), + ( + # importable module that contains the command implementation + 'datalad_compute.commands.provision_cmd', + # name of the command class implementation in above module + 'Provision', + # optional name of the command in the cmdline API + 'provision', + # optional name of the command in the Python API + 'provision' + ), ] ) diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py new file mode 100644 index 0000000..3e1e7cf --- /dev/null +++ b/datalad_compute/commands/provision_cmd.py @@ -0,0 +1,186 @@ +""" +A data provisioner that works with local git repositories. +Data is provisioned in a temporary worktree. All subdatasets +are currently also provisioned. +""" +from __future__ import annotations + +import logging +import random +import shutil +import stat +from contextlib import chdir +from pathlib import Path +from typing import Iterable +from tempfile import TemporaryDirectory + +from datalad_next.commands import ( + EnsureCommandParameterization, + ValidatedInterface, + Parameter, + build_doc, + datasetmethod, + eval_results, + get_status_dict, +) +from datalad_next.constraints import ( + EnsureDataset, + EnsureListOf, + EnsureStr, EnsurePath, +) +from datalad_next.datasets import Dataset +from datalad_next.runners import call_git_lines + +from datalad_compute.utils.glob import resolve_patterns +from ..commands.compute_cmd import read_list + + +__docformat__ = 'restructuredtext' + + +lgr = logging.getLogger('datalad.compute.provision_cmd') + + +# decoration auto-generates standard help +@build_doc +# all commands must be derived from Interface +class Provision(ValidatedInterface): + # first docstring line is used a short description in the cmdline help + # the rest is put in the verbose help and manpage + """Provision inputs for a compute command + """ + + _validator_ = EnsureCommandParameterization(dict( + dataset=EnsureDataset(installed=True), + input=EnsureListOf(EnsureStr(min_len=1)), + input_list=EnsureStr(min_len=1), + tmp_dir=EnsurePath(is_mode=stat.S_ISDIR), + delete=EnsurePath(lexists=True, is_mode=stat.S_ISDIR), + )) + + # parameters of the command, must be exhaustive + _params_ = dict( + dataset=Parameter( + args=('-d', '--dataset'), + doc="Dataset to be used as a configuration source. Beyond " + "reading configuration items, this command does not interact with " + "the dataset."), + branch=Parameter( + args=('-b', '--branch',), + doc="Branch (or commit) that should be provisioned, if " + "not specified HEAD will be used"), + delete=Parameter( + args=('--delete',), + doc="Delete the temporary worktree WORKTREE that belongs the the " + "dataset (cannot be used with `-b`, `--branch`, `-i`, or " + "`--input`)"), + input=Parameter( + args=('-i', '--input',), + action='append', + doc="An input file pattern (repeat for multiple inputs, " + "file pattern support python globbing, globbing is expanded " + "in the source dataset"), + input_list=Parameter( + args=('-I', '--input-list',), + doc="Name of a file that contains a list of input file patterns. " + "Format is one file per line, relative path from `dataset`. " + "Empty lines, i.e. lines that contain only newlines, and lines " + "that start with '#' are ignored. Line content is stripped " + "before used. This is useful if a large number of input file " + "patterns should be provided."), + temp_dir=Parameter( + args=('-t', '--temp-dir',), + doc="Path of the directory where temporary worktrees should be " + "created. The default is `$TMP` if set, otherwise `/tmp`."), + ) + + @staticmethod + @datasetmethod(name='compute') + @eval_results + def __call__(dataset=None, + branch=None, + delete=None, + input=None, + input_list=None, + temp_dir=None, + ): + + dataset : Dataset = dataset.ds if dataset else Dataset('.') + if delete: + if branch or input: + raise ValueError( + 'Cannot use `-d`, `--delete` with `-b`, `--branch`,' + ' `-i`, or `--input`') + + remove(dataset, delete) + yield get_status_dict( + action='provision [delete]', + path=delete, + status='ok', + message=f'delete workspace: {delete!r} from dataset {dataset}',) + + temp_dir: Path = temp_dir or Path(TemporaryDirectory().name) + inputs = input or [] + read_list(input_list) + provision_dir = provide(dataset, temp_dir, branch, inputs) + yield get_status_dict( + action='provision', + path=str(provision_dir), + status='ok', + message=f'provisioned dataset: {dataset} in workspace: {provision_dir!r}',) + + +def remove(dataset: Dataset, + worktree: str + ) -> None: + remove_subdatasets(worktree) + shutil.rmtree(worktree) + prune_worktrees(dataset) + + +def remove_subdatasets(worktree: str): + dataset = Dataset(worktree) + for subdataset_info in dataset.subdatasets(result_renderer='disabled'): + dataset.drop( + subdataset_info['path'], + recursive=True, + reckless='kill', + what='all', + result_renderer='disabled') + + +def prune_worktrees(dataset: Dataset) -> None: + call_git_lines(['worktree', 'prune'], cwd=dataset.pathobj) + for result in dataset.subdatasets(result_renderer='disabled'): + prune_worktrees(Dataset(result['path'])) + + +def provide(dataset: Dataset, + worktree_dir: Path, + source_branch: str | None = None, + input_patterns: Iterable[str] | None = None, + ) -> Path: + + lgr.debug('Provisioning dataset %s at %s', dataset, worktree_dir) + + worktree_dir.mkdir(parents=True, exist_ok=True) + worktree_name = worktree_dir.parts[-1] + + # Resolve input file patterns in the original dataset + input_files = resolve_patterns(dataset.path, input_patterns) + + # Create a worktree + args = ['worktree', 'add', '-b', worktree_name] + [str(worktree_dir)] + ( + [source_branch] + if source_branch + else [] + ) + call_git_lines(args, cwd=dataset.pathobj) + + # Get all input files in the worktree + worktree_dataset = Dataset(worktree_dir) + with chdir(worktree_dataset.path): + for file in input_files: + lgr.debug('Provisioning file %s', file) + worktree_dataset.get(file, result_renderer='disabled') + + return worktree_dir diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/dataprovider/gitworktree.py index 591bc23..e69de29 100644 --- a/datalad_compute/dataprovider/gitworktree.py +++ b/datalad_compute/dataprovider/gitworktree.py @@ -1,182 +0,0 @@ -""" -A data provisioner that works with local git repositories. -Data is provisioned in a temporary worktree. All subdatasets -are currently also provisioned. -""" -from __future__ import annotations - -import logging -import os -import random -import shutil -from argparse import ArgumentParser -from contextlib import chdir -from pathlib import Path -from typing import Iterable - -from datalad_next.datasets import Dataset -from datalad_next.runners import call_git_success - -from datalad_compute.utils.glob import resolve_patterns -from ..commands.compute_cmd import read_list - - -lgr = logging.getLogger('datalad.compute.dataprovider.gitworktree') - -argument_parser = ArgumentParser() -argument_parser.add_argument( - 'dataset', - default='.', - help='Path to source dataset (default: current directory)', -) -argument_parser.add_argument( - '-d', '--delete', - metavar='WORKTREE', - help='Delete the temporary worktree WORKTREE that belongs the the ' - 'dataset (cannot be used with `-b`, `--branch`, `-i`, or `--input`)', -) -argument_parser.add_argument( - '-b', '--branch', - help='Branch (name, sha, or tag) of `dataset` that should be provisioned. ' - 'If not given the default branch will be used', -) -argument_parser.add_argument( - '-i', '--input', - action='append', - metavar='PATH', - help='File pattern that should be provisioned (relative from dataset ' - 'root), at least one input has to be provided (use multiple times to ' - 'define multiple inputs). Patterns are resolved by Python\'s globbing ' - 'rules. They are resolved in the source dataset.', -) -argument_parser.add_argument( - '-I', '--input-list', - metavar='PATH', - default=None, - help='Path of a file that contains a list of input file patterns', -) -argument_parser.add_argument( - '-t', '--temp-dir', - metavar='PATH', - default=os.getenv('TMP', '/tmp'), - help='Path of the directory where temporary worktrees should be created. ' - 'The default is `$TMP` if set, otherwise `/tmp`.', -) - - -def remove(dataset: str, - worktree: str - ) -> None: - remove_subdatasets(worktree) - shutil.rmtree(worktree) - dataset = Dataset(dataset) - prune_worktrees(dataset) - - -def remove_subdatasets(worktree: str): - dataset = Dataset(worktree) - for subdataset_info in dataset.subdatasets(result_renderer='disabled'): - dataset.drop( - subdataset_info['path'], - recursive=True, - reckless='kill', - what='all', - result_renderer='disabled') - - -def prune_worktrees(dataset: Dataset) -> None: - with chdir(dataset.path): - call_git_success( - ['worktree', 'prune'], - capture_output=True) - - for result in dataset.subdatasets(result_renderer='disabled'): - prune_worktrees(Dataset(result['path'])) - - -def provide(dataset_dir: str, - temp_dir: str, - source_branch: str | None = None, - input_patterns: Iterable[str] | None = None, - ) -> Path: - - lgr.debug('Provisioning dataset %s', dataset_dir) - worktree_name = random_name() - worktree_dir = Path(temp_dir) / worktree_name - if not worktree_dir.exists(): - worktree_dir.mkdir(parents=True, exist_ok=True) - - # Resolve input file patterns in the original dataset - input_files = resolve_patterns(dataset_dir, input_patterns) - - # Create a worktree - with chdir(dataset_dir): - args = ['worktree', 'add', '-b', worktree_name] + [str(worktree_dir)] + ( - [source_branch] - if source_branch - else [] - ) - call_git_success(args, capture_output=True) - - source_dataset = Dataset(dataset_dir) - - # get candidate environment variables for each subdataset - env_vars = get_candidate_env_vars(source_dataset) - - # Get all input files in the worktree - worktree_dataset = Dataset(worktree_dir) - with chdir(worktree_dataset.path): - stored_environ = dict(os.environ) - os.environ.update(env_vars) - for file in input_files: - lgr.debug('Provisioning file %s', file) - worktree_dataset.get(file, result_renderer='disabled') - os.environ.clear() - os.environ.update(stored_environ) - - return worktree_dir - - -def get_candidate_env_vars(dataset: Dataset, counter: int = 1) -> dict[str, str]: - env_vars = {} - for result in dataset.subdatasets(result_renderer='disabled'): - env_vars[f'DATALAD_GET_SUBDATASET__SOURCE__CANDIDATE__100__{counter}'] = result['path'] - counter += 1 - subdataset = Dataset(result['path']) - env_vars = { - **env_vars, - **get_candidate_env_vars(subdataset, counter) - } - return env_vars - - -def random_name() -> str: - return 'tmp_' + ''.join( - random.choices('abcdefghijklmnopqrstuvwxyz', k=10)) - - -def main(): - arguments = argument_parser.parse_args() - if arguments.delete: - - if arguments.branch or arguments.input: - raise ValueError( - 'Cannot use `-d`, `--delete` with `-b`, `--branch`,' - ' `-i`, or `--input`') - - remove(arguments.dataset, arguments.delete) - return - - inputs = arguments.input or [] + read_list(arguments.input_list) - - provision_dir = provide( - arguments.dataset, - arguments.temp_dir, - arguments.branch, - inputs, - ) - print(provision_dir) - - -if __name__ == '__main__': - main() From 6fb23091654e092f42d880c784561247f50530d9 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 2 Oct 2024 06:44:35 +0200 Subject: [PATCH 078/148] add get-candidate env-vars to provision cmd --- datalad_compute/commands/provision_cmd.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py index 3e1e7cf..79d89ec 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_compute/commands/provision_cmd.py @@ -6,7 +6,7 @@ from __future__ import annotations import logging -import random +import os import shutil import stat from contextlib import chdir @@ -176,11 +176,31 @@ def provide(dataset: Dataset, ) call_git_lines(args, cwd=dataset.pathobj) + # get candidate environment variables for each subdataset + env_vars = get_candidate_env_vars(dataset) + # Get all input files in the worktree worktree_dataset = Dataset(worktree_dir) with chdir(worktree_dataset.path): + stored_environ = dict(os.environ) + os.environ.update(env_vars) for file in input_files: lgr.debug('Provisioning file %s', file) worktree_dataset.get(file, result_renderer='disabled') + os.environ.clear() + os.environ.update(stored_environ) return worktree_dir + + +def get_candidate_env_vars(dataset: Dataset, counter: int = 1) -> dict[str, str]: + env_vars = {} + for result in dataset.subdatasets(result_renderer='disabled'): + env_vars[f'DATALAD_GET_SUBDATASET__SOURCE__CANDIDATE__100__{counter}'] = result['path'] + counter += 1 + subdataset = Dataset(result['path']) + env_vars = { + **env_vars, + **get_candidate_env_vars(subdataset, counter) + } + return env_vars From 094eeeaa2745cf368d6260a39424277455f97570 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 2 Oct 2024 14:18:38 +0200 Subject: [PATCH 079/148] adapt provision tests to new implementation --- .../annexremotes/tests/test_hierarchies.py | 2 +- datalad_compute/commands/compute_cmd.py | 15 ++-- .../{dataprovider => commands}/gitworktree.py | 0 datalad_compute/commands/provision_cmd.py | 68 +++++++------------ .../tests}/__init__.py | 0 .../tests/create_datasets.py | 0 .../tests/test_provision.py} | 47 ++++++------- .../dataprovider/tests/__init__.py | 0 setup.cfg | 67 ++++++++++++++++++ 9 files changed, 118 insertions(+), 81 deletions(-) rename datalad_compute/{dataprovider => commands}/gitworktree.py (100%) rename datalad_compute/{dataprovider => commands/tests}/__init__.py (100%) rename datalad_compute/{dataprovider => commands}/tests/create_datasets.py (100%) rename datalad_compute/{dataprovider/tests/test_gitworktree.py => commands/tests/test_provision.py} (81%) delete mode 100644 datalad_compute/dataprovider/tests/__init__.py diff --git a/datalad_compute/annexremotes/tests/test_hierarchies.py b/datalad_compute/annexremotes/tests/test_hierarchies.py index a16e21d..4b60694 100644 --- a/datalad_compute/annexremotes/tests/test_hierarchies.py +++ b/datalad_compute/annexremotes/tests/test_hierarchies.py @@ -8,7 +8,7 @@ template_dir, url_scheme, ) -from datalad_compute.dataprovider.tests.create_datasets import create_ds_hierarchy +from datalad_compute.commands.tests.create_datasets import create_ds_hierarchy test_method = """ diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 9353519..fce1c0f 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -150,7 +150,7 @@ def __call__(dataset=None, worktree = provide(dataset, branch, input_pattern) execute(worktree, template, parameter, output) collect(worktree, dataset, output) - un_provide(dataset, worktree) + dataset.provision(delete=worktree) url_base = get_url( dataset, @@ -241,13 +241,8 @@ def provide(dataset: Dataset, ) -> Path: lgr.debug('provide: %s %s %s', dataset, branch, input_patterns) - - args = ['provide-gitworktree', dataset.path, ] + ( - ['--branch', branch] if branch else [] - ) - args.extend(chain(*[('--input', i) for i in (input_patterns or [])])) - stdout = subprocess.run(args, stdout=subprocess.PIPE, check=True).stdout - return Path(stdout.splitlines()[-1].decode()) + result = dataset.provision(input=input_patterns, branch=branch) + return Path(result[0]['path']) def execute(worktree: Path, @@ -331,6 +326,4 @@ def un_provide(dataset: Dataset, ) -> None: lgr.debug('un_provide: %s %s', dataset, str(worktree)) - - args = ['provide-gitworktree', dataset.path, '--delete', str(worktree)] - subprocess.run(args, check=True) + dataset.provision(delete=worktree) diff --git a/datalad_compute/dataprovider/gitworktree.py b/datalad_compute/commands/gitworktree.py similarity index 100% rename from datalad_compute/dataprovider/gitworktree.py rename to datalad_compute/commands/gitworktree.py diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py index 79d89ec..c011326 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_compute/commands/provision_cmd.py @@ -55,7 +55,7 @@ class Provision(ValidatedInterface): input=EnsureListOf(EnsureStr(min_len=1)), input_list=EnsureStr(min_len=1), tmp_dir=EnsurePath(is_mode=stat.S_ISDIR), - delete=EnsurePath(lexists=True, is_mode=stat.S_ISDIR), + delete=EnsureDataset(installed=True), )) # parameters of the command, must be exhaustive @@ -88,21 +88,21 @@ class Provision(ValidatedInterface): "that start with '#' are ignored. Line content is stripped " "before used. This is useful if a large number of input file " "patterns should be provided."), - temp_dir=Parameter( - args=('-t', '--temp-dir',), - doc="Path of the directory where temporary worktrees should be " - "created. The default is `$TMP` if set, otherwise `/tmp`."), + worktree_dir=Parameter( + args=('-w', '--worktree-dir',), + doc="Path of the directory that should become the temporary worktree" + ", defaults to `tempfile.TemporaryDirectory().name`."), ) @staticmethod - @datasetmethod(name='compute') + @datasetmethod(name='provision') @eval_results def __call__(dataset=None, branch=None, delete=None, input=None, input_list=None, - temp_dir=None, + worktree_dir=None, ): dataset : Dataset = dataset.ds if dataset else Dataset('.') @@ -112,16 +112,17 @@ def __call__(dataset=None, 'Cannot use `-d`, `--delete` with `-b`, `--branch`,' ' `-i`, or `--input`') - remove(dataset, delete) + remove(dataset, delete.ds) yield get_status_dict( action='provision [delete]', - path=delete, + path=delete.ds.path, status='ok', - message=f'delete workspace: {delete!r} from dataset {dataset}',) + message=f'delete workspace: {delete.ds.path!r} from dataset {dataset}') + return - temp_dir: Path = temp_dir or Path(TemporaryDirectory().name) + worktree_dir: Path = worktree_dir or Path(TemporaryDirectory().name) inputs = input or [] + read_list(input_list) - provision_dir = provide(dataset, temp_dir, branch, inputs) + provision_dir = provide(dataset, worktree_dir, branch, inputs) yield get_status_dict( action='provision', path=str(provision_dir), @@ -130,17 +131,22 @@ def __call__(dataset=None, def remove(dataset: Dataset, - worktree: str + worktree: Dataset ) -> None: - remove_subdatasets(worktree) - shutil.rmtree(worktree) + worktree.drop( + what='all', + reckless='kill', + recursive=True, + result_renderer='disabled') prune_worktrees(dataset) -def remove_subdatasets(worktree: str): - dataset = Dataset(worktree) - for subdataset_info in dataset.subdatasets(result_renderer='disabled'): - dataset.drop( +def remove_subdatasets(worktree: Dataset): + for subdataset_info in worktree.subdatasets( + recursive=True, + result_renderer='disabled' + ): + worktree.drop( subdataset_info['path'], recursive=True, reckless='kill', @@ -163,44 +169,22 @@ def provide(dataset: Dataset, lgr.debug('Provisioning dataset %s at %s', dataset, worktree_dir) worktree_dir.mkdir(parents=True, exist_ok=True) - worktree_name = worktree_dir.parts[-1] # Resolve input file patterns in the original dataset input_files = resolve_patterns(dataset.path, input_patterns) # Create a worktree - args = ['worktree', 'add', '-b', worktree_name] + [str(worktree_dir)] + ( + args = ['worktree', 'add'] + [str(worktree_dir)] + ( [source_branch] if source_branch else [] ) call_git_lines(args, cwd=dataset.pathobj) - # get candidate environment variables for each subdataset - env_vars = get_candidate_env_vars(dataset) - # Get all input files in the worktree worktree_dataset = Dataset(worktree_dir) with chdir(worktree_dataset.path): - stored_environ = dict(os.environ) - os.environ.update(env_vars) for file in input_files: lgr.debug('Provisioning file %s', file) worktree_dataset.get(file, result_renderer='disabled') - os.environ.clear() - os.environ.update(stored_environ) - return worktree_dir - - -def get_candidate_env_vars(dataset: Dataset, counter: int = 1) -> dict[str, str]: - env_vars = {} - for result in dataset.subdatasets(result_renderer='disabled'): - env_vars[f'DATALAD_GET_SUBDATASET__SOURCE__CANDIDATE__100__{counter}'] = result['path'] - counter += 1 - subdataset = Dataset(result['path']) - env_vars = { - **env_vars, - **get_candidate_env_vars(subdataset, counter) - } - return env_vars diff --git a/datalad_compute/dataprovider/__init__.py b/datalad_compute/commands/tests/__init__.py similarity index 100% rename from datalad_compute/dataprovider/__init__.py rename to datalad_compute/commands/tests/__init__.py diff --git a/datalad_compute/dataprovider/tests/create_datasets.py b/datalad_compute/commands/tests/create_datasets.py similarity index 100% rename from datalad_compute/dataprovider/tests/create_datasets.py rename to datalad_compute/commands/tests/create_datasets.py diff --git a/datalad_compute/dataprovider/tests/test_gitworktree.py b/datalad_compute/commands/tests/test_provision.py similarity index 81% rename from datalad_compute/dataprovider/tests/test_gitworktree.py rename to datalad_compute/commands/tests/test_provision.py index a0a2bc4..4e8259e 100644 --- a/datalad_compute/dataprovider/tests/test_gitworktree.py +++ b/datalad_compute/commands/tests/test_provision.py @@ -7,11 +7,6 @@ from datalad_next.datasets import Dataset from datalad_next.runners import call_git_lines - -from ..gitworktree import ( - provide, - remove, -) from .create_datasets import create_ds_hierarchy @@ -39,26 +34,24 @@ for path in file_path_templates ] -all_paths = a_paths + b_paths - def test_worktree_basic(tmp_path): dataset = create_ds_hierarchy(tmp_path, 'ds1', 3)[0][2] - worktree = Dataset(provide( - dataset.path, - str(tmp_path), - input_patterns=[ + provision_result = dataset.provision( + worktree_dir=tmp_path / 'ds1_worktree1', + input=[ 'a.txt', 'b.txt', 'ds1_subds0/a0.txt', 'ds1_subds0/b0.txt', 'ds1_subds0/ds1_subds1/a1.txt', 'ds1_subds0/ds1_subds1/b1.txt' ], - )) + )[0] + worktree = Dataset(provision_result['path']) r_orig = [r['gitmodule_url'] for r in dataset.subdatasets(recursive=True, result_renderer='disabled')] r_worktree = [r['gitmodule_url'] for r in worktree.subdatasets(recursive=True, result_renderer='disabled')] assert r_orig == r_worktree - remove(dataset.path, worktree.path) + dataset.provision(delete=worktree.path) def check_deleted_worktrees(ds: Dataset): with chdir(ds.path): @@ -78,41 +71,41 @@ def check_deleted_worktrees(ds: Dataset): def test_worktree_globbing(tmp_path): dataset = create_ds_hierarchy(tmp_path, 'ds1', 3)[0][2] - worktree = Dataset(provide( - dataset.path, - str(tmp_path), - input_patterns=[ + result = dataset.provision( + worktree_dir=tmp_path / 'ds1_worktree2', + input=[ '*.txt', '*_subds0/*.txt', '*_subds0/*_subds1/*.txt', '*_subds0/*_subds1/*_subds2/*.txt', ], - )) + )[0] - worktree_set = set(get_file_list(worktree.pathobj)) + worktree = Path(result['path']) + worktree_set = set(get_file_list(worktree)) assert worktree_set == set( path.format(ds_name='ds1') for path in all_paths ) - remove(dataset.path, worktree.path) + dataset.provision(delete=worktree) - worktree = Dataset(provide( - dataset.path, - str(tmp_path), - input_patterns=[ + result = dataset.provision( + worktree_dir=tmp_path / 'ds1_worktree2', + input=[ 'b*txt', '*_subds0/b*txt', '*_subds0/*_subds1/b*txt', '*_subds0/*_subds1/*_subds2/b*txt', ], - )) + )[0] - worktree_set = set(get_file_list(worktree.pathobj)) + worktree = Path(result['path']) + worktree_set = set(get_file_list(worktree)) assert set( path.format(ds_name='ds1') for path in b_paths ).issubset(worktree_set) - remove(dataset.path, worktree.path) + dataset.provision(delete=worktree) dataset.drop( what='all', diff --git a/datalad_compute/dataprovider/tests/__init__.py b/datalad_compute/dataprovider/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/setup.cfg b/setup.cfg index e69de29..0622c63 100644 --- a/setup.cfg +++ b/setup.cfg @@ -0,0 +1,67 @@ +[metadata] +url = https://github.com/datalad/datalad-extension-template +author = The DataLad Team and Contributors +author_email = team@datalad.org +description = demo DataLad extension package +long_description = file:README.md +long_description_content_type = text/markdown; charset=UTF-8 +license = MIT +classifiers = + Programming Language :: Python + License :: OSI Approved :: MIT License + Programming Language :: Python :: 3 + +[options] +python_requires = >= 3.11 +install_requires = + annexremote + datalad >= 1.1.1 + datalad_next >= 1.5.0 + datasalad +packages = find_namespace: +include_package_data = True + +[options.packages.find] +include = datalad_compute* + +[options.extras_require] +# this matches the name used by -core and what is expected by some CI setups +devel = + coverage + hypothesis + pytest + pytest-cov + sphinx + sphinx_rtd_theme + sphinx_copybutton +devel-utils = + pytest-xdist + scriv + +[options.entry_points] +# 'datalad.extensions' is THE entrypoint inspected by the datalad API builders +datalad.extensions = + # the label in front of '=' is the command suite label + # the entrypoint can point to any symbol of any name, as long it is + # valid datalad interface specification (see demo in this extensions) + compute = datalad_compute:command_suite + +console_scripts = + git-annex-remote-compute = datalad_compute.annexremotes.compute:main + +[versioneer] +# See the docstring in versioneer.py for instructions. Note that you must +# re-run 'versioneer.py setup' after changing this section, and commit the +# resulting files. +VCS = git +style = pep440 +versionfile_source = datalad_compute/_version.py +versionfile_build = datalad_compute/_version.py +tag_prefix = +parentdir_prefix = + +[coverage:report] +show_missing = True +omit = + # versioneer code + datalad_compute/_version.py From ccb82fda5fff43a23b7d80e3eba632dbe82c7756 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20M=C3=B6nch?= Date: Thu, 3 Oct 2024 09:03:36 +0200 Subject: [PATCH 080/148] create worktree from locally available datasets This change enables provision to install necessary subdatasets from already available datasets of the dataset that should be provisioned. --- datalad_compute/commands/provision_cmd.py | 111 ++++++++++++++++++++-- 1 file changed, 104 insertions(+), 7 deletions(-) diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py index c011326..b5d25e6 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_compute/commands/provision_cmd.py @@ -6,12 +6,13 @@ from __future__ import annotations import logging -import os -import shutil import stat from contextlib import chdir from pathlib import Path -from typing import Iterable +from typing import ( + Any, + Iterable, +) from tempfile import TemporaryDirectory from datalad_next.commands import ( @@ -170,9 +171,6 @@ def provide(dataset: Dataset, worktree_dir.mkdir(parents=True, exist_ok=True) - # Resolve input file patterns in the original dataset - input_files = resolve_patterns(dataset.path, input_patterns) - # Create a worktree args = ['worktree', 'add'] + [str(worktree_dir)] + ( [source_branch] @@ -181,10 +179,109 @@ def provide(dataset: Dataset, ) call_git_lines(args, cwd=dataset.pathobj) - # Get all input files in the worktree + input_files = resolve_patterns(dataset.path, input_patterns) + worktree_dataset = Dataset(worktree_dir) + install_required_locally_available_datasets( + dataset, + [Path(i) for i in input_files], + worktree_dataset) + + # Get all input files in the worktree with chdir(worktree_dataset.path): for file in input_files: lgr.debug('Provisioning file %s', file) worktree_dataset.get(file, result_renderer='disabled') return worktree_dir + + +def check_results(results: Iterable[dict[str, Any]]): + assert not any( + result['status'] in ('impossible', 'error') + for result in results) + + +def install_required_locally_available_datasets(root_dataset: Dataset, + input_files: list[Path], + worktree: Dataset, + ) -> None: + """Ensure that local and locally changed subdatasets can be provisioned. + + If subdatasets are only available within the root dataset, either because + they are not published or because they are locally modified, the provision + has to use those. + + This means we have to adapt cloning candidates before trying to install + a subdataset. This is done by: + + - Determining which subdatasets are installed in the root dataset + - Determining which of those subdatasets are required by the input files + - Adjust the `.gitmodules` files and install the required local datasets + - All other datasets are installed as usual, e.g. via `datalad get`. + """ + + # Determine which subdatasets are installed in the root dataset + subdataset_info = get_subdataset_info(root_dataset) + + # Determine which subdatasets are required by the input files + required_subdatasets = determine_required_subdatasets( + subdataset_info, + input_files) + + install_locally_available_subdatasets( + root_dataset, + required_subdatasets, + worktree) + + +def get_subdataset_info(dataset: Dataset) -> Iterable[tuple[Path, Path, Path]]: + results = dataset.subdatasets( + recursive=True, + result_renderer='disabled') + check_results(results) + return [ + ( + Path(result['path']), + Path(result['parentds']), + Path(result['path']).relative_to(dataset.pathobj) + ) + for result in results + ] + + +def determine_required_subdatasets(subdataset_info: Iterable[tuple[Path, Path, Path]], + input_files: list[Path], + ) -> set[tuple[Path, Path, Path]]: + required_set = set() + for file in input_files: + # if the path can be expressed as relative to the subdataset path. + # the subdataset is required, and so are all subdatasets above it. + for subdataset_path, parent_path, path_from_root in subdataset_info: + try: + file.relative_to(path_from_root) + required_set.add((subdataset_path, parent_path, path_from_root)) + except ValueError as e: + pass + return required_set + + +def install_locally_available_subdatasets(source_dataset: Dataset, + required_subdatasets: set[tuple[Path, Path, Path]], + worktree: Dataset, + ) -> None: + """Install the required subdatasets from the source dataset in the worktree. + """ + todo = [Path('.')] + while todo: + current_root = todo.pop() + for subdataset_path, parent_path, path_from_root in required_subdatasets: + if not current_root == parent_path.relative_to(source_dataset.pathobj): + continue + # Set the URL to the full source path + args = ['-C', str(worktree.pathobj / current_root), + 'submodule', 'set-url', '--', + str(subdataset_path.relative_to(parent_path)), + 'file://' + str(source_dataset.pathobj / path_from_root)] + call_git_lines(args) + worktree.get(path_from_root, get_data=False) + todo.append(path_from_root) From 74016e0b2bde22f53bdf9714d65a598e205904d9 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 4 Oct 2024 12:56:02 +0200 Subject: [PATCH 081/148] remove an unused variable --- datalad_compute/commands/provision_cmd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py index b5d25e6..392f2e0 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_compute/commands/provision_cmd.py @@ -260,7 +260,7 @@ def determine_required_subdatasets(subdataset_info: Iterable[tuple[Path, Path, P try: file.relative_to(path_from_root) required_set.add((subdataset_path, parent_path, path_from_root)) - except ValueError as e: + except ValueError: pass return required_set From 7fa4f1387c45a8c2de90366f15ad0b38fe34a9af Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 4 Oct 2024 13:11:37 +0200 Subject: [PATCH 082/148] adapt provision tests to modified provision code --- datalad_compute/commands/tests/test_provision.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/datalad_compute/commands/tests/test_provision.py b/datalad_compute/commands/tests/test_provision.py index 4e8259e..7eeb4b3 100644 --- a/datalad_compute/commands/tests/test_provision.py +++ b/datalad_compute/commands/tests/test_provision.py @@ -37,19 +37,19 @@ def test_worktree_basic(tmp_path): dataset = create_ds_hierarchy(tmp_path, 'ds1', 3)[0][2] + inputs = [ + 'a.txt', 'b.txt', + 'ds1_subds0/a0.txt', 'ds1_subds0/b0.txt', + 'ds1_subds0/ds1_subds1/a1.txt', 'ds1_subds0/ds1_subds1/b1.txt' + ] provision_result = dataset.provision( worktree_dir=tmp_path / 'ds1_worktree1', - input=[ - 'a.txt', 'b.txt', - 'ds1_subds0/a0.txt', 'ds1_subds0/b0.txt', - 'ds1_subds0/ds1_subds1/a1.txt', 'ds1_subds0/ds1_subds1/b1.txt' - ], + input=inputs, )[0] worktree = Dataset(provision_result['path']) - r_orig = [r['gitmodule_url'] for r in dataset.subdatasets(recursive=True, result_renderer='disabled')] - r_worktree = [r['gitmodule_url'] for r in worktree.subdatasets(recursive=True, result_renderer='disabled')] - assert r_orig == r_worktree + # Check input availability + assert all((worktree.pathobj / path).exists() for path in inputs) dataset.provision(delete=worktree.path) From ab0baa18b6faf283c1ec0ca6d298438f32dfa795 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 4 Oct 2024 13:14:33 +0200 Subject: [PATCH 083/148] add registration test for provision-cmd --- datalad_compute/tests/test_register.py | 1 + 1 file changed, 1 insertion(+) diff --git a/datalad_compute/tests/test_register.py b/datalad_compute/tests/test_register.py index 8e2b9f9..fc9ebdb 100644 --- a/datalad_compute/tests/test_register.py +++ b/datalad_compute/tests/test_register.py @@ -4,3 +4,4 @@ def test_register(): import datalad.api as da assert hasattr(da, 'compute') + assert hasattr(da, 'provision') From 2f6d8f9b134de794c245fdc7fa442f4cbaf66ee0 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 4 Oct 2024 15:53:30 +0200 Subject: [PATCH 084/148] remove an unused empty file --- datalad_compute/commands/gitworktree.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 datalad_compute/commands/gitworktree.py diff --git a/datalad_compute/commands/gitworktree.py b/datalad_compute/commands/gitworktree.py deleted file mode 100644 index e69de29..0000000 From 43e67031d0653219356dc0cbf37b799dc0e375cf Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sat, 5 Oct 2024 09:45:46 +0200 Subject: [PATCH 085/148] improve docstrings and log-messages --- datalad_compute/commands/provision_cmd.py | 31 +++++++++-------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py index 392f2e0..e9419bd 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_compute/commands/provision_cmd.py @@ -49,6 +49,13 @@ class Provision(ValidatedInterface): # first docstring line is used a short description in the cmdline help # the rest is put in the verbose help and manpage """Provision inputs for a compute command + + This command provides a temporary, partial copy of the dataset in a separate + tree, called a "worktree". The worktree will contain all files that are + specified by the input patterns. All necessary subdatasets will be + installed. If a subdataset is locally available in the source dataset, it + will be installed from there. Its main purpose is to provide an isolated + environment for "compute" commands. """ _validator_ = EnsureCommandParameterization(dict( @@ -73,8 +80,8 @@ class Provision(ValidatedInterface): delete=Parameter( args=('--delete',), doc="Delete the temporary worktree WORKTREE that belongs the the " - "dataset (cannot be used with `-b`, `--branch`, `-i`, or " - "`--input`)"), + "dataset (cannot be used with `-b`, `--branch`, `-i`," + "`--input`, `-I`, or `--input-list`)."), input=Parameter( args=('-i', '--input',), action='append', @@ -91,8 +98,8 @@ class Provision(ValidatedInterface): "patterns should be provided."), worktree_dir=Parameter( args=('-w', '--worktree-dir',), - doc="Path of the directory that should become the temporary worktree" - ", defaults to `tempfile.TemporaryDirectory().name`."), + doc="Path of the directory that should become the temporary " + "worktree, defaults to `tempfile.TemporaryDirectory().name`."), ) @staticmethod @@ -142,19 +149,6 @@ def remove(dataset: Dataset, prune_worktrees(dataset) -def remove_subdatasets(worktree: Dataset): - for subdataset_info in worktree.subdatasets( - recursive=True, - result_renderer='disabled' - ): - worktree.drop( - subdataset_info['path'], - recursive=True, - reckless='kill', - what='all', - result_renderer='disabled') - - def prune_worktrees(dataset: Dataset) -> None: call_git_lines(['worktree', 'prune'], cwd=dataset.pathobj) for result in dataset.subdatasets(result_renderer='disabled'): @@ -190,7 +184,7 @@ def provide(dataset: Dataset, # Get all input files in the worktree with chdir(worktree_dataset.path): for file in input_files: - lgr.debug('Provisioning file %s', file) + lgr.debug('provisioning input file %s', file) worktree_dataset.get(file, result_renderer='disabled') return worktree_dir @@ -238,7 +232,6 @@ def get_subdataset_info(dataset: Dataset) -> Iterable[tuple[Path, Path, Path]]: results = dataset.subdatasets( recursive=True, result_renderer='disabled') - check_results(results) return [ ( Path(result['path']), From d8bc2c78c4a89d28a9fd1391767025faefea80e3 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sat, 5 Oct 2024 13:06:13 +0200 Subject: [PATCH 086/148] improve collection in annex remote This commit adds code to the collection stage of the annex remote that checks whether non-annexed output files are identicall to their counterpart in the source dataset. --- datalad_compute/annexremotes/compute.py | 32 +++++++++++++++++++------ 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/datalad_compute/annexremotes/compute.py b/datalad_compute/annexremotes/compute.py index 7c7a4a8..b58e0a1 100644 --- a/datalad_compute/annexremotes/compute.py +++ b/datalad_compute/annexremotes/compute.py @@ -1,9 +1,11 @@ from __future__ import annotations +import hashlib import json import logging import shutil import subprocess +from hashlib import md5 from pathlib import Path from typing import Any from urllib.parse import ( @@ -160,17 +162,33 @@ def _collect(self, if output == this: continue dataset_path, file_path = get_file_dataset(dataset.pathobj / output) - call_git_success([ - '-C', str(dataset_path), - 'annex', 'reinject', - str(worktree / output), - str(file_path)], + is_annexed = call_git_success( + ['annex', 'whereis', str(file_path)], + cwd=dataset_path, capture_output=True) - - # Collect `this` file + if is_annexed: + call_git_success( + ['annex', 'reinject', str(worktree / output), str(file_path)], + cwd=dataset_path, + capture_output=True) + else: + # Check that the files in worktree and source dataset are identical + assert ( + _hash_file(dataset.pathobj / output) + == _hash_file(worktree / output), + f'calculated output: ({worktree / output}) differs from ' + f'original output: ({dataset.pathobj / output}).') + + # Collect `this` file. It has to be copied to the destination given + # by git-annex. Git-annex will check its integrity. shutil.copyfile(worktree / this, this_destination) +def _hash_file(file: str | Path) -> str: + with open(file, 'rb') as f: + return hashlib.file_digest(f, 'md5').hexdigest() + + def main(): """cmdline entry point""" super_main( From 82e3d27d466a4593aa59c3a33fd038b6285f2a90 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sat, 5 Oct 2024 13:39:13 +0200 Subject: [PATCH 087/148] ensure that addurl is only called on annexed files --- datalad_compute/commands/compute_cmd.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index fce1c0f..c9535a1 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -213,12 +213,20 @@ def add_url(dataset: Dataset, # Build the file-specific URL and store it in the annex url = url_base + f'&this={quote(file_path)}' file_dataset_path, file_path = get_file_dataset(dataset.pathobj / file_path) - success = call_git_success( - ['-C', str(file_dataset_path), 'annex', 'addurl', url, '--file', file_path] - + (['--relaxed'] if url_only else []), - capture_output=True,) - - assert success, f'\naddurl failed:\nfile_dataset_path: {file_dataset_path}\nurl: {url!r}\nfile_path: {file_path!r}' + is_annexed = call_git_success( + ['annex', 'whereis', str(file_path)], + cwd=file_dataset_path, + capture_output=True) + if is_annexed: + success = call_git_success( + ['annex', 'addurl', url, '--file', file_path] + + (['--relaxed'] if url_only else []), + cwd=file_dataset_path, + capture_output=True) + assert ( + success, + f'\naddurl failed:\nfile_dataset_path: {file_dataset_path}\n' + f'url: {url!r}\nfile_path: {file_path!r}') return url From 068280f1426627c40282f88abfc37beca2131936 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sat, 5 Oct 2024 13:43:52 +0200 Subject: [PATCH 088/148] use `pwd`-kwarg instead of `-C ` --- datalad_compute/commands/compute_cmd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index c9535a1..520e0c1 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -236,8 +236,8 @@ def get_file_dataset(file: Path) -> tuple[Path, Path]: Determine the path of the dataset that contains the file and the relative path of the file in this dataset.""" top_level = Path(call_git_oneline( - ['-C', str(file.parent), 'rev-parse', '--show-toplevel'] - )) + ['rev-parse', '--show-toplevel'], + cwd=file.parent)) return ( Path(top_level), file.absolute().relative_to(top_level)) From ab0068097bbd1d60b7509188cda0962a52db25df Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sat, 5 Oct 2024 13:52:52 +0200 Subject: [PATCH 089/148] change python version to 3.11 in tests --- .appveyor.yml | 4 ++-- .github/workflows/test_crippledfs.yml | 0 2 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/test_crippledfs.yml diff --git a/.appveyor.yml b/.appveyor.yml index ac2768b..a179521 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -68,14 +68,14 @@ environment: # Ubuntu core tests - job_name: test-linux APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2204 - PY: 3.9 + PY: 3.11 INSTALL_GITANNEX: git-annex -m snapshot # same as 'test-linux', but TMPDIR is on a crippled filesystem, causing # most, if not all test datasets to be created on that filesystem - job_name: test-linux-crippled APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2204 - PY: 3.9 + PY: 3.11 # datalad-annex git remote needs something after git-annex_8.20211x INSTALL_GITANNEX: git-annex -m snapshot diff --git a/.github/workflows/test_crippledfs.yml b/.github/workflows/test_crippledfs.yml new file mode 100644 index 0000000..e69de29 From bfe95c7b000cfce50ea1a915c69d5f96468b52af Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sat, 5 Oct 2024 13:55:16 +0200 Subject: [PATCH 090/148] convert util tests to subdataset --- datalad_compute/utils/tests/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 datalad_compute/utils/tests/__init__.py diff --git a/datalad_compute/utils/tests/__init__.py b/datalad_compute/utils/tests/__init__.py new file mode 100644 index 0000000..e69de29 From 412f1d43e001a988b29e19ca59fcd47cf9437c54 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sat, 5 Oct 2024 21:58:14 +0200 Subject: [PATCH 091/148] remove hash comparison of worktree and dataset files --- datalad_compute/annexremotes/compute.py | 14 -------------- datalad_compute/commands/tests/test_collection.py | 0 2 files changed, 14 deletions(-) create mode 100644 datalad_compute/commands/tests/test_collection.py diff --git a/datalad_compute/annexremotes/compute.py b/datalad_compute/annexremotes/compute.py index b58e0a1..25d4c9d 100644 --- a/datalad_compute/annexremotes/compute.py +++ b/datalad_compute/annexremotes/compute.py @@ -1,11 +1,9 @@ from __future__ import annotations -import hashlib import json import logging import shutil import subprocess -from hashlib import md5 from pathlib import Path from typing import Any from urllib.parse import ( @@ -171,24 +169,12 @@ def _collect(self, ['annex', 'reinject', str(worktree / output), str(file_path)], cwd=dataset_path, capture_output=True) - else: - # Check that the files in worktree and source dataset are identical - assert ( - _hash_file(dataset.pathobj / output) - == _hash_file(worktree / output), - f'calculated output: ({worktree / output}) differs from ' - f'original output: ({dataset.pathobj / output}).') # Collect `this` file. It has to be copied to the destination given # by git-annex. Git-annex will check its integrity. shutil.copyfile(worktree / this, this_destination) -def _hash_file(file: str | Path) -> str: - with open(file, 'rb') as f: - return hashlib.file_digest(f, 'md5').hexdigest() - - def main(): """cmdline entry point""" super_main( diff --git a/datalad_compute/commands/tests/test_collection.py b/datalad_compute/commands/tests/test_collection.py new file mode 100644 index 0000000..e69de29 From dba0ea0d4af8587c31318f9f9a700020aecb6c11 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sun, 6 Oct 2024 05:44:10 +0200 Subject: [PATCH 092/148] add output pattern support for computation This commit adds support for output patterns to the compute-command and the compute special remote. --- datalad_compute/annexremotes/compute.py | 13 +++- .../annexremotes/tests/test_hierarchies.py | 62 ++++++++++++++++++- datalad_compute/commands/compute_cmd.py | 53 +++++++++------- 3 files changed, 102 insertions(+), 26 deletions(-) diff --git a/datalad_compute/annexremotes/compute.py b/datalad_compute/annexremotes/compute.py index 25d4c9d..cdca9af 100644 --- a/datalad_compute/annexremotes/compute.py +++ b/datalad_compute/annexremotes/compute.py @@ -5,7 +5,10 @@ import shutil import subprocess from pathlib import Path -from typing import Any +from typing import ( + Any, + Iterable, +) from urllib.parse import ( unquote, urlparse, @@ -27,7 +30,7 @@ provide, un_provide, ) - +from ..utils.glob import resolve_patterns lgr = logging.getLogger('datalad.compute.annexremotes.compute') @@ -148,12 +151,15 @@ def _find_dataset(self, def _collect(self, worktree: Path, dataset: Dataset, - outputs: list[str], + output_patterns: Iterable[str], this: str, this_destination: str, ) -> None: """Collect computation results for `this` (and all other outputs) """ + # Get all outputs that were created during computation + outputs = resolve_patterns(root_dir=worktree, patterns=output_patterns) + # Collect all output files that have been created while creating # `this` file. for output in outputs: @@ -165,6 +171,7 @@ def _collect(self, cwd=dataset_path, capture_output=True) if is_annexed: + self.annex.debug(f'_collect: reinject: {worktree / output} -> {dataset_path}:{file_path}') call_git_success( ['annex', 'reinject', str(worktree / output), str(file_path)], cwd=dataset_path, diff --git a/datalad_compute/annexremotes/tests/test_hierarchies.py b/datalad_compute/annexremotes/tests/test_hierarchies.py index 4b60694..ed4021e 100644 --- a/datalad_compute/annexremotes/tests/test_hierarchies.py +++ b/datalad_compute/annexremotes/tests/test_hierarchies.py @@ -2,6 +2,7 @@ from datalad.api import get as datalad_get from datalad_next.datasets import Dataset +from datalad_next.runners import call_git_success from datalad_next.tests.fixtures import datalad_cfg from ... import ( @@ -40,6 +41,15 @@ 'd2_subds0/d2_subds1/d2_subds2/a2.txt', 'd2_subds0/d2_subds1/d2_subds2/b2.txt', 'd2_subds0/d2_subds1/d2_subds2/new.txt', ] + +output_pattern = [ + '*.txt', + 'd2_subds0/*.txt', + 'd2_subds0/d2_subds1/*.txt', + 'd2_subds0/d2_subds1/d2_subds2/*.txt', +] + + test_file_content = [ (file, content) for file, content in @@ -50,7 +60,7 @@ def _drop_files(dataset: Dataset, files: Iterable[str]): for file in files: - dataset.drop(file, result_renderer='disabled') + dataset.drop(file, reckless='availability', result_renderer='disabled') assert not (dataset.pathobj / file).exists() @@ -109,3 +119,53 @@ def test_end_to_end(tmp_path, datalad_cfg, monkeypatch): datalad_get('d2_subds0/d2_subds1/a1.txt') _check_content(root_dataset, test_file_content) + + +def test_end_to_end_with_pattern(tmp_path, datalad_cfg, monkeypatch): + + datasets = create_ds_hierarchy(tmp_path, 'd2', 3) + root_dataset = datasets[0][2] + + # add method template + template_path = root_dataset.pathobj / template_dir + template_path.mkdir(parents=True) + (template_path / 'test_method').write_text(test_method) + root_dataset.save(result_renderer='disabled') + + # set annex security related variables to allow compute-URLs + datalad_cfg.set('annex.security.allowed-url-schemes', url_scheme, scope='global') + datalad_cfg.set('annex.security.allowed-ip-addresses', 'all', scope='global') + datalad_cfg.set('annex.security.allow-unverified-downloads', 'ACKTHPPT', scope='global') + + # run compute command + root_dataset.compute( + template='test_method', + parameter=[ + 'first=first', + 'second=second', + 'third=third', + ], + output=output_pattern, + result_renderer='disabled') + + # check computation success + _check_content(root_dataset, test_file_content) + + # Drop all computed content + _drop_files(root_dataset, output) + + # Go to the subdataset `d2_subds0/d2_subds1` and fetch the content of `a1.txt` + # from a compute remote. + monkeypatch.chdir(root_dataset.pathobj / 'd2_subds0' / 'd2_subds1') + datalad_get('a1.txt') + + # check that all known files that were computed are added to the annex + _check_content(root_dataset, test_file_content) + + _drop_files(root_dataset, output) + + # check get in subdatasets + monkeypatch.chdir(root_dataset.pathobj) + datalad_get('d2_subds0/d2_subds1/a1.txt') + + _check_content(root_dataset, test_file_content) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 520e0c1..a3b7ba1 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -7,8 +7,6 @@ import logging import os import shutil -import subprocess -from itertools import chain from pathlib import Path from typing import Iterable from urllib.parse import quote @@ -39,6 +37,7 @@ url_scheme, ) from ..utils.compute import compute +from ..utils.glob import resolve_patterns __docformat__ = 'restructuredtext' @@ -90,7 +89,7 @@ class Compute(ValidatedInterface): action='append', doc="An input file pattern (repeat for multiple inputs, " "file pattern support python globbing, globbing is expanded " - "in the source dataset"), + "in the source dataset)"), input_list=Parameter( args=('-I', '--input-list',), doc="Name of a file that contains a list of input file patterns. " @@ -102,10 +101,12 @@ class Compute(ValidatedInterface): output=Parameter( args=('-o', '--output',), action='append', - doc="Name of an output file (repeat for multiple outputs)"), + doc="An output file pattern (repeat for multiple outputs)" + "file pattern support python globbing, globbing is expanded " + "in the worktree)"), output_list=Parameter( args=('-O', '--output-list',), - doc="Name of a file that contains a list of output files. Format " + doc="Name of a file that contains a list of output patterns. Format " "is one file per line, relative path from `dataset`. Empty " "lines, i.e. lines that contain only newlines, arg ignored. " "This is useful if a large number of output files should be " @@ -143,13 +144,13 @@ def __call__(dataset=None, dataset : Dataset = dataset.ds if dataset else Dataset('.') input_pattern = (input or []) + read_list(input_list) - output = (output or []) + read_list(output_list) + output_pattern = (output or []) + read_list(output_list) parameter = (parameter or []) + read_list(parameter_list) if not url_only: worktree = provide(dataset, branch, input_pattern) - execute(worktree, template, parameter, output) - collect(worktree, dataset, output) + execute(worktree, template, parameter, output_pattern) + output = collect(worktree, dataset, output_pattern) dataset.provision(delete=worktree) url_base = get_url( @@ -158,7 +159,7 @@ def __call__(dataset=None, template, parameter, input_pattern, - output) + output_pattern) for out in output: url = add_url(dataset, out, url_base, url_only) @@ -184,8 +185,8 @@ def get_url(dataset: Dataset, branch: str | None, template_name: str, parameters: dict[str, str], - input_files: list[str], - output_files: list[str], + input_pattern: list[str], + output_pattern: list[str], ) -> str: branch = dataset.repo.get_hexsha() if branch is None else branch @@ -194,8 +195,8 @@ def get_url(dataset: Dataset, + f'?root_id={quote(dataset.id)}' + f'&default_root_version={quote(branch)}' + f'&method={quote(template_name)}' - + f'&input={quote(json.dumps(input_files))}' - + f'&output={quote(json.dumps(output_files))}' + + f'&input={quote(json.dumps(input_pattern))}' + + f'&output={quote(json.dumps(output_pattern))}' + f'¶ms={quote(json.dumps(parameters))}' ) @@ -256,20 +257,25 @@ def provide(dataset: Dataset, def execute(worktree: Path, template_name: str, parameter: list[str], - output: list[str], + output_pattern: list[str], ) -> None: lgr.debug( 'execute: %s %s %s %s', str(worktree), - template_name, repr(parameter), repr(output)) + template_name, repr(parameter), repr(output_pattern)) worktree_ds = Dataset(worktree) - # Get the subdatasets, directories, and files that are part of the output - # space. - create_output_space(worktree_ds, output) - # Unlock output files in the output space (worktree-directory) - unlock_files(worktree_ds, output) + # Determine which outputs already exist + existing_outputs = resolve_patterns( + root_dir=worktree, + patterns=output_pattern) + + # Get the subdatasets, directories, and files of the existing output space + create_output_space(worktree_ds, existing_outputs) + + # Unlock existing output files in the output space (worktree-directory) + unlock_files(worktree_ds, existing_outputs) # Run the computation in the worktree-directory template_path = worktree / template_dir / template_name @@ -282,8 +288,10 @@ def execute(worktree: Path, def collect(worktree: Path, dataset: Dataset, - output: Iterable[str], - ) -> None: + output_pattern: Iterable[str], + ) -> set[str]: + + output = resolve_patterns(root_dir=worktree, patterns=output_pattern) # Unlock output files in the dataset-directory and copy the result unlock_files(dataset, output) @@ -295,6 +303,7 @@ def collect(worktree: Path, # Save the dataset dataset.save(recursive=True, result_renderer='disabled') + return output def unlock_files(dataset: Dataset, From 9305747c29bcb5599c51f1f325fbfcb4a237e230 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sun, 6 Oct 2024 05:55:15 +0200 Subject: [PATCH 093/148] remove an unused import --- datalad_compute/annexremotes/tests/test_hierarchies.py | 1 - 1 file changed, 1 deletion(-) diff --git a/datalad_compute/annexremotes/tests/test_hierarchies.py b/datalad_compute/annexremotes/tests/test_hierarchies.py index ed4021e..968f52a 100644 --- a/datalad_compute/annexremotes/tests/test_hierarchies.py +++ b/datalad_compute/annexremotes/tests/test_hierarchies.py @@ -2,7 +2,6 @@ from datalad.api import get as datalad_get from datalad_next.datasets import Dataset -from datalad_next.runners import call_git_success from datalad_next.tests.fixtures import datalad_cfg from ... import ( From c3168e5ed448ac431b277eaf39ad64dff440d7cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20M=C3=B6nch?= Date: Sun, 6 Oct 2024 08:00:25 +0200 Subject: [PATCH 094/148] enable recursive globbing --- datalad_compute/utils/glob.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalad_compute/utils/glob.py b/datalad_compute/utils/glob.py index f11a68b..34fb19b 100644 --- a/datalad_compute/utils/glob.py +++ b/datalad_compute/utils/glob.py @@ -12,5 +12,5 @@ def resolve_patterns(root_dir: str | Path, ) -> set[str]: return set( chain.from_iterable( - glob(pattern, root_dir=str(root_dir)) + glob(pattern, root_dir=str(root_dir), recursive=True) for pattern in patterns)) From 9042cadde14f6a4772f6093f384c60cd2e0ff1d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20M=C3=B6nch?= Date: Sun, 6 Oct 2024 09:01:22 +0200 Subject: [PATCH 095/148] update fmriprep docker example This commit updates the example to use input/output globbing. In addition it improves the description and fixes and error in the invocation. --- examples/fmriprep docker/input.txt | 7 +-- examples/fmriprep docker/output.txt | 85 +------------------------- examples/fmriprep docker/parameter.txt | 2 +- examples/fmriprep docker/readme.md | 12 +++- 4 files changed, 13 insertions(+), 93 deletions(-) diff --git a/examples/fmriprep docker/input.txt b/examples/fmriprep docker/input.txt index 9693bad..108c779 100644 --- a/examples/fmriprep docker/input.txt +++ b/examples/fmriprep docker/input.txt @@ -4,9 +4,4 @@ datasets/ds000102/participants.tsv datasets/ds000102/T1w.json datasets/ds000102/task-flanker_bold.json -datasets/ds000102/sub-01/anat/sub-01_T1w.nii.gz - -datasets/ds000102/sub-01/func/sub-01_task-flanker_run-1_bold.nii.gz -datasets/ds000102/sub-01/func/sub-01_task-flanker_run-1_events.tsv -datasets/ds000102/sub-01/func/sub-01_task-flanker_run-2_bold.nii.gz -datasets/ds000102/sub-01/func/sub-01_task-flanker_run-2_events.tsv +datasets/ds000102/sub-01/** diff --git a/examples/fmriprep docker/output.txt b/examples/fmriprep docker/output.txt index 14ac0f4..829130f 100644 --- a/examples/fmriprep docker/output.txt +++ b/examples/fmriprep docker/output.txt @@ -1,85 +1,2 @@ # Paths are relative to the dataset in which `datalad compute` was executed -datasets/ds000102/derivatives/logs/CITATION.md -datasets/ds000102/derivatives/logs/CITATION.bib -datasets/ds000102/derivatives/logs/CITATION.html -datasets/ds000102/derivatives/logs/CITATION.tex - -datasets/ds000102/derivatives/sub-01/figures/sub-01_desc-about_T1w.html -datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-1_desc-validation_bold.html -datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-2_desc-validation_bold.html -datasets/ds000102/derivatives/sub-01/figures/sub-01_desc-summary_T1w.html -datasets/ds000102/derivatives/sub-01/figures/sub-01_desc-conform_T1w.html -datasets/ds000102/derivatives/sub-01/figures/sub-01_dseg.svg -datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-1_desc-summary_bold.html -datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-2_desc-summary_bold.html -datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-1_desc-coreg_bold.svg -datasets/ds000102/derivatives/sub-01/figures/sub-01_space-MNI152NLin2009cAsym_T1w.svg -datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-2_desc-coreg_bold.svg -datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-1_desc-rois_bold.svg -datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-1_desc-compcorvar_bold.svg -datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-1_desc-confoundcorr_bold.svg -datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-2_desc-compcorvar_bold.svg -datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-2_desc-rois_bold.svg -datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-2_desc-confoundcorr_bold.svg -datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-1_desc-carpetplot_bold.svg -datasets/ds000102/derivatives/sub-01/figures/sub-01_task-flanker_run-2_desc-carpetplot_bold.svg - -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_desc-hmc_boldref.nii.gz -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_desc-hmc_boldref.json -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_desc-hmc_boldref.nii.gz -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_desc-hmc_boldref.json -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_from-orig_to-boldref_mode-image_desc-hmc_xfm.txt -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_from-orig_to-boldref_mode-image_desc-hmc_xfm.json -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_from-orig_to-boldref_mode-image_desc-hmc_xfm.txt -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_from-orig_to-boldref_mode-image_desc-hmc_xfm.json -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_desc-coreg_boldref.nii.gz -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_desc-coreg_boldref.json -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_desc-coreg_boldref.nii.gz -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_desc-coreg_boldref.json -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_desc-brain_mask.nii.gz -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_desc-brain_mask.json -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_desc-brain_mask.nii.gz -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_desc-brain_mask.json -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_from-boldref_to-T1w_mode-image_desc-coreg_xfm.txt -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_from-boldref_to-T1w_mode-image_desc-coreg_xfm.json -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_from-boldref_to-T1w_mode-image_desc-coreg_xfm.txt -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_from-boldref_to-T1w_mode-image_desc-coreg_xfm.json -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_space-MNI152NLin2009cAsym_boldref.nii.gz -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_space-MNI152NLin2009cAsym_boldref.json -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_space-MNI152NLin2009cAsym_desc-brain_mask.nii.gz -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_space-MNI152NLin2009cAsym_desc-brain_mask.json -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_space-MNI152NLin2009cAsym_boldref.nii.gz -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_space-MNI152NLin2009cAsym_boldref.json -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_space-MNI152NLin2009cAsym_desc-brain_mask.nii.gz -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_space-MNI152NLin2009cAsym_desc-brain_mask.json -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_space-MNI152NLin2009cAsym_desc-preproc_bold.nii.gz -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_space-MNI152NLin2009cAsym_desc-preproc_bold.json -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_space-MNI152NLin2009cAsym_desc-preproc_bold.nii.gz -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_space-MNI152NLin2009cAsym_desc-preproc_bold.json -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_desc-confounds_timeseries.tsv -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-1_desc-confounds_timeseries.json -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_desc-confounds_timeseries.tsv -datasets/ds000102/derivatives/sub-01/func/sub-01_task-flanker_run-2_desc-confounds_timeseries.json - -datasets/ds000102/derivatives/sub-01/anat/sub-01_desc-preproc_T1w.nii.gz -datasets/ds000102/derivatives/sub-01/anat/sub-01_desc-preproc_T1w.json -datasets/ds000102/derivatives/sub-01/anat/sub-01_desc-brain_mask.nii.gz -datasets/ds000102/derivatives/sub-01/anat/sub-01_desc-brain_mask.json -datasets/ds000102/derivatives/sub-01/anat/sub-01_dseg.nii.gz -datasets/ds000102/derivatives/sub-01/anat/sub-01_label-GM_probseg.nii.gz -datasets/ds000102/derivatives/sub-01/anat/sub-01_label-WM_probseg.nii.gz -datasets/ds000102/derivatives/sub-01/anat/sub-01_label-CSF_probseg.nii.gz -datasets/ds000102/derivatives/sub-01/anat/sub-01_from-T1w_to-MNI152NLin2009cAsym_mode-image_xfm.h5 -datasets/ds000102/derivatives/sub-01/anat/sub-01_from-MNI152NLin2009cAsym_to-T1w_mode-image_xfm.h5 -datasets/ds000102/derivatives/sub-01/anat/sub-01_space-MNI152NLin2009cAsym_desc-brain_mask.nii.gz -datasets/ds000102/derivatives/sub-01/anat/sub-01_space-MNI152NLin2009cAsym_desc-brain_mask.json -datasets/ds000102/derivatives/sub-01/anat/sub-01_space-MNI152NLin2009cAsym_desc-preproc_T1w.nii.gz -datasets/ds000102/derivatives/sub-01/anat/sub-01_space-MNI152NLin2009cAsym_desc-preproc_T1w.json -datasets/ds000102/derivatives/sub-01/anat/sub-01_space-MNI152NLin2009cAsym_dseg.nii.gz -datasets/ds000102/derivatives/sub-01/anat/sub-01_space-MNI152NLin2009cAsym_label-GM_probseg.nii.gz -datasets/ds000102/derivatives/sub-01/anat/sub-01_space-MNI152NLin2009cAsym_label-WM_probseg.nii.gz -datasets/ds000102/derivatives/sub-01/anat/sub-01_space-MNI152NLin2009cAsym_label-CSF_probseg.nii.gz - -datasets/ds000102/derivatives/sub-01.html -datasets/ds000102/derivatives/dataset_description.json -datasets/ds000102/derivatives/.bidsignore +derivatives/ds000102/** diff --git a/examples/fmriprep docker/parameter.txt b/examples/fmriprep docker/parameter.txt index 7398855..421e49b 100644 --- a/examples/fmriprep docker/parameter.txt +++ b/examples/fmriprep docker/parameter.txt @@ -1,4 +1,4 @@ input_dir=datasets/ds000102 -output_dir=datasets/ds000102/derivatives +output_dir=derivatives/ds000102 participant_label=01 license_file=license.txt diff --git a/examples/fmriprep docker/readme.md b/examples/fmriprep docker/readme.md index 4c35ec6..7e9d6d1 100644 --- a/examples/fmriprep docker/readme.md +++ b/examples/fmriprep docker/readme.md @@ -1,6 +1,14 @@ This directory contains a simple example for running `fmriprep-docker` on a single subject of a BIDS dataset. The template is `fmriprep-docker`, input, output, and parameter files are defined in `input.txt`, `output.txt`, and `parameter.txt`, respectively. -The example assumes that the BIDS dataset referenced in `input_dir` is a subdataset of the dataset in which the computation is started (the root-dataset), as outlined in the fairly-big-follow-up document (https://hackmd.io/7oRB8qwuRtCm6BkV44Ubww). +The example assumes that the BIDS dataset referenced in `input_dir` is a subdataset of the dataset in which the computation is started (the root-dataset), as outlined in the fairly-big-follow-up document (https://hackmd.io/7oRB8qwuRtCm6BkV44Ubww). In contrast to the fairly-big-follow-up document, the example uses another subdataset, that collects the results of the computation. The dataset layout is therefore as follows: +``` +root-dataset +├── datasets +│ ├── ds000102 +│ +├── derivatives + ├── ds000102 +``` Executing the computation requires installation of this extension (see https://github.com/christian-monch/datalad-compute/tree/main/README.md), and the installation of the python package `fmriprep-docker`. The template, i.e. `fmriprep-docker` has to be placed in the folder `.datalad/compute/methods` of the root-dataset (and the dataset has to be saved). @@ -12,5 +20,5 @@ To keep the command line short, input files, output files, and parameter for the The computation can be executed with the following command: ```bash -> datalad compute -I input.txt -O output.txt -P parameter.txt fmriprep_template +> datalad compute -I input.txt -O output.txt -P parameter.txt fmriprep-docker ``` From 4c909871949d2a1585f52e12228fcb8f9b824726 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sun, 6 Oct 2024 12:01:08 +0200 Subject: [PATCH 096/148] refactor compute special remote tests --- .../annexremotes/tests/test_hierarchies.py | 76 +++++-------------- 1 file changed, 18 insertions(+), 58 deletions(-) diff --git a/datalad_compute/annexremotes/tests/test_hierarchies.py b/datalad_compute/annexremotes/tests/test_hierarchies.py index 968f52a..78289d5 100644 --- a/datalad_compute/annexremotes/tests/test_hierarchies.py +++ b/datalad_compute/annexremotes/tests/test_hierarchies.py @@ -1,4 +1,6 @@ -from collections.abc import Iterable +from typing import Iterable + +import pytest from datalad.api import get as datalad_get from datalad_next.datasets import Dataset @@ -33,7 +35,7 @@ """ -output = [ +output_pattern_static = [ 'a.txt', 'b.txt', 'new.txt', 'd2_subds0/a0.txt', 'd2_subds0/b0.txt', 'd2_subds0/new.txt', 'd2_subds0/d2_subds1/a1.txt', 'd2_subds0/d2_subds1/b1.txt', 'd2_subds0/d2_subds1/new.txt', @@ -41,7 +43,7 @@ ] -output_pattern = [ +output_pattern_glob = [ '*.txt', 'd2_subds0/*.txt', 'd2_subds0/d2_subds1/*.txt', @@ -52,7 +54,9 @@ test_file_content = [ (file, content) for file, content in - zip(output, ['content: first\n', 'content: second\n', 'content: third\n'] * 4) + zip( + output_pattern_static, + ['content: first\n', 'content: second\n', 'content: third\n'] * 4) ] @@ -70,57 +74,8 @@ def _check_content(dataset, assert (dataset.pathobj / file).read_text() == content -def test_end_to_end(tmp_path, datalad_cfg, monkeypatch): - - datasets = create_ds_hierarchy(tmp_path, 'd2', 3) - root_dataset = datasets[0][2] - - # add method template - template_path = root_dataset.pathobj / template_dir - template_path.mkdir(parents=True) - (template_path / 'test_method').write_text(test_method) - root_dataset.save(result_renderer='disabled') - - # set annex security related variables to allow compute-URLs - datalad_cfg.set('annex.security.allowed-url-schemes', url_scheme, scope='global') - datalad_cfg.set('annex.security.allowed-ip-addresses', 'all', scope='global') - datalad_cfg.set('annex.security.allow-unverified-downloads', 'ACKTHPPT', scope='global') - - # run compute command - root_dataset.compute( - template='test_method', - parameter=[ - 'first=first', - 'second=second', - 'third=third', - ], - output=output, - result_renderer='disabled') - - # check computation success - _check_content(root_dataset, test_file_content) - - # Drop all computed content - _drop_files(root_dataset, output) - - # Go to the subdataset `d2_subds0/d2_subds1` and fetch the content of `a1.txt` - # from a compute remote. - monkeypatch.chdir(root_dataset.pathobj / 'd2_subds0' / 'd2_subds1') - datalad_get('a1.txt') - - # check that all files are computed - _check_content(root_dataset, test_file_content) - - _drop_files(root_dataset, output) - - # check get in subdatasets - monkeypatch.chdir(root_dataset.pathobj) - datalad_get('d2_subds0/d2_subds1/a1.txt') - - _check_content(root_dataset, test_file_content) - - -def test_end_to_end_with_pattern(tmp_path, datalad_cfg, monkeypatch): +@pytest.mark.parametrize('output_pattern', [output_pattern_static, output_pattern_glob]) +def test_end_to_end(tmp_path, datalad_cfg, monkeypatch, output_pattern): datasets = create_ds_hierarchy(tmp_path, 'd2', 3) root_dataset = datasets[0][2] @@ -137,7 +92,7 @@ def test_end_to_end_with_pattern(tmp_path, datalad_cfg, monkeypatch): datalad_cfg.set('annex.security.allow-unverified-downloads', 'ACKTHPPT', scope='global') # run compute command - root_dataset.compute( + results = root_dataset.compute( template='test_method', parameter=[ 'first=first', @@ -147,11 +102,16 @@ def test_end_to_end_with_pattern(tmp_path, datalad_cfg, monkeypatch): output=output_pattern, result_renderer='disabled') + collected_output = [ + str(result['path'].relative_to(root_dataset.pathobj)) + for result in results] + assert set(collected_output) == set(output_pattern_static) + # check computation success _check_content(root_dataset, test_file_content) # Drop all computed content - _drop_files(root_dataset, output) + _drop_files(root_dataset, collected_output) # Go to the subdataset `d2_subds0/d2_subds1` and fetch the content of `a1.txt` # from a compute remote. @@ -161,7 +121,7 @@ def test_end_to_end_with_pattern(tmp_path, datalad_cfg, monkeypatch): # check that all known files that were computed are added to the annex _check_content(root_dataset, test_file_content) - _drop_files(root_dataset, output) + _drop_files(root_dataset, collected_output) # check get in subdatasets monkeypatch.chdir(root_dataset.pathobj) From aff673554402e5ed1f75794df9136e2acb4025a8 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sun, 6 Oct 2024 15:30:40 +0200 Subject: [PATCH 097/148] fix globing and add tests --- .../commands/tests/test_collection.py | 27 +++++++++++++++++++ datalad_compute/utils/glob.py | 8 +++--- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/datalad_compute/commands/tests/test_collection.py b/datalad_compute/commands/tests/test_collection.py index e69de29..1e3781f 100644 --- a/datalad_compute/commands/tests/test_collection.py +++ b/datalad_compute/commands/tests/test_collection.py @@ -0,0 +1,27 @@ +from pathlib import Path + +from .create_datasets import create_ds_hierarchy +from .test_provision import get_file_list +from ..compute_cmd import collect + + +def test_collect(tmp_path): + + dataset = create_ds_hierarchy(tmp_path, 'ds1', 1)[0][2] + + worktree_dir = tmp_path / 'ds1_worktree' + worktree_dir.mkdir(parents=True, exist_ok=False) + worktree = dataset.provision(worktree_dir=worktree_dir) + + result_dir = worktree_dir / 'results' / 'sub-01' + result_dir.mkdir(parents=True) + (result_dir / 'a.txt').write_text('content: a\n') + (result_dir / 'b.txt').write_text('content: b\n') + + result = collect( + worktree=Path(worktree[0]['path']), + dataset=dataset, + output_pattern=['results/**'] + ) + assert result == {'results/sub-01/a.txt', 'results/sub-01/b.txt'} + assert set(get_file_list(dataset.pathobj / 'results')) == {'sub-01/a.txt', 'sub-01/b.txt'} diff --git a/datalad_compute/utils/glob.py b/datalad_compute/utils/glob.py index 34fb19b..90cae72 100644 --- a/datalad_compute/utils/glob.py +++ b/datalad_compute/utils/glob.py @@ -11,6 +11,8 @@ def resolve_patterns(root_dir: str | Path, patterns: Iterable[str] ) -> set[str]: return set( - chain.from_iterable( - glob(pattern, root_dir=str(root_dir), recursive=True) - for pattern in patterns)) + filter( + lambda p: not (Path(root_dir) / p).is_dir(), + chain.from_iterable( + glob(pattern, root_dir=str(root_dir), recursive=True) + for pattern in patterns))) From 81ee14272e13d8f09a4c8ad7ee3539b3c6e8e2a5 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sun, 6 Oct 2024 15:54:14 +0200 Subject: [PATCH 098/148] add a provisioning context-manager --- datalad_compute/annexremotes/compute.py | 26 +++++++------- datalad_compute/commands/compute_cmd.py | 34 ++++++++++++------- .../commands/tests/test_provision.py | 9 +++++ 3 files changed, 44 insertions(+), 25 deletions(-) diff --git a/datalad_compute/annexremotes/compute.py b/datalad_compute/annexremotes/compute.py index cdca9af..e1030f9 100644 --- a/datalad_compute/annexremotes/compute.py +++ b/datalad_compute/annexremotes/compute.py @@ -27,8 +27,7 @@ from ..commands.compute_cmd import ( execute, get_file_dataset, - provide, - un_provide, + provide_context, ) from ..utils.glob import resolve_patterns @@ -112,16 +111,19 @@ def transfer_retrieve(self, key: str, file_name: str) -> None: # Perform the computation, and collect the results lgr.debug('Starting provision') self.annex.debug('Starting provision') - worktree = provide(dataset, compute_info['root_version'], compute_info['input']) - lgr.debug('Starting execution') - self.annex.debug('Starting execution') - execute(worktree, compute_info['method'], compute_info['parameter'], compute_info['output']) - lgr.debug('Starting collection') - self.annex.debug('Starting collection') - self._collect(worktree, dataset, compute_info['output'], compute_info['this'], file_name) - lgr.debug('Starting unprovision') - self.annex.debug('Starting unprovision') - un_provide(dataset, worktree) + with provide_context( + dataset, + compute_info['root_version'], + compute_info['input'] + ) as worktree: + lgr.debug('Starting execution') + self.annex.debug('Starting execution') + execute(worktree, compute_info['method'], compute_info['parameter'], compute_info['output']) + lgr.debug('Starting collection') + self.annex.debug('Starting collection') + self._collect(worktree, dataset, compute_info['output'], compute_info['this'], file_name) + lgr.debug('Starting unprovision') + self.annex.debug('Starting unprovision') def checkpresent(self, key: str) -> bool: # See if at least one URL with the compute url-scheme is present diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index a3b7ba1..60c8c15 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -8,7 +8,10 @@ import os import shutil from pathlib import Path -from typing import Iterable +from typing import ( + Generator, + Iterable, +) from urllib.parse import quote from datalad.support.exceptions import IncompleteResultsError @@ -148,10 +151,9 @@ def __call__(dataset=None, parameter = (parameter or []) + read_list(parameter_list) if not url_only: - worktree = provide(dataset, branch, input_pattern) - execute(worktree, template, parameter, output_pattern) - output = collect(worktree, dataset, output_pattern) - dataset.provision(delete=worktree) + with provide_context(dataset, branch, input_pattern) as worktree: + execute(worktree, template, parameter, output_pattern) + output = collect(worktree, dataset, output_pattern) url_base = get_url( dataset, @@ -254,6 +256,20 @@ def provide(dataset: Dataset, return Path(result[0]['path']) +@contextlib.contextmanager +def provide_context(dataset: Dataset, + branch: str | None, + input_patterns: list[str], + ) -> Generator: + + worktree = provide(dataset, branch=branch, input_patterns=input_patterns) + try: + yield worktree + finally: + lgr.debug('un_provide: %s %s', dataset, str(worktree)) + dataset.provision(delete=worktree) + + def execute(worktree: Path, template_name: str, parameter: list[str], @@ -336,11 +352,3 @@ def create_output_space(dataset: Dataset, except IncompleteResultsError: # Ignore non-existing files pass - - -def un_provide(dataset: Dataset, - worktree: Path, - ) -> None: - - lgr.debug('un_provide: %s %s', dataset, str(worktree)) - dataset.provision(delete=worktree) diff --git a/datalad_compute/commands/tests/test_provision.py b/datalad_compute/commands/tests/test_provision.py index 7eeb4b3..c45aac6 100644 --- a/datalad_compute/commands/tests/test_provision.py +++ b/datalad_compute/commands/tests/test_provision.py @@ -8,6 +8,7 @@ from datalad_next.runners import call_git_lines from .create_datasets import create_ds_hierarchy +from ..compute_cmd import provide_context file_path_templates = [ @@ -126,3 +127,11 @@ def get_file_list(root: Path, yield from get_file_list(root, child, prefix=prefix / child) else: yield str((prefix / child).relative_to(root)) + + +def test_provision_context(tmp_path): + dataset = create_ds_hierarchy(tmp_path, 'ds1')[0][2] + with provide_context(dataset, branch=None, input_patterns=['**']) as worktree: + files = set(get_file_list(worktree)) + assert files + assert not worktree.exists() From 8efedeec8d8df274608518d22f9755dc05ec5599 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sun, 6 Oct 2024 16:40:07 +0200 Subject: [PATCH 099/148] yield str instead of Path-object --- datalad_compute/commands/compute_cmd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 60c8c15..2994e8f 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -167,7 +167,7 @@ def __call__(dataset=None, url = add_url(dataset, out, url_base, url_only) yield get_status_dict( action='compute', - path=dataset.pathobj / out, + path=str(dataset.pathobj / out), status='ok', message=f'added url: {url!r} to {out!r} in {dataset.pathobj}',) From 3534a779b5345051aa3d8bdb876614a40f5564fc Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 7 Oct 2024 10:46:21 +0200 Subject: [PATCH 100/148] fix end-to-end tests --- datalad_compute/annexremotes/tests/test_hierarchies.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datalad_compute/annexremotes/tests/test_hierarchies.py b/datalad_compute/annexremotes/tests/test_hierarchies.py index 78289d5..c1d60ef 100644 --- a/datalad_compute/annexremotes/tests/test_hierarchies.py +++ b/datalad_compute/annexremotes/tests/test_hierarchies.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Iterable import pytest @@ -103,7 +104,7 @@ def test_end_to_end(tmp_path, datalad_cfg, monkeypatch, output_pattern): result_renderer='disabled') collected_output = [ - str(result['path'].relative_to(root_dataset.pathobj)) + str(Path(result['path']).relative_to(root_dataset.pathobj)) for result in results] assert set(collected_output) == set(output_pattern_static) From 0f7491659d5c060e467c109bcfe835a57db7c6ce Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 7 Oct 2024 11:05:52 +0200 Subject: [PATCH 101/148] rename annex special remote source file --- datalad_compute/annexremotes/{compute.py => compute_remote.py} | 0 setup.cfg | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename datalad_compute/annexremotes/{compute.py => compute_remote.py} (100%) diff --git a/datalad_compute/annexremotes/compute.py b/datalad_compute/annexremotes/compute_remote.py similarity index 100% rename from datalad_compute/annexremotes/compute.py rename to datalad_compute/annexremotes/compute_remote.py diff --git a/setup.cfg b/setup.cfg index 0622c63..d639b0a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,7 +47,7 @@ datalad.extensions = compute = datalad_compute:command_suite console_scripts = - git-annex-remote-compute = datalad_compute.annexremotes.compute:main + git-annex-remote-compute = datalad_compute.annexremotes.compute_remote:main [versioneer] # See the docstring in versioneer.py for instructions. Note that you must From b1d67f897a273a81216bd1bdd605d66d40f46f0e Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 7 Oct 2024 15:10:59 +0200 Subject: [PATCH 102/148] fix root_dataset search in compute remote --- datalad_compute/annexremotes/compute_remote.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/datalad_compute/annexremotes/compute_remote.py b/datalad_compute/annexremotes/compute_remote.py index e1030f9..ccced91 100644 --- a/datalad_compute/annexremotes/compute_remote.py +++ b/datalad_compute/annexremotes/compute_remote.py @@ -133,7 +133,7 @@ def _find_dataset(self, root_id: str ) -> Dataset: """Find the first enclosing dataset with the given root_id""" - current_dir = Path(self.annex.getgitdir()) / '..' + current_dir = Path(self.annex.getgitdir()).parent.absolute() while current_dir != Path('/'): result = subprocess.run( @@ -143,11 +143,10 @@ def _find_dataset(self, '--get', 'datalad.dataset.id' ], stdout=subprocess.PIPE) - if result.returncode != 0: - continue - if result.stdout.decode().strip() == root_id: - return Dataset(current_dir) - current_dir = current_dir / '..' + if result.returncode == 0: + if result.stdout.decode().strip() == root_id: + return Dataset(current_dir) + current_dir = current_dir.parent raise RemoteError(f'Could not find dataset {root_id!r}') def _collect(self, From da37d8315b49f18cc871598ab72af174bc00e9ca Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 7 Oct 2024 16:16:32 +0200 Subject: [PATCH 103/148] add a test for the compute remote --- .../annexremotes/tests/test_compute_remote.py | 121 ++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 datalad_compute/annexremotes/tests/test_compute_remote.py diff --git a/datalad_compute/annexremotes/tests/test_compute_remote.py b/datalad_compute/annexremotes/tests/test_compute_remote.py new file mode 100644 index 0000000..f6b0f59 --- /dev/null +++ b/datalad_compute/annexremotes/tests/test_compute_remote.py @@ -0,0 +1,121 @@ +import subprocess +import sys +from queue import Queue + +from annexremote import Master + +from ..compute_remote import ComputeRemote +from datalad_compute.commands.tests.create_datasets import create_ds_hierarchy + + +template = """ +inputs = ['content'] + +use_shell = 'true' +executable = "echo" + +arguments = [ + "content: {content} > 'a.txt';", +] +""" + + +class MockedOutput: + def __init__(self): + self.output = '' + self.lines = [] + + def write(self, *args, **kwargs): + self.output += ''.join(args) + lineswith = self.output.splitlines(keepends=True) + lineswithout = self.output.splitlines(keepends=False) + if not lineswith: + pass + elif lineswithout[-1] == lineswith[-1]: + self.lines = lineswithout[:-1] + self.output = lineswith[-1] + else: + self.lines = lineswithout + self.output = '' + print(repr(self.output), repr(self.lines)) + + def flush(self): + pass + + def next_line(self): + print('next_line:', self.lines) + if self.lines: + while True: + line = self.lines.pop(0) + if line.startswith('DEBUG '): + print('XXX DEBUG XXX ' + line[6:], file=sys.stderr) + continue + return line + return None + + +class MockedInput: + def __init__(self): + self.input = Queue() + + def readline(self): + return self.input.get() + + def send(self, value): + self.input.put(value) + + +def test_compute_remote_main(tmp_path, monkeypatch): + + dataset = create_ds_hierarchy(tmp_path, 'ds1', 0)[0][2] + monkeypatch.chdir(dataset.path) + + template_path = dataset.pathobj / '.datalad' / 'compute' / 'methods' + template_path.mkdir(parents=True) + (template_path / 'echo').write_text(template) + dataset.save() + + key = tuple( + filter( + lambda line: line.startswith(b'key: '), + subprocess.run( + ['git', 'annex', 'info', 'a.txt'], + stdout=subprocess.PIPE, + check=True).stdout.splitlines()))[0].split(b': ')[1] + + result = dataset.configuration('get', 'datalad.dataset.id') + dataset_id = result[0]['value'] + dataset_version = dataset.repo.get_hexsha() + + input = MockedInput() + + # We send all messages into the queue upfront because we do the test in a + # single thread and do not get back control once `master.listen` is called + # below. + input.send('PREPARE\n') + input.send(f'TRANSFER RETRIEVE {key} {str(tmp_path / "computed.txt")}\n') + url = ( + 'datalad-make:///?' + f'root_id={dataset_id}' + f'&default_root_version={dataset_version}' + '&method=echo' + '&input=%5B%5D' + '&output=%5B"a.txt"%5D' + '¶ms=%5B"content=some_string"%5D' + '&this=a.txt' + ) + input.send(f'VALUE {url}\n') + input.send('VALUE\n') + input.send('VALUE .git\n') + input.send('') + + output = MockedOutput() + + master = Master(output=output) + remote = ComputeRemote(master) + master.LinkRemote(remote) + master.Listen(input=input) + + # At this point the compute remote should have executed the computation + # and written the result. + assert (tmp_path / 'computed.txt').read_text().strip() == 'content: some_string' From 92e328cadbfed902b9eff70fe3fa98bfcfade0b8 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 7 Oct 2024 16:16:59 +0200 Subject: [PATCH 104/148] improve log messages --- datalad_compute/annexremotes/compute_remote.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datalad_compute/annexremotes/compute_remote.py b/datalad_compute/annexremotes/compute_remote.py index ccced91..74a6d77 100644 --- a/datalad_compute/annexremotes/compute_remote.py +++ b/datalad_compute/annexremotes/compute_remote.py @@ -122,8 +122,8 @@ def transfer_retrieve(self, key: str, file_name: str) -> None: lgr.debug('Starting collection') self.annex.debug('Starting collection') self._collect(worktree, dataset, compute_info['output'], compute_info['this'], file_name) - lgr.debug('Starting unprovision') - self.annex.debug('Starting unprovision') + lgr.debug('Leaving provision context') + self.annex.debug('Leaving provision context') def checkpresent(self, key: str) -> bool: # See if at least one URL with the compute url-scheme is present From b61c8eb7c049c005bb51bb5e1c594b0a0f7e455f Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 8 Oct 2024 08:50:38 +0200 Subject: [PATCH 105/148] remove `dataset-id` from recompute-URLs --- datalad_compute/annexremotes/compute_remote.py | 18 ++++++------------ .../annexremotes/tests/test_compute_remote.py | 5 +---- datalad_compute/commands/compute_cmd.py | 3 +-- 3 files changed, 8 insertions(+), 18 deletions(-) diff --git a/datalad_compute/annexremotes/compute_remote.py b/datalad_compute/annexremotes/compute_remote.py index 74a6d77..4830383 100644 --- a/datalad_compute/annexremotes/compute_remote.py +++ b/datalad_compute/annexremotes/compute_remote.py @@ -86,11 +86,10 @@ def get_compute_info(self, key: str) -> dict[str, Any]: def get_assigned_value(assignment: str) -> str: return assignment.split('=', 1)[1] - root_id, root_version, method, inputs, outputs, parameters, this \ + root_version, method, inputs, outputs, parameters, this \ = self.get_url_encoded_info(self.get_url_for_key(key)) return { - 'root_id': unquote(get_assigned_value(root_id)), 'root_version': unquote(get_assigned_value(root_version)), 'method': unquote(get_assigned_value(method)), 'input': json.loads(unquote(get_assigned_value(inputs))), @@ -106,7 +105,7 @@ def transfer_retrieve(self, key: str, file_name: str) -> None: self.annex.debug(f'TRANSFER RETRIEVE compute_info: {compute_info!r}') # TODO: get version override from configuration - dataset = self._find_dataset(compute_info['root_id']) + dataset = self._find_dataset(compute_info['root_version']) # Perform the computation, and collect the results lgr.debug('Starting provision') @@ -130,24 +129,19 @@ def checkpresent(self, key: str) -> bool: return self.annex.geturls(key, f'{url_scheme}:') != [] def _find_dataset(self, - root_id: str + root_version: str ) -> Dataset: """Find the first enclosing dataset with the given root_id""" current_dir = Path(self.annex.getgitdir()).parent.absolute() while current_dir != Path('/'): result = subprocess.run( - [ - 'git', 'config', '-f', - str(current_dir/ '.datalad' / 'config'), - '--get', 'datalad.dataset.id' - ], + ['git', 'cat-file', '-t', root_version], stdout=subprocess.PIPE) if result.returncode == 0: - if result.stdout.decode().strip() == root_id: - return Dataset(current_dir) + return Dataset(current_dir) current_dir = current_dir.parent - raise RemoteError(f'Could not find dataset {root_id!r}') + raise RemoteError(f'Could not find dataset with commit {root_version!r}') def _collect(self, worktree: Path, diff --git a/datalad_compute/annexremotes/tests/test_compute_remote.py b/datalad_compute/annexremotes/tests/test_compute_remote.py index f6b0f59..1a36087 100644 --- a/datalad_compute/annexremotes/tests/test_compute_remote.py +++ b/datalad_compute/annexremotes/tests/test_compute_remote.py @@ -83,8 +83,6 @@ def test_compute_remote_main(tmp_path, monkeypatch): stdout=subprocess.PIPE, check=True).stdout.splitlines()))[0].split(b': ')[1] - result = dataset.configuration('get', 'datalad.dataset.id') - dataset_id = result[0]['value'] dataset_version = dataset.repo.get_hexsha() input = MockedInput() @@ -96,8 +94,7 @@ def test_compute_remote_main(tmp_path, monkeypatch): input.send(f'TRANSFER RETRIEVE {key} {str(tmp_path / "computed.txt")}\n') url = ( 'datalad-make:///?' - f'root_id={dataset_id}' - f'&default_root_version={dataset_version}' + f'root_version={dataset.repo.get_hexsha()}' '&method=echo' '&input=%5B%5D' '&output=%5B"a.txt"%5D' diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 2994e8f..2085d71 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -194,8 +194,7 @@ def get_url(dataset: Dataset, branch = dataset.repo.get_hexsha() if branch is None else branch return ( f'{url_scheme}:///' - + f'?root_id={quote(dataset.id)}' - + f'&default_root_version={quote(branch)}' + + f'&root_version={quote(branch)}' + f'&method={quote(template_name)}' + f'&input={quote(json.dumps(input_pattern))}' + f'&output={quote(json.dumps(output_pattern))}' From 87c2244a0eac5353f8b9d01d6e94f334be81f043 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 8 Oct 2024 10:09:14 +0200 Subject: [PATCH 106/148] fix bugs in compute-URL and dataset search --- .../annexremotes/compute_remote.py | 21 +++++++++++-------- datalad_compute/commands/compute_cmd.py | 2 +- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/datalad_compute/annexremotes/compute_remote.py b/datalad_compute/annexremotes/compute_remote.py index 4830383..0862c45 100644 --- a/datalad_compute/annexremotes/compute_remote.py +++ b/datalad_compute/annexremotes/compute_remote.py @@ -73,7 +73,7 @@ def getcost(self) -> int: return 100 def get_url_encoded_info(self, url: str) -> list[str]: - parts = urlparse(url).query.split('&', 6) + parts = urlparse(url).query.split('&', 5) self.annex.debug(f'get_url_encoded_info: url: {url!r}, parts: {parts!r}') return parts @@ -129,19 +129,22 @@ def checkpresent(self, key: str) -> bool: return self.annex.geturls(key, f'{url_scheme}:') != [] def _find_dataset(self, - root_version: str + commit: str ) -> Dataset: - """Find the first enclosing dataset with the given root_id""" - current_dir = Path(self.annex.getgitdir()).parent.absolute() - + """Find the first enclosing dataset with the given commit""" + start_dir = Path(self.annex.getgitdir()).parent.absolute() + current_dir = start_dir while current_dir != Path('/'): result = subprocess.run( - ['git', 'cat-file', '-t', root_version], - stdout=subprocess.PIPE) - if result.returncode == 0: + ['git', 'cat-file', '-t', commit], + stdout=subprocess.PIPE, + cwd=current_dir) + if result.returncode == 0 and result.stdout.strip() == b'commit': return Dataset(current_dir) current_dir = current_dir.parent - raise RemoteError(f'Could not find dataset with commit {root_version!r}') + raise RemoteError( + f'Could not find dataset with commit {commit!r}, starting from ' + f'{start_dir}') def _collect(self, worktree: Path, diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 2085d71..ac83f51 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -194,7 +194,7 @@ def get_url(dataset: Dataset, branch = dataset.repo.get_hexsha() if branch is None else branch return ( f'{url_scheme}:///' - + f'&root_version={quote(branch)}' + + f'?root_version={quote(branch)}' + f'&method={quote(template_name)}' + f'&input={quote(json.dumps(input_pattern))}' + f'&output={quote(json.dumps(output_pattern))}' From 1b95b6fbb74e201facea915cfd331c7510f4b3ba Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 8 Oct 2024 11:20:40 +0200 Subject: [PATCH 107/148] use stored specs in compute-URLs This commit adds the ability to store the specs for a computation ---i.e. method-name, inputs, outputs, and parameters--- in `.datalad`, and reference it in the compute-URL. That keeps the compute URLs short. Note the the dataset version that is generated by storing the specs is used as reference for a computation. If anything goes wrong in the computation, the specs are still stored in the dataset. --- datalad_compute/__init__.py | 1 + .../annexremotes/compute_remote.py | 42 +++++++---- .../annexremotes/tests/test_compute_remote.py | 16 +++-- datalad_compute/commands/compute_cmd.py | 69 +++++++++++++------ 4 files changed, 87 insertions(+), 41 deletions(-) diff --git a/datalad_compute/__init__.py b/datalad_compute/__init__.py index acd9b56..301b153 100644 --- a/datalad_compute/__init__.py +++ b/datalad_compute/__init__.py @@ -42,3 +42,4 @@ url_scheme = 'datalad-make' template_dir = '.datalad/compute/methods' +specification_dir = '.datalad/compute/specifications' diff --git a/datalad_compute/annexremotes/compute_remote.py b/datalad_compute/annexremotes/compute_remote.py index 0862c45..60cedd3 100644 --- a/datalad_compute/annexremotes/compute_remote.py +++ b/datalad_compute/annexremotes/compute_remote.py @@ -23,7 +23,11 @@ from datalad_next.datasets import Dataset from datalad_next.runners import call_git_success -from .. import url_scheme +from .. import ( + specification_dir, + url_scheme, +) + from ..commands.compute_cmd import ( execute, get_file_dataset, @@ -82,31 +86,38 @@ def get_url_for_key(self, key: str) -> str: self.annex.debug(f'get_url_for_key: key: {key!r}, urls: {urls!r}') return urls[0] - def get_compute_info(self, key: str) -> dict[str, Any]: + def get_compute_info(self, + key: str + ) -> tuple[dict[str, Any], Dataset]: + def get_assigned_value(assignment: str) -> str: return assignment.split('=', 1)[1] - root_version, method, inputs, outputs, parameters, this \ - = self.get_url_encoded_info(self.get_url_for_key(key)) + root_version, spec_name, this = list( + map( + lambda expr: unquote(get_assigned_value(expr)), + self.get_url_encoded_info(self.get_url_for_key(key)))) + + dataset = self._find_dataset(root_version) + spec_path = dataset.pathobj / specification_dir / spec_name + with open(spec_path, 'rb') as f: + spec = json.load(f) return { - 'root_version': unquote(get_assigned_value(root_version)), - 'method': unquote(get_assigned_value(method)), - 'input': json.loads(unquote(get_assigned_value(inputs))), - 'output': json.loads(unquote(get_assigned_value(outputs))), - 'parameter': json.loads(unquote(get_assigned_value(parameters))), - 'this': unquote(get_assigned_value(this)), - } + 'root_version': root_version, + 'this': this, + **{ + name: spec[name] + for name in ['method', 'input', 'output', 'parameter'] + } + }, dataset def transfer_retrieve(self, key: str, file_name: str) -> None: self.annex.debug(f'TRANSFER RETRIEVE key: {key!r}, file_name: {file_name!r}') - compute_info = self.get_compute_info(key) + compute_info, dataset = self.get_compute_info(key) self.annex.debug(f'TRANSFER RETRIEVE compute_info: {compute_info!r}') - # TODO: get version override from configuration - dataset = self._find_dataset(compute_info['root_version']) - # Perform the computation, and collect the results lgr.debug('Starting provision') self.annex.debug('Starting provision') @@ -132,6 +143,7 @@ def _find_dataset(self, commit: str ) -> Dataset: """Find the first enclosing dataset with the given commit""" + # TODO: get version override from configuration start_dir = Path(self.annex.getgitdir()).parent.absolute() current_dir = start_dir while current_dir != Path('/'): diff --git a/datalad_compute/annexremotes/tests/test_compute_remote.py b/datalad_compute/annexremotes/tests/test_compute_remote.py index 1a36087..8ae198b 100644 --- a/datalad_compute/annexremotes/tests/test_compute_remote.py +++ b/datalad_compute/annexremotes/tests/test_compute_remote.py @@ -6,7 +6,8 @@ from ..compute_remote import ComputeRemote from datalad_compute.commands.tests.create_datasets import create_ds_hierarchy - +from ... import specification_dir +from ...commands.compute_cmd import build_json template = """ inputs = ['content'] @@ -83,7 +84,13 @@ def test_compute_remote_main(tmp_path, monkeypatch): stdout=subprocess.PIPE, check=True).stdout.splitlines()))[0].split(b': ')[1] - dataset_version = dataset.repo.get_hexsha() + (dataset.pathobj / specification_dir).mkdir(parents=True) + (dataset.pathobj / specification_dir / '000001111122222').write_text( + build_json( + 'echo', + [], + ['a.txt'], + {'content': 'some_string'})) input = MockedInput() @@ -95,10 +102,7 @@ def test_compute_remote_main(tmp_path, monkeypatch): url = ( 'datalad-make:///?' f'root_version={dataset.repo.get_hexsha()}' - '&method=echo' - '&input=%5B%5D' - '&output=%5B"a.txt"%5D' - '¶ms=%5B"content=some_string"%5D' + '&specification=000001111122222' '&this=a.txt' ) input.send(f'VALUE {url}\n') diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index ac83f51..c9f94f0 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -3,6 +3,7 @@ from __future__ import annotations import contextlib +import hashlib import json import logging import os @@ -36,6 +37,7 @@ ) from .. import ( + specification_dir, template_dir, url_scheme, ) @@ -150,19 +152,25 @@ def __call__(dataset=None, output_pattern = (output or []) + read_list(output_list) parameter = (parameter or []) + read_list(parameter_list) - if not url_only: - with provide_context(dataset, branch, input_pattern) as worktree: - execute(worktree, template, parameter, output_pattern) - output = collect(worktree, dataset, output_pattern) + parameter_dict = { + p.split('=', 1)[0]: p.split('=', 1)[1] + for p in parameter} - url_base = get_url( + # We have to get the URL first, because saving the specification to + # the dataset will change the version. + url_base, reset_commit = get_url( dataset, branch, template, - parameter, + parameter_dict, input_pattern, output_pattern) + if not url_only: + with provide_context(dataset, branch, input_pattern) as worktree: + execute(worktree, template, parameter_dict, output_pattern) + output = collect(worktree, dataset, output_pattern) + for out in output: url = add_url(dataset, out, url_base, url_only) yield get_status_dict( @@ -189,17 +197,42 @@ def get_url(dataset: Dataset, parameters: dict[str, str], input_pattern: list[str], output_pattern: list[str], - ) -> str: + ) -> tuple[str, str]: + + # If something goes wrong after the compute specification was saved, + # the dataset state should be reset to `branch` + reset_branch = branch or dataset.repo.get_hexsha() + + # create the specification and hash it + spec = build_json(template_name, input_pattern, output_pattern, parameters) + hasher = hashlib.sha256() + hasher.update(spec.encode()) + digest = hasher.hexdigest() + + # write the specification file + (dataset.pathobj / specification_dir).mkdir(exist_ok=True) + (dataset.pathobj / specification_dir / digest).write_text(spec) + dataset.save( + message=f'[DATALAD] saving computation spec: {digest}', + recursive=True, result_renderer='disabled') - branch = dataset.repo.get_hexsha() if branch is None else branch return ( f'{url_scheme}:///' - + f'?root_version={quote(branch)}' - + f'&method={quote(template_name)}' - + f'&input={quote(json.dumps(input_pattern))}' - + f'&output={quote(json.dumps(output_pattern))}' - + f'¶ms={quote(json.dumps(parameters))}' - ) + + f'?root_version={quote(dataset.repo.get_hexsha())}' + + f'&specification={quote(digest)}' + ), reset_branch + + +def build_json(method: str, + inputs: list[str], + outputs: list[str], + parameters: dict[str, str] + ) -> str: + return json.dumps({ + 'method': method, + 'input': inputs, + 'output': outputs, + 'parameter': parameters}) def add_url(dataset: Dataset, @@ -271,7 +304,7 @@ def provide_context(dataset: Dataset, def execute(worktree: Path, template_name: str, - parameter: list[str], + parameter: dict[str, str], output_pattern: list[str], ) -> None: @@ -294,11 +327,7 @@ def execute(worktree: Path, # Run the computation in the worktree-directory template_path = worktree / template_dir / template_name - parameter_dict = { - p.split('=', 1)[0]: p.split('=', 1)[1] - for p in parameter - } - compute(worktree, template_path, parameter_dict) + compute(worktree, template_path, parameter) def collect(worktree: Path, From 181618076dd0c12d41cf0b52dbf16ecd78a77ac7 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 9 Oct 2024 10:10:53 +0200 Subject: [PATCH 108/148] add a regression test for duplicated computations This commit adds a test that ensures that duplicated computations can be performed. Earlier this raised an error because the spec-files were not unlocked before writing them. --- .../commands/tests/test_compute.py | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 datalad_compute/commands/tests/test_compute.py diff --git a/datalad_compute/commands/tests/test_compute.py b/datalad_compute/commands/tests/test_compute.py new file mode 100644 index 0000000..91200cb --- /dev/null +++ b/datalad_compute/commands/tests/test_compute.py @@ -0,0 +1,71 @@ +from pathlib import Path +from typing import Iterable + +import pytest + +from datalad.api import get as datalad_get +from datalad_next.datasets import Dataset +from datalad_next.tests.fixtures import datalad_cfg + +from ... import ( + template_dir, + url_scheme, +) +from datalad_compute.commands.tests.create_datasets import create_ds_hierarchy + + +test_method = """ +inputs = ['name'] +use_shell = 'true' +executable = 'echo' +arguments = ["Hello {name} > a.txt"] +""" + + +output_pattern = ['a.txt'] + + +def _drop_files(dataset: Dataset, + files: Iterable[str]): + for file in files: + dataset.drop(file, reckless='availability', result_renderer='disabled') + assert not (dataset.pathobj / file).exists() + + +def _check_content(dataset, + file_content: Iterable[tuple[str, str]] + ): + for file, content in file_content: + assert (dataset.pathobj / file).read_text() == content + + +def test_duplicated_compuation(tmp_path, datalad_cfg, monkeypatch): + + datasets = create_ds_hierarchy(tmp_path, 'd1', 0) + root_dataset = datasets[0][2] + + # add method template + template_path = root_dataset.pathobj / template_dir + template_path.mkdir(parents=True) + (template_path / 'test_method').write_text(test_method) + root_dataset.save(result_renderer='disabled') + + # set annex security related variables to allow compute-URLs + datalad_cfg.set('annex.security.allowed-url-schemes', url_scheme, scope='global') + datalad_cfg.set('annex.security.allowed-ip-addresses', 'all', scope='global') + datalad_cfg.set('annex.security.allow-unverified-downloads', 'ACKTHPPT', scope='global') + + # run the same command twice + _run_simple_computation(root_dataset) + _run_simple_computation(root_dataset) + + +def _run_simple_computation(root_dataset: Dataset): + results = root_dataset.compute( + template='test_method', + parameter=['name=Robert'], + output=['a.txt'], + result_renderer='disabled') + + # check that the output is correct + assert (root_dataset.pathobj / 'a.txt').read_text() == 'Hello Robert\n' From 6f7a4aefec95ddd11f209deb13aadbb7795cbb73 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 9 Oct 2024 10:12:24 +0200 Subject: [PATCH 109/148] unlock spec files This commit fixes a bug where the spec files were not unlocked. This leads to errors if an identical computation is executed a second time. --- datalad_compute/commands/compute_cmd.py | 43 +++++++++++++++++++------ 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index c9f94f0..ec28536 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -203,24 +203,47 @@ def get_url(dataset: Dataset, # the dataset state should be reset to `branch` reset_branch = branch or dataset.repo.get_hexsha() + # Write the specification to a file in the dataset + digest = write_spec( + dataset, + template_name, + input_pattern, + output_pattern, + parameters) + + return ( + f'{url_scheme}:///' + + f'?root_version={quote(dataset.repo.get_hexsha())}' + + f'&specification={quote(digest)}' + ), reset_branch + + +def write_spec(dataset: Dataset, + method: str, + input_pattern: list[str], + output_pattern: list[str], + parameters: dict[str, str] + ) -> str: + # create the specification and hash it - spec = build_json(template_name, input_pattern, output_pattern, parameters) + spec = build_json(method, input_pattern, output_pattern, parameters) hasher = hashlib.sha256() hasher.update(spec.encode()) digest = hasher.hexdigest() # write the specification file - (dataset.pathobj / specification_dir).mkdir(exist_ok=True) - (dataset.pathobj / specification_dir / digest).write_text(spec) + spec_dir = dataset.pathobj / specification_dir + spec_dir.mkdir(exist_ok=True) + spec_file = spec_dir / digest + with contextlib.chdir(dataset.pathobj): + call_git_success( + ['annex', 'unlock', str(spec_file)], + capture_output=True) + spec_file.write_text(spec) dataset.save( - message=f'[DATALAD] saving computation spec: {digest}', + message=f'[DATALAD] saving computation spec\n\nfile name: {digest}', recursive=True, result_renderer='disabled') - - return ( - f'{url_scheme}:///' - + f'?root_version={quote(dataset.repo.get_hexsha())}' - + f'&specification={quote(digest)}' - ), reset_branch + return digest def build_json(method: str, From 590355e8094bc1a4a0d96e0d2e54c3736179f13e Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 9 Oct 2024 09:41:54 +0200 Subject: [PATCH 110/148] do not provision unclean files This commit modified provision to report an error if untracked oder modified files should be provisioned. --- datalad_compute/commands/provision_cmd.py | 36 ++++++++++++++----- .../commands/tests/test_provision.py | 30 ++++++++++++++++ 2 files changed, 57 insertions(+), 9 deletions(-) diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py index e9419bd..de5423b 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_compute/commands/provision_cmd.py @@ -11,7 +11,7 @@ from pathlib import Path from typing import ( Any, - Iterable, + Iterable, Generator, ) from tempfile import TemporaryDirectory @@ -130,12 +130,7 @@ def __call__(dataset=None, worktree_dir: Path = worktree_dir or Path(TemporaryDirectory().name) inputs = input or [] + read_list(input_list) - provision_dir = provide(dataset, worktree_dir, branch, inputs) - yield get_status_dict( - action='provision', - path=str(provision_dir), - status='ok', - message=f'provisioned dataset: {dataset} in workspace: {provision_dir!r}',) + yield from provide(dataset, worktree_dir, branch, inputs) def remove(dataset: Dataset, @@ -159,7 +154,7 @@ def provide(dataset: Dataset, worktree_dir: Path, source_branch: str | None = None, input_patterns: Iterable[str] | None = None, - ) -> Path: + ) -> Generator: lgr.debug('Provisioning dataset %s at %s', dataset, worktree_dir) @@ -175,6 +170,17 @@ def provide(dataset: Dataset, input_files = resolve_patterns(dataset.path, input_patterns) + unclean_elements = get_unclean_elements(dataset, input_files) + if unclean_elements: + for element in unclean_elements: + yield get_status_dict( + action='provision', + path=element['path'], + status='error', + state=element['state'], + message=f'cannot provision {element["state"]} input: {element["path"]!r} from dataset {dataset}') + return + worktree_dataset = Dataset(worktree_dir) install_required_locally_available_datasets( dataset, @@ -186,7 +192,11 @@ def provide(dataset: Dataset, for file in input_files: lgr.debug('provisioning input file %s', file) worktree_dataset.get(file, result_renderer='disabled') - return worktree_dir + yield get_status_dict( + action='provision', + path=str(worktree_dir), + status='ok', + message=f'provisioned dataset: {dataset} in workspace: {worktree_dir!r}',) def check_results(results: Iterable[dict[str, Any]]): @@ -195,6 +205,14 @@ def check_results(results: Iterable[dict[str, Any]]): for result in results) +def get_unclean_elements(dataset: Dataset, + paths: Iterable[str] + ) -> list[dict]: + if not paths: + return [] + return list(filter(lambda x: x['state'] != 'clean', dataset.status(paths))) + + def install_required_locally_available_datasets(root_dataset: Dataset, input_files: list[Path], worktree: Dataset, diff --git a/datalad_compute/commands/tests/test_provision.py b/datalad_compute/commands/tests/test_provision.py index c45aac6..6894366 100644 --- a/datalad_compute/commands/tests/test_provision.py +++ b/datalad_compute/commands/tests/test_provision.py @@ -135,3 +135,33 @@ def test_provision_context(tmp_path): files = set(get_file_list(worktree)) assert files assert not worktree.exists() + + +def test_unclean_dataset(tmp_path): + dataset = Dataset(tmp_path / 'ds1') + dataset.create(cfg_proc='text2git', result_renderer='disabled') + (dataset.pathobj / 'a.txt').write_text('content') + dataset.save() + (dataset.pathobj / 'a.txt').write_text('changed content') + (dataset.pathobj / 'b.txt').write_text('untracked content') + + # Check that provision of unclean input results in errors + input_pattern = ['a.txt', 'b.txt'] + results = dataset.provision( + input=input_pattern, + worktree_dir=tmp_path / 'ds1_worktree1', + on_failure='ignore') + assert set((result['status'], result['state']) for result in results) == \ + {('error', 'modified'), ('error', 'untracked')} + + # Check that a saved dataset can be provisioned + dataset.save() + dataset.provision( + input=input_pattern, + worktree_dir=tmp_path / 'ds1_worktree2') + + # Check that non-input file `c.txt` is ignored + (dataset.pathobj / 'c.txt').write_text('content') + dataset.provision( + input=input_pattern, + worktree_dir=tmp_path / 'ds1_worktree3') From aada1a9527db12ca17ba93f4578ca188d3b73c77 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 9 Oct 2024 12:17:22 +0200 Subject: [PATCH 111/148] remove DEBUG output from compute-remote tests --- datalad_compute/annexremotes/tests/test_compute_remote.py | 1 - 1 file changed, 1 deletion(-) diff --git a/datalad_compute/annexremotes/tests/test_compute_remote.py b/datalad_compute/annexremotes/tests/test_compute_remote.py index 8ae198b..450b5fa 100644 --- a/datalad_compute/annexremotes/tests/test_compute_remote.py +++ b/datalad_compute/annexremotes/tests/test_compute_remote.py @@ -49,7 +49,6 @@ def next_line(self): while True: line = self.lines.pop(0) if line.startswith('DEBUG '): - print('XXX DEBUG XXX ' + line[6:], file=sys.stderr) continue return line return None From 6fe64352fa628221a090fcba3b8a7f5e19c7cd40 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 9 Oct 2024 14:23:41 +0200 Subject: [PATCH 112/148] remove stale provision branches from datasets This commit adds code the removes the branches from datasets that were created during provision. It also adds a regression test. --- datalad_compute/commands/provision_cmd.py | 5 ++++- .../commands/tests/test_provision.py | 19 ++++++++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py index de5423b..5ec2a98 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_compute/commands/provision_cmd.py @@ -30,7 +30,7 @@ EnsureStr, EnsurePath, ) from datalad_next.datasets import Dataset -from datalad_next.runners import call_git_lines +from datalad_next.runners import call_git_lines, call_git_success from datalad_compute.utils.glob import resolve_patterns from ..commands.compute_cmd import read_list @@ -142,6 +142,9 @@ def remove(dataset: Dataset, recursive=True, result_renderer='disabled') prune_worktrees(dataset) + call_git_success( + ['branch', '-d', worktree.pathobj.name], + cwd=dataset.pathobj) def prune_worktrees(dataset: Dataset) -> None: diff --git a/datalad_compute/commands/tests/test_provision.py b/datalad_compute/commands/tests/test_provision.py index 6894366..f4e7fa0 100644 --- a/datalad_compute/commands/tests/test_provision.py +++ b/datalad_compute/commands/tests/test_provision.py @@ -1,6 +1,7 @@ from __future__ import annotations -from contextlib import chdir +import contextlib +from contextlib import chdir, contextmanager from pathlib import Path from typing import Iterable @@ -165,3 +166,19 @@ def test_unclean_dataset(tmp_path): dataset.provision( input=input_pattern, worktree_dir=tmp_path / 'ds1_worktree3') + + +def test_branch_deletion_after_provision(tmp_path): + dataset = create_ds_hierarchy(tmp_path, 'ds1', 3)[0][2] + with provide_context( + dataset=dataset, + branch=None, + input_patterns=['a.txt'] + ) as worktree: + assert worktree.exists() + assert not worktree.exists() + with contextlib.chdir(dataset.path): + branches = [ + l.strip() + for l in call_git_lines(['branch'])] + assert worktree.name not in branches From ec024424e964c8b578e61b2badb00b47444782ec Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 9 Oct 2024 15:37:04 +0200 Subject: [PATCH 113/148] fix the deletion of the worktree --- datalad_compute/commands/provision_cmd.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py index 5ec2a98..2c35cd1 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_compute/commands/provision_cmd.py @@ -149,8 +149,6 @@ def remove(dataset: Dataset, def prune_worktrees(dataset: Dataset) -> None: call_git_lines(['worktree', 'prune'], cwd=dataset.pathobj) - for result in dataset.subdatasets(result_renderer='disabled'): - prune_worktrees(Dataset(result['path'])) def provide(dataset: Dataset, From 0261229713aaadcd74e16b8dceb129af711846cc Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 9 Oct 2024 16:46:38 +0200 Subject: [PATCH 114/148] ensure that an annexed template is fetched --- datalad_compute/commands/compute_cmd.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index ec28536..1912ce0 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -349,8 +349,9 @@ def execute(worktree: Path, unlock_files(worktree_ds, existing_outputs) # Run the computation in the worktree-directory - template_path = worktree / template_dir / template_name - compute(worktree, template_path, parameter) + template_path = template_dir / template_name + worktree_ds.get(template_path) + compute(worktree, worktree / template_path, parameter) def collect(worktree: Path, From 67c688b7c759c0b1334899e1f906898b8d5f69e5 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 9 Oct 2024 21:49:56 +0200 Subject: [PATCH 115/148] fix a bug in template directory calculation --- datalad_compute/commands/compute_cmd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 1912ce0..257d24c 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -349,7 +349,7 @@ def execute(worktree: Path, unlock_files(worktree_ds, existing_outputs) # Run the computation in the worktree-directory - template_path = template_dir / template_name + template_path = Path(template_dir) / template_name worktree_ds.get(template_path) compute(worktree, worktree / template_path, parameter) From c5f4b53003a41eec14282affbc4cfd04d38f8bf5 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 10 Oct 2024 12:55:49 +0200 Subject: [PATCH 116/148] remove print-statements from annexremote tests --- datalad_compute/annexremotes/tests/test_compute_remote.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/datalad_compute/annexremotes/tests/test_compute_remote.py b/datalad_compute/annexremotes/tests/test_compute_remote.py index 450b5fa..e2f0961 100644 --- a/datalad_compute/annexremotes/tests/test_compute_remote.py +++ b/datalad_compute/annexremotes/tests/test_compute_remote.py @@ -38,13 +38,11 @@ def write(self, *args, **kwargs): else: self.lines = lineswithout self.output = '' - print(repr(self.output), repr(self.lines)) def flush(self): pass def next_line(self): - print('next_line:', self.lines) if self.lines: while True: line = self.lines.pop(0) From b74f95cac33332e17e40941fa2c4863bf9897c9b Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 10 Oct 2024 15:33:08 +0200 Subject: [PATCH 117/148] use only "present" local subdatasets in provision --- datalad_compute/commands/provision_cmd.py | 1 + .../commands/tests/test_provision.py | 34 ++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py index 2c35cd1..c093a1f 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_compute/commands/provision_cmd.py @@ -258,6 +258,7 @@ def get_subdataset_info(dataset: Dataset) -> Iterable[tuple[Path, Path, Path]]: Path(result['path']).relative_to(dataset.pathobj) ) for result in results + if result['state'] == 'present' ] diff --git a/datalad_compute/commands/tests/test_provision.py b/datalad_compute/commands/tests/test_provision.py index f4e7fa0..8a88ac6 100644 --- a/datalad_compute/commands/tests/test_provision.py +++ b/datalad_compute/commands/tests/test_provision.py @@ -1,7 +1,7 @@ from __future__ import annotations import contextlib -from contextlib import chdir, contextmanager +from contextlib import chdir from pathlib import Path from typing import Iterable @@ -182,3 +182,35 @@ def test_branch_deletion_after_provision(tmp_path): l.strip() for l in call_git_lines(['branch'])] assert worktree.name not in branches + + +def test_not_present_local_datasets(tmp_path): + root_ds = Dataset(tmp_path / 'ds1') + root_ds.create(cfg_proc='text2git', result_renderer='disabled') + root_ds.clone( + 'https://github.com/OpenNeuroDatasets/ds000102', + result_renderer='disabled') + provisioned_dataset = Dataset( + root_ds.provision( + input=['ds000102/README'])[0]['path']) + url = _get_submodule_url(provisioned_dataset, 'ds000102') + assert url.startswith(f'file://{root_ds.path}') + + root_ds.drop( + 'ds000102', + what='all', + reckless='availability', + result_renderer='disabled') + + provisioned_dataset_2 = Dataset( + root_ds.provision( + input=['ds000102/README'])[0]['path']) + url_2 = _get_submodule_url(provisioned_dataset_2, 'ds000102') + assert url_2 == 'https://github.com/OpenNeuroDatasets/ds000102' + + +def _get_submodule_url(dataset: Dataset, submodule_path: str) -> str: + x = call_git_lines( + ['config', '-f', str(dataset.pathobj / '.gitmodules'), '--get', + f'submodule.{submodule_path}.url']) + return x[0].strip() From 062d0c5e47b4fdd6e6d89953ba715860f3916ed5 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 11 Oct 2024 12:28:01 +0200 Subject: [PATCH 118/148] add --no-globbing option and provisioning tests --- datalad_compute/commands/compute_cmd.py | 27 ++++++++++++++++--- datalad_compute/commands/provision_cmd.py | 18 +++++++++++-- .../commands/tests/test_provision.py | 4 ++- 3 files changed, 43 insertions(+), 6 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 257d24c..c1c569f 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -129,6 +129,12 @@ class Compute(ValidatedInterface): "that start with '#' are ignored. Line content is stripped " "before used. This is useful if a large number of parameters " "should be provided."), + no_globbing=Parameter( + args=('-n', '--no-globbing',), + doc="Use input pattern as file names and do not apply globbing. " + "This allows to specify files that are no currently present " + "in the source dataset as input. Those files will be made " + "available in the worktree."), ) @staticmethod @@ -144,6 +150,7 @@ def __call__(dataset=None, output_list=None, parameter=None, parameter_list=None, + no_globbing=False, ): dataset : Dataset = dataset.ds if dataset else Dataset('.') @@ -167,7 +174,12 @@ def __call__(dataset=None, output_pattern) if not url_only: - with provide_context(dataset, branch, input_pattern) as worktree: + with provide_context( + dataset, + branch, + input_pattern, + no_globbing, + ) as worktree: execute(worktree, template, parameter_dict, output_pattern) output = collect(worktree, dataset, output_pattern) @@ -304,10 +316,14 @@ def get_file_dataset(file: Path) -> tuple[Path, Path]: def provide(dataset: Dataset, branch: str | None, input_patterns: list[str], + no_globbing: bool, ) -> Path: lgr.debug('provide: %s %s %s', dataset, branch, input_patterns) - result = dataset.provision(input=input_patterns, branch=branch) + result = dataset.provision( + input=input_patterns, + branch=branch, + no_globbing=no_globbing) return Path(result[0]['path']) @@ -315,9 +331,14 @@ def provide(dataset: Dataset, def provide_context(dataset: Dataset, branch: str | None, input_patterns: list[str], + no_globbing: bool, ) -> Generator: - worktree = provide(dataset, branch=branch, input_patterns=input_patterns) + worktree = provide( + dataset, + branch=branch, + input_patterns=input_patterns, + no_globbing=no_globbing) try: yield worktree finally: diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py index c093a1f..bee1013 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_compute/commands/provision_cmd.py @@ -15,6 +15,7 @@ ) from tempfile import TemporaryDirectory +from datalad.support.constraints import EnsureBool from datalad_next.commands import ( EnsureCommandParameterization, ValidatedInterface, @@ -64,6 +65,7 @@ class Provision(ValidatedInterface): input_list=EnsureStr(min_len=1), tmp_dir=EnsurePath(is_mode=stat.S_ISDIR), delete=EnsureDataset(installed=True), + no_globbing=EnsureBool(), )) # parameters of the command, must be exhaustive @@ -96,6 +98,13 @@ class Provision(ValidatedInterface): "that start with '#' are ignored. Line content is stripped " "before used. This is useful if a large number of input file " "patterns should be provided."), + no_globbing=Parameter( + args=('-n', '--no-globbing',), + action='store_true', + doc="Interpret input pattern as file names and do not apply " + "globbing. This allows to specify files that are no currently " + "present in the source dataset as input. Those files will be " + "made available in the worktree by the provisioning stage."), worktree_dir=Parameter( args=('-w', '--worktree-dir',), doc="Path of the directory that should become the temporary " @@ -110,6 +119,7 @@ def __call__(dataset=None, delete=None, input=None, input_list=None, + no_globbing=False, worktree_dir=None, ): @@ -130,7 +140,7 @@ def __call__(dataset=None, worktree_dir: Path = worktree_dir or Path(TemporaryDirectory().name) inputs = input or [] + read_list(input_list) - yield from provide(dataset, worktree_dir, branch, inputs) + yield from provide(dataset, worktree_dir, branch, inputs, no_globbing) def remove(dataset: Dataset, @@ -155,6 +165,7 @@ def provide(dataset: Dataset, worktree_dir: Path, source_branch: str | None = None, input_patterns: Iterable[str] | None = None, + no_globbing: bool = False, ) -> Generator: lgr.debug('Provisioning dataset %s at %s', dataset, worktree_dir) @@ -169,7 +180,10 @@ def provide(dataset: Dataset, ) call_git_lines(args, cwd=dataset.pathobj) - input_files = resolve_patterns(dataset.path, input_patterns) + if no_globbing: + input_files = set(input_patterns) + else: + input_files = resolve_patterns(dataset.path, input_patterns) unclean_elements = get_unclean_elements(dataset, input_files) if unclean_elements: diff --git a/datalad_compute/commands/tests/test_provision.py b/datalad_compute/commands/tests/test_provision.py index 8a88ac6..07bd4fc 100644 --- a/datalad_compute/commands/tests/test_provision.py +++ b/datalad_compute/commands/tests/test_provision.py @@ -204,7 +204,9 @@ def test_not_present_local_datasets(tmp_path): provisioned_dataset_2 = Dataset( root_ds.provision( - input=['ds000102/README'])[0]['path']) + input=['ds000102/README'], + no_globbing=True, + on_failure='ignore')[0]['path']) url_2 = _get_submodule_url(provisioned_dataset_2, 'ds000102') assert url_2 == 'https://github.com/OpenNeuroDatasets/ds000102' From 47f219bab159553022f1888eec14a15653fe77a4 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 11 Oct 2024 16:07:37 +0200 Subject: [PATCH 119/148] add missing provision enhancement fixes --- datalad_compute/commands/compute_cmd.py | 2 +- .../commands/tests/test_collection.py | 4 ++- .../commands/tests/test_provision.py | 25 ++++++++++++------- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index c1c569f..91934bc 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -331,7 +331,7 @@ def provide(dataset: Dataset, def provide_context(dataset: Dataset, branch: str | None, input_patterns: list[str], - no_globbing: bool, + no_globbing: bool = False, ) -> Generator: worktree = provide( diff --git a/datalad_compute/commands/tests/test_collection.py b/datalad_compute/commands/tests/test_collection.py index 1e3781f..c63b5aa 100644 --- a/datalad_compute/commands/tests/test_collection.py +++ b/datalad_compute/commands/tests/test_collection.py @@ -11,7 +11,9 @@ def test_collect(tmp_path): worktree_dir = tmp_path / 'ds1_worktree' worktree_dir.mkdir(parents=True, exist_ok=False) - worktree = dataset.provision(worktree_dir=worktree_dir) + worktree = dataset.provision( + worktree_dir=worktree_dir, + result_renderer='disabled') result_dir = worktree_dir / 'results' / 'sub-01' result_dir.mkdir(parents=True) diff --git a/datalad_compute/commands/tests/test_provision.py b/datalad_compute/commands/tests/test_provision.py index 07bd4fc..6aef0d7 100644 --- a/datalad_compute/commands/tests/test_provision.py +++ b/datalad_compute/commands/tests/test_provision.py @@ -47,13 +47,13 @@ def test_worktree_basic(tmp_path): provision_result = dataset.provision( worktree_dir=tmp_path / 'ds1_worktree1', input=inputs, - )[0] + result_renderer='disabled')[0] worktree = Dataset(provision_result['path']) # Check input availability assert all((worktree.pathobj / path).exists() for path in inputs) - dataset.provision(delete=worktree.path) + dataset.provision(delete=worktree.path, result_renderer='disabled') def check_deleted_worktrees(ds: Dataset): with chdir(ds.path): @@ -81,6 +81,7 @@ def test_worktree_globbing(tmp_path): '*_subds0/*_subds1/*.txt', '*_subds0/*_subds1/*_subds2/*.txt', ], + result_renderer='disabled', )[0] worktree = Path(result['path']) @@ -89,7 +90,7 @@ def test_worktree_globbing(tmp_path): path.format(ds_name='ds1') for path in all_paths ) - dataset.provision(delete=worktree) + dataset.provision(delete=worktree, result_renderer='disabled') result = dataset.provision( worktree_dir=tmp_path / 'ds1_worktree2', @@ -99,6 +100,7 @@ def test_worktree_globbing(tmp_path): '*_subds0/*_subds1/b*txt', '*_subds0/*_subds1/*_subds2/b*txt', ], + result_renderer='disabled', )[0] worktree = Path(result['path']) @@ -107,7 +109,7 @@ def test_worktree_globbing(tmp_path): path.format(ds_name='ds1') for path in b_paths ).issubset(worktree_set) - dataset.provision(delete=worktree) + dataset.provision(delete=worktree, result_renderer='disabled') dataset.drop( what='all', @@ -151,7 +153,8 @@ def test_unclean_dataset(tmp_path): results = dataset.provision( input=input_pattern, worktree_dir=tmp_path / 'ds1_worktree1', - on_failure='ignore') + on_failure='ignore', + result_renderer='disabled') assert set((result['status'], result['state']) for result in results) == \ {('error', 'modified'), ('error', 'untracked')} @@ -159,13 +162,15 @@ def test_unclean_dataset(tmp_path): dataset.save() dataset.provision( input=input_pattern, - worktree_dir=tmp_path / 'ds1_worktree2') + worktree_dir=tmp_path / 'ds1_worktree2', + result_renderer='disabled') # Check that non-input file `c.txt` is ignored (dataset.pathobj / 'c.txt').write_text('content') dataset.provision( input=input_pattern, - worktree_dir=tmp_path / 'ds1_worktree3') + worktree_dir=tmp_path / 'ds1_worktree3', + result_renderer='disabled') def test_branch_deletion_after_provision(tmp_path): @@ -192,7 +197,8 @@ def test_not_present_local_datasets(tmp_path): result_renderer='disabled') provisioned_dataset = Dataset( root_ds.provision( - input=['ds000102/README'])[0]['path']) + input=['ds000102/README'], + result_renderer='disabled')[0]['path']) url = _get_submodule_url(provisioned_dataset, 'ds000102') assert url.startswith(f'file://{root_ds.path}') @@ -206,7 +212,8 @@ def test_not_present_local_datasets(tmp_path): root_ds.provision( input=['ds000102/README'], no_globbing=True, - on_failure='ignore')[0]['path']) + on_failure='ignore', + result_renderer='disabled')[0]['path']) url_2 = _get_submodule_url(provisioned_dataset_2, 'ds000102') assert url_2 == 'https://github.com/OpenNeuroDatasets/ds000102' From 7acadbc5bb94bf5ab5a0dd33e698895114174ca1 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 11 Oct 2024 17:21:30 +0200 Subject: [PATCH 120/148] fix a bug in speculative computation URL-adding --- datalad_compute/commands/compute_cmd.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 91934bc..41b5d58 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -283,11 +283,20 @@ def add_url(dataset: Dataset, # Build the file-specific URL and store it in the annex url = url_base + f'&this={quote(file_path)}' file_dataset_path, file_path = get_file_dataset(dataset.pathobj / file_path) - is_annexed = call_git_success( - ['annex', 'whereis', str(file_path)], - cwd=file_dataset_path, - capture_output=True) - if is_annexed: + + # If the file does not exist and speculative computation is requested, we + # can just add the URL. + if not (dataset.pathobj / file_path).exists() and url_only: + can_add = True + else: + # Check if the file is annexed, otherwise we cannot add a URL + can_add = call_git_success( + ['annex', 'whereis', str(file_path)], + cwd=file_dataset_path, + capture_output=True) + + # Add the URL + if can_add: success = call_git_success( ['annex', 'addurl', url, '--file', file_path] + (['--relaxed'] if url_only else []), From 0f49b3224ecf4ee5eecaa597aa82cd87c6e868c2 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 11 Oct 2024 17:38:30 +0200 Subject: [PATCH 121/148] add a test for speculative computation --- .../commands/tests/test_compute.py | 49 ++++++++++++------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/datalad_compute/commands/tests/test_compute.py b/datalad_compute/commands/tests/test_compute.py index 91200cb..311ee57 100644 --- a/datalad_compute/commands/tests/test_compute.py +++ b/datalad_compute/commands/tests/test_compute.py @@ -1,24 +1,17 @@ -from pathlib import Path from typing import Iterable -import pytest - -from datalad.api import get as datalad_get from datalad_next.datasets import Dataset from datalad_next.tests.fixtures import datalad_cfg -from ... import ( - template_dir, - url_scheme, -) +from ... import template_dir from datalad_compute.commands.tests.create_datasets import create_ds_hierarchy test_method = """ -inputs = ['name'] +inputs = ['name', 'file'] use_shell = 'true' executable = 'echo' -arguments = ["Hello {name} > a.txt"] +arguments = ["Hello {name} > {file}"] """ @@ -39,7 +32,7 @@ def _check_content(dataset, assert (dataset.pathobj / file).read_text() == content -def test_duplicated_compuation(tmp_path, datalad_cfg, monkeypatch): +def test_duplicated_computation(tmp_path, datalad_cfg, monkeypatch): datasets = create_ds_hierarchy(tmp_path, 'd1', 0) root_dataset = datasets[0][2] @@ -50,20 +43,40 @@ def test_duplicated_compuation(tmp_path, datalad_cfg, monkeypatch): (template_path / 'test_method').write_text(test_method) root_dataset.save(result_renderer='disabled') - # set annex security related variables to allow compute-URLs - datalad_cfg.set('annex.security.allowed-url-schemes', url_scheme, scope='global') - datalad_cfg.set('annex.security.allowed-ip-addresses', 'all', scope='global') - datalad_cfg.set('annex.security.allow-unverified-downloads', 'ACKTHPPT', scope='global') - # run the same command twice _run_simple_computation(root_dataset) _run_simple_computation(root_dataset) +def test_speculative_computation(tmp_path, datalad_cfg, monkeypatch): + + datasets = create_ds_hierarchy(tmp_path, 'd2', 0) + root_dataset = datasets[0][2] + + # add method template + template_path = root_dataset.pathobj / template_dir + template_path.mkdir(parents=True) + (template_path / 'test_method').write_text(test_method) + root_dataset.save(result_renderer='disabled') + + root_dataset.compute( + template='test_method', + parameter=['name=Robert', 'file=spec.txt'], + output=['spec.txt'], + url_only=True, + result_renderer='disabled') + + # set annex security related variables to allow compute-URLs + datalad_cfg.set('annex.security.allow-unverified-downloads', 'ACKTHPPT', scope='global') + + root_dataset.get('spec.txt') + assert (root_dataset.pathobj / 'spec.txt').read_text() == 'Hello Robert\n' + + def _run_simple_computation(root_dataset: Dataset): - results = root_dataset.compute( + root_dataset.compute( template='test_method', - parameter=['name=Robert'], + parameter=['name=Robert', 'file=a.txt'], output=['a.txt'], result_renderer='disabled') From b99d441093d7100d41bb9453cad53f82403b4d41 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 11 Oct 2024 21:11:22 +0200 Subject: [PATCH 122/148] reduce output --- datalad_compute/commands/compute_cmd.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 41b5d58..33c2089 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -332,7 +332,8 @@ def provide(dataset: Dataset, result = dataset.provision( input=input_patterns, branch=branch, - no_globbing=no_globbing) + no_globbing=no_globbing, + result_renderer='disabled') return Path(result[0]['path']) @@ -352,7 +353,7 @@ def provide_context(dataset: Dataset, yield worktree finally: lgr.debug('un_provide: %s %s', dataset, str(worktree)) - dataset.provision(delete=worktree) + dataset.provision(delete=worktree, result_renderer='disabled') def execute(worktree: Path, From d78105b1f27b512a8e8914ea89c3e21612e758bf Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 11 Oct 2024 17:10:19 +0200 Subject: [PATCH 123/148] remove unnecessary configuration settings from README.md --- README.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index ad1beea..ac4afe3 100644 --- a/README.md +++ b/README.md @@ -40,14 +40,12 @@ pip install . ## Example usage -Install the extension, create a dataset, configure it to use `compute`-URLs +Install the extension and create a dataset ```bash > datalad create compute-test-1 > cd compute-test-1 -> git config annex.security.allowed-url-schemes datalad-make -> git config annex.security.allowed-ip-addresses all ``` Create the template directory and a template @@ -89,9 +87,9 @@ copied there and the output files must be copied back). ```bash > cat name-1.txt -bob +content: bob > cat name-2.txt -alice +content: alice ``` Drop the content of `name-1.txt`, verify it is gone, recreate it via From 87566119ce7d47a5669fa24b51e6dc8f084049f9 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 11 Oct 2024 21:21:03 +0200 Subject: [PATCH 124/148] add instructions for compute special remote to example --- examples/fmriprep docker/readme.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/fmriprep docker/readme.md b/examples/fmriprep docker/readme.md index 7e9d6d1..414b42c 100644 --- a/examples/fmriprep docker/readme.md +++ b/examples/fmriprep docker/readme.md @@ -17,6 +17,12 @@ To keep the command line short, input files, output files, and parameter for the - `output.txt` - `parameter.txt` +Be sure to add a compute special remote to the dataset that contains the folder `derivatives/ds000102`. +This can be done with the following command: +```bash +> git annex initremote compute type=external externaltype=compute encryption=none +``` + The computation can be executed with the following command: ```bash From 69fa9a480ced21b5631e6761afe7bc776714936f Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 11 Oct 2024 21:23:09 +0200 Subject: [PATCH 125/148] remove unnecessary configurations from code --- datalad_compute/annexremotes/tests/test_hierarchies.py | 2 -- datalad_compute/commands/tests/create_datasets.py | 6 +----- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/datalad_compute/annexremotes/tests/test_hierarchies.py b/datalad_compute/annexremotes/tests/test_hierarchies.py index c1d60ef..5e5b9b1 100644 --- a/datalad_compute/annexremotes/tests/test_hierarchies.py +++ b/datalad_compute/annexremotes/tests/test_hierarchies.py @@ -88,8 +88,6 @@ def test_end_to_end(tmp_path, datalad_cfg, monkeypatch, output_pattern): root_dataset.save(result_renderer='disabled') # set annex security related variables to allow compute-URLs - datalad_cfg.set('annex.security.allowed-url-schemes', url_scheme, scope='global') - datalad_cfg.set('annex.security.allowed-ip-addresses', 'all', scope='global') datalad_cfg.set('annex.security.allow-unverified-downloads', 'ACKTHPPT', scope='global') # run compute command diff --git a/datalad_compute/commands/tests/create_datasets.py b/datalad_compute/commands/tests/create_datasets.py index 703b1e6..ba421e5 100644 --- a/datalad_compute/commands/tests/create_datasets.py +++ b/datalad_compute/commands/tests/create_datasets.py @@ -14,11 +14,7 @@ def update_config_for_compute(dataset: Dataset): action='set', scope='local', recursive=True, - spec=[ - ('annex.security.allowed-url-schemes', url_scheme), - ('annex.security.allowed-ip-addresses', 'all'), - ('annex.security.allow-unverified-downloads', 'ACKTHPPT'), - ], + spec=[('annex.security.allow-unverified-downloads', 'ACKTHPPT')], result_renderer='disabled') From 9a2f0efddbda8d0d079cf75e58f1f37cbf6e3d70 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 11 Oct 2024 21:32:09 +0200 Subject: [PATCH 126/148] disable result renderer in `get`-command --- datalad_compute/commands/provision_cmd.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py index bee1013..948b788 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_compute/commands/provision_cmd.py @@ -310,5 +310,8 @@ def install_locally_available_subdatasets(source_dataset: Dataset, str(subdataset_path.relative_to(parent_path)), 'file://' + str(source_dataset.pathobj / path_from_root)] call_git_lines(args) - worktree.get(path_from_root, get_data=False) + worktree.get( + path_from_root, + get_data=False, + result_renderer='disabled') todo.append(path_from_root) From da57b2afbc853e074d4652e2d8279b7862a547a0 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sun, 13 Oct 2024 08:11:03 +0200 Subject: [PATCH 127/148] fix a bug in worktree_dir handling --- datalad_compute/commands/provision_cmd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py index 948b788..bda8af3 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_compute/commands/provision_cmd.py @@ -138,7 +138,7 @@ def __call__(dataset=None, message=f'delete workspace: {delete.ds.path!r} from dataset {dataset}') return - worktree_dir: Path = worktree_dir or Path(TemporaryDirectory().name) + worktree_dir: Path = Path(worktree_dir) or Path(TemporaryDirectory().name) inputs = input or [] + read_list(input_list) yield from provide(dataset, worktree_dir, branch, inputs, no_globbing) From dfc7a649b66dd70529ee28dc1b895e7fd2a43731 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sat, 12 Oct 2024 12:06:17 +0200 Subject: [PATCH 128/148] remove duplicated code form test_compute.py --- .../commands/tests/test_compute.py | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/datalad_compute/commands/tests/test_compute.py b/datalad_compute/commands/tests/test_compute.py index 311ee57..1397b0b 100644 --- a/datalad_compute/commands/tests/test_compute.py +++ b/datalad_compute/commands/tests/test_compute.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Iterable from datalad_next.datasets import Dataset @@ -34,14 +35,7 @@ def _check_content(dataset, def test_duplicated_computation(tmp_path, datalad_cfg, monkeypatch): - datasets = create_ds_hierarchy(tmp_path, 'd1', 0) - root_dataset = datasets[0][2] - - # add method template - template_path = root_dataset.pathobj / template_dir - template_path.mkdir(parents=True) - (template_path / 'test_method').write_text(test_method) - root_dataset.save(result_renderer='disabled') + root_dataset = _setup_simple_computation(tmp_path) # run the same command twice _run_simple_computation(root_dataset) @@ -50,14 +44,7 @@ def test_duplicated_computation(tmp_path, datalad_cfg, monkeypatch): def test_speculative_computation(tmp_path, datalad_cfg, monkeypatch): - datasets = create_ds_hierarchy(tmp_path, 'd2', 0) - root_dataset = datasets[0][2] - - # add method template - template_path = root_dataset.pathobj / template_dir - template_path.mkdir(parents=True) - (template_path / 'test_method').write_text(test_method) - root_dataset.save(result_renderer='disabled') + root_dataset = _setup_simple_computation(tmp_path) root_dataset.compute( template='test_method', @@ -69,10 +56,24 @@ def test_speculative_computation(tmp_path, datalad_cfg, monkeypatch): # set annex security related variables to allow compute-URLs datalad_cfg.set('annex.security.allow-unverified-downloads', 'ACKTHPPT', scope='global') + # Perform the speculative computation root_dataset.get('spec.txt') assert (root_dataset.pathobj / 'spec.txt').read_text() == 'Hello Robert\n' +def _setup_simple_computation(tmp_path: Path) -> Dataset: + datasets = create_ds_hierarchy(tmp_path, 'd1', 0) + root_dataset = datasets[0][2] + + # add method template + template_path = root_dataset.pathobj / template_dir + template_path.mkdir(parents=True) + (template_path / 'test_method').write_text(test_method) + root_dataset.save(result_renderer='disabled') + + return root_dataset + + def _run_simple_computation(root_dataset: Dataset): root_dataset.compute( template='test_method', From 335ecf46d77f7713f4011342a81b22e4d44b004b Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sat, 12 Oct 2024 12:26:57 +0200 Subject: [PATCH 129/148] removed duplicated code --- .../annexremotes/tests/test_hierarchies.py | 19 ++------- .../commands/tests/create_datasets.py | 20 ++++++++- .../commands/tests/test_compute.py | 42 ++++--------------- 3 files changed, 30 insertions(+), 51 deletions(-) diff --git a/datalad_compute/annexremotes/tests/test_hierarchies.py b/datalad_compute/annexremotes/tests/test_hierarchies.py index 5e5b9b1..e8c9048 100644 --- a/datalad_compute/annexremotes/tests/test_hierarchies.py +++ b/datalad_compute/annexremotes/tests/test_hierarchies.py @@ -7,11 +7,9 @@ from datalad_next.datasets import Dataset from datalad_next.tests.fixtures import datalad_cfg -from ... import ( - template_dir, - url_scheme, +from datalad_compute.commands.tests.create_datasets import ( + create_simple_computation_dataset, ) -from datalad_compute.commands.tests.create_datasets import create_ds_hierarchy test_method = """ @@ -78,17 +76,8 @@ def _check_content(dataset, @pytest.mark.parametrize('output_pattern', [output_pattern_static, output_pattern_glob]) def test_end_to_end(tmp_path, datalad_cfg, monkeypatch, output_pattern): - datasets = create_ds_hierarchy(tmp_path, 'd2', 3) - root_dataset = datasets[0][2] - - # add method template - template_path = root_dataset.pathobj / template_dir - template_path.mkdir(parents=True) - (template_path / 'test_method').write_text(test_method) - root_dataset.save(result_renderer='disabled') - - # set annex security related variables to allow compute-URLs - datalad_cfg.set('annex.security.allow-unverified-downloads', 'ACKTHPPT', scope='global') + root_dataset = create_simple_computation_dataset( + tmp_path, 'd2', 3, test_method) # run compute command results = root_dataset.compute( diff --git a/datalad_compute/commands/tests/create_datasets.py b/datalad_compute/commands/tests/create_datasets.py index ba421e5..b760f60 100644 --- a/datalad_compute/commands/tests/create_datasets.py +++ b/datalad_compute/commands/tests/create_datasets.py @@ -5,7 +5,7 @@ from datalad_next.datasets import Dataset from datalad_next.runners import call_git_success -from datalad_compute import url_scheme +from datalad_compute import template_dir def update_config_for_compute(dataset: Dataset): @@ -71,3 +71,21 @@ def create_ds_hierarchy(tmp_path: Path, add_compute_remote(Dataset(root_dataset.pathobj / subdataset_path)) return datasets + + +def create_simple_computation_dataset(tmp_path: Path, + dataset_name: str, + subdataset_levels: int, + test_method: str, + ) -> Dataset: + + datasets = create_ds_hierarchy(tmp_path, dataset_name, subdataset_levels) + root_dataset = datasets[0][2] + + # add method template + template_path = root_dataset.pathobj / template_dir + template_path.mkdir(parents=True) + (template_path / 'test_method').write_text(test_method) + root_dataset.save(result_renderer='disabled') + + return root_dataset diff --git a/datalad_compute/commands/tests/test_compute.py b/datalad_compute/commands/tests/test_compute.py index 1397b0b..affd11e 100644 --- a/datalad_compute/commands/tests/test_compute.py +++ b/datalad_compute/commands/tests/test_compute.py @@ -1,11 +1,9 @@ -from pathlib import Path -from typing import Iterable - from datalad_next.datasets import Dataset from datalad_next.tests.fixtures import datalad_cfg -from ... import template_dir -from datalad_compute.commands.tests.create_datasets import create_ds_hierarchy +from datalad_compute.commands.tests.create_datasets import ( + create_simple_computation_dataset, +) test_method = """ @@ -15,27 +13,13 @@ arguments = ["Hello {name} > {file}"] """ - output_pattern = ['a.txt'] -def _drop_files(dataset: Dataset, - files: Iterable[str]): - for file in files: - dataset.drop(file, reckless='availability', result_renderer='disabled') - assert not (dataset.pathobj / file).exists() - - -def _check_content(dataset, - file_content: Iterable[tuple[str, str]] - ): - for file, content in file_content: - assert (dataset.pathobj / file).read_text() == content - - def test_duplicated_computation(tmp_path, datalad_cfg, monkeypatch): - root_dataset = _setup_simple_computation(tmp_path) + root_dataset = create_simple_computation_dataset( + tmp_path, 'ds1', 0, test_method) # run the same command twice _run_simple_computation(root_dataset) @@ -44,7 +28,8 @@ def test_duplicated_computation(tmp_path, datalad_cfg, monkeypatch): def test_speculative_computation(tmp_path, datalad_cfg, monkeypatch): - root_dataset = _setup_simple_computation(tmp_path) + root_dataset = create_simple_computation_dataset( + tmp_path, 'ds1', 0, test_method) root_dataset.compute( template='test_method', @@ -61,19 +46,6 @@ def test_speculative_computation(tmp_path, datalad_cfg, monkeypatch): assert (root_dataset.pathobj / 'spec.txt').read_text() == 'Hello Robert\n' -def _setup_simple_computation(tmp_path: Path) -> Dataset: - datasets = create_ds_hierarchy(tmp_path, 'd1', 0) - root_dataset = datasets[0][2] - - # add method template - template_path = root_dataset.pathobj / template_dir - template_path.mkdir(parents=True) - (template_path / 'test_method').write_text(test_method) - root_dataset.save(result_renderer='disabled') - - return root_dataset - - def _run_simple_computation(root_dataset: Dataset): root_dataset.compute( template='test_method', From 195f0b77abf2bde8c8ff8162ee0f9ffda804009f Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 14 Oct 2024 14:08:33 +0200 Subject: [PATCH 130/148] use Get()-interface, remove unused variable --- datalad_compute/annexremotes/tests/test_compute_remote.py | 2 +- datalad_compute/annexremotes/tests/test_hierarchies.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datalad_compute/annexremotes/tests/test_compute_remote.py b/datalad_compute/annexremotes/tests/test_compute_remote.py index e2f0961..efed519 100644 --- a/datalad_compute/annexremotes/tests/test_compute_remote.py +++ b/datalad_compute/annexremotes/tests/test_compute_remote.py @@ -26,7 +26,7 @@ def __init__(self): self.output = '' self.lines = [] - def write(self, *args, **kwargs): + def write(self, *args, **_): self.output += ''.join(args) lineswith = self.output.splitlines(keepends=True) lineswithout = self.output.splitlines(keepends=False) diff --git a/datalad_compute/annexremotes/tests/test_hierarchies.py b/datalad_compute/annexremotes/tests/test_hierarchies.py index e8c9048..0a7c799 100644 --- a/datalad_compute/annexremotes/tests/test_hierarchies.py +++ b/datalad_compute/annexremotes/tests/test_hierarchies.py @@ -3,7 +3,7 @@ import pytest -from datalad.api import get as datalad_get +from datalad.distribution.get import Get as datalad_get from datalad_next.datasets import Dataset from datalad_next.tests.fixtures import datalad_cfg From 7d7a3959d1f1c8a9f8ac0c472cd482e6047b51e2 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 15 Oct 2024 11:55:41 +0200 Subject: [PATCH 131/148] fix a bug in worktree parameter handling --- datalad_compute/commands/provision_cmd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py index bda8af3..ab56871 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_compute/commands/provision_cmd.py @@ -138,7 +138,7 @@ def __call__(dataset=None, message=f'delete workspace: {delete.ds.path!r} from dataset {dataset}') return - worktree_dir: Path = Path(worktree_dir) or Path(TemporaryDirectory().name) + worktree_dir: Path = Path(worktree_dir or TemporaryDirectory().name) inputs = input or [] + read_list(input_list) yield from provide(dataset, worktree_dir, branch, inputs, no_globbing) From f9f1d0589d946ee666839b5641655702356a9a99 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 14 Oct 2024 20:46:01 +0200 Subject: [PATCH 132/148] add subdataset aware input globbing This commit adds input globing that works similarly to the input globbing of `datalad run`. It performs globbing on the worktree, installing all necessary subdatasets that are matched in a globbing expression. As a result, an expression like `**` would install all subdatasets that are reachable from the root dataset. --- datalad_compute/commands/compute_cmd.py | 14 +-- datalad_compute/commands/provision_cmd.py | 120 +++++++++++++++------- 2 files changed, 84 insertions(+), 50 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 33c2089..989bec9 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -129,12 +129,6 @@ class Compute(ValidatedInterface): "that start with '#' are ignored. Line content is stripped " "before used. This is useful if a large number of parameters " "should be provided."), - no_globbing=Parameter( - args=('-n', '--no-globbing',), - doc="Use input pattern as file names and do not apply globbing. " - "This allows to specify files that are no currently present " - "in the source dataset as input. Those files will be made " - "available in the worktree."), ) @staticmethod @@ -150,7 +144,6 @@ def __call__(dataset=None, output_list=None, parameter=None, parameter_list=None, - no_globbing=False, ): dataset : Dataset = dataset.ds if dataset else Dataset('.') @@ -178,7 +171,6 @@ def __call__(dataset=None, dataset, branch, input_pattern, - no_globbing, ) as worktree: execute(worktree, template, parameter_dict, output_pattern) output = collect(worktree, dataset, output_pattern) @@ -325,14 +317,12 @@ def get_file_dataset(file: Path) -> tuple[Path, Path]: def provide(dataset: Dataset, branch: str | None, input_patterns: list[str], - no_globbing: bool, ) -> Path: lgr.debug('provide: %s %s %s', dataset, branch, input_patterns) result = dataset.provision( input=input_patterns, branch=branch, - no_globbing=no_globbing, result_renderer='disabled') return Path(result[0]['path']) @@ -341,14 +331,12 @@ def provide(dataset: Dataset, def provide_context(dataset: Dataset, branch: str | None, input_patterns: list[str], - no_globbing: bool = False, ) -> Generator: worktree = provide( dataset, branch=branch, - input_patterns=input_patterns, - no_globbing=no_globbing) + input_patterns=input_patterns) try: yield worktree finally: diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py index ab56871..3264f57 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_compute/commands/provision_cmd.py @@ -6,7 +6,9 @@ from __future__ import annotations import logging +import os import stat +import sys from contextlib import chdir from pathlib import Path from typing import ( @@ -32,6 +34,7 @@ ) from datalad_next.datasets import Dataset from datalad_next.runners import call_git_lines, call_git_success +from hypothesis.strategies import recursive from datalad_compute.utils.glob import resolve_patterns from ..commands.compute_cmd import read_list @@ -98,13 +101,6 @@ class Provision(ValidatedInterface): "that start with '#' are ignored. Line content is stripped " "before used. This is useful if a large number of input file " "patterns should be provided."), - no_globbing=Parameter( - args=('-n', '--no-globbing',), - action='store_true', - doc="Interpret input pattern as file names and do not apply " - "globbing. This allows to specify files that are no currently " - "present in the source dataset as input. Those files will be " - "made available in the worktree by the provisioning stage."), worktree_dir=Parameter( args=('-w', '--worktree-dir',), doc="Path of the directory that should become the temporary " @@ -119,7 +115,6 @@ def __call__(dataset=None, delete=None, input=None, input_list=None, - no_globbing=False, worktree_dir=None, ): @@ -140,7 +135,7 @@ def __call__(dataset=None, worktree_dir: Path = Path(worktree_dir or TemporaryDirectory().name) inputs = input or [] + read_list(input_list) - yield from provide(dataset, worktree_dir, branch, inputs, no_globbing) + yield from provide(dataset, worktree_dir, branch, inputs) def remove(dataset: Dataset, @@ -165,7 +160,6 @@ def provide(dataset: Dataset, worktree_dir: Path, source_branch: str | None = None, input_patterns: Iterable[str] | None = None, - no_globbing: bool = False, ) -> Generator: lgr.debug('Provisioning dataset %s at %s', dataset, worktree_dir) @@ -180,33 +174,26 @@ def provide(dataset: Dataset, ) call_git_lines(args, cwd=dataset.pathobj) - if no_globbing: - input_files = set(input_patterns) - else: - input_files = resolve_patterns(dataset.path, input_patterns) - - unclean_elements = get_unclean_elements(dataset, input_files) - if unclean_elements: - for element in unclean_elements: - yield get_status_dict( - action='provision', - path=element['path'], - status='error', - state=element['state'], - message=f'cannot provision {element["state"]} input: {element["path"]!r} from dataset {dataset}') + is_dirty = False + for element in get_dirty_elements(dataset): + is_dirty = True + yield get_status_dict( + action='provision', + path=element['path'], + status='error', + state=element['state'], + message=f'cannot provision {element["state"]} input: {element["path"]!r} from dataset {dataset}') + if is_dirty: return worktree_dataset = Dataset(worktree_dir) - install_required_locally_available_datasets( - dataset, - [Path(i) for i in input_files], - worktree_dataset) # Get all input files in the worktree with chdir(worktree_dataset.path): - for file in input_files: - lgr.debug('provisioning input file %s', file) - worktree_dataset.get(file, result_renderer='disabled') + for pattern in input_patterns: + print(f'XXXXX: pattern: {pattern}', file=sys.stderr, flush=True) + worktree_dataset.get(pattern) + yield get_status_dict( action='provision', path=str(worktree_dir), @@ -214,18 +201,77 @@ def provide(dataset: Dataset, message=f'provisioned dataset: {dataset} in workspace: {worktree_dir!r}',) +def resolve_pattern(dataset: Dataset, + pattern_list: list[str] + ) -> set[Path]: + """Resolve a pattern in the dataset, install all subdatasets that might lead + to a potential match. + + Once all subdatasets are determined and installed, get the content of the + matching files. + """ + + uninstalled_subdataset = get_uninstalled_subdatasets(dataset) + result = set() + for pattern in pattern_list: + pattern_elements = pattern.split(os.sep) + if pattern_elements[0] == '': + lgr.warning('Ignoring absolute input pattern %s', pattern) + continue + result = result.union( + set( + glob_pattern( + dataset, + Path(), + pattern.split(os.sep), + uninstalled_subdataset))) + return result + + +def get_uninstalled_subdatasets(dataset: Dataset) -> list[Path]: + # get the list of all non-installed subdatasets + return [ + Path(result['path']) + for result in dataset.subdatasets(recursive=True, result_renderer='disabled') + if result['state'] == 'absent'] + + +def glob_pattern(root: Dataset, + position: Path, + pattern: list[str], + uninstalled_subdatasets: list[Path] + ) -> list[Path]: + + if not pattern: + return [position] + + if pattern[0] == '**': + result = glob_pattern(root, position, pattern[1:], uninstalled_subdatasets) + else: + result = [] + + for match in (root.pathobj / position).glob(pattern[0]): + if match.is_dir() and match in uninstalled_subdatasets: + lgr.info('Installing subdataset %s to glob input', match) + root.get(str(match), get_data=False) + uninstalled_subdatasets.remove(match) + uninstalled_subdatasets.extend(get_uninstalled_subdatasets(root)) + for submatch in glob_pattern(root, match, pattern[1:], uninstalled_subdatasets): + result.append(submatch) + return result + + def check_results(results: Iterable[dict[str, Any]]): assert not any( result['status'] in ('impossible', 'error') for result in results) -def get_unclean_elements(dataset: Dataset, - paths: Iterable[str] - ) -> list[dict]: - if not paths: - return [] - return list(filter(lambda x: x['state'] != 'clean', dataset.status(paths))) +def get_dirty_elements(dataset: Dataset) -> Generator: + for result in dataset.status(recursive=True): + if result['state'] != 'clean': + if result['type'] == 'file': + yield result def install_required_locally_available_datasets(root_dataset: Dataset, From 198da6b7d41ddde27ca047aab83c9c7314d64452 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 15 Oct 2024 15:33:22 +0200 Subject: [PATCH 133/148] fix an assert --- datalad_compute/commands/compute_cmd.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_compute/commands/compute_cmd.py index 989bec9..b74d36b 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_compute/commands/compute_cmd.py @@ -294,10 +294,10 @@ def add_url(dataset: Dataset, + (['--relaxed'] if url_only else []), cwd=file_dataset_path, capture_output=True) - assert ( - success, - f'\naddurl failed:\nfile_dataset_path: {file_dataset_path}\n' - f'url: {url!r}\nfile_path: {file_path!r}') + assert \ + success, \ + f'\naddurl failed:\nfile_dataset_path: {file_dataset_path}\n' \ + f'url: {url!r}\nfile_path: {file_path!r}' return url From 36c5295390c17ea11bc413a14d31c387fcbe1c6c Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 15 Oct 2024 16:19:27 +0200 Subject: [PATCH 134/148] fix use of `Get()` --- datalad_compute/annexremotes/tests/test_hierarchies.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datalad_compute/annexremotes/tests/test_hierarchies.py b/datalad_compute/annexremotes/tests/test_hierarchies.py index 0a7c799..10f36b8 100644 --- a/datalad_compute/annexremotes/tests/test_hierarchies.py +++ b/datalad_compute/annexremotes/tests/test_hierarchies.py @@ -104,7 +104,7 @@ def test_end_to_end(tmp_path, datalad_cfg, monkeypatch, output_pattern): # Go to the subdataset `d2_subds0/d2_subds1` and fetch the content of `a1.txt` # from a compute remote. monkeypatch.chdir(root_dataset.pathobj / 'd2_subds0' / 'd2_subds1') - datalad_get('a1.txt') + datalad_get()('a1.txt') # check that all known files that were computed are added to the annex _check_content(root_dataset, test_file_content) @@ -113,6 +113,6 @@ def test_end_to_end(tmp_path, datalad_cfg, monkeypatch, output_pattern): # check get in subdatasets monkeypatch.chdir(root_dataset.pathobj) - datalad_get('d2_subds0/d2_subds1/a1.txt') + datalad_get()('d2_subds0/d2_subds1/a1.txt') _check_content(root_dataset, test_file_content) From 62a81185012270b650d8422896582f20cd1aefb3 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 15 Oct 2024 16:52:23 +0200 Subject: [PATCH 135/148] remove unused imports --- datalad_compute/commands/provision_cmd.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py index 3264f57..1d62152 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_compute/commands/provision_cmd.py @@ -34,9 +34,7 @@ ) from datalad_next.datasets import Dataset from datalad_next.runners import call_git_lines, call_git_success -from hypothesis.strategies import recursive -from datalad_compute.utils.glob import resolve_patterns from ..commands.compute_cmd import read_list @@ -91,8 +89,9 @@ class Provision(ValidatedInterface): args=('-i', '--input',), action='append', doc="An input file pattern (repeat for multiple inputs, " - "file pattern support python globbing, globbing is expanded " - "in the source dataset"), + "file pattern support python globbing, globbing is done in the " + "worktree and through all matching subdatasets, installing " + "if necessary)."), input_list=Parameter( args=('-I', '--input-list',), doc="Name of a file that contains a list of input file patterns. " From 0742e18c0842ea9cd37b8f7b24452b305c00ce51 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 15 Oct 2024 20:55:40 +0200 Subject: [PATCH 136/148] use `glob.glob` to glob input patterns The method `Path.glob` does not yield dangling links, even if they match the pattern. We use `glob.glob` therefore --- datalad_compute/commands/provision_cmd.py | 49 ++++++++++--------- .../commands/tests/test_provision.py | 7 --- 2 files changed, 26 insertions(+), 30 deletions(-) diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py index 1d62152..76f1818 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_compute/commands/provision_cmd.py @@ -8,8 +8,8 @@ import logging import os import stat -import sys from contextlib import chdir +from glob import glob from pathlib import Path from typing import ( Any, @@ -158,7 +158,7 @@ def prune_worktrees(dataset: Dataset) -> None: def provide(dataset: Dataset, worktree_dir: Path, source_branch: str | None = None, - input_patterns: Iterable[str] | None = None, + input_patterns: list[str] | None = None, ) -> Generator: lgr.debug('Provisioning dataset %s at %s', dataset, worktree_dir) @@ -189,9 +189,8 @@ def provide(dataset: Dataset, # Get all input files in the worktree with chdir(worktree_dataset.path): - for pattern in input_patterns: - print(f'XXXXX: pattern: {pattern}', file=sys.stderr, flush=True) - worktree_dataset.get(pattern) + for path in resolve_patterns(worktree_dataset, input_patterns): + worktree_dataset.get(path) yield get_status_dict( action='provision', @@ -200,30 +199,31 @@ def provide(dataset: Dataset, message=f'provisioned dataset: {dataset} in workspace: {worktree_dir!r}',) -def resolve_pattern(dataset: Dataset, - pattern_list: list[str] - ) -> set[Path]: - """Resolve a pattern in the dataset, install all subdatasets that might lead - to a potential match. +def resolve_patterns(dataset: Dataset, + pattern_list: list[str] + ) -> set[Path]: + """Resolve patterns in the dataset, install all necessary subdatasets and get content Once all subdatasets are determined and installed, get the content of the matching files. """ uninstalled_subdataset = get_uninstalled_subdatasets(dataset) + result = set() for pattern in pattern_list: - pattern_elements = pattern.split(os.sep) - if pattern_elements[0] == '': + pattern_parts = pattern.split(os.sep) + + if pattern_parts[0] == '': lgr.warning('Ignoring absolute input pattern %s', pattern) continue - result = result.union( - set( - glob_pattern( - dataset, - Path(), - pattern.split(os.sep), - uninstalled_subdataset))) + + result = result.union(set( + glob_pattern( + dataset, + Path(), + pattern_parts, + uninstalled_subdataset))) return result @@ -249,13 +249,16 @@ def glob_pattern(root: Dataset, else: result = [] - for match in (root.pathobj / position).glob(pattern[0]): - if match.is_dir() and match in uninstalled_subdatasets: + for match in glob(pattern[0], root_dir=root.pathobj / position): + match = position / match + absolute_match = root.pathobj / match + if absolute_match.is_dir() and absolute_match in uninstalled_subdatasets: lgr.info('Installing subdataset %s to glob input', match) root.get(str(match), get_data=False) - uninstalled_subdatasets.remove(match) + uninstalled_subdatasets.remove(absolute_match) uninstalled_subdatasets.extend(get_uninstalled_subdatasets(root)) - for submatch in glob_pattern(root, match, pattern[1:], uninstalled_subdatasets): + submatch_pattern = pattern if pattern[0] == '**' else pattern[1:] + for submatch in glob_pattern(root, match, submatch_pattern, uninstalled_subdatasets): result.append(submatch) return result diff --git a/datalad_compute/commands/tests/test_provision.py b/datalad_compute/commands/tests/test_provision.py index 6aef0d7..e42c388 100644 --- a/datalad_compute/commands/tests/test_provision.py +++ b/datalad_compute/commands/tests/test_provision.py @@ -165,13 +165,6 @@ def test_unclean_dataset(tmp_path): worktree_dir=tmp_path / 'ds1_worktree2', result_renderer='disabled') - # Check that non-input file `c.txt` is ignored - (dataset.pathobj / 'c.txt').write_text('content') - dataset.provision( - input=input_pattern, - worktree_dir=tmp_path / 'ds1_worktree3', - result_renderer='disabled') - def test_branch_deletion_after_provision(tmp_path): dataset = create_ds_hierarchy(tmp_path, 'ds1', 3)[0][2] From eab5a60e476d39bd6fbc5af1ce687b2dd57ff373 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 16 Oct 2024 14:01:26 +0200 Subject: [PATCH 137/148] refactor, add doc-strings, etc. --- datalad_compute/commands/provision_cmd.py | 148 ++++++++++++------ .../commands/tests/test_provision.py | 2 + 2 files changed, 103 insertions(+), 47 deletions(-) diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py index 76f1818..f82b26b 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_compute/commands/provision_cmd.py @@ -12,8 +12,8 @@ from glob import glob from pathlib import Path from typing import ( - Any, - Iterable, Generator, + Iterable, + Generator, ) from tempfile import TemporaryDirectory @@ -38,9 +38,6 @@ from ..commands.compute_cmd import read_list -__docformat__ = 'restructuredtext' - - lgr = logging.getLogger('datalad.compute.provision_cmd') @@ -134,7 +131,7 @@ def __call__(dataset=None, worktree_dir: Path = Path(worktree_dir or TemporaryDirectory().name) inputs = input or [] + read_list(input_list) - yield from provide(dataset, worktree_dir, branch, inputs) + yield from provide(dataset, worktree_dir, inputs, branch) def remove(dataset: Dataset, @@ -157,9 +154,26 @@ def prune_worktrees(dataset: Dataset) -> None: def provide(dataset: Dataset, worktree_dir: Path, + input_patterns: list[str], source_branch: str | None = None, - input_patterns: list[str] | None = None, ) -> Generator: + """Provide paths defined by input_patterns in a temporary worktree + + Parameters + ---------- + dataset: Dataset + Dataset that should be provisioned + worktree_dir: Path + Path to a directory that should contain the provisioned worktree + input_patterns: list[str] + List of patterns that describe the input files + source_branch: str | None + Branch that should be provisioned, if None HEAD will be used [optional] + + Returns + ------- + + """ lgr.debug('Provisioning dataset %s at %s', dataset, worktree_dir) @@ -202,15 +216,26 @@ def provide(dataset: Dataset, def resolve_patterns(dataset: Dataset, pattern_list: list[str] ) -> set[Path]: - """Resolve patterns in the dataset, install all necessary subdatasets and get content - - Once all subdatasets are determined and installed, get the content of the - matching files. + """Resolve file patterns in the dataset + + This method will resolve relative path-patterns in the dataset. It will + install all subdatasets that are matched by the patterns. Pattern are + described as outline in `glob.glob`. The method support recursive globbing + of zero or more directories with the pattern: `**`. + + Parameters + ---------- + dataset : Dataset + Dataset in which the patterns should be resolved. + pattern_list : list[str] + List of patterns thatThat should be resolved. + + Returns + ------- + set[Path] + Set of paths that match the patterns. """ - - uninstalled_subdataset = get_uninstalled_subdatasets(dataset) - - result = set() + matches = set() for pattern in pattern_list: pattern_parts = pattern.split(os.sep) @@ -218,62 +243,91 @@ def resolve_patterns(dataset: Dataset, lgr.warning('Ignoring absolute input pattern %s', pattern) continue - result = result.union(set( + matches.update( glob_pattern( dataset, Path(), pattern_parts, - uninstalled_subdataset))) - return result + get_uninstalled_subdatasets(dataset))) + return matches -def get_uninstalled_subdatasets(dataset: Dataset) -> list[Path]: - # get the list of all non-installed subdatasets - return [ - Path(result['path']) +def get_uninstalled_subdatasets(dataset: Dataset) -> set[Path]: + """Get a list of the paths of all visible, non-installed subdatasets""" + return set([ + Path(result['path']).relative_to(dataset.pathobj) for result in dataset.subdatasets(recursive=True, result_renderer='disabled') - if result['state'] == 'absent'] + if result['state'] == 'absent']) def glob_pattern(root: Dataset, position: Path, pattern: list[str], - uninstalled_subdatasets: list[Path] - ) -> list[Path]: - + uninstalled_subdatasets: set[Path], + ) -> set[Path]: + """Glob a pattern in a dataset installing subdatasets if necessary + + Parameters + ---------- + root: Dataset + The dataset in which the pattern should be resolved. + position: Path + A relative path that denotes the position in the dataset from which a + pattern is matched. + pattern: list[str] + The path-elements of the pattern. For example `['*', 'a', '*.txt']` + represents the pattern `'*/a/*.txt'`. + uninstalled_subdatasets: set[Path] + A set that contains all currently known uninstalled subdatasets. This + set will be updated in the method + + Returns + ------- + set[Path] + A set that contains all paths that match the pattern. + """ if not pattern: - return [position] + return {position} + # If the pattern starts with `**` we have to glob the remainder of the + # pattern from this position. if pattern[0] == '**': - result = glob_pattern(root, position, pattern[1:], uninstalled_subdatasets) + result = glob_pattern( + root, + position, + pattern[1:], + uninstalled_subdatasets) else: - result = [] - - for match in glob(pattern[0], root_dir=root.pathobj / position): + result = set() + + # Match all elements at the current position with the first part of the + # pattern. + for match in glob( + '*' if pattern[0] == '**' else pattern[0], + root_dir=root.pathobj / position + ): match = position / match - absolute_match = root.pathobj / match - if absolute_match.is_dir() and absolute_match in uninstalled_subdatasets: + + # If the match is a directory that is in uninstalled subdatasets, + # install the dataset and updated uninstalled datasets before proceeding + # with matching the pattern. + if match.is_dir() and match in uninstalled_subdatasets: lgr.info('Installing subdataset %s to glob input', match) - root.get(str(match), get_data=False) - uninstalled_subdatasets.remove(absolute_match) - uninstalled_subdatasets.extend(get_uninstalled_subdatasets(root)) + root.get(str(match), get_data=False, result_renderer='disabled') + uninstalled_subdatasets.remove(match) + uninstalled_subdatasets.update(get_uninstalled_subdatasets(root)) + + # We have a match, try to match the remainder of the pattern. submatch_pattern = pattern if pattern[0] == '**' else pattern[1:] - for submatch in glob_pattern(root, match, submatch_pattern, uninstalled_subdatasets): - result.append(submatch) + result.update(glob_pattern(root, match, submatch_pattern, uninstalled_subdatasets)) return result -def check_results(results: Iterable[dict[str, Any]]): - assert not any( - result['status'] in ('impossible', 'error') - for result in results) - - def get_dirty_elements(dataset: Dataset) -> Generator: + """Get all dirty elements in the dataset""" for result in dataset.status(recursive=True): - if result['state'] != 'clean': - if result['type'] == 'file': - yield result + if result['type'] == 'file' and result['state'] != 'clean': + yield result def install_required_locally_available_datasets(root_dataset: Dataset, diff --git a/datalad_compute/commands/tests/test_provision.py b/datalad_compute/commands/tests/test_provision.py index e42c388..991a5b3 100644 --- a/datalad_compute/commands/tests/test_provision.py +++ b/datalad_compute/commands/tests/test_provision.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import Iterable +import pytest from datalad_next.datasets import Dataset from datalad_next.runners import call_git_lines @@ -182,6 +183,7 @@ def test_branch_deletion_after_provision(tmp_path): assert worktree.name not in branches +@pytest.mark.skip(reason="local subdatasets are currently ignored") def test_not_present_local_datasets(tmp_path): root_ds = Dataset(tmp_path / 'ds1') root_ds.create(cfg_proc='text2git', result_renderer='disabled') From 485ac98329eb6b21060ae7d86e015eb9bd6824fa Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 16 Oct 2024 16:46:57 +0200 Subject: [PATCH 138/148] install locally available subdataset first --- datalad_compute/commands/provision_cmd.py | 142 +++++++----------- .../commands/tests/test_provision.py | 2 - 2 files changed, 57 insertions(+), 87 deletions(-) diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_compute/commands/provision_cmd.py index f82b26b..7200101 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_compute/commands/provision_cmd.py @@ -203,7 +203,7 @@ def provide(dataset: Dataset, # Get all input files in the worktree with chdir(worktree_dataset.path): - for path in resolve_patterns(worktree_dataset, input_patterns): + for path in resolve_patterns(dataset, worktree_dataset, input_patterns): worktree_dataset.get(path) yield get_status_dict( @@ -214,6 +214,7 @@ def provide(dataset: Dataset, def resolve_patterns(dataset: Dataset, + worktree: Dataset, pattern_list: list[str] ) -> set[Path]: """Resolve file patterns in the dataset @@ -225,8 +226,10 @@ def resolve_patterns(dataset: Dataset, Parameters ---------- - dataset : Dataset - Dataset in which the patterns should be resolved. + dataset: Dataset, + Dataset that should be provisioned. + worktree : Dataset + Worktree dataset, in which the patterns should be resolved. pattern_list : list[str] List of patterns thatThat should be resolved. @@ -245,10 +248,11 @@ def resolve_patterns(dataset: Dataset, matches.update( glob_pattern( - dataset, + worktree, Path(), pattern_parts, - get_uninstalled_subdatasets(dataset))) + get_uninstalled_subdatasets(worktree), + get_installed_subdatasets(dataset))) return matches @@ -264,6 +268,7 @@ def glob_pattern(root: Dataset, position: Path, pattern: list[str], uninstalled_subdatasets: set[Path], + locally_available_subdatasets: Iterable[tuple[Path, Path, Path]], ) -> set[Path]: """Glob a pattern in a dataset installing subdatasets if necessary @@ -279,7 +284,10 @@ def glob_pattern(root: Dataset, represents the pattern `'*/a/*.txt'`. uninstalled_subdatasets: set[Path] A set that contains all currently known uninstalled subdatasets. This - set will be updated in the method + set will be updated in the method. + locally_available_subdatasets: set[Path] + A set that contains all datasets that are available in the dataset for + which the worktree is created. Returns ------- @@ -296,7 +304,8 @@ def glob_pattern(root: Dataset, root, position, pattern[1:], - uninstalled_subdatasets) + uninstalled_subdatasets, + locally_available_subdatasets) else: result = set() @@ -313,13 +322,22 @@ def glob_pattern(root: Dataset, # with matching the pattern. if match.is_dir() and match in uninstalled_subdatasets: lgr.info('Installing subdataset %s to glob input', match) - root.get(str(match), get_data=False, result_renderer='disabled') - uninstalled_subdatasets.remove(match) - uninstalled_subdatasets.update(get_uninstalled_subdatasets(root)) + install_subdataset( + root, + match, + uninstalled_subdatasets, + locally_available_subdatasets) # We have a match, try to match the remainder of the pattern. submatch_pattern = pattern if pattern[0] == '**' else pattern[1:] - result.update(glob_pattern(root, match, submatch_pattern, uninstalled_subdatasets)) + result.update( + glob_pattern( + root, + match, + submatch_pattern, + uninstalled_subdatasets, + locally_available_subdatasets)) + return result @@ -330,90 +348,44 @@ def get_dirty_elements(dataset: Dataset) -> Generator: yield result -def install_required_locally_available_datasets(root_dataset: Dataset, - input_files: list[Path], - worktree: Dataset, - ) -> None: - """Ensure that local and locally changed subdatasets can be provisioned. - - If subdatasets are only available within the root dataset, either because - they are not published or because they are locally modified, the provision - has to use those. - - This means we have to adapt cloning candidates before trying to install - a subdataset. This is done by: - - - Determining which subdatasets are installed in the root dataset - - Determining which of those subdatasets are required by the input files - - Adjust the `.gitmodules` files and install the required local datasets - - All other datasets are installed as usual, e.g. via `datalad get`. - """ - - # Determine which subdatasets are installed in the root dataset - subdataset_info = get_subdataset_info(root_dataset) - - # Determine which subdatasets are required by the input files - required_subdatasets = determine_required_subdatasets( - subdataset_info, - input_files) - - install_locally_available_subdatasets( - root_dataset, - required_subdatasets, - worktree) +def install_subdataset(worktree: Dataset, + subdataset_path: Path, + uninstalled_subdatasets: set[Path], + locally_available_datasets: Iterable[tuple[Path, Path, Path]], + ) -> None: + """Install a subdataset, prefer locally available subdatasets""" + local_subdataset = ([ + l + for l in locally_available_datasets + if l[2] == subdataset_path] or [None])[0] + + if local_subdataset: + absolute_path, parent_ds_path, path_from_root = local_subdataset + # Set the URL to the full source path + args = ['-C', str(worktree.pathobj / parent_ds_path), + 'submodule', 'set-url', '--', + str(path_from_root.relative_to(parent_ds_path)), + 'file://' + str(absolute_path)] + call_git_lines(args) + worktree.get( + str(subdataset_path), + get_data=False, + result_renderer='disabled') + uninstalled_subdatasets.remove(subdataset_path) + uninstalled_subdatasets.update(get_uninstalled_subdatasets(worktree)) -def get_subdataset_info(dataset: Dataset) -> Iterable[tuple[Path, Path, Path]]: +def get_installed_subdatasets(dataset: Dataset + ) -> Iterable[tuple[Path, Path, Path]]: results = dataset.subdatasets( recursive=True, result_renderer='disabled') return [ ( Path(result['path']), - Path(result['parentds']), + Path(result['parentds']).relative_to(dataset.pathobj), Path(result['path']).relative_to(dataset.pathobj) ) for result in results if result['state'] == 'present' ] - - -def determine_required_subdatasets(subdataset_info: Iterable[tuple[Path, Path, Path]], - input_files: list[Path], - ) -> set[tuple[Path, Path, Path]]: - required_set = set() - for file in input_files: - # if the path can be expressed as relative to the subdataset path. - # the subdataset is required, and so are all subdatasets above it. - for subdataset_path, parent_path, path_from_root in subdataset_info: - try: - file.relative_to(path_from_root) - required_set.add((subdataset_path, parent_path, path_from_root)) - except ValueError: - pass - return required_set - - -def install_locally_available_subdatasets(source_dataset: Dataset, - required_subdatasets: set[tuple[Path, Path, Path]], - worktree: Dataset, - ) -> None: - """Install the required subdatasets from the source dataset in the worktree. - """ - todo = [Path('.')] - while todo: - current_root = todo.pop() - for subdataset_path, parent_path, path_from_root in required_subdatasets: - if not current_root == parent_path.relative_to(source_dataset.pathobj): - continue - # Set the URL to the full source path - args = ['-C', str(worktree.pathobj / current_root), - 'submodule', 'set-url', '--', - str(subdataset_path.relative_to(parent_path)), - 'file://' + str(source_dataset.pathobj / path_from_root)] - call_git_lines(args) - worktree.get( - path_from_root, - get_data=False, - result_renderer='disabled') - todo.append(path_from_root) diff --git a/datalad_compute/commands/tests/test_provision.py b/datalad_compute/commands/tests/test_provision.py index 991a5b3..66b5eee 100644 --- a/datalad_compute/commands/tests/test_provision.py +++ b/datalad_compute/commands/tests/test_provision.py @@ -183,7 +183,6 @@ def test_branch_deletion_after_provision(tmp_path): assert worktree.name not in branches -@pytest.mark.skip(reason="local subdatasets are currently ignored") def test_not_present_local_datasets(tmp_path): root_ds = Dataset(tmp_path / 'ds1') root_ds.create(cfg_proc='text2git', result_renderer='disabled') @@ -206,7 +205,6 @@ def test_not_present_local_datasets(tmp_path): provisioned_dataset_2 = Dataset( root_ds.provision( input=['ds000102/README'], - no_globbing=True, on_failure='ignore', result_renderer='disabled')[0]['path']) url_2 = _get_submodule_url(provisioned_dataset_2, 'ds000102') From 15470ef2248efbd8f0b0ebb09e6ad47945b7d257 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 17 Oct 2024 15:44:08 +0200 Subject: [PATCH 139/148] rename `compute` to `remake` This commit performs renaming of the old `compute` names into the `remake` names. The following names are used now: - command: `datalad make` - annex remote: `datalad-remake` - url scheme: `datalad-make:` - remake subdirectory name: `.datalad/make` --- .github/workflows/mypy-pr.yml | 2 +- .github/workflows/mypy-project.yml | 2 +- datalad_compute/__init__.py | 45 --------------- datalad_compute/tests/test_register.py | 7 --- datalad_remake/__init__.py | 56 +++++++++++++------ .../annexremotes/__init__.py | 0 .../annexremotes/remake_remote.py | 15 ++--- .../annexremotes/tests/__init__.py | 0 .../annexremotes/tests/test_hierarchies.py | 8 +-- .../annexremotes/tests/test_remake_remote.py | 20 +++---- .../commands/__init__.py | 0 .../commands/make_cmd.py | 16 +++--- .../commands/provision_cmd.py | 8 +-- .../commands/tests/__init__.py | 0 .../commands/tests/create_datasets.py | 20 +++---- .../commands/tests/test_collection.py | 2 +- .../commands/tests/test_compute.py | 9 +-- .../commands/tests/test_listhandling.py | 2 +- .../commands/tests/test_provision.py | 3 +- datalad_remake/tests/test_dummy.py | 6 -- datalad_remake/tests/test_register.py | 6 ++ .../utils/__init__.py | 0 .../utils/compute.py | 4 +- .../utils/glob.py | 0 .../utils/tests/__init__.py | 0 .../utils/tests/test_substitution.py | 0 pyproject.toml | 21 +++++-- requirements-devel.txt | 3 + 28 files changed, 118 insertions(+), 137 deletions(-) delete mode 100644 datalad_compute/__init__.py delete mode 100644 datalad_compute/tests/test_register.py rename {datalad_compute => datalad_remake}/annexremotes/__init__.py (100%) rename datalad_compute/annexremotes/compute_remote.py => datalad_remake/annexremotes/remake_remote.py (94%) rename {datalad_compute => datalad_remake}/annexremotes/tests/__init__.py (100%) rename {datalad_compute => datalad_remake}/annexremotes/tests/test_hierarchies.py (95%) rename datalad_compute/annexremotes/tests/test_compute_remote.py => datalad_remake/annexremotes/tests/test_remake_remote.py (82%) rename {datalad_compute => datalad_remake}/commands/__init__.py (100%) rename datalad_compute/commands/compute_cmd.py => datalad_remake/commands/make_cmd.py (97%) rename {datalad_compute => datalad_remake}/commands/provision_cmd.py (98%) rename {datalad_compute => datalad_remake}/commands/tests/__init__.py (100%) rename {datalad_compute => datalad_remake}/commands/tests/create_datasets.py (84%) rename {datalad_compute => datalad_remake}/commands/tests/test_collection.py (96%) rename {datalad_compute => datalad_remake}/commands/tests/test_compute.py (87%) rename {datalad_compute => datalad_remake}/commands/tests/test_listhandling.py (95%) rename {datalad_compute => datalad_remake}/commands/tests/test_provision.py (99%) delete mode 100644 datalad_remake/tests/test_dummy.py create mode 100644 datalad_remake/tests/test_register.py rename {datalad_compute => datalad_remake}/utils/__init__.py (100%) rename {datalad_compute => datalad_remake}/utils/compute.py (91%) rename {datalad_compute => datalad_remake}/utils/glob.py (100%) rename {datalad_compute => datalad_remake}/utils/tests/__init__.py (100%) rename {datalad_compute => datalad_remake}/utils/tests/test_substitution.py (100%) diff --git a/.github/workflows/mypy-pr.yml b/.github/workflows/mypy-pr.yml index a49660d..b936223 100644 --- a/.github/workflows/mypy-pr.yml +++ b/.github/workflows/mypy-pr.yml @@ -38,4 +38,4 @@ jobs: # with focused error reports, rather than barfing a huge complaint # that is unrelated to the changeset someone has been working on. # run on the oldest supported Python version - hatch run types:mypy --python-version 3.9 --follow-imports skip --pretty --show-error-context ${{ steps.changed-py-files.outputs.all_changed_files }} + hatch run types:mypy --python-version 3.11 --follow-imports skip --pretty --show-error-context ${{ steps.changed-py-files.outputs.all_changed_files }} diff --git a/.github/workflows/mypy-project.yml b/.github/workflows/mypy-project.yml index 3a1bb1b..276506d 100644 --- a/.github/workflows/mypy-project.yml +++ b/.github/workflows/mypy-project.yml @@ -26,4 +26,4 @@ jobs: hatch run types:mypy --install-types --non-interactive --follow-imports skip datalad_core # run mypy on the full project. # run on the oldest supported Python version - hatch run types:mypy --python-version 3.9 --pretty --show-error-context datalad_core + hatch run types:mypy --python-version 3.11 --pretty --show-error-context datalad_core diff --git a/datalad_compute/__init__.py b/datalad_compute/__init__.py deleted file mode 100644 index 301b153..0000000 --- a/datalad_compute/__init__.py +++ /dev/null @@ -1,45 +0,0 @@ -"""DataLad compute extension""" - -__docformat__ = 'restructuredtext' - -import logging -lgr = logging.getLogger('datalad.compute') - -# Defines a datalad command suite. -# This variable must be bound as a setuptools entrypoint -# to be found by datalad -command_suite = ( - # description of the command suite, displayed in cmdline help - "Demo DataLad command suite", - [ - # specification of a command, any number of commands can be defined - ( - # importable module that contains the command implementation - 'datalad_compute.commands.compute_cmd', - # name of the command class implementation in above module - 'Compute', - # optional name of the command in the cmdline API - 'compute', - # optional name of the command in the Python API - 'compute' - ), - ( - # importable module that contains the command implementation - 'datalad_compute.commands.provision_cmd', - # name of the command class implementation in above module - 'Provision', - # optional name of the command in the cmdline API - 'provision', - # optional name of the command in the Python API - 'provision' - ), - ] -) - -from . import _version -__version__ = _version.get_versions()['version'] - - -url_scheme = 'datalad-make' -template_dir = '.datalad/compute/methods' -specification_dir = '.datalad/compute/specifications' diff --git a/datalad_compute/tests/test_register.py b/datalad_compute/tests/test_register.py deleted file mode 100644 index fc9ebdb..0000000 --- a/datalad_compute/tests/test_register.py +++ /dev/null @@ -1,7 +0,0 @@ -from datalad.tests.utils_pytest import assert_result_count - - -def test_register(): - import datalad.api as da - assert hasattr(da, 'compute') - assert hasattr(da, 'provision') diff --git a/datalad_remake/__init__.py b/datalad_remake/__init__.py index 3ac1200..15d46c4 100644 --- a/datalad_remake/__init__.py +++ b/datalad_remake/__init__.py @@ -1,25 +1,47 @@ +"""DataLad remake extension""" + from __future__ import annotations from datalad_remake._version import __version__ + __all__ = [ '__version__', ] -# command_suite = ( -# # description of the command suite, displayed in cmdline help -# "Demo DataLad command suite", -# [ -# # specification of a command, any number of commands can be defined -# ( -# # importable module that contains the command implementation -# 'datalad_remake.commands.compute_cmd', -# # name of the command class implementation in above module -# 'Compute', -# # optional name of the command in the cmdline API -# 'compute', -# # optional name of the command in the Python API -# 'compute' -# ), -# ] -# ) + +# Defines a datalad command suite. +# This variable must be bound as a setuptools entrypoint +# to be found by datalad +command_suite = ( + # description of the command suite, displayed in cmdline help + "DataLad remake command suite", + [ + # specification of a command, any number of commands can be defined + ( + # importable module that contains the command implementation + 'datalad_remake.commands.make_cmd', + # name of the command class implementation in above module + 'Make', + # optional name of the command in the cmdline API + 'make', + # optional name of the command in the Python API + 'make' + ), + ( + # importable module that contains the command implementation + 'datalad_remake.commands.provision_cmd', + # name of the command class implementation in above module + 'Provision', + # optional name of the command in the cmdline API + 'provision', + # optional name of the command in the Python API + 'provision' + ), + ] +) + + +url_scheme = 'datalad-remake' +template_dir = '.datalad/make/methods' +specification_dir = '.datalad/make/specifications' diff --git a/datalad_compute/annexremotes/__init__.py b/datalad_remake/annexremotes/__init__.py similarity index 100% rename from datalad_compute/annexremotes/__init__.py rename to datalad_remake/annexremotes/__init__.py diff --git a/datalad_compute/annexremotes/compute_remote.py b/datalad_remake/annexremotes/remake_remote.py similarity index 94% rename from datalad_compute/annexremotes/compute_remote.py rename to datalad_remake/annexremotes/remake_remote.py index 60cedd3..2102492 100644 --- a/datalad_compute/annexremotes/compute_remote.py +++ b/datalad_remake/annexremotes/remake_remote.py @@ -28,17 +28,18 @@ url_scheme, ) -from ..commands.compute_cmd import ( +from ..commands.make_cmd import ( execute, get_file_dataset, provide_context, ) from ..utils.glob import resolve_patterns -lgr = logging.getLogger('datalad.compute.annexremotes.compute') +lgr = logging.getLogger('datalad.remake.annexremotes.remake') -class ComputeRemote(SpecialRemote): + +class RemakeRemote(SpecialRemote): def __init__(self, annex: Master): super().__init__(annex) @@ -136,7 +137,7 @@ def transfer_retrieve(self, key: str, file_name: str) -> None: self.annex.debug('Leaving provision context') def checkpresent(self, key: str) -> bool: - # See if at least one URL with the compute url-scheme is present + # See if at least one URL with the remake url-scheme is present return self.annex.geturls(key, f'{url_scheme}:') != [] def _find_dataset(self, @@ -195,7 +196,7 @@ def _collect(self, def main(): """cmdline entry point""" super_main( - cls=ComputeRemote, - remote_name='compute', - description="Access to computed data", + cls=RemakeRemote, + remote_name='datalad-remake', + description='Remake data based on datalad-remake specifications', ) diff --git a/datalad_compute/annexremotes/tests/__init__.py b/datalad_remake/annexremotes/tests/__init__.py similarity index 100% rename from datalad_compute/annexremotes/tests/__init__.py rename to datalad_remake/annexremotes/tests/__init__.py diff --git a/datalad_compute/annexremotes/tests/test_hierarchies.py b/datalad_remake/annexremotes/tests/test_hierarchies.py similarity index 95% rename from datalad_compute/annexremotes/tests/test_hierarchies.py rename to datalad_remake/annexremotes/tests/test_hierarchies.py index 10f36b8..ccb344a 100644 --- a/datalad_compute/annexremotes/tests/test_hierarchies.py +++ b/datalad_remake/annexremotes/tests/test_hierarchies.py @@ -7,7 +7,7 @@ from datalad_next.datasets import Dataset from datalad_next.tests.fixtures import datalad_cfg -from datalad_compute.commands.tests.create_datasets import ( +from datalad_remake.commands.tests.create_datasets import ( create_simple_computation_dataset, ) @@ -79,8 +79,8 @@ def test_end_to_end(tmp_path, datalad_cfg, monkeypatch, output_pattern): root_dataset = create_simple_computation_dataset( tmp_path, 'd2', 3, test_method) - # run compute command - results = root_dataset.compute( + # run `make` command + results = root_dataset.make( template='test_method', parameter=[ 'first=first', @@ -102,7 +102,7 @@ def test_end_to_end(tmp_path, datalad_cfg, monkeypatch, output_pattern): _drop_files(root_dataset, collected_output) # Go to the subdataset `d2_subds0/d2_subds1` and fetch the content of `a1.txt` - # from a compute remote. + # from a datalad-remake remote. monkeypatch.chdir(root_dataset.pathobj / 'd2_subds0' / 'd2_subds1') datalad_get()('a1.txt') diff --git a/datalad_compute/annexremotes/tests/test_compute_remote.py b/datalad_remake/annexremotes/tests/test_remake_remote.py similarity index 82% rename from datalad_compute/annexremotes/tests/test_compute_remote.py rename to datalad_remake/annexremotes/tests/test_remake_remote.py index efed519..8fcbebe 100644 --- a/datalad_compute/annexremotes/tests/test_compute_remote.py +++ b/datalad_remake/annexremotes/tests/test_remake_remote.py @@ -1,13 +1,13 @@ import subprocess -import sys from queue import Queue from annexremote import Master -from ..compute_remote import ComputeRemote -from datalad_compute.commands.tests.create_datasets import create_ds_hierarchy +from ..remake_remote import RemakeRemote from ... import specification_dir -from ...commands.compute_cmd import build_json +from ...commands.make_cmd import build_json +from datalad_remake.commands.tests.create_datasets import create_ds_hierarchy + template = """ inputs = ['content'] @@ -68,7 +68,7 @@ def test_compute_remote_main(tmp_path, monkeypatch): dataset = create_ds_hierarchy(tmp_path, 'ds1', 0)[0][2] monkeypatch.chdir(dataset.path) - template_path = dataset.pathobj / '.datalad' / 'compute' / 'methods' + template_path = dataset.pathobj / '.datalad' / 'make' / 'methods' template_path.mkdir(parents=True) (template_path / 'echo').write_text(template) dataset.save() @@ -95,7 +95,7 @@ def test_compute_remote_main(tmp_path, monkeypatch): # single thread and do not get back control once `master.listen` is called # below. input.send('PREPARE\n') - input.send(f'TRANSFER RETRIEVE {key} {str(tmp_path / "computed.txt")}\n') + input.send(f'TRANSFER RETRIEVE {key} {str(tmp_path / "remade.txt")}\n') url = ( 'datalad-make:///?' f'root_version={dataset.repo.get_hexsha()}' @@ -110,10 +110,10 @@ def test_compute_remote_main(tmp_path, monkeypatch): output = MockedOutput() master = Master(output=output) - remote = ComputeRemote(master) + remote = RemakeRemote(master) master.LinkRemote(remote) master.Listen(input=input) - # At this point the compute remote should have executed the computation - # and written the result. - assert (tmp_path / 'computed.txt').read_text().strip() == 'content: some_string' + # At this point the datalad-remake remote should have executed the + # computation and written the result. + assert (tmp_path / 'remade.txt').read_text().strip() == 'content: some_string' diff --git a/datalad_compute/commands/__init__.py b/datalad_remake/commands/__init__.py similarity index 100% rename from datalad_compute/commands/__init__.py rename to datalad_remake/commands/__init__.py diff --git a/datalad_compute/commands/compute_cmd.py b/datalad_remake/commands/make_cmd.py similarity index 97% rename from datalad_compute/commands/compute_cmd.py rename to datalad_remake/commands/make_cmd.py index b74d36b..153f14f 100644 --- a/datalad_compute/commands/compute_cmd.py +++ b/datalad_remake/commands/make_cmd.py @@ -1,4 +1,4 @@ -"""DataLad compute command""" +"""DataLad make command""" from __future__ import annotations @@ -45,15 +45,13 @@ from ..utils.glob import resolve_patterns -__docformat__ = 'restructuredtext' - -lgr = logging.getLogger('datalad.compute.compute_cmd') +lgr = logging.getLogger('datalad.remake.make_cmd') # decoration auto-generates standard help @build_doc # all commands must be derived from Interface -class Compute(ValidatedInterface): +class Make(ValidatedInterface): # first docstring line is used a short description in the cmdline help # the rest is put in the verbose help and manpage """Specify a computation and optionally execute it @@ -84,7 +82,7 @@ class Compute(ValidatedInterface): template=Parameter( args=('template',), doc="Name of the computing template (template should be present " - "in $DATASET/.datalad/compute/methods)"), + "in $DATASET/.datalad/remake/methods)"), branch=Parameter( args=('-b', '--branch',), doc="Branch (or commit) that should be used for computation, if " @@ -132,7 +130,7 @@ class Compute(ValidatedInterface): ) @staticmethod - @datasetmethod(name='compute') + @datasetmethod(name='make') @eval_results def __call__(dataset=None, url_only=False, @@ -178,7 +176,7 @@ def __call__(dataset=None, for out in output: url = add_url(dataset, out, url_base, url_only) yield get_status_dict( - action='compute', + action='make', path=str(dataset.pathobj / out), status='ok', message=f'added url: {url!r} to {out!r} in {dataset.pathobj}',) @@ -203,7 +201,7 @@ def get_url(dataset: Dataset, output_pattern: list[str], ) -> tuple[str, str]: - # If something goes wrong after the compute specification was saved, + # If something goes wrong after the make specification was saved, # the dataset state should be reset to `branch` reset_branch = branch or dataset.repo.get_hexsha() diff --git a/datalad_compute/commands/provision_cmd.py b/datalad_remake/commands/provision_cmd.py similarity index 98% rename from datalad_compute/commands/provision_cmd.py rename to datalad_remake/commands/provision_cmd.py index 7200101..efe0cc9 100644 --- a/datalad_compute/commands/provision_cmd.py +++ b/datalad_remake/commands/provision_cmd.py @@ -35,10 +35,10 @@ from datalad_next.datasets import Dataset from datalad_next.runners import call_git_lines, call_git_success -from ..commands.compute_cmd import read_list +from ..commands.make_cmd import read_list -lgr = logging.getLogger('datalad.compute.provision_cmd') +lgr = logging.getLogger('datalad.remake.provision_cmd') # decoration auto-generates standard help @@ -47,14 +47,14 @@ class Provision(ValidatedInterface): # first docstring line is used a short description in the cmdline help # the rest is put in the verbose help and manpage - """Provision inputs for a compute command + """Provision inputs for a `make` command This command provides a temporary, partial copy of the dataset in a separate tree, called a "worktree". The worktree will contain all files that are specified by the input patterns. All necessary subdatasets will be installed. If a subdataset is locally available in the source dataset, it will be installed from there. Its main purpose is to provide an isolated - environment for "compute" commands. + environment for `make` commands. """ _validator_ = EnsureCommandParameterization(dict( diff --git a/datalad_compute/commands/tests/__init__.py b/datalad_remake/commands/tests/__init__.py similarity index 100% rename from datalad_compute/commands/tests/__init__.py rename to datalad_remake/commands/tests/__init__.py diff --git a/datalad_compute/commands/tests/create_datasets.py b/datalad_remake/commands/tests/create_datasets.py similarity index 84% rename from datalad_compute/commands/tests/create_datasets.py rename to datalad_remake/commands/tests/create_datasets.py index b760f60..ca2c545 100644 --- a/datalad_compute/commands/tests/create_datasets.py +++ b/datalad_remake/commands/tests/create_datasets.py @@ -5,11 +5,11 @@ from datalad_next.datasets import Dataset from datalad_next.runners import call_git_success -from datalad_compute import template_dir +from datalad_remake import template_dir -def update_config_for_compute(dataset: Dataset): - # set annex security related variables to allow compute-URLs +def update_config_for_remake(dataset: Dataset): + # set annex security related variables to allow remake-URLs dataset.configuration( action='set', scope='local', @@ -18,11 +18,11 @@ def update_config_for_compute(dataset: Dataset): result_renderer='disabled') -def add_compute_remote(dataset: Dataset): +def add_remake_remote(dataset: Dataset): call_git_success([ '-C', dataset.path, - 'annex', 'initremote', 'compute', - 'type=external', 'externaltype=compute', + 'annex', 'initremote', 'remake', + 'type=external', 'externaltype=datalad-remake', 'encryption=none'], capture_output=True) @@ -61,14 +61,14 @@ def create_ds_hierarchy(tmp_path: Path, dataset[2].save(result_renderer='disabled') root_dataset.get(recursive=True, result_renderer='disabled') - update_config_for_compute(root_dataset) + update_config_for_remake(root_dataset) - # Add compute remotes to the root dataset and all subdatasets - add_compute_remote(root_dataset) + # Add datalad-remake remotes to the root dataset and all subdatasets + add_remake_remote(root_dataset) subdataset_path = Path() for index in range(subdataset_levels): subdataset_path /= f'{name}_subds{index}' - add_compute_remote(Dataset(root_dataset.pathobj / subdataset_path)) + add_remake_remote(Dataset(root_dataset.pathobj / subdataset_path)) return datasets diff --git a/datalad_compute/commands/tests/test_collection.py b/datalad_remake/commands/tests/test_collection.py similarity index 96% rename from datalad_compute/commands/tests/test_collection.py rename to datalad_remake/commands/tests/test_collection.py index c63b5aa..0c6a3d2 100644 --- a/datalad_compute/commands/tests/test_collection.py +++ b/datalad_remake/commands/tests/test_collection.py @@ -2,7 +2,7 @@ from .create_datasets import create_ds_hierarchy from .test_provision import get_file_list -from ..compute_cmd import collect +from ..make_cmd import collect def test_collect(tmp_path): diff --git a/datalad_compute/commands/tests/test_compute.py b/datalad_remake/commands/tests/test_compute.py similarity index 87% rename from datalad_compute/commands/tests/test_compute.py rename to datalad_remake/commands/tests/test_compute.py index affd11e..de94f2c 100644 --- a/datalad_compute/commands/tests/test_compute.py +++ b/datalad_remake/commands/tests/test_compute.py @@ -1,7 +1,7 @@ from datalad_next.datasets import Dataset from datalad_next.tests.fixtures import datalad_cfg -from datalad_compute.commands.tests.create_datasets import ( +from datalad_remake.commands.tests.create_datasets import ( create_simple_computation_dataset, ) @@ -31,14 +31,15 @@ def test_speculative_computation(tmp_path, datalad_cfg, monkeypatch): root_dataset = create_simple_computation_dataset( tmp_path, 'ds1', 0, test_method) - root_dataset.compute( + root_dataset.make( template='test_method', parameter=['name=Robert', 'file=spec.txt'], output=['spec.txt'], url_only=True, result_renderer='disabled') - # set annex security related variables to allow compute-URLs + # set annex security related variables to allow datalad-remake-URLs + # in speculative make commands datalad_cfg.set('annex.security.allow-unverified-downloads', 'ACKTHPPT', scope='global') # Perform the speculative computation @@ -47,7 +48,7 @@ def test_speculative_computation(tmp_path, datalad_cfg, monkeypatch): def _run_simple_computation(root_dataset: Dataset): - root_dataset.compute( + root_dataset.make( template='test_method', parameter=['name=Robert', 'file=a.txt'], output=['a.txt'], diff --git a/datalad_compute/commands/tests/test_listhandling.py b/datalad_remake/commands/tests/test_listhandling.py similarity index 95% rename from datalad_compute/commands/tests/test_listhandling.py rename to datalad_remake/commands/tests/test_listhandling.py index 3c9e53d..cfd82c5 100644 --- a/datalad_compute/commands/tests/test_listhandling.py +++ b/datalad_remake/commands/tests/test_listhandling.py @@ -1,6 +1,6 @@ import tempfile from pathlib import Path -from datalad_compute.commands.compute_cmd import read_list +from datalad_remake.commands.make_cmd import read_list from hypothesis import given from hypothesis.strategies import lists, text diff --git a/datalad_compute/commands/tests/test_provision.py b/datalad_remake/commands/tests/test_provision.py similarity index 99% rename from datalad_compute/commands/tests/test_provision.py rename to datalad_remake/commands/tests/test_provision.py index 66b5eee..5ff40b8 100644 --- a/datalad_compute/commands/tests/test_provision.py +++ b/datalad_remake/commands/tests/test_provision.py @@ -5,12 +5,11 @@ from pathlib import Path from typing import Iterable -import pytest from datalad_next.datasets import Dataset from datalad_next.runners import call_git_lines from .create_datasets import create_ds_hierarchy -from ..compute_cmd import provide_context +from ..make_cmd import provide_context file_path_templates = [ diff --git a/datalad_remake/tests/test_dummy.py b/datalad_remake/tests/test_dummy.py deleted file mode 100644 index ebdfde4..0000000 --- a/datalad_remake/tests/test_dummy.py +++ /dev/null @@ -1,6 +0,0 @@ -import datalad_remake # noqa: F401 - - -def test_dummy(): - # nothing but a placeholder - pass diff --git a/datalad_remake/tests/test_register.py b/datalad_remake/tests/test_register.py new file mode 100644 index 0000000..a8781a3 --- /dev/null +++ b/datalad_remake/tests/test_register.py @@ -0,0 +1,6 @@ + + +def test_register(): + import datalad.api as da + assert hasattr(da, 'make') + assert hasattr(da, 'provision') diff --git a/datalad_compute/utils/__init__.py b/datalad_remake/utils/__init__.py similarity index 100% rename from datalad_compute/utils/__init__.py rename to datalad_remake/utils/__init__.py diff --git a/datalad_compute/utils/compute.py b/datalad_remake/utils/compute.py similarity index 91% rename from datalad_compute/utils/compute.py rename to datalad_remake/utils/compute.py index 745ffd7..bc8d2eb 100644 --- a/datalad_compute/utils/compute.py +++ b/datalad_remake/utils/compute.py @@ -73,8 +73,8 @@ def compute(root_directory: Path, with contextlib.chdir(root_directory): if template.get('use_shell', 'false') == 'true': - lgr.debug(f'compute(): RUNNING: with shell=True: {" ".join([substituted_executable] + substituted_arguments)}') + lgr.debug(f'compute: RUNNING: with shell=True: {" ".join([substituted_executable] + substituted_arguments)}') subprocess.run(' '.join([substituted_executable] + substituted_arguments), shell=True, check=True) else: - lgr.debug(f'compute(): RUNNING: {[substituted_executable] + substituted_arguments}') + lgr.debug(f'compute: RUNNING: {[substituted_executable] + substituted_arguments}') subprocess.run([substituted_executable] + substituted_arguments, check=True) diff --git a/datalad_compute/utils/glob.py b/datalad_remake/utils/glob.py similarity index 100% rename from datalad_compute/utils/glob.py rename to datalad_remake/utils/glob.py diff --git a/datalad_compute/utils/tests/__init__.py b/datalad_remake/utils/tests/__init__.py similarity index 100% rename from datalad_compute/utils/tests/__init__.py rename to datalad_remake/utils/tests/__init__.py diff --git a/datalad_compute/utils/tests/test_substitution.py b/datalad_remake/utils/tests/test_substitution.py similarity index 100% rename from datalad_compute/utils/tests/test_substitution.py rename to datalad_remake/utils/tests/test_substitution.py diff --git a/pyproject.toml b/pyproject.toml index caa7f0f..35d6e08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ name = "datalad-remake" dynamic = ["version"] description = '' readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.11" license = "MIT" keywords = [ "datalad", @@ -35,15 +35,15 @@ classifiers = [ "Topic :: Software Development :: Version Control :: Git", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ + "annexremote", "datalad_next", + "datasalad", ] [project.urls] @@ -53,6 +53,12 @@ Issues = "https://github.com/datalad/datalad-remake/issues" Source = "https://github.com/datalad/datalad-remake" Changelog = "https://github.com/datalad/datalad-remake/blob/main/CHANGELOG.md" +[project.scripts] +git-annex-remote-datalad-remake = "datalad_remake.annexremotes.remake_remote:main" + +[project.entry-points."datalad.extensions"] +remake = "datalad_remake:command_suite" + [tool.hatch.version] source = "vcs" @@ -69,13 +75,12 @@ extra-dependencies = [ "pytest-cov", ] - [tool.hatch.envs.tests] description = "run tests across Python versions" template = "hatch-test" [[tool.hatch.envs.tests.matrix]] -python = ["3.9", "3.10", "3.11", "3.12"] +python = ["3.11", "3.12"] [tool.hatch.envs.tests.scripts] run = 'python -m pytest {args}' @@ -86,9 +91,10 @@ extra-dependencies = [ "mypy>=1.0.0", "pytest", ] + [tool.hatch.envs.types.scripts] check = [ - "mypy --install-types --non-interactive --python-version 3.9 --pretty --show-error-context datalad_remake", + "mypy --install-types --non-interactive --python-version 3.11 --pretty --show-error-context datalad_remake", ] [tool.hatch.envs.docs] @@ -96,6 +102,7 @@ description = "build Sphinx-based docs" extra-dependencies = [ "sphinx", ] + [tool.hatch.envs.docs.scripts] build = [ "make -C docs html", @@ -111,6 +118,7 @@ detached = true extra-dependencies = [ "commitizen", ] + [tool.hatch.envs.cz.scripts] check-commits = [ # check all commit messages since the (before) beginning @@ -131,6 +139,7 @@ detached = true extra-dependencies = [ "codespell", ] + [tool.hatch.envs.codespell.scripts] check = "codespell" fix = "codespell --write-changes" diff --git a/requirements-devel.txt b/requirements-devel.txt index 9ddee5e..dfab594 100644 --- a/requirements-devel.txt +++ b/requirements-devel.txt @@ -1,8 +1,11 @@ # requirements for a development environment +annexremote coverage datalad datalad-next datasalad +hatch +hatch-vcs hypothesis pytest pytest-cov From 1a8f3613d5fc6e2ab49e44695dbfa54be4d1095e Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 17 Oct 2024 15:51:18 +0200 Subject: [PATCH 140/148] add missing `hypothesis` dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 35d6e08..7f6ffdb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,6 +68,7 @@ version-file = "datalad_remake/_version.py" [tool.hatch.envs.hatch-test] default-args = ["datalad_remake"] extra-dependencies = [ + "hypothesis", "pytest", # if you come here, because coverage combination crashed for you # run `hatch test --cover` and/or see From e591531014b4fa3de05040e1b2eb2e297018c2b0 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 21 Oct 2024 16:12:07 +0200 Subject: [PATCH 141/148] apply linter suggestions --- TODO.txt | 13 --- datalad_remake/__init__.py | 2 +- datalad_remake/annexremotes/remake_remote.py | 44 +++++----- .../annexremotes/tests/test_hierarchies.py | 24 ++--- .../annexremotes/tests/test_remake_remote.py | 28 +++--- datalad_remake/commands/make_cmd.py | 87 ++++++++++--------- datalad_remake/commands/provision_cmd.py | 74 ++++++++-------- .../commands/tests/test_collection.py | 2 +- datalad_remake/commands/tests/test_compute.py | 6 +- .../commands/tests/test_listhandling.py | 3 +- .../commands/tests/test_provision.py | 20 +++-- datalad_remake/utils/compute.py | 25 +++--- datalad_remake/utils/glob.py | 5 +- pyproject.toml | 10 ++- setup.cfg | 67 -------------- 15 files changed, 173 insertions(+), 237 deletions(-) delete mode 100644 TODO.txt delete mode 100644 setup.cfg diff --git a/TODO.txt b/TODO.txt deleted file mode 100644 index 9ad2e4c..0000000 --- a/TODO.txt +++ /dev/null @@ -1,13 +0,0 @@ - -- Provide a way to overwrite the root dataset version during `datalad get`. - This could be done via a config variable or an option that is used when - creating the annex remote. - -- Implement opportunistic collection of all results of a single computation - that are not yet present (via `git annex reinject`). - -- Consolidate compute-URL construction and splitting code - -- Use dataclass for compute_info in annex remote - -- Implement locking for result collection diff --git a/datalad_remake/__init__.py b/datalad_remake/__init__.py index 15d46c4..45fc1e0 100644 --- a/datalad_remake/__init__.py +++ b/datalad_remake/__init__.py @@ -4,9 +4,9 @@ from datalad_remake._version import __version__ - __all__ = [ '__version__', + 'command_suite', ] diff --git a/datalad_remake/annexremotes/remake_remote.py b/datalad_remake/annexremotes/remake_remote.py index 2102492..edf6c5f 100644 --- a/datalad_remake/annexremotes/remake_remote.py +++ b/datalad_remake/annexremotes/remake_remote.py @@ -6,35 +6,34 @@ import subprocess from pathlib import Path from typing import ( + TYPE_CHECKING, Any, - Iterable, ) from urllib.parse import ( unquote, urlparse, ) -from annexremote import Master from datalad.customremotes import RemoteError -from datalad_next.annexremotes import ( - SpecialRemote, - super_main -) +from datalad_next.annexremotes import SpecialRemote, super_main from datalad_next.datasets import Dataset from datalad_next.runners import call_git_success -from .. import ( +from datalad_remake import ( specification_dir, url_scheme, ) - -from ..commands.make_cmd import ( +from datalad_remake.commands.make_cmd import ( execute, get_file_dataset, provide_context, ) -from ..utils.glob import resolve_patterns +from datalad_remake.utils.glob import resolve_patterns + +if TYPE_CHECKING: + from collections.abc import Iterable + from annexremote import Master lgr = logging.getLogger('datalad.remake.annexremotes.remake') @@ -51,19 +50,19 @@ def close(self) -> None: pass def _check_url(self, url: str) -> bool: - return url.startswith(f'URL--{url_scheme}:') or url.startswith(f'{url_scheme}:') + return url.startswith((f'URL--{url_scheme}:', f'{url_scheme}:')) def prepare(self): - self.annex.debug(f'PREPARE') + self.annex.debug('PREPARE') def initremote(self): - self.annex.debug(f'INITREMOTE') + self.annex.debug('INITREMOTE') def remove(self, key: str): self.annex.debug(f'REMOVE {key!r}') def transfer_store(self, key: str, local_file: str): - self.annex.debug(f'TRANSFER STORE') + self.annex.debug(f'TRANSFER STORE {key!r}, {local_file!r}') def claimurl(self, url: str) -> bool: self.annex.debug(f'CLAIMURL {url!r}') @@ -74,7 +73,7 @@ def checkurl(self, url: str) -> bool: return self._check_url(url) def getcost(self) -> int: - self.annex.debug(f'GETCOST') + self.annex.debug('GETCOST') return 100 def get_url_encoded_info(self, url: str) -> list[str]: @@ -94,10 +93,10 @@ def get_compute_info(self, def get_assigned_value(assignment: str) -> str: return assignment.split('=', 1)[1] - root_version, spec_name, this = list( - map( - lambda expr: unquote(get_assigned_value(expr)), - self.get_url_encoded_info(self.get_url_for_key(key)))) + root_version, spec_name, this = ( + unquote(get_assigned_value(expr)) + for expr in self.get_url_encoded_info(self.get_url_for_key(key)) + ) dataset = self._find_dataset(root_version) spec_path = dataset.pathobj / specification_dir / spec_name @@ -149,15 +148,16 @@ def _find_dataset(self, current_dir = start_dir while current_dir != Path('/'): result = subprocess.run( - ['git', 'cat-file', '-t', commit], + ['git', 'cat-file', '-t', commit], # noqa: S607 stdout=subprocess.PIPE, - cwd=current_dir) + cwd=current_dir, check=False) if result.returncode == 0 and result.stdout.strip() == b'commit': return Dataset(current_dir) current_dir = current_dir.parent - raise RemoteError( + msg = ( f'Could not find dataset with commit {commit!r}, starting from ' f'{start_dir}') + raise RemoteError(msg) def _collect(self, worktree: Path, diff --git a/datalad_remake/annexremotes/tests/test_hierarchies.py b/datalad_remake/annexremotes/tests/test_hierarchies.py index ccb344a..c87b160 100644 --- a/datalad_remake/annexremotes/tests/test_hierarchies.py +++ b/datalad_remake/annexremotes/tests/test_hierarchies.py @@ -1,17 +1,14 @@ +from collections.abc import Iterable from pathlib import Path -from typing import Iterable import pytest - -from datalad.distribution.get import Get as datalad_get +from datalad.distribution.get import Get as datalad_Get from datalad_next.datasets import Dataset -from datalad_next.tests.fixtures import datalad_cfg from datalad_remake.commands.tests.create_datasets import ( create_simple_computation_dataset, ) - test_method = """ inputs = ['first', 'second', 'third'] use_shell = 'true' @@ -50,13 +47,10 @@ ] -test_file_content = [ - (file, content) - for file, content in - zip( - output_pattern_static, - ['content: first\n', 'content: second\n', 'content: third\n'] * 4) -] +test_file_content = list(zip( + output_pattern_static, + ['content: first\n', 'content: second\n', 'content: third\n'] * 4, strict=False) +) def _drop_files(dataset: Dataset, @@ -74,7 +68,7 @@ def _check_content(dataset, @pytest.mark.parametrize('output_pattern', [output_pattern_static, output_pattern_glob]) -def test_end_to_end(tmp_path, datalad_cfg, monkeypatch, output_pattern): +def test_end_to_end(tmp_path, monkeypatch, output_pattern): root_dataset = create_simple_computation_dataset( tmp_path, 'd2', 3, test_method) @@ -104,7 +98,7 @@ def test_end_to_end(tmp_path, datalad_cfg, monkeypatch, output_pattern): # Go to the subdataset `d2_subds0/d2_subds1` and fetch the content of `a1.txt` # from a datalad-remake remote. monkeypatch.chdir(root_dataset.pathobj / 'd2_subds0' / 'd2_subds1') - datalad_get()('a1.txt') + datalad_Get()('a1.txt') # check that all known files that were computed are added to the annex _check_content(root_dataset, test_file_content) @@ -113,6 +107,6 @@ def test_end_to_end(tmp_path, datalad_cfg, monkeypatch, output_pattern): # check get in subdatasets monkeypatch.chdir(root_dataset.pathobj) - datalad_get()('d2_subds0/d2_subds1/a1.txt') + datalad_Get()('d2_subds0/d2_subds1/a1.txt') _check_content(root_dataset, test_file_content) diff --git a/datalad_remake/annexremotes/tests/test_remake_remote.py b/datalad_remake/annexremotes/tests/test_remake_remote.py index 8fcbebe..69408c5 100644 --- a/datalad_remake/annexremotes/tests/test_remake_remote.py +++ b/datalad_remake/annexremotes/tests/test_remake_remote.py @@ -3,11 +3,11 @@ from annexremote import Master -from ..remake_remote import RemakeRemote -from ... import specification_dir -from ...commands.make_cmd import build_json from datalad_remake.commands.tests.create_datasets import create_ds_hierarchy +from ... import specification_dir +from ...commands.make_cmd import build_json +from ..remake_remote import RemakeRemote template = """ inputs = ['content'] @@ -73,13 +73,13 @@ def test_compute_remote_main(tmp_path, monkeypatch): (template_path / 'echo').write_text(template) dataset.save() - key = tuple( + key = next( filter( lambda line: line.startswith(b'key: '), subprocess.run( - ['git', 'annex', 'info', 'a.txt'], + ['git', 'annex', 'info', 'a.txt'], # noqa: S607 stdout=subprocess.PIPE, - check=True).stdout.splitlines()))[0].split(b': ')[1] + check=True).stdout.splitlines())).split(b': ')[1] (dataset.pathobj / specification_dir).mkdir(parents=True) (dataset.pathobj / specification_dir / '000001111122222').write_text( @@ -89,30 +89,30 @@ def test_compute_remote_main(tmp_path, monkeypatch): ['a.txt'], {'content': 'some_string'})) - input = MockedInput() + input_ = MockedInput() # We send all messages into the queue upfront because we do the test in a # single thread and do not get back control once `master.listen` is called # below. - input.send('PREPARE\n') - input.send(f'TRANSFER RETRIEVE {key} {str(tmp_path / "remade.txt")}\n') + input_.send('PREPARE\n') + input_.send(f'TRANSFER RETRIEVE {key} {tmp_path / "remade.txt"!s}\n') url = ( 'datalad-make:///?' f'root_version={dataset.repo.get_hexsha()}' '&specification=000001111122222' '&this=a.txt' ) - input.send(f'VALUE {url}\n') - input.send('VALUE\n') - input.send('VALUE .git\n') - input.send('') + input_.send(f'VALUE {url}\n') + input_.send('VALUE\n') + input_.send('VALUE .git\n') + input_.send('') output = MockedOutput() master = Master(output=output) remote = RemakeRemote(master) master.LinkRemote(remote) - master.Listen(input=input) + master.Listen(input=input_) # At this point the datalad-remake remote should have executed the # computation and written the result. diff --git a/datalad_remake/commands/make_cmd.py b/datalad_remake/commands/make_cmd.py index 153f14f..f64797d 100644 --- a/datalad_remake/commands/make_cmd.py +++ b/datalad_remake/commands/make_cmd.py @@ -9,17 +9,14 @@ import os import shutil from pathlib import Path -from typing import ( - Generator, - Iterable, -) +from typing import TYPE_CHECKING from urllib.parse import quote from datalad.support.exceptions import IncompleteResultsError from datalad_next.commands import ( EnsureCommandParameterization, - ValidatedInterface, Parameter, + ValidatedInterface, build_doc, datasetmethod, eval_results, @@ -36,14 +33,20 @@ call_git_success, ) -from .. import ( +from datalad_remake import ( specification_dir, template_dir, url_scheme, ) -from ..utils.compute import compute -from ..utils.glob import resolve_patterns +from datalad_remake.utils.compute import compute +from datalad_remake.utils.glob import resolve_patterns +if TYPE_CHECKING: + from collections.abc import ( + Generator, + Iterable, + ) + from typing import ClassVar lgr = logging.getLogger('datalad.remake.make_cmd') @@ -57,43 +60,43 @@ class Make(ValidatedInterface): """Specify a computation and optionally execute it """ - _validator_ = EnsureCommandParameterization(dict( - dataset=EnsureDataset(installed=True), - input=EnsureListOf(EnsureStr(min_len=1)), - input_list=EnsureStr(min_len=1), - output=EnsureListOf(EnsureStr(min_len=1), min_len=1), - output_list=EnsureStr(min_len=1), - parameter=EnsureListOf(EnsureStr(min_len=3)), - parameter_list=EnsureStr(min_len=1), - )) + _validator_ = EnsureCommandParameterization({ + 'dataset': EnsureDataset(installed=True), + 'input': EnsureListOf(EnsureStr(min_len=1)), + 'input_list': EnsureStr(min_len=1), + 'output': EnsureListOf(EnsureStr(min_len=1), min_len=1), + 'output_list': EnsureStr(min_len=1), + 'parameter': EnsureListOf(EnsureStr(min_len=3)), + 'parameter_list': EnsureStr(min_len=1), + }) # parameters of the command, must be exhaustive - _params_ = dict( - dataset=Parameter( + _params_: ClassVar[dict[int, Parameter]] = { + 'dataset': Parameter( args=('-d', '--dataset'), doc="Dataset to be used as a configuration source. Beyond " "reading configuration items, this command does not interact with " "the dataset."), - url_only=Parameter( + 'url_only': Parameter( args=('-u', '--url-only'), action="store_true", doc="Don't perform the computation, register an URL-key " "instead. A `git annex get ` will trigger the computation"), - template=Parameter( + 'template': Parameter( args=('template',), doc="Name of the computing template (template should be present " "in $DATASET/.datalad/remake/methods)"), - branch=Parameter( + 'branch': Parameter( args=('-b', '--branch',), doc="Branch (or commit) that should be used for computation, if " "not specified HEAD will be used"), - input=Parameter( + 'input': Parameter( args=('-i', '--input',), action='append', doc="An input file pattern (repeat for multiple inputs, " "file pattern support python globbing, globbing is expanded " "in the source dataset)"), - input_list=Parameter( + 'input_list': Parameter( args=('-I', '--input-list',), doc="Name of a file that contains a list of input file patterns. " "Format is one file per line, relative path from `dataset`. " @@ -101,25 +104,25 @@ class Make(ValidatedInterface): "that start with '#' are ignored. Line content is stripped " "before used. This is useful if a large number of input file " "patterns should be provided."), - output=Parameter( + 'output': Parameter( args=('-o', '--output',), action='append', doc="An output file pattern (repeat for multiple outputs)" "file pattern support python globbing, globbing is expanded " "in the worktree)"), - output_list=Parameter( + 'output_list': Parameter( args=('-O', '--output-list',), doc="Name of a file that contains a list of output patterns. Format " "is one file per line, relative path from `dataset`. Empty " "lines, i.e. lines that contain only newlines, arg ignored. " "This is useful if a large number of output files should be " "provided."), - parameter=Parameter( + 'parameter': Parameter( args=('-p', '--parameter',), action='append', doc="Input parameter in the form = (repeat for " "multiple parameters)"), - parameter_list=Parameter( + 'parameter_list': Parameter( args=('-P', '--parameter-list',), doc="Name of a file that contains a list of parameters. Format " "is one `=` string per line. " @@ -127,16 +130,17 @@ class Make(ValidatedInterface): "that start with '#' are ignored. Line content is stripped " "before used. This is useful if a large number of parameters " "should be provided."), - ) + } @staticmethod @datasetmethod(name='make') @eval_results def __call__(dataset=None, + *, url_only=False, template=None, branch=None, - input=None, + input_=None, input_list=None, output=None, output_list=None, @@ -146,7 +150,7 @@ def __call__(dataset=None, dataset : Dataset = dataset.ds if dataset else Dataset('.') - input_pattern = (input or []) + read_list(input_list) + input_pattern = (input_ or []) + read_list(input_list) output_pattern = (output or []) + read_list(output_list) parameter = (parameter or []) + read_list(parameter_list) @@ -174,7 +178,7 @@ def __call__(dataset=None, output = collect(worktree, dataset, output_pattern) for out in output: - url = add_url(dataset, out, url_base, url_only) + url = add_url(dataset, out, url_base, url_only=url_only) yield get_status_dict( action='make', path=str(dataset.pathobj / out), @@ -215,8 +219,8 @@ def get_url(dataset: Dataset, return ( f'{url_scheme}:///' - + f'?root_version={quote(dataset.repo.get_hexsha())}' - + f'&specification={quote(digest)}' + f'?root_version={quote(dataset.repo.get_hexsha())}' + f'&specification={quote(digest)}' ), reset_branch @@ -263,6 +267,7 @@ def build_json(method: str, def add_url(dataset: Dataset, file_path: str, url_base: str, + *, url_only: bool ) -> str: @@ -292,10 +297,11 @@ def add_url(dataset: Dataset, + (['--relaxed'] if url_only else []), cwd=file_dataset_path, capture_output=True) - assert \ - success, \ - f'\naddurl failed:\nfile_dataset_path: {file_dataset_path}\n' \ - f'url: {url!r}\nfile_path: {file_path!r}' + if not success: + msg = ( + f'\naddurl failed:\nfile_dataset_path: {file_dataset_path}\n' + f'url: {url!r}\nfile_path: {file_path!r}') + raise RuntimeError(msg) return url @@ -416,8 +422,5 @@ def create_output_space(dataset: Dataset, ) -> None: """Get all files that are part of the output space.""" for f in files: - try: + with contextlib.suppress(IncompleteResultsError): dataset.get(f, result_renderer='disabled') - except IncompleteResultsError: - # Ignore non-existing files - pass diff --git a/datalad_remake/commands/provision_cmd.py b/datalad_remake/commands/provision_cmd.py index efe0cc9..19e8505 100644 --- a/datalad_remake/commands/provision_cmd.py +++ b/datalad_remake/commands/provision_cmd.py @@ -11,17 +11,17 @@ from contextlib import chdir from glob import glob from pathlib import Path +from tempfile import TemporaryDirectory from typing import ( - Iterable, - Generator, + TYPE_CHECKING, + ClassVar, ) -from tempfile import TemporaryDirectory from datalad.support.constraints import EnsureBool from datalad_next.commands import ( EnsureCommandParameterization, - ValidatedInterface, Parameter, + ValidatedInterface, build_doc, datasetmethod, eval_results, @@ -30,13 +30,16 @@ from datalad_next.constraints import ( EnsureDataset, EnsureListOf, - EnsureStr, EnsurePath, + EnsurePath, + EnsureStr, ) from datalad_next.datasets import Dataset from datalad_next.runners import call_git_lines, call_git_success -from ..commands.make_cmd import read_list +from datalad_remake.commands.make_cmd import read_list +if TYPE_CHECKING: + from collections.abc import Generator, Iterable lgr = logging.getLogger('datalad.remake.provision_cmd') @@ -57,39 +60,39 @@ class Provision(ValidatedInterface): environment for `make` commands. """ - _validator_ = EnsureCommandParameterization(dict( - dataset=EnsureDataset(installed=True), - input=EnsureListOf(EnsureStr(min_len=1)), - input_list=EnsureStr(min_len=1), - tmp_dir=EnsurePath(is_mode=stat.S_ISDIR), - delete=EnsureDataset(installed=True), - no_globbing=EnsureBool(), - )) + _validator_ = EnsureCommandParameterization({ + 'dataset': EnsureDataset(installed=True), + 'input': EnsureListOf(EnsureStr(min_len=1)), + 'input_list': EnsureStr(min_len=1), + 'tmp_dir': EnsurePath(is_mode=stat.S_ISDIR), + 'delete': EnsureDataset(installed=True), + 'no_globbing': EnsureBool(), + }) # parameters of the command, must be exhaustive - _params_ = dict( - dataset=Parameter( + _params_: ClassVar[dict[str, Parameter]] = { + 'dataset': Parameter( args=('-d', '--dataset'), doc="Dataset to be used as a configuration source. Beyond " "reading configuration items, this command does not interact with " "the dataset."), - branch=Parameter( + 'branch': Parameter( args=('-b', '--branch',), doc="Branch (or commit) that should be provisioned, if " "not specified HEAD will be used"), - delete=Parameter( + 'delete': Parameter( args=('--delete',), doc="Delete the temporary worktree WORKTREE that belongs the the " "dataset (cannot be used with `-b`, `--branch`, `-i`," "`--input`, `-I`, or `--input-list`)."), - input=Parameter( + 'input': Parameter( args=('-i', '--input',), action='append', doc="An input file pattern (repeat for multiple inputs, " "file pattern support python globbing, globbing is done in the " "worktree and through all matching subdatasets, installing " "if necessary)."), - input_list=Parameter( + 'input_list': Parameter( args=('-I', '--input-list',), doc="Name of a file that contains a list of input file patterns. " "Format is one file per line, relative path from `dataset`. " @@ -97,11 +100,11 @@ class Provision(ValidatedInterface): "that start with '#' are ignored. Line content is stripped " "before used. This is useful if a large number of input file " "patterns should be provided."), - worktree_dir=Parameter( + 'worktree_dir': Parameter( args=('-w', '--worktree-dir',), doc="Path of the directory that should become the temporary " "worktree, defaults to `tempfile.TemporaryDirectory().name`."), - ) + } @staticmethod @datasetmethod(name='provision') @@ -109,17 +112,20 @@ class Provision(ValidatedInterface): def __call__(dataset=None, branch=None, delete=None, - input=None, + input_=None, input_list=None, worktree_dir=None, ): dataset : Dataset = dataset.ds if dataset else Dataset('.') if delete: - if branch or input: - raise ValueError( + if branch or input_: + msg = ( 'Cannot use `-d`, `--delete` with `-b`, `--branch`,' - ' `-i`, or `--input`') + ' `-i`, or `--input`' + ) + raise ValueError( + msg) remove(dataset, delete.ds) yield get_status_dict( @@ -130,7 +136,7 @@ def __call__(dataset=None, return worktree_dir: Path = Path(worktree_dir or TemporaryDirectory().name) - inputs = input or [] + read_list(input_list) + inputs = input_ or [*read_list(input_list)] yield from provide(dataset, worktree_dir, inputs, branch) @@ -258,10 +264,10 @@ def resolve_patterns(dataset: Dataset, def get_uninstalled_subdatasets(dataset: Dataset) -> set[Path]: """Get a list of the paths of all visible, non-installed subdatasets""" - return set([ + return { Path(result['path']).relative_to(dataset.pathobj) for result in dataset.subdatasets(recursive=True, result_renderer='disabled') - if result['state'] == 'absent']) + if result['state'] == 'absent'} def glob_pattern(root: Dataset, @@ -311,11 +317,11 @@ def glob_pattern(root: Dataset, # Match all elements at the current position with the first part of the # pattern. - for match in glob( + for rec_match in glob( '*' if pattern[0] == '**' else pattern[0], root_dir=root.pathobj / position ): - match = position / match + match = position / rec_match # If the match is a directory that is in uninstalled subdatasets, # install the dataset and updated uninstalled datasets before proceeding @@ -355,9 +361,9 @@ def install_subdataset(worktree: Dataset, ) -> None: """Install a subdataset, prefer locally available subdatasets""" local_subdataset = ([ - l - for l in locally_available_datasets - if l[2] == subdataset_path] or [None])[0] + dataset + for dataset in locally_available_datasets + if dataset[2] == subdataset_path] or [None])[0] if local_subdataset: absolute_path, parent_ds_path, path_from_root = local_subdataset diff --git a/datalad_remake/commands/tests/test_collection.py b/datalad_remake/commands/tests/test_collection.py index 0c6a3d2..371e625 100644 --- a/datalad_remake/commands/tests/test_collection.py +++ b/datalad_remake/commands/tests/test_collection.py @@ -1,8 +1,8 @@ from pathlib import Path +from ..make_cmd import collect from .create_datasets import create_ds_hierarchy from .test_provision import get_file_list -from ..make_cmd import collect def test_collect(tmp_path): diff --git a/datalad_remake/commands/tests/test_compute.py b/datalad_remake/commands/tests/test_compute.py index de94f2c..4e51fbb 100644 --- a/datalad_remake/commands/tests/test_compute.py +++ b/datalad_remake/commands/tests/test_compute.py @@ -1,11 +1,9 @@ from datalad_next.datasets import Dataset -from datalad_next.tests.fixtures import datalad_cfg from datalad_remake.commands.tests.create_datasets import ( create_simple_computation_dataset, ) - test_method = """ inputs = ['name', 'file'] use_shell = 'true' @@ -16,7 +14,7 @@ output_pattern = ['a.txt'] -def test_duplicated_computation(tmp_path, datalad_cfg, monkeypatch): +def test_duplicated_computation(tmp_path): root_dataset = create_simple_computation_dataset( tmp_path, 'ds1', 0, test_method) @@ -26,7 +24,7 @@ def test_duplicated_computation(tmp_path, datalad_cfg, monkeypatch): _run_simple_computation(root_dataset) -def test_speculative_computation(tmp_path, datalad_cfg, monkeypatch): +def test_speculative_computation(tmp_path, datalad_cfg): root_dataset = create_simple_computation_dataset( tmp_path, 'ds1', 0, test_method) diff --git a/datalad_remake/commands/tests/test_listhandling.py b/datalad_remake/commands/tests/test_listhandling.py index cfd82c5..a1864f0 100644 --- a/datalad_remake/commands/tests/test_listhandling.py +++ b/datalad_remake/commands/tests/test_listhandling.py @@ -1,10 +1,11 @@ import tempfile from pathlib import Path -from datalad_remake.commands.make_cmd import read_list from hypothesis import given from hypothesis.strategies import lists, text +from datalad_remake.commands.make_cmd import read_list + def test_empty_list_reading(): assert read_list(None) == [] diff --git a/datalad_remake/commands/tests/test_provision.py b/datalad_remake/commands/tests/test_provision.py index 5ff40b8..cf4cc71 100644 --- a/datalad_remake/commands/tests/test_provision.py +++ b/datalad_remake/commands/tests/test_provision.py @@ -3,14 +3,16 @@ import contextlib from contextlib import chdir from pathlib import Path -from typing import Iterable +from typing import TYPE_CHECKING from datalad_next.datasets import Dataset from datalad_next.runners import call_git_lines -from .create_datasets import create_ds_hierarchy from ..make_cmd import provide_context +from .create_datasets import create_ds_hierarchy +if TYPE_CHECKING: + from collections.abc import Iterable file_path_templates = [ '{file}.txt', @@ -86,10 +88,10 @@ def test_worktree_globbing(tmp_path): worktree = Path(result['path']) worktree_set = set(get_file_list(worktree)) - assert worktree_set == set( + assert worktree_set == { path.format(ds_name='ds1') for path in all_paths - ) + } dataset.provision(delete=worktree, result_renderer='disabled') result = dataset.provision( @@ -105,10 +107,10 @@ def test_worktree_globbing(tmp_path): worktree = Path(result['path']) worktree_set = set(get_file_list(worktree)) - assert set( + assert { path.format(ds_name='ds1') for path in b_paths - ).issubset(worktree_set) + }.issubset(worktree_set) dataset.provision(delete=worktree, result_renderer='disabled') dataset.drop( @@ -155,7 +157,7 @@ def test_unclean_dataset(tmp_path): worktree_dir=tmp_path / 'ds1_worktree1', on_failure='ignore', result_renderer='disabled') - assert set((result['status'], result['state']) for result in results) == \ + assert {(result['status'], result['state']) for result in results} == \ {('error', 'modified'), ('error', 'untracked')} # Check that a saved dataset can be provisioned @@ -177,8 +179,8 @@ def test_branch_deletion_after_provision(tmp_path): assert not worktree.exists() with contextlib.chdir(dataset.path): branches = [ - l.strip() - for l in call_git_lines(['branch'])] + line.strip() + for line in call_git_lines(['branch'])] assert worktree.name not in branches diff --git a/datalad_remake/utils/compute.py b/datalad_remake/utils/compute.py index bc8d2eb..3f816ee 100644 --- a/datalad_remake/utils/compute.py +++ b/datalad_remake/utils/compute.py @@ -4,10 +4,10 @@ import logging import subprocess import tomllib +from typing import TYPE_CHECKING, Any -from pathlib import Path -from typing import Any - +if TYPE_CHECKING: + from pathlib import Path lgr = logging.getLogger('datalad.compute') @@ -40,12 +40,17 @@ def get_substitutions(template: dict[str, Any], # Check the user specified inputs inputs = template['inputs'] if len(inputs) != len(arguments.keys()): - raise ValueError('Template inputs and arguments have different lengths') + msg = 'Template inputs and arguments have different lengths' + raise ValueError(msg) if not all(input_name in arguments for input_name in inputs): - raise ValueError(f'Template inputs and arguments have different names: inputs: {inputs}, arguments: {arguments}') + msg = ( + f'Template inputs and arguments have different names: ' + f'inputs: {inputs}, arguments: {arguments}') + raise ValueError(msg) if len(inputs) != len(set(inputs)): - raise ValueError('Template inputs contain duplicates') + msg = 'Template inputs contain duplicates' + raise ValueError(msg) return { input_name: arguments[input_name] @@ -73,8 +78,8 @@ def compute(root_directory: Path, with contextlib.chdir(root_directory): if template.get('use_shell', 'false') == 'true': - lgr.debug(f'compute: RUNNING: with shell=True: {" ".join([substituted_executable] + substituted_arguments)}') - subprocess.run(' '.join([substituted_executable] + substituted_arguments), shell=True, check=True) + lgr.debug(f'compute: RUNNING: with shell=True: {" ".join([substituted_executable, *substituted_arguments])}') + subprocess.run(' '.join([substituted_executable, *substituted_arguments]), shell=True, check=True) # noqa: S602 else: - lgr.debug(f'compute: RUNNING: {[substituted_executable] + substituted_arguments}') - subprocess.run([substituted_executable] + substituted_arguments, check=True) + lgr.debug(f'compute: RUNNING: {[substituted_executable, *substituted_arguments]}') + subprocess.run([substituted_executable, *substituted_arguments], check=True) diff --git a/datalad_remake/utils/glob.py b/datalad_remake/utils/glob.py index 90cae72..607954e 100644 --- a/datalad_remake/utils/glob.py +++ b/datalad_remake/utils/glob.py @@ -3,7 +3,10 @@ from glob import glob from itertools import chain from pathlib import Path -from typing import Iterable +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Iterable # Resolve input file patterns in the original dataset diff --git a/pyproject.toml b/pyproject.toml index 7f6ffdb..0653f5f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ keywords = [ "git-annex", ] authors = [ -# { name = "Michael Hanke", email = "michael.hanke@gmail.com" }, + { name = "The DataLad Team and Contributors", email = "team@datalad.org" }, ] maintainers = [ # { name = "Michael Hanke", email = "michael.hanke@gmail.com" }, @@ -42,6 +42,7 @@ classifiers = [ ] dependencies = [ "annexremote", + "datalad", "datalad_next", "datasalad", ] @@ -76,6 +77,9 @@ extra-dependencies = [ "pytest-cov", ] +[tool.hatch.envs.hatch-test.env-vars] +DATALAD_EXTENSIONS_LOAD = "next" + [tool.hatch.envs.tests] description = "run tests across Python versions" template = "hatch-test" @@ -159,7 +163,7 @@ data_file = "${COVERAGE_ROOT-.}/.coverage" [tool.coverage.paths] datalad_remake = ["src/datalad_remake", "*/datalad_remake/src/datalad_remake"] -tests = ["tests", "*/datalad_remake/tests"] +tests = ["tests", "*/datalad_remake/*/tests"] [tool.coverage.report] show_missing = true @@ -177,7 +181,7 @@ exclude = [ ] line-length = 88 indent-width = 4 -target-version = "py39" +target-version = "py311" [tool.ruff.format] # Prefer single quotes over double quotes. quote-style = "single" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index d639b0a..0000000 --- a/setup.cfg +++ /dev/null @@ -1,67 +0,0 @@ -[metadata] -url = https://github.com/datalad/datalad-extension-template -author = The DataLad Team and Contributors -author_email = team@datalad.org -description = demo DataLad extension package -long_description = file:README.md -long_description_content_type = text/markdown; charset=UTF-8 -license = MIT -classifiers = - Programming Language :: Python - License :: OSI Approved :: MIT License - Programming Language :: Python :: 3 - -[options] -python_requires = >= 3.11 -install_requires = - annexremote - datalad >= 1.1.1 - datalad_next >= 1.5.0 - datasalad -packages = find_namespace: -include_package_data = True - -[options.packages.find] -include = datalad_compute* - -[options.extras_require] -# this matches the name used by -core and what is expected by some CI setups -devel = - coverage - hypothesis - pytest - pytest-cov - sphinx - sphinx_rtd_theme - sphinx_copybutton -devel-utils = - pytest-xdist - scriv - -[options.entry_points] -# 'datalad.extensions' is THE entrypoint inspected by the datalad API builders -datalad.extensions = - # the label in front of '=' is the command suite label - # the entrypoint can point to any symbol of any name, as long it is - # valid datalad interface specification (see demo in this extensions) - compute = datalad_compute:command_suite - -console_scripts = - git-annex-remote-compute = datalad_compute.annexremotes.compute_remote:main - -[versioneer] -# See the docstring in versioneer.py for instructions. Note that you must -# re-run 'versioneer.py setup' after changing this section, and commit the -# resulting files. -VCS = git -style = pep440 -versionfile_source = datalad_compute/_version.py -versionfile_build = datalad_compute/_version.py -tag_prefix = -parentdir_prefix = - -[coverage:report] -show_missing = True -omit = - # versioneer code - datalad_compute/_version.py From d2e8944ce1098349e39fcb00ca036da3bb489110 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 21 Oct 2024 17:50:30 +0200 Subject: [PATCH 142/148] rf: reformat code --- datalad_remake/__init__.py | 8 +- datalad_remake/annexremotes/remake_remote.py | 68 ++-- .../annexremotes/tests/test_hierarchies.py | 42 +- .../annexremotes/tests/test_remake_remote.py | 13 +- datalad_remake/commands/make_cmd.py | 376 +++++++++--------- datalad_remake/commands/provision_cmd.py | 241 ++++++----- .../commands/tests/create_datasets.py | 48 ++- .../commands/tests/test_collection.py | 12 +- datalad_remake/commands/tests/test_compute.py | 18 +- .../commands/tests/test_listhandling.py | 14 +- .../commands/tests/test_provision.py | 108 +++-- datalad_remake/tests/test_register.py | 3 +- datalad_remake/utils/compute.py | 63 +-- datalad_remake/utils/glob.py | 9 +- .../utils/tests/test_substitution.py | 18 +- 15 files changed, 548 insertions(+), 493 deletions(-) diff --git a/datalad_remake/__init__.py b/datalad_remake/__init__.py index 45fc1e0..cb44161 100644 --- a/datalad_remake/__init__.py +++ b/datalad_remake/__init__.py @@ -15,7 +15,7 @@ # to be found by datalad command_suite = ( # description of the command suite, displayed in cmdline help - "DataLad remake command suite", + 'DataLad remake command suite', [ # specification of a command, any number of commands can be defined ( @@ -26,7 +26,7 @@ # optional name of the command in the cmdline API 'make', # optional name of the command in the Python API - 'make' + 'make', ), ( # importable module that contains the command implementation @@ -36,9 +36,9 @@ # optional name of the command in the cmdline API 'provision', # optional name of the command in the Python API - 'provision' + 'provision', ), - ] + ], ) diff --git a/datalad_remake/annexremotes/remake_remote.py b/datalad_remake/annexremotes/remake_remote.py index edf6c5f..5b0417f 100644 --- a/datalad_remake/annexremotes/remake_remote.py +++ b/datalad_remake/annexremotes/remake_remote.py @@ -39,7 +39,6 @@ class RemakeRemote(SpecialRemote): - def __init__(self, annex: Master): super().__init__(annex) @@ -86,10 +85,7 @@ def get_url_for_key(self, key: str) -> str: self.annex.debug(f'get_url_for_key: key: {key!r}, urls: {urls!r}') return urls[0] - def get_compute_info(self, - key: str - ) -> tuple[dict[str, Any], Dataset]: - + def get_compute_info(self, key: str) -> tuple[dict[str, Any], Dataset]: def get_assigned_value(assignment: str) -> str: return assignment.split('=', 1)[1] @@ -106,10 +102,7 @@ def get_assigned_value(assignment: str) -> str: return { 'root_version': root_version, 'this': this, - **{ - name: spec[name] - for name in ['method', 'input', 'output', 'parameter'] - } + **{name: spec[name] for name in ['method', 'input', 'output', 'parameter']}, }, dataset def transfer_retrieve(self, key: str, file_name: str) -> None: @@ -122,16 +115,25 @@ def transfer_retrieve(self, key: str, file_name: str) -> None: lgr.debug('Starting provision') self.annex.debug('Starting provision') with provide_context( - dataset, - compute_info['root_version'], - compute_info['input'] + dataset, compute_info['root_version'], compute_info['input'] ) as worktree: lgr.debug('Starting execution') self.annex.debug('Starting execution') - execute(worktree, compute_info['method'], compute_info['parameter'], compute_info['output']) + execute( + worktree, + compute_info['method'], + compute_info['parameter'], + compute_info['output'], + ) lgr.debug('Starting collection') self.annex.debug('Starting collection') - self._collect(worktree, dataset, compute_info['output'], compute_info['this'], file_name) + self._collect( + worktree, + dataset, + compute_info['output'], + compute_info['this'], + file_name, + ) lgr.debug('Leaving provision context') self.annex.debug('Leaving provision context') @@ -139,9 +141,7 @@ def checkpresent(self, key: str) -> bool: # See if at least one URL with the remake url-scheme is present return self.annex.geturls(key, f'{url_scheme}:') != [] - def _find_dataset(self, - commit: str - ) -> Dataset: + def _find_dataset(self, commit: str) -> Dataset: """Find the first enclosing dataset with the given commit""" # TODO: get version override from configuration start_dir = Path(self.annex.getgitdir()).parent.absolute() @@ -150,23 +150,27 @@ def _find_dataset(self, result = subprocess.run( ['git', 'cat-file', '-t', commit], # noqa: S607 stdout=subprocess.PIPE, - cwd=current_dir, check=False) + cwd=current_dir, + check=False, + ) if result.returncode == 0 and result.stdout.strip() == b'commit': return Dataset(current_dir) current_dir = current_dir.parent msg = ( f'Could not find dataset with commit {commit!r}, starting from ' - f'{start_dir}') + f'{start_dir}' + ) raise RemoteError(msg) - def _collect(self, - worktree: Path, - dataset: Dataset, - output_patterns: Iterable[str], - this: str, - this_destination: str, - ) -> None: - """Collect computation results for `this` (and all other outputs) """ + def _collect( + self, + worktree: Path, + dataset: Dataset, + output_patterns: Iterable[str], + this: str, + this_destination: str, + ) -> None: + """Collect computation results for `this` (and all other outputs)""" # Get all outputs that were created during computation outputs = resolve_patterns(root_dir=worktree, patterns=output_patterns) @@ -180,13 +184,17 @@ def _collect(self, is_annexed = call_git_success( ['annex', 'whereis', str(file_path)], cwd=dataset_path, - capture_output=True) + capture_output=True, + ) if is_annexed: - self.annex.debug(f'_collect: reinject: {worktree / output} -> {dataset_path}:{file_path}') + self.annex.debug( + f'_collect: reinject: {worktree / output} -> {dataset_path}:{file_path}' + ) call_git_success( ['annex', 'reinject', str(worktree / output), str(file_path)], cwd=dataset_path, - capture_output=True) + capture_output=True, + ) # Collect `this` file. It has to be copied to the destination given # by git-annex. Git-annex will check its integrity. diff --git a/datalad_remake/annexremotes/tests/test_hierarchies.py b/datalad_remake/annexremotes/tests/test_hierarchies.py index c87b160..a09060c 100644 --- a/datalad_remake/annexremotes/tests/test_hierarchies.py +++ b/datalad_remake/annexremotes/tests/test_hierarchies.py @@ -32,10 +32,18 @@ output_pattern_static = [ - 'a.txt', 'b.txt', 'new.txt', - 'd2_subds0/a0.txt', 'd2_subds0/b0.txt', 'd2_subds0/new.txt', - 'd2_subds0/d2_subds1/a1.txt', 'd2_subds0/d2_subds1/b1.txt', 'd2_subds0/d2_subds1/new.txt', - 'd2_subds0/d2_subds1/d2_subds2/a2.txt', 'd2_subds0/d2_subds1/d2_subds2/b2.txt', 'd2_subds0/d2_subds1/d2_subds2/new.txt', + 'a.txt', + 'b.txt', + 'new.txt', + 'd2_subds0/a0.txt', + 'd2_subds0/b0.txt', + 'd2_subds0/new.txt', + 'd2_subds0/d2_subds1/a1.txt', + 'd2_subds0/d2_subds1/b1.txt', + 'd2_subds0/d2_subds1/new.txt', + 'd2_subds0/d2_subds1/d2_subds2/a2.txt', + 'd2_subds0/d2_subds1/d2_subds2/b2.txt', + 'd2_subds0/d2_subds1/d2_subds2/new.txt', ] @@ -47,31 +55,29 @@ ] -test_file_content = list(zip( - output_pattern_static, - ['content: first\n', 'content: second\n', 'content: third\n'] * 4, strict=False) +test_file_content = list( + zip( + output_pattern_static, + ['content: first\n', 'content: second\n', 'content: third\n'] * 4, + strict=False, + ) ) -def _drop_files(dataset: Dataset, - files: Iterable[str]): +def _drop_files(dataset: Dataset, files: Iterable[str]): for file in files: dataset.drop(file, reckless='availability', result_renderer='disabled') assert not (dataset.pathobj / file).exists() -def _check_content(dataset, - file_content: Iterable[tuple[str, str]] - ): +def _check_content(dataset, file_content: Iterable[tuple[str, str]]): for file, content in file_content: assert (dataset.pathobj / file).read_text() == content @pytest.mark.parametrize('output_pattern', [output_pattern_static, output_pattern_glob]) def test_end_to_end(tmp_path, monkeypatch, output_pattern): - - root_dataset = create_simple_computation_dataset( - tmp_path, 'd2', 3, test_method) + root_dataset = create_simple_computation_dataset(tmp_path, 'd2', 3, test_method) # run `make` command results = root_dataset.make( @@ -82,11 +88,13 @@ def test_end_to_end(tmp_path, monkeypatch, output_pattern): 'third=third', ], output=output_pattern, - result_renderer='disabled') + result_renderer='disabled', + ) collected_output = [ str(Path(result['path']).relative_to(root_dataset.pathobj)) - for result in results] + for result in results + ] assert set(collected_output) == set(output_pattern_static) # check computation success diff --git a/datalad_remake/annexremotes/tests/test_remake_remote.py b/datalad_remake/annexremotes/tests/test_remake_remote.py index 69408c5..004e6e4 100644 --- a/datalad_remake/annexremotes/tests/test_remake_remote.py +++ b/datalad_remake/annexremotes/tests/test_remake_remote.py @@ -64,7 +64,6 @@ def send(self, value): def test_compute_remote_main(tmp_path, monkeypatch): - dataset = create_ds_hierarchy(tmp_path, 'ds1', 0)[0][2] monkeypatch.chdir(dataset.path) @@ -79,15 +78,15 @@ def test_compute_remote_main(tmp_path, monkeypatch): subprocess.run( ['git', 'annex', 'info', 'a.txt'], # noqa: S607 stdout=subprocess.PIPE, - check=True).stdout.splitlines())).split(b': ')[1] + check=True, + ).stdout.splitlines(), + ) + ).split(b': ')[1] (dataset.pathobj / specification_dir).mkdir(parents=True) (dataset.pathobj / specification_dir / '000001111122222').write_text( - build_json( - 'echo', - [], - ['a.txt'], - {'content': 'some_string'})) + build_json('echo', [], ['a.txt'], {'content': 'some_string'}) + ) input_ = MockedInput() diff --git a/datalad_remake/commands/make_cmd.py b/datalad_remake/commands/make_cmd.py index f64797d..3f98114 100644 --- a/datalad_remake/commands/make_cmd.py +++ b/datalad_remake/commands/make_cmd.py @@ -57,122 +57,148 @@ class Make(ValidatedInterface): # first docstring line is used a short description in the cmdline help # the rest is put in the verbose help and manpage - """Specify a computation and optionally execute it - """ - - _validator_ = EnsureCommandParameterization({ - 'dataset': EnsureDataset(installed=True), - 'input': EnsureListOf(EnsureStr(min_len=1)), - 'input_list': EnsureStr(min_len=1), - 'output': EnsureListOf(EnsureStr(min_len=1), min_len=1), - 'output_list': EnsureStr(min_len=1), - 'parameter': EnsureListOf(EnsureStr(min_len=3)), - 'parameter_list': EnsureStr(min_len=1), - }) + """Specify a computation and optionally execute it""" + + _validator_ = EnsureCommandParameterization( + { + 'dataset': EnsureDataset(installed=True), + 'input': EnsureListOf(EnsureStr(min_len=1)), + 'input_list': EnsureStr(min_len=1), + 'output': EnsureListOf(EnsureStr(min_len=1), min_len=1), + 'output_list': EnsureStr(min_len=1), + 'parameter': EnsureListOf(EnsureStr(min_len=3)), + 'parameter_list': EnsureStr(min_len=1), + } + ) # parameters of the command, must be exhaustive _params_: ClassVar[dict[int, Parameter]] = { 'dataset': Parameter( args=('-d', '--dataset'), - doc="Dataset to be used as a configuration source. Beyond " - "reading configuration items, this command does not interact with " - "the dataset."), + doc='Dataset to be used as a configuration source. Beyond ' + 'reading configuration items, this command does not interact with ' + 'the dataset.', + ), 'url_only': Parameter( args=('-u', '--url-only'), - action="store_true", + action='store_true', doc="Don't perform the computation, register an URL-key " - "instead. A `git annex get ` will trigger the computation"), + 'instead. A `git annex get ` will trigger the computation', + ), 'template': Parameter( args=('template',), - doc="Name of the computing template (template should be present " - "in $DATASET/.datalad/remake/methods)"), + doc='Name of the computing template (template should be present ' + 'in $DATASET/.datalad/remake/methods)', + ), 'branch': Parameter( - args=('-b', '--branch',), - doc="Branch (or commit) that should be used for computation, if " - "not specified HEAD will be used"), + args=( + '-b', + '--branch', + ), + doc='Branch (or commit) that should be used for computation, if ' + 'not specified HEAD will be used', + ), 'input': Parameter( - args=('-i', '--input',), + args=( + '-i', + '--input', + ), action='append', - doc="An input file pattern (repeat for multiple inputs, " - "file pattern support python globbing, globbing is expanded " - "in the source dataset)"), + doc='An input file pattern (repeat for multiple inputs, ' + 'file pattern support python globbing, globbing is expanded ' + 'in the source dataset)', + ), 'input_list': Parameter( - args=('-I', '--input-list',), - doc="Name of a file that contains a list of input file patterns. " - "Format is one file per line, relative path from `dataset`. " - "Empty lines, i.e. lines that contain only newlines, and lines " - "that start with '#' are ignored. Line content is stripped " - "before used. This is useful if a large number of input file " - "patterns should be provided."), + args=( + '-I', + '--input-list', + ), + doc='Name of a file that contains a list of input file patterns. ' + 'Format is one file per line, relative path from `dataset`. ' + 'Empty lines, i.e. lines that contain only newlines, and lines ' + "that start with '#' are ignored. Line content is stripped " + 'before used. This is useful if a large number of input file ' + 'patterns should be provided.', + ), 'output': Parameter( - args=('-o', '--output',), + args=( + '-o', + '--output', + ), action='append', - doc="An output file pattern (repeat for multiple outputs)" - "file pattern support python globbing, globbing is expanded " - "in the worktree)"), + doc='An output file pattern (repeat for multiple outputs)' + 'file pattern support python globbing, globbing is expanded ' + 'in the worktree)', + ), 'output_list': Parameter( - args=('-O', '--output-list',), - doc="Name of a file that contains a list of output patterns. Format " - "is one file per line, relative path from `dataset`. Empty " - "lines, i.e. lines that contain only newlines, arg ignored. " - "This is useful if a large number of output files should be " - "provided."), + args=( + '-O', + '--output-list', + ), + doc='Name of a file that contains a list of output patterns. Format ' + 'is one file per line, relative path from `dataset`. Empty ' + 'lines, i.e. lines that contain only newlines, arg ignored. ' + 'This is useful if a large number of output files should be ' + 'provided.', + ), 'parameter': Parameter( - args=('-p', '--parameter',), + args=( + '-p', + '--parameter', + ), action='append', - doc="Input parameter in the form = (repeat for " - "multiple parameters)"), + doc='Input parameter in the form = (repeat for ' + 'multiple parameters)', + ), 'parameter_list': Parameter( - args=('-P', '--parameter-list',), - doc="Name of a file that contains a list of parameters. Format " - "is one `=` string per line. " - "Empty lines, i.e. lines that contain only newlines, and lines " - "that start with '#' are ignored. Line content is stripped " - "before used. This is useful if a large number of parameters " - "should be provided."), + args=( + '-P', + '--parameter-list', + ), + doc='Name of a file that contains a list of parameters. Format ' + 'is one `=` string per line. ' + 'Empty lines, i.e. lines that contain only newlines, and lines ' + "that start with '#' are ignored. Line content is stripped " + 'before used. This is useful if a large number of parameters ' + 'should be provided.', + ), } @staticmethod @datasetmethod(name='make') @eval_results - def __call__(dataset=None, - *, - url_only=False, - template=None, - branch=None, - input_=None, - input_list=None, - output=None, - output_list=None, - parameter=None, - parameter_list=None, - ): - - dataset : Dataset = dataset.ds if dataset else Dataset('.') + def __call__( + dataset=None, + *, + url_only=False, + template=None, + branch=None, + input_=None, + input_list=None, + output=None, + output_list=None, + parameter=None, + parameter_list=None, + ): + dataset: Dataset = dataset.ds if dataset else Dataset('.') input_pattern = (input_ or []) + read_list(input_list) output_pattern = (output or []) + read_list(output_list) parameter = (parameter or []) + read_list(parameter_list) - parameter_dict = { - p.split('=', 1)[0]: p.split('=', 1)[1] - for p in parameter} + parameter_dict = {p.split('=', 1)[0]: p.split('=', 1)[1] for p in parameter} # We have to get the URL first, because saving the specification to # the dataset will change the version. url_base, reset_commit = get_url( - dataset, - branch, - template, - parameter_dict, - input_pattern, - output_pattern) + dataset, branch, template, parameter_dict, input_pattern, output_pattern + ) if not url_only: with provide_context( - dataset, - branch, - input_pattern, + dataset, + branch, + input_pattern, ) as worktree: execute(worktree, template, parameter_dict, output_pattern) output = collect(worktree, dataset, output_pattern) @@ -180,42 +206,43 @@ def __call__(dataset=None, for out in output: url = add_url(dataset, out, url_base, url_only=url_only) yield get_status_dict( - action='make', - path=str(dataset.pathobj / out), - status='ok', - message=f'added url: {url!r} to {out!r} in {dataset.pathobj}',) + action='make', + path=str(dataset.pathobj / out), + status='ok', + message=f'added url: {url!r} to {out!r} in {dataset.pathobj}', + ) def read_list(list_file: str | Path | None) -> list[str]: if list_file is None: return [] - return list(filter( - lambda s: s != '' and not s.startswith('#'), - [ - line.strip() - for line in Path(list_file).read_text().splitlines(keepends=False) - ])) - - -def get_url(dataset: Dataset, - branch: str | None, - template_name: str, - parameters: dict[str, str], - input_pattern: list[str], - output_pattern: list[str], - ) -> tuple[str, str]: + return list( + filter( + lambda s: s != '' and not s.startswith('#'), + [ + line.strip() + for line in Path(list_file).read_text().splitlines(keepends=False) + ], + ) + ) + +def get_url( + dataset: Dataset, + branch: str | None, + template_name: str, + parameters: dict[str, str], + input_pattern: list[str], + output_pattern: list[str], +) -> tuple[str, str]: # If something goes wrong after the make specification was saved, # the dataset state should be reset to `branch` reset_branch = branch or dataset.repo.get_hexsha() # Write the specification to a file in the dataset digest = write_spec( - dataset, - template_name, - input_pattern, - output_pattern, - parameters) + dataset, template_name, input_pattern, output_pattern, parameters + ) return ( f'{url_scheme}:///' @@ -224,13 +251,13 @@ def get_url(dataset: Dataset, ), reset_branch -def write_spec(dataset: Dataset, - method: str, - input_pattern: list[str], - output_pattern: list[str], - parameters: dict[str, str] - ) -> str: - +def write_spec( + dataset: Dataset, + method: str, + input_pattern: list[str], + output_pattern: list[str], + parameters: dict[str, str], +) -> str: # create the specification and hash it spec = build_json(method, input_pattern, output_pattern, parameters) hasher = hashlib.sha256() @@ -242,38 +269,28 @@ def write_spec(dataset: Dataset, spec_dir.mkdir(exist_ok=True) spec_file = spec_dir / digest with contextlib.chdir(dataset.pathobj): - call_git_success( - ['annex', 'unlock', str(spec_file)], - capture_output=True) + call_git_success(['annex', 'unlock', str(spec_file)], capture_output=True) spec_file.write_text(spec) dataset.save( message=f'[DATALAD] saving computation spec\n\nfile name: {digest}', - recursive=True, result_renderer='disabled') + recursive=True, + result_renderer='disabled', + ) return digest -def build_json(method: str, - inputs: list[str], - outputs: list[str], - parameters: dict[str, str] - ) -> str: - return json.dumps({ - 'method': method, - 'input': inputs, - 'output': outputs, - 'parameter': parameters}) - +def build_json( + method: str, inputs: list[str], outputs: list[str], parameters: dict[str, str] +) -> str: + return json.dumps( + {'method': method, 'input': inputs, 'output': outputs, 'parameter': parameters} + ) -def add_url(dataset: Dataset, - file_path: str, - url_base: str, - *, - url_only: bool - ) -> str: +def add_url(dataset: Dataset, file_path: str, url_base: str, *, url_only: bool) -> str: lgr.debug( - 'add_url: %s %s %s %s', - str(dataset), str(file_path), url_base, repr(url_only)) + 'add_url: %s %s %s %s', str(dataset), str(file_path), url_base, repr(url_only) + ) # Build the file-specific URL and store it in the annex url = url_base + f'&this={quote(file_path)}' @@ -288,7 +305,8 @@ def add_url(dataset: Dataset, can_add = call_git_success( ['annex', 'whereis', str(file_path)], cwd=file_dataset_path, - capture_output=True) + capture_output=True, + ) # Add the URL if can_add: @@ -296,51 +314,47 @@ def add_url(dataset: Dataset, ['annex', 'addurl', url, '--file', file_path] + (['--relaxed'] if url_only else []), cwd=file_dataset_path, - capture_output=True) + capture_output=True, + ) if not success: msg = ( f'\naddurl failed:\nfile_dataset_path: {file_dataset_path}\n' - f'url: {url!r}\nfile_path: {file_path!r}') + f'url: {url!r}\nfile_path: {file_path!r}' + ) raise RuntimeError(msg) return url def get_file_dataset(file: Path) -> tuple[Path, Path]: - """ Get dataset of file and relative path of file from the dataset + """Get dataset of file and relative path of file from the dataset Determine the path of the dataset that contains the file and the relative path of the file in this dataset.""" - top_level = Path(call_git_oneline( - ['rev-parse', '--show-toplevel'], - cwd=file.parent)) - return ( - Path(top_level), - file.absolute().relative_to(top_level)) - + top_level = Path( + call_git_oneline(['rev-parse', '--show-toplevel'], cwd=file.parent) + ) + return (Path(top_level), file.absolute().relative_to(top_level)) -def provide(dataset: Dataset, - branch: str | None, - input_patterns: list[str], - ) -> Path: +def provide( + dataset: Dataset, + branch: str | None, + input_patterns: list[str], +) -> Path: lgr.debug('provide: %s %s %s', dataset, branch, input_patterns) result = dataset.provision( - input=input_patterns, - branch=branch, - result_renderer='disabled') + input=input_patterns, branch=branch, result_renderer='disabled' + ) return Path(result[0]['path']) @contextlib.contextmanager -def provide_context(dataset: Dataset, - branch: str | None, - input_patterns: list[str], - ) -> Generator: - - worktree = provide( - dataset, - branch=branch, - input_patterns=input_patterns) +def provide_context( + dataset: Dataset, + branch: str | None, + input_patterns: list[str], +) -> Generator: + worktree = provide(dataset, branch=branch, input_patterns=input_patterns) try: yield worktree finally: @@ -348,22 +362,24 @@ def provide_context(dataset: Dataset, dataset.provision(delete=worktree, result_renderer='disabled') -def execute(worktree: Path, - template_name: str, - parameter: dict[str, str], - output_pattern: list[str], - ) -> None: - +def execute( + worktree: Path, + template_name: str, + parameter: dict[str, str], + output_pattern: list[str], +) -> None: lgr.debug( - 'execute: %s %s %s %s', str(worktree), - template_name, repr(parameter), repr(output_pattern)) + 'execute: %s %s %s %s', + str(worktree), + template_name, + repr(parameter), + repr(output_pattern), + ) worktree_ds = Dataset(worktree) # Determine which outputs already exist - existing_outputs = resolve_patterns( - root_dir=worktree, - patterns=output_pattern) + existing_outputs = resolve_patterns(root_dir=worktree, patterns=output_pattern) # Get the subdatasets, directories, and files of the existing output space create_output_space(worktree_ds, existing_outputs) @@ -377,11 +393,11 @@ def execute(worktree: Path, compute(worktree, worktree / template_path, parameter) -def collect(worktree: Path, - dataset: Dataset, - output_pattern: Iterable[str], - ) -> set[str]: - +def collect( + worktree: Path, + dataset: Dataset, + output_pattern: Iterable[str], +) -> set[str]: output = resolve_patterns(root_dir=worktree, patterns=output_pattern) # Unlock output files in the dataset-directory and copy the result @@ -397,9 +413,7 @@ def collect(worktree: Path, return output -def unlock_files(dataset: Dataset, - files: Iterable[str] - ) -> None: +def unlock_files(dataset: Dataset, files: Iterable[str]) -> None: """Use datalad to resolve subdatasets and unlock files in the dataset.""" # TODO: for some reason `dataset unlock` does not operate in the # context of `dataset.pathobj`, so we need to change the working @@ -417,9 +431,7 @@ def unlock_files(dataset: Dataset, dataset.unlock(file, result_renderer='disabled') -def create_output_space(dataset: Dataset, - files: Iterable[str] - ) -> None: +def create_output_space(dataset: Dataset, files: Iterable[str]) -> None: """Get all files that are part of the output space.""" for f in files: with contextlib.suppress(IncompleteResultsError): diff --git a/datalad_remake/commands/provision_cmd.py b/datalad_remake/commands/provision_cmd.py index 19e8505..3bca2fd 100644 --- a/datalad_remake/commands/provision_cmd.py +++ b/datalad_remake/commands/provision_cmd.py @@ -3,6 +3,7 @@ Data is provisioned in a temporary worktree. All subdatasets are currently also provisioned. """ + from __future__ import annotations import logging @@ -60,79 +61,99 @@ class Provision(ValidatedInterface): environment for `make` commands. """ - _validator_ = EnsureCommandParameterization({ - 'dataset': EnsureDataset(installed=True), - 'input': EnsureListOf(EnsureStr(min_len=1)), - 'input_list': EnsureStr(min_len=1), - 'tmp_dir': EnsurePath(is_mode=stat.S_ISDIR), - 'delete': EnsureDataset(installed=True), - 'no_globbing': EnsureBool(), - }) + _validator_ = EnsureCommandParameterization( + { + 'dataset': EnsureDataset(installed=True), + 'input': EnsureListOf(EnsureStr(min_len=1)), + 'input_list': EnsureStr(min_len=1), + 'tmp_dir': EnsurePath(is_mode=stat.S_ISDIR), + 'delete': EnsureDataset(installed=True), + 'no_globbing': EnsureBool(), + } + ) # parameters of the command, must be exhaustive _params_: ClassVar[dict[str, Parameter]] = { 'dataset': Parameter( args=('-d', '--dataset'), - doc="Dataset to be used as a configuration source. Beyond " - "reading configuration items, this command does not interact with " - "the dataset."), + doc='Dataset to be used as a configuration source. Beyond ' + 'reading configuration items, this command does not interact with ' + 'the dataset.', + ), 'branch': Parameter( - args=('-b', '--branch',), - doc="Branch (or commit) that should be provisioned, if " - "not specified HEAD will be used"), + args=( + '-b', + '--branch', + ), + doc='Branch (or commit) that should be provisioned, if ' + 'not specified HEAD will be used', + ), 'delete': Parameter( args=('--delete',), - doc="Delete the temporary worktree WORKTREE that belongs the the " - "dataset (cannot be used with `-b`, `--branch`, `-i`," - "`--input`, `-I`, or `--input-list`)."), + doc='Delete the temporary worktree WORKTREE that belongs the the ' + 'dataset (cannot be used with `-b`, `--branch`, `-i`,' + '`--input`, `-I`, or `--input-list`).', + ), 'input': Parameter( - args=('-i', '--input',), + args=( + '-i', + '--input', + ), action='append', - doc="An input file pattern (repeat for multiple inputs, " - "file pattern support python globbing, globbing is done in the " - "worktree and through all matching subdatasets, installing " - "if necessary)."), + doc='An input file pattern (repeat for multiple inputs, ' + 'file pattern support python globbing, globbing is done in the ' + 'worktree and through all matching subdatasets, installing ' + 'if necessary).', + ), 'input_list': Parameter( - args=('-I', '--input-list',), - doc="Name of a file that contains a list of input file patterns. " - "Format is one file per line, relative path from `dataset`. " - "Empty lines, i.e. lines that contain only newlines, and lines " - "that start with '#' are ignored. Line content is stripped " - "before used. This is useful if a large number of input file " - "patterns should be provided."), + args=( + '-I', + '--input-list', + ), + doc='Name of a file that contains a list of input file patterns. ' + 'Format is one file per line, relative path from `dataset`. ' + 'Empty lines, i.e. lines that contain only newlines, and lines ' + "that start with '#' are ignored. Line content is stripped " + 'before used. This is useful if a large number of input file ' + 'patterns should be provided.', + ), 'worktree_dir': Parameter( - args=('-w', '--worktree-dir',), - doc="Path of the directory that should become the temporary " - "worktree, defaults to `tempfile.TemporaryDirectory().name`."), + args=( + '-w', + '--worktree-dir', + ), + doc='Path of the directory that should become the temporary ' + 'worktree, defaults to `tempfile.TemporaryDirectory().name`.', + ), } @staticmethod @datasetmethod(name='provision') @eval_results - def __call__(dataset=None, - branch=None, - delete=None, - input_=None, - input_list=None, - worktree_dir=None, - ): - - dataset : Dataset = dataset.ds if dataset else Dataset('.') + def __call__( + dataset=None, + branch=None, + delete=None, + input_=None, + input_list=None, + worktree_dir=None, + ): + dataset: Dataset = dataset.ds if dataset else Dataset('.') if delete: if branch or input_: msg = ( 'Cannot use `-d`, `--delete` with `-b`, `--branch`,' ' `-i`, or `--input`' ) - raise ValueError( - msg) + raise ValueError(msg) remove(dataset, delete.ds) yield get_status_dict( action='provision [delete]', path=delete.ds.path, status='ok', - message=f'delete workspace: {delete.ds.path!r} from dataset {dataset}') + message=f'delete workspace: {delete.ds.path!r} from dataset {dataset}', + ) return worktree_dir: Path = Path(worktree_dir or TemporaryDirectory().name) @@ -140,29 +161,24 @@ def __call__(dataset=None, yield from provide(dataset, worktree_dir, inputs, branch) -def remove(dataset: Dataset, - worktree: Dataset - ) -> None: +def remove(dataset: Dataset, worktree: Dataset) -> None: worktree.drop( - what='all', - reckless='kill', - recursive=True, - result_renderer='disabled') + what='all', reckless='kill', recursive=True, result_renderer='disabled' + ) prune_worktrees(dataset) - call_git_success( - ['branch', '-d', worktree.pathobj.name], - cwd=dataset.pathobj) + call_git_success(['branch', '-d', worktree.pathobj.name], cwd=dataset.pathobj) def prune_worktrees(dataset: Dataset) -> None: call_git_lines(['worktree', 'prune'], cwd=dataset.pathobj) -def provide(dataset: Dataset, - worktree_dir: Path, - input_patterns: list[str], - source_branch: str | None = None, - ) -> Generator: +def provide( + dataset: Dataset, + worktree_dir: Path, + input_patterns: list[str], + source_branch: str | None = None, +) -> Generator: """Provide paths defined by input_patterns in a temporary worktree Parameters @@ -186,10 +202,10 @@ def provide(dataset: Dataset, worktree_dir.mkdir(parents=True, exist_ok=True) # Create a worktree - args = ['worktree', 'add'] + [str(worktree_dir)] + ( - [source_branch] - if source_branch - else [] + args = ( + ['worktree', 'add'] + + [str(worktree_dir)] + + ([source_branch] if source_branch else []) ) call_git_lines(args, cwd=dataset.pathobj) @@ -201,7 +217,8 @@ def provide(dataset: Dataset, path=element['path'], status='error', state=element['state'], - message=f'cannot provision {element["state"]} input: {element["path"]!r} from dataset {dataset}') + message=f'cannot provision {element["state"]} input: {element["path"]!r} from dataset {dataset}', + ) if is_dirty: return @@ -216,13 +233,13 @@ def provide(dataset: Dataset, action='provision', path=str(worktree_dir), status='ok', - message=f'provisioned dataset: {dataset} in workspace: {worktree_dir!r}',) + message=f'provisioned dataset: {dataset} in workspace: {worktree_dir!r}', + ) -def resolve_patterns(dataset: Dataset, - worktree: Dataset, - pattern_list: list[str] - ) -> set[Path]: +def resolve_patterns( + dataset: Dataset, worktree: Dataset, pattern_list: list[str] +) -> set[Path]: """Resolve file patterns in the dataset This method will resolve relative path-patterns in the dataset. It will @@ -258,7 +275,9 @@ def resolve_patterns(dataset: Dataset, Path(), pattern_parts, get_uninstalled_subdatasets(worktree), - get_installed_subdatasets(dataset))) + get_installed_subdatasets(dataset), + ) + ) return matches @@ -267,15 +286,17 @@ def get_uninstalled_subdatasets(dataset: Dataset) -> set[Path]: return { Path(result['path']).relative_to(dataset.pathobj) for result in dataset.subdatasets(recursive=True, result_renderer='disabled') - if result['state'] == 'absent'} + if result['state'] == 'absent' + } -def glob_pattern(root: Dataset, - position: Path, - pattern: list[str], - uninstalled_subdatasets: set[Path], - locally_available_subdatasets: Iterable[tuple[Path, Path, Path]], - ) -> set[Path]: +def glob_pattern( + root: Dataset, + position: Path, + pattern: list[str], + uninstalled_subdatasets: set[Path], + locally_available_subdatasets: Iterable[tuple[Path, Path, Path]], +) -> set[Path]: """Glob a pattern in a dataset installing subdatasets if necessary Parameters @@ -311,15 +332,15 @@ def glob_pattern(root: Dataset, position, pattern[1:], uninstalled_subdatasets, - locally_available_subdatasets) + locally_available_subdatasets, + ) else: result = set() # Match all elements at the current position with the first part of the # pattern. for rec_match in glob( - '*' if pattern[0] == '**' else pattern[0], - root_dir=root.pathobj / position + '*' if pattern[0] == '**' else pattern[0], root_dir=root.pathobj / position ): match = position / rec_match @@ -329,10 +350,8 @@ def glob_pattern(root: Dataset, if match.is_dir() and match in uninstalled_subdatasets: lgr.info('Installing subdataset %s to glob input', match) install_subdataset( - root, - match, - uninstalled_subdatasets, - locally_available_subdatasets) + root, match, uninstalled_subdatasets, locally_available_subdatasets + ) # We have a match, try to match the remainder of the pattern. submatch_pattern = pattern if pattern[0] == '**' else pattern[1:] @@ -342,7 +361,9 @@ def glob_pattern(root: Dataset, match, submatch_pattern, uninstalled_subdatasets, - locally_available_subdatasets)) + locally_available_subdatasets, + ) + ) return result @@ -354,43 +375,47 @@ def get_dirty_elements(dataset: Dataset) -> Generator: yield result -def install_subdataset(worktree: Dataset, - subdataset_path: Path, - uninstalled_subdatasets: set[Path], - locally_available_datasets: Iterable[tuple[Path, Path, Path]], - ) -> None: +def install_subdataset( + worktree: Dataset, + subdataset_path: Path, + uninstalled_subdatasets: set[Path], + locally_available_datasets: Iterable[tuple[Path, Path, Path]], +) -> None: """Install a subdataset, prefer locally available subdatasets""" - local_subdataset = ([ - dataset - for dataset in locally_available_datasets - if dataset[2] == subdataset_path] or [None])[0] + local_subdataset = ( + [ + dataset + for dataset in locally_available_datasets + if dataset[2] == subdataset_path + ] + or [None] + )[0] if local_subdataset: absolute_path, parent_ds_path, path_from_root = local_subdataset # Set the URL to the full source path - args = ['-C', str(worktree.pathobj / parent_ds_path), - 'submodule', 'set-url', '--', - str(path_from_root.relative_to(parent_ds_path)), - 'file://' + str(absolute_path)] + args = [ + '-C', + str(worktree.pathobj / parent_ds_path), + 'submodule', + 'set-url', + '--', + str(path_from_root.relative_to(parent_ds_path)), + 'file://' + str(absolute_path), + ] call_git_lines(args) - worktree.get( - str(subdataset_path), - get_data=False, - result_renderer='disabled') + worktree.get(str(subdataset_path), get_data=False, result_renderer='disabled') uninstalled_subdatasets.remove(subdataset_path) uninstalled_subdatasets.update(get_uninstalled_subdatasets(worktree)) -def get_installed_subdatasets(dataset: Dataset - ) -> Iterable[tuple[Path, Path, Path]]: - results = dataset.subdatasets( - recursive=True, - result_renderer='disabled') +def get_installed_subdatasets(dataset: Dataset) -> Iterable[tuple[Path, Path, Path]]: + results = dataset.subdatasets(recursive=True, result_renderer='disabled') return [ ( Path(result['path']), Path(result['parentds']).relative_to(dataset.pathobj), - Path(result['path']).relative_to(dataset.pathobj) + Path(result['path']).relative_to(dataset.pathobj), ) for result in results if result['state'] == 'present' diff --git a/datalad_remake/commands/tests/create_datasets.py b/datalad_remake/commands/tests/create_datasets.py index ca2c545..06e6a0b 100644 --- a/datalad_remake/commands/tests/create_datasets.py +++ b/datalad_remake/commands/tests/create_datasets.py @@ -15,23 +15,29 @@ def update_config_for_remake(dataset: Dataset): scope='local', recursive=True, spec=[('annex.security.allow-unverified-downloads', 'ACKTHPPT')], - result_renderer='disabled') + result_renderer='disabled', + ) def add_remake_remote(dataset: Dataset): - call_git_success([ - '-C', dataset.path, - 'annex', 'initremote', 'remake', - 'type=external', 'externaltype=datalad-remake', - 'encryption=none'], - capture_output=True) - - -def create_ds_hierarchy(tmp_path: Path, - name: str, - subdataset_levels: int = 2 - ) -> list[tuple[str, Path, Dataset]]: - + call_git_success( + [ + '-C', + dataset.path, + 'annex', + 'initremote', + 'remake', + 'type=external', + 'externaltype=datalad-remake', + 'encryption=none', + ], + capture_output=True, + ) + + +def create_ds_hierarchy( + tmp_path: Path, name: str, subdataset_levels: int = 2 +) -> list[tuple[str, Path, Dataset]]: # Create root dataset root_dataset = Dataset(tmp_path / name) root_dataset.create(force=True, result_renderer='disabled') @@ -52,7 +58,7 @@ def create_ds_hierarchy(tmp_path: Path, # Link the datasets for index in range(len(datasets) - 2, -1, -1): - dataset, subdataset = datasets[index:index+2] + dataset, subdataset = datasets[index : index + 2] dataset[2].install( path=subdataset[0], source='file://' + subdataset[2].path, @@ -73,12 +79,12 @@ def create_ds_hierarchy(tmp_path: Path, return datasets -def create_simple_computation_dataset(tmp_path: Path, - dataset_name: str, - subdataset_levels: int, - test_method: str, - ) -> Dataset: - +def create_simple_computation_dataset( + tmp_path: Path, + dataset_name: str, + subdataset_levels: int, + test_method: str, +) -> Dataset: datasets = create_ds_hierarchy(tmp_path, dataset_name, subdataset_levels) root_dataset = datasets[0][2] diff --git a/datalad_remake/commands/tests/test_collection.py b/datalad_remake/commands/tests/test_collection.py index 371e625..e0c1590 100644 --- a/datalad_remake/commands/tests/test_collection.py +++ b/datalad_remake/commands/tests/test_collection.py @@ -6,14 +6,11 @@ def test_collect(tmp_path): - dataset = create_ds_hierarchy(tmp_path, 'ds1', 1)[0][2] worktree_dir = tmp_path / 'ds1_worktree' worktree_dir.mkdir(parents=True, exist_ok=False) - worktree = dataset.provision( - worktree_dir=worktree_dir, - result_renderer='disabled') + worktree = dataset.provision(worktree_dir=worktree_dir, result_renderer='disabled') result_dir = worktree_dir / 'results' / 'sub-01' result_dir.mkdir(parents=True) @@ -23,7 +20,10 @@ def test_collect(tmp_path): result = collect( worktree=Path(worktree[0]['path']), dataset=dataset, - output_pattern=['results/**'] + output_pattern=['results/**'], ) assert result == {'results/sub-01/a.txt', 'results/sub-01/b.txt'} - assert set(get_file_list(dataset.pathobj / 'results')) == {'sub-01/a.txt', 'sub-01/b.txt'} + assert set(get_file_list(dataset.pathobj / 'results')) == { + 'sub-01/a.txt', + 'sub-01/b.txt', + } diff --git a/datalad_remake/commands/tests/test_compute.py b/datalad_remake/commands/tests/test_compute.py index 4e51fbb..6b763da 100644 --- a/datalad_remake/commands/tests/test_compute.py +++ b/datalad_remake/commands/tests/test_compute.py @@ -15,9 +15,7 @@ def test_duplicated_computation(tmp_path): - - root_dataset = create_simple_computation_dataset( - tmp_path, 'ds1', 0, test_method) + root_dataset = create_simple_computation_dataset(tmp_path, 'ds1', 0, test_method) # run the same command twice _run_simple_computation(root_dataset) @@ -25,20 +23,21 @@ def test_duplicated_computation(tmp_path): def test_speculative_computation(tmp_path, datalad_cfg): - - root_dataset = create_simple_computation_dataset( - tmp_path, 'ds1', 0, test_method) + root_dataset = create_simple_computation_dataset(tmp_path, 'ds1', 0, test_method) root_dataset.make( template='test_method', parameter=['name=Robert', 'file=spec.txt'], output=['spec.txt'], url_only=True, - result_renderer='disabled') + result_renderer='disabled', + ) # set annex security related variables to allow datalad-remake-URLs # in speculative make commands - datalad_cfg.set('annex.security.allow-unverified-downloads', 'ACKTHPPT', scope='global') + datalad_cfg.set( + 'annex.security.allow-unverified-downloads', 'ACKTHPPT', scope='global' + ) # Perform the speculative computation root_dataset.get('spec.txt') @@ -50,7 +49,8 @@ def _run_simple_computation(root_dataset: Dataset): template='test_method', parameter=['name=Robert', 'file=a.txt'], output=['a.txt'], - result_renderer='disabled') + result_renderer='disabled', + ) # check that the output is correct assert (root_dataset.pathobj / 'a.txt').read_text() == 'Hello Robert\n' diff --git a/datalad_remake/commands/tests/test_listhandling.py b/datalad_remake/commands/tests/test_listhandling.py index a1864f0..dd18095 100644 --- a/datalad_remake/commands/tests/test_listhandling.py +++ b/datalad_remake/commands/tests/test_listhandling.py @@ -27,17 +27,19 @@ def test_list_reading_strip(tmp_path: Path): assert read_list(str(list_file)) == ['a', 'b', 'c'] -def _test_wordlist(tmp_path: Path, - word_list: list[str], - ) -> None: +def _test_wordlist( + tmp_path: Path, + word_list: list[str], +) -> None: list_file = _write_list(tmp_path, word_list) assert read_list(str(list_file)) == word_list assert read_list(list_file) == word_list -def _write_list(tmp_path: Path, - word_list: list[str], - ) -> Path: +def _write_list( + tmp_path: Path, + word_list: list[str], +) -> Path: list_file = tmp_path / 'list.txt' list_file.write_text('\n'.join(word_list)) return list_file diff --git a/datalad_remake/commands/tests/test_provision.py b/datalad_remake/commands/tests/test_provision.py index cf4cc71..3fcb8f7 100644 --- a/datalad_remake/commands/tests/test_provision.py +++ b/datalad_remake/commands/tests/test_provision.py @@ -23,33 +23,29 @@ all_paths = [ - template.format(file=f) - for template in file_path_templates - for f in ['a', 'b'] + template.format(file=f) for template in file_path_templates for f in ['a', 'b'] ] -a_paths = [ - path.format(file='a') - for path in file_path_templates -] +a_paths = [path.format(file='a') for path in file_path_templates] -b_paths = [ - path.format(file='b') - for path in file_path_templates -] +b_paths = [path.format(file='b') for path in file_path_templates] def test_worktree_basic(tmp_path): dataset = create_ds_hierarchy(tmp_path, 'ds1', 3)[0][2] inputs = [ - 'a.txt', 'b.txt', - 'ds1_subds0/a0.txt', 'ds1_subds0/b0.txt', - 'ds1_subds0/ds1_subds1/a1.txt', 'ds1_subds0/ds1_subds1/b1.txt' + 'a.txt', + 'b.txt', + 'ds1_subds0/a0.txt', + 'ds1_subds0/b0.txt', + 'ds1_subds0/ds1_subds1/a1.txt', + 'ds1_subds0/ds1_subds1/b1.txt', ] provision_result = dataset.provision( worktree_dir=tmp_path / 'ds1_worktree1', input=inputs, - result_renderer='disabled')[0] + result_renderer='disabled', + )[0] worktree = Dataset(provision_result['path']) # Check input availability @@ -67,10 +63,8 @@ def check_deleted_worktrees(ds: Dataset): check_deleted_worktrees(dataset) dataset.drop( - what='all', - reckless='kill', - recursive=True, - result_renderer='disabled') + what='all', reckless='kill', recursive=True, result_renderer='disabled' + ) def test_worktree_globbing(tmp_path): @@ -88,10 +82,7 @@ def test_worktree_globbing(tmp_path): worktree = Path(result['path']) worktree_set = set(get_file_list(worktree)) - assert worktree_set == { - path.format(ds_name='ds1') - for path in all_paths - } + assert worktree_set == {path.format(ds_name='ds1') for path in all_paths} dataset.provision(delete=worktree, result_renderer='disabled') result = dataset.provision( @@ -107,23 +98,17 @@ def test_worktree_globbing(tmp_path): worktree = Path(result['path']) worktree_set = set(get_file_list(worktree)) - assert { - path.format(ds_name='ds1') - for path in b_paths - }.issubset(worktree_set) + assert {path.format(ds_name='ds1') for path in b_paths}.issubset(worktree_set) dataset.provision(delete=worktree, result_renderer='disabled') dataset.drop( - what='all', - reckless='kill', - recursive=True, - result_renderer='disabled') + what='all', reckless='kill', recursive=True, result_renderer='disabled' + ) -def get_file_list(root: Path, - path: Path|None = None, - prefix: Path|None = None - ) -> Iterable[str]: +def get_file_list( + root: Path, path: Path | None = None, prefix: Path | None = None +) -> Iterable[str]: prefix = prefix or Path('') path = path or root for child in path.iterdir(): @@ -156,31 +141,31 @@ def test_unclean_dataset(tmp_path): input=input_pattern, worktree_dir=tmp_path / 'ds1_worktree1', on_failure='ignore', - result_renderer='disabled') - assert {(result['status'], result['state']) for result in results} == \ - {('error', 'modified'), ('error', 'untracked')} + result_renderer='disabled', + ) + assert {(result['status'], result['state']) for result in results} == { + ('error', 'modified'), + ('error', 'untracked'), + } # Check that a saved dataset can be provisioned dataset.save() dataset.provision( input=input_pattern, worktree_dir=tmp_path / 'ds1_worktree2', - result_renderer='disabled') + result_renderer='disabled', + ) def test_branch_deletion_after_provision(tmp_path): dataset = create_ds_hierarchy(tmp_path, 'ds1', 3)[0][2] with provide_context( - dataset=dataset, - branch=None, - input_patterns=['a.txt'] + dataset=dataset, branch=None, input_patterns=['a.txt'] ) as worktree: assert worktree.exists() assert not worktree.exists() with contextlib.chdir(dataset.path): - branches = [ - line.strip() - for line in call_git_lines(['branch'])] + branches = [line.strip() for line in call_git_lines(['branch'])] assert worktree.name not in branches @@ -188,32 +173,37 @@ def test_not_present_local_datasets(tmp_path): root_ds = Dataset(tmp_path / 'ds1') root_ds.create(cfg_proc='text2git', result_renderer='disabled') root_ds.clone( - 'https://github.com/OpenNeuroDatasets/ds000102', - result_renderer='disabled') + 'https://github.com/OpenNeuroDatasets/ds000102', result_renderer='disabled' + ) provisioned_dataset = Dataset( - root_ds.provision( - input=['ds000102/README'], - result_renderer='disabled')[0]['path']) + root_ds.provision(input=['ds000102/README'], result_renderer='disabled')[0][ + 'path' + ] + ) url = _get_submodule_url(provisioned_dataset, 'ds000102') assert url.startswith(f'file://{root_ds.path}') root_ds.drop( - 'ds000102', - what='all', - reckless='availability', - result_renderer='disabled') + 'ds000102', what='all', reckless='availability', result_renderer='disabled' + ) provisioned_dataset_2 = Dataset( root_ds.provision( - input=['ds000102/README'], - on_failure='ignore', - result_renderer='disabled')[0]['path']) + input=['ds000102/README'], on_failure='ignore', result_renderer='disabled' + )[0]['path'] + ) url_2 = _get_submodule_url(provisioned_dataset_2, 'ds000102') assert url_2 == 'https://github.com/OpenNeuroDatasets/ds000102' def _get_submodule_url(dataset: Dataset, submodule_path: str) -> str: x = call_git_lines( - ['config', '-f', str(dataset.pathobj / '.gitmodules'), '--get', - f'submodule.{submodule_path}.url']) + [ + 'config', + '-f', + str(dataset.pathobj / '.gitmodules'), + '--get', + f'submodule.{submodule_path}.url', + ] + ) return x[0].strip() diff --git a/datalad_remake/tests/test_register.py b/datalad_remake/tests/test_register.py index a8781a3..89f3a7f 100644 --- a/datalad_remake/tests/test_register.py +++ b/datalad_remake/tests/test_register.py @@ -1,6 +1,5 @@ - - def test_register(): import datalad.api as da + assert hasattr(da, 'make') assert hasattr(da, 'provision') diff --git a/datalad_remake/utils/compute.py b/datalad_remake/utils/compute.py index 3f816ee..01a9f50 100644 --- a/datalad_remake/utils/compute.py +++ b/datalad_remake/utils/compute.py @@ -12,9 +12,10 @@ lgr = logging.getLogger('datalad.compute') -def substitute_string(format_str: str, - replacements: dict[str, str], - ) -> str: +def substitute_string( + format_str: str, + replacements: dict[str, str], +) -> str: for variable_name, replacement in replacements.items(): place_holder = '{' + variable_name + '}' if place_holder in format_str: @@ -22,21 +23,21 @@ def substitute_string(format_str: str, return format_str -def substitute_arguments(spec: dict[str, Any], - replacements: dict[str, str], - format_list_id: str, - ) -> list[str]: - +def substitute_arguments( + spec: dict[str, Any], + replacements: dict[str, str], + format_list_id: str, +) -> list[str]: return [ substitute_string(str(format_str), replacements) for format_str in spec[format_list_id] ] -def get_substitutions(template: dict[str, Any], - arguments: dict[str, str], - ) -> dict[str, str]: - +def get_substitutions( + template: dict[str, Any], + arguments: dict[str, str], +) -> dict[str, str]: # Check the user specified inputs inputs = template['inputs'] if len(inputs) != len(arguments.keys()): @@ -45,24 +46,22 @@ def get_substitutions(template: dict[str, Any], if not all(input_name in arguments for input_name in inputs): msg = ( f'Template inputs and arguments have different names: ' - f'inputs: {inputs}, arguments: {arguments}') + f'inputs: {inputs}, arguments: {arguments}' + ) raise ValueError(msg) if len(inputs) != len(set(inputs)): msg = 'Template inputs contain duplicates' raise ValueError(msg) - return { - input_name: arguments[input_name] - for input_name in inputs - } - + return {input_name: arguments[input_name] for input_name in inputs} -def compute(root_directory: Path, - template_path: Path, - compute_arguments: dict[str, str], - ) -> None: +def compute( + root_directory: Path, + template_path: Path, + compute_arguments: dict[str, str], +) -> None: with template_path.open('rb') as f: template = tomllib.load(f) @@ -70,16 +69,20 @@ def compute(root_directory: Path, substitutions['root_directory'] = str(root_directory) substituted_executable = substitute_string(template['executable'], substitutions) - substituted_arguments = substitute_arguments( - template, - substitutions, - 'arguments' - ) + substituted_arguments = substitute_arguments(template, substitutions, 'arguments') with contextlib.chdir(root_directory): if template.get('use_shell', 'false') == 'true': - lgr.debug(f'compute: RUNNING: with shell=True: {" ".join([substituted_executable, *substituted_arguments])}') - subprocess.run(' '.join([substituted_executable, *substituted_arguments]), shell=True, check=True) # noqa: S602 + lgr.debug( + f'compute: RUNNING: with shell=True: {" ".join([substituted_executable, *substituted_arguments])}' + ) + subprocess.run( + ' '.join([substituted_executable, *substituted_arguments]), + shell=True, + check=True, + ) # noqa: S602 else: - lgr.debug(f'compute: RUNNING: {[substituted_executable, *substituted_arguments]}') + lgr.debug( + f'compute: RUNNING: {[substituted_executable, *substituted_arguments]}' + ) subprocess.run([substituted_executable, *substituted_arguments], check=True) diff --git a/datalad_remake/utils/glob.py b/datalad_remake/utils/glob.py index 607954e..3ffbdbb 100644 --- a/datalad_remake/utils/glob.py +++ b/datalad_remake/utils/glob.py @@ -10,12 +10,13 @@ # Resolve input file patterns in the original dataset -def resolve_patterns(root_dir: str | Path, - patterns: Iterable[str] - ) -> set[str]: +def resolve_patterns(root_dir: str | Path, patterns: Iterable[str]) -> set[str]: return set( filter( lambda p: not (Path(root_dir) / p).is_dir(), chain.from_iterable( glob(pattern, root_dir=str(root_dir), recursive=True) - for pattern in patterns))) + for pattern in patterns + ), + ) + ) diff --git a/datalad_remake/utils/tests/test_substitution.py b/datalad_remake/utils/tests/test_substitution.py index 2ee2480..86d4ff9 100644 --- a/datalad_remake/utils/tests/test_substitution.py +++ b/datalad_remake/utils/tests/test_substitution.py @@ -1,5 +1,3 @@ - - from ..compute import ( substitute_arguments, substitute_string, @@ -7,10 +5,13 @@ def test_multiple_substitutions(): - assert substitute_string( - 'This is a {test} with {multiple} substitutions', - {'test': 'string', 'multiple': 'multiple'}, - ) == 'This is a string with multiple substitutions' + assert ( + substitute_string( + 'This is a {test} with {multiple} substitutions', + {'test': 'string', 'multiple': 'multiple'}, + ) + == 'This is a string with multiple substitutions' + ) def test_argument_substitution(): @@ -20,11 +21,12 @@ def test_argument_substitution(): ] s = substitute_arguments( {'arguments': arguments}, - {'root_directory': '/path/to/root', + { + 'root_directory': '/path/to/root', 'input_dir': 'input', 'output_dir': 'output', }, - 'arguments' + 'arguments', ) assert s == [ '/path/to/root/input', From 630c2f514fd8d42c4def3d7ee588487ffa64cc38 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 22 Oct 2024 09:02:30 +0200 Subject: [PATCH 143/148] rf: improve subprocess calls in compute --- datalad_remake/utils/compute.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/datalad_remake/utils/compute.py b/datalad_remake/utils/compute.py index 01a9f50..19b474e 100644 --- a/datalad_remake/utils/compute.py +++ b/datalad_remake/utils/compute.py @@ -73,16 +73,10 @@ def compute( with contextlib.chdir(root_directory): if template.get('use_shell', 'false') == 'true': - lgr.debug( - f'compute: RUNNING: with shell=True: {" ".join([substituted_executable, *substituted_arguments])}' - ) - subprocess.run( - ' '.join([substituted_executable, *substituted_arguments]), - shell=True, - check=True, - ) # noqa: S602 + cmd = ' '.join([substituted_executable, *substituted_arguments]) + lgr.debug(f'compute: RUNNING: with shell=True: {cmd}') + subprocess.run(cmd, shell=True, check=True) # noqa: S602 else: - lgr.debug( - f'compute: RUNNING: {[substituted_executable, *substituted_arguments]}' - ) - subprocess.run([substituted_executable, *substituted_arguments], check=True) + cmd_list = [substituted_executable, *substituted_arguments] + lgr.debug(f'compute: RUNNING: {cmd_list}') + subprocess.run(cmd_list, check=True) From 588b2c39c59688632bc6faed51bf55b12945ecb6 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 22 Oct 2024 09:16:50 +0200 Subject: [PATCH 144/148] ci: adjust lowest commitizen-checked sha --- .github/workflows/conventional-commits.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/conventional-commits.yml b/.github/workflows/conventional-commits.yml index 8c86ddc..88a37b3 100644 --- a/.github/workflows/conventional-commits.yml +++ b/.github/workflows/conventional-commits.yml @@ -20,4 +20,4 @@ jobs: run: python -m pip install commitizen - name: Run commit message checks run: | - cz check --rev-range ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }} + cz check --rev-range 630c2f514fd8d42c4def3d7ee588487ffa64cc38..${{ github.event.pull_request.head.sha }} From 7f6ddb66508974353a0c403f94ece7bc1dfc9320 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 22 Oct 2024 11:06:14 +0200 Subject: [PATCH 145/148] fix: fix type errors --- datalad_remake/commands/make_cmd.py | 87 ++++++++++++++---------- datalad_remake/commands/provision_cmd.py | 48 ++++++------- 2 files changed, 73 insertions(+), 62 deletions(-) diff --git a/datalad_remake/commands/make_cmd.py b/datalad_remake/commands/make_cmd.py index 3f98114..1aa6c9a 100644 --- a/datalad_remake/commands/make_cmd.py +++ b/datalad_remake/commands/make_cmd.py @@ -25,7 +25,9 @@ from datalad_next.constraints import ( EnsureDataset, EnsureListOf, + EnsurePath, EnsureStr, + DatasetParameter, ) from datalad_next.datasets import Dataset from datalad_next.runners import ( @@ -62,17 +64,18 @@ class Make(ValidatedInterface): _validator_ = EnsureCommandParameterization( { 'dataset': EnsureDataset(installed=True), + 'template': EnsureStr(min_len=1), 'input': EnsureListOf(EnsureStr(min_len=1)), - 'input_list': EnsureStr(min_len=1), + 'input_list': EnsurePath(), 'output': EnsureListOf(EnsureStr(min_len=1), min_len=1), - 'output_list': EnsureStr(min_len=1), + 'output_list': EnsurePath(), 'parameter': EnsureListOf(EnsureStr(min_len=3)), - 'parameter_list': EnsureStr(min_len=1), + 'parameter_list': EnsurePath(), } ) # parameters of the command, must be exhaustive - _params_: ClassVar[dict[int, Parameter]] = { + _params_: ClassVar[dict[str, Parameter]] = { 'dataset': Parameter( args=('-d', '--dataset'), doc='Dataset to be used as a configuration source. Beyond ' @@ -168,48 +171,54 @@ class Make(ValidatedInterface): @datasetmethod(name='make') @eval_results def __call__( - dataset=None, + dataset: DatasetParameter | None = None, *, - url_only=False, - template=None, - branch=None, - input_=None, - input_list=None, - output=None, - output_list=None, - parameter=None, - parameter_list=None, - ): - dataset: Dataset = dataset.ds if dataset else Dataset('.') - - input_pattern = (input_ or []) + read_list(input_list) + template: str = '', + url_only: bool = False, + branch: str | None = None, + input: list[str] | None = None, + input_list: Path | None = None, + output: list[str] | None = None, + output_list: Path | None = None, + parameter: list[str] | None = None, + parameter_list: Path | None = None, + ) -> Generator: + + ds: Dataset = dataset.ds if dataset else Dataset('.') + + input_pattern = (input or []) + read_list(input_list) output_pattern = (output or []) + read_list(output_list) parameter = (parameter or []) + read_list(parameter_list) - parameter_dict = {p.split('=', 1)[0]: p.split('=', 1)[1] for p in parameter} + parameter_dict = { + p.split('=', 1)[0]: p.split('=', 1)[1] + for p in parameter + } # We have to get the URL first, because saving the specification to # the dataset will change the version. url_base, reset_commit = get_url( - dataset, branch, template, parameter_dict, input_pattern, output_pattern + ds, branch, template, parameter_dict, input_pattern, output_pattern ) if not url_only: with provide_context( - dataset, + ds, branch, input_pattern, ) as worktree: execute(worktree, template, parameter_dict, output_pattern) - output = collect(worktree, dataset, output_pattern) + resolved_output = collect(worktree, ds, output_pattern) + else: + resolved_output = set(output_pattern) - for out in output: - url = add_url(dataset, out, url_base, url_only=url_only) + for out in resolved_output: + url = add_url(ds, out, url_base, url_only=url_only) yield get_status_dict( action='make', - path=str(dataset.pathobj / out), + path=str(ds.pathobj / out), status='ok', - message=f'added url: {url!r} to {out!r} in {dataset.pathobj}', + message=f'added url: {url!r} to {out!r} in {ds.pathobj}', ) @@ -287,39 +296,45 @@ def build_json( ) -def add_url(dataset: Dataset, file_path: str, url_base: str, *, url_only: bool) -> str: +def add_url(dataset: Dataset, + file_path: str, + url_base: str, + *, + url_only: bool + ) -> str: lgr.debug( - 'add_url: %s %s %s %s', str(dataset), str(file_path), url_base, repr(url_only) + 'add_url: %s %s %s %s', + str(dataset), file_path, url_base, repr(url_only) ) # Build the file-specific URL and store it in the annex url = url_base + f'&this={quote(file_path)}' - file_dataset_path, file_path = get_file_dataset(dataset.pathobj / file_path) + dataset_path, path = get_file_dataset(dataset.pathobj / file_path) # If the file does not exist and speculative computation is requested, we # can just add the URL. - if not (dataset.pathobj / file_path).exists() and url_only: + if not (dataset.pathobj / path).exists() and url_only: can_add = True else: # Check if the file is annexed, otherwise we cannot add a URL can_add = call_git_success( - ['annex', 'whereis', str(file_path)], - cwd=file_dataset_path, + ['annex', 'whereis', str(path)], + cwd=dataset_path, capture_output=True, ) # Add the URL if can_add: success = call_git_success( - ['annex', 'addurl', url, '--file', file_path] + ['annex', 'addurl', url, '--file', str(path)] + (['--relaxed'] if url_only else []), - cwd=file_dataset_path, + cwd=dataset_path, capture_output=True, ) if not success: msg = ( - f'\naddurl failed:\nfile_dataset_path: {file_dataset_path}\n' - f'url: {url!r}\nfile_path: {file_path!r}' + f'\naddurl failed:\ndataset_path: {dataset_path}\n' + f'url: {url!r}\nfile_path: {path!r}' ) raise RuntimeError(msg) return url diff --git a/datalad_remake/commands/provision_cmd.py b/datalad_remake/commands/provision_cmd.py index 3bca2fd..7fc37af 100644 --- a/datalad_remake/commands/provision_cmd.py +++ b/datalad_remake/commands/provision_cmd.py @@ -18,7 +18,6 @@ ClassVar, ) -from datalad.support.constraints import EnsureBool from datalad_next.commands import ( EnsureCommandParameterization, Parameter, @@ -33,6 +32,7 @@ EnsureListOf, EnsurePath, EnsureStr, + DatasetParameter, AnyOf, ) from datalad_next.datasets import Dataset from datalad_next.runners import call_git_lines, call_git_success @@ -65,10 +65,9 @@ class Provision(ValidatedInterface): { 'dataset': EnsureDataset(installed=True), 'input': EnsureListOf(EnsureStr(min_len=1)), - 'input_list': EnsureStr(min_len=1), - 'tmp_dir': EnsurePath(is_mode=stat.S_ISDIR), + 'input_list': EnsurePath(), 'delete': EnsureDataset(installed=True), - 'no_globbing': EnsureBool(), + 'worktree_dir': AnyOf(EnsurePath(), EnsureStr(min_len=1)), } ) @@ -131,34 +130,34 @@ class Provision(ValidatedInterface): @datasetmethod(name='provision') @eval_results def __call__( - dataset=None, - branch=None, - delete=None, - input_=None, - input_list=None, - worktree_dir=None, + dataset: DatasetParameter | None = None, + branch: str | None = None, + delete: DatasetParameter | None = None, + input: list[str] | None = None, + input_list: Path | None = None, + worktree_dir: str | Path | None = None, ): - dataset: Dataset = dataset.ds if dataset else Dataset('.') + ds: Dataset = dataset.ds if dataset else Dataset('.') if delete: - if branch or input_: + if branch or input: msg = ( 'Cannot use `-d`, `--delete` with `-b`, `--branch`,' ' `-i`, or `--input`' ) raise ValueError(msg) - remove(dataset, delete.ds) + remove(ds, delete.ds) yield get_status_dict( action='provision [delete]', path=delete.ds.path, status='ok', - message=f'delete workspace: {delete.ds.path!r} from dataset {dataset}', + message=f'delete workspace: {delete.ds.path!r} from dataset {ds!r}', ) return - worktree_dir: Path = Path(worktree_dir or TemporaryDirectory().name) - inputs = input_ or [*read_list(input_list)] - yield from provide(dataset, worktree_dir, inputs, branch) + resolved_worktree_dir: Path = Path(worktree_dir or TemporaryDirectory().name) + inputs = input or [*read_list(input_list)] + yield from provide(ds, resolved_worktree_dir, inputs, branch) def remove(dataset: Dataset, worktree: Dataset) -> None: @@ -382,17 +381,14 @@ def install_subdataset( locally_available_datasets: Iterable[tuple[Path, Path, Path]], ) -> None: """Install a subdataset, prefer locally available subdatasets""" - local_subdataset = ( - [ - dataset - for dataset in locally_available_datasets - if dataset[2] == subdataset_path - ] - or [None] - )[0] + local_subdataset = [ + dataset_info + for dataset_info in locally_available_datasets + if dataset_info[2] == subdataset_path + ] if local_subdataset: - absolute_path, parent_ds_path, path_from_root = local_subdataset + absolute_path, parent_ds_path, path_from_root = local_subdataset[0] # Set the URL to the full source path args = [ '-C', From 400b9a8bfcf9ddfc01c0ca49cdcf23be7cebcf21 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 22 Oct 2024 11:09:22 +0200 Subject: [PATCH 146/148] fix: fix linter errors --- datalad_remake/commands/make_cmd.py | 4 ++-- datalad_remake/commands/provision_cmd.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/datalad_remake/commands/make_cmd.py b/datalad_remake/commands/make_cmd.py index 1aa6c9a..9fd22d0 100644 --- a/datalad_remake/commands/make_cmd.py +++ b/datalad_remake/commands/make_cmd.py @@ -23,11 +23,11 @@ get_status_dict, ) from datalad_next.constraints import ( + DatasetParameter, EnsureDataset, EnsureListOf, EnsurePath, EnsureStr, - DatasetParameter, ) from datalad_next.datasets import Dataset from datalad_next.runners import ( @@ -176,7 +176,7 @@ def __call__( template: str = '', url_only: bool = False, branch: str | None = None, - input: list[str] | None = None, + input: list[str] | None = None, # noqa: A002 input_list: Path | None = None, output: list[str] | None = None, output_list: Path | None = None, diff --git a/datalad_remake/commands/provision_cmd.py b/datalad_remake/commands/provision_cmd.py index 7fc37af..501a3e9 100644 --- a/datalad_remake/commands/provision_cmd.py +++ b/datalad_remake/commands/provision_cmd.py @@ -8,7 +8,6 @@ import logging import os -import stat from contextlib import chdir from glob import glob from pathlib import Path @@ -28,11 +27,12 @@ get_status_dict, ) from datalad_next.constraints import ( + AnyOf, + DatasetParameter, EnsureDataset, EnsureListOf, EnsurePath, EnsureStr, - DatasetParameter, AnyOf, ) from datalad_next.datasets import Dataset from datalad_next.runners import call_git_lines, call_git_success @@ -133,7 +133,7 @@ def __call__( dataset: DatasetParameter | None = None, branch: str | None = None, delete: DatasetParameter | None = None, - input: list[str] | None = None, + input: list[str] | None = None, # noqa: A002 input_list: Path | None = None, worktree_dir: str | Path | None = None, ): From 3bac8b125bb8d973a3408b7546f8e2bdabc5c64e Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 22 Oct 2024 11:17:30 +0200 Subject: [PATCH 147/148] fix: use automated formatting results --- datalad_remake/commands/make_cmd.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/datalad_remake/commands/make_cmd.py b/datalad_remake/commands/make_cmd.py index 9fd22d0..cb7a5c3 100644 --- a/datalad_remake/commands/make_cmd.py +++ b/datalad_remake/commands/make_cmd.py @@ -183,17 +183,13 @@ def __call__( parameter: list[str] | None = None, parameter_list: Path | None = None, ) -> Generator: - ds: Dataset = dataset.ds if dataset else Dataset('.') input_pattern = (input or []) + read_list(input_list) output_pattern = (output or []) + read_list(output_list) parameter = (parameter or []) + read_list(parameter_list) - parameter_dict = { - p.split('=', 1)[0]: p.split('=', 1)[1] - for p in parameter - } + parameter_dict = {p.split('=', 1)[0]: p.split('=', 1)[1] for p in parameter} # We have to get the URL first, because saving the specification to # the dataset will change the version. @@ -296,16 +292,8 @@ def build_json( ) -def add_url(dataset: Dataset, - file_path: str, - url_base: str, - *, - url_only: bool - ) -> str: - lgr.debug( - 'add_url: %s %s %s %s', - str(dataset), file_path, url_base, repr(url_only) - ) +def add_url(dataset: Dataset, file_path: str, url_base: str, *, url_only: bool) -> str: + lgr.debug('add_url: %s %s %s %s', str(dataset), file_path, url_base, repr(url_only)) # Build the file-specific URL and store it in the annex url = url_base + f'&this={quote(file_path)}' From 81f08a368a67796c4fb8d47254b5f8c4c6bd18ba Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 22 Oct 2024 13:10:05 +0200 Subject: [PATCH 148/148] ci: ignore missing imports in mypy for now This commit instructs mypy to ignore missing imports. Otherwise a number of errors are created by importing datalad-modules. This is due to the fact that no type-stubs exist for those modules yet. Once the type-stubs are created, this commit should be reverted. --- .github/workflows/mypy-pr.yml | 8 +++++--- .github/workflows/mypy-project.yml | 6 ++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/mypy-pr.yml b/.github/workflows/mypy-pr.yml index b936223..cbc3b2e 100644 --- a/.github/workflows/mypy-pr.yml +++ b/.github/workflows/mypy-pr.yml @@ -31,11 +31,13 @@ jobs: if: steps.changed-py-files.outputs.any_changed == 'true' run: | # get any type stubs that mypy thinks it needs - hatch run types:mypy --install-types --non-interactive --follow-imports skip ${{ steps.changed-py-files.outputs.all_changed_files }} + hatch run types:mypy --install-types --non-interactive --ignore-missing-imports --follow-imports skip ${{ steps.changed-py-files.outputs.all_changed_files }} # run mypy on the modified files only, and do not even follow imports. # this results is a fairly superficial test, but given the overall # state of annotations, we strive to become more correct incrementally # with focused error reports, rather than barfing a huge complaint # that is unrelated to the changeset someone has been working on. - # run on the oldest supported Python version - hatch run types:mypy --python-version 3.11 --follow-imports skip --pretty --show-error-context ${{ steps.changed-py-files.outputs.all_changed_files }} + # run on the oldest supported Python version. + # specify `--ignore-missing-imports` until the datalad-packages have + # type stubs for all their modules. + hatch run types:mypy --python-version 3.11 --ignore-missing-imports --follow-imports skip --pretty --show-error-context ${{ steps.changed-py-files.outputs.all_changed_files }} diff --git a/.github/workflows/mypy-project.yml b/.github/workflows/mypy-project.yml index 276506d..c6b2654 100644 --- a/.github/workflows/mypy-project.yml +++ b/.github/workflows/mypy-project.yml @@ -25,5 +25,7 @@ jobs: # get any type stubs that mypy thinks it needs hatch run types:mypy --install-types --non-interactive --follow-imports skip datalad_core # run mypy on the full project. - # run on the oldest supported Python version - hatch run types:mypy --python-version 3.11 --pretty --show-error-context datalad_core + # run on the oldest supported Python version. + # specify `--ignore-missing-imports` until the datalad-packages have + # type stubs for all their modules. + hatch run types:mypy --python-version 3.11 --ignore-missing-imports --pretty --show-error-context datalad_core