diff --git a/.appveyor.yml b/.appveyor.yml index ac2768b..a179521 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -68,14 +68,14 @@ environment: # Ubuntu core tests - job_name: test-linux APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2204 - PY: 3.9 + PY: 3.11 INSTALL_GITANNEX: git-annex -m snapshot # same as 'test-linux', but TMPDIR is on a crippled filesystem, causing # most, if not all test datasets to be created on that filesystem - job_name: test-linux-crippled APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2204 - PY: 3.9 + PY: 3.11 # datalad-annex git remote needs something after git-annex_8.20211x INSTALL_GITANNEX: git-annex -m snapshot diff --git a/.github/workflows/conventional-commits.yml b/.github/workflows/conventional-commits.yml index 8c86ddc..88a37b3 100644 --- a/.github/workflows/conventional-commits.yml +++ b/.github/workflows/conventional-commits.yml @@ -20,4 +20,4 @@ jobs: run: python -m pip install commitizen - name: Run commit message checks run: | - cz check --rev-range ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }} + cz check --rev-range 630c2f514fd8d42c4def3d7ee588487ffa64cc38..${{ github.event.pull_request.head.sha }} diff --git a/.github/workflows/docbuild.yml b/.github/workflows/docbuild.yml new file mode 100644 index 0000000..a7319f8 --- /dev/null +++ b/.github/workflows/docbuild.yml @@ -0,0 +1,27 @@ +name: docs + +on: [push, pull_request] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - name: Set up environment + run: | + git config --global user.email "test@github.land" + git config --global user.name "GitHub Almighty" + - uses: actions/checkout@v4 + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: 3.11 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-devel.txt + pip install . + - name: Build docs + run: | + make -C docs html diff --git a/.github/workflows/mypy-pr.yml b/.github/workflows/mypy-pr.yml index a49660d..cbc3b2e 100644 --- a/.github/workflows/mypy-pr.yml +++ b/.github/workflows/mypy-pr.yml @@ -31,11 +31,13 @@ jobs: if: steps.changed-py-files.outputs.any_changed == 'true' run: | # get any type stubs that mypy thinks it needs - hatch run types:mypy --install-types --non-interactive --follow-imports skip ${{ steps.changed-py-files.outputs.all_changed_files }} + hatch run types:mypy --install-types --non-interactive --ignore-missing-imports --follow-imports skip ${{ steps.changed-py-files.outputs.all_changed_files }} # run mypy on the modified files only, and do not even follow imports. # this results is a fairly superficial test, but given the overall # state of annotations, we strive to become more correct incrementally # with focused error reports, rather than barfing a huge complaint # that is unrelated to the changeset someone has been working on. - # run on the oldest supported Python version - hatch run types:mypy --python-version 3.9 --follow-imports skip --pretty --show-error-context ${{ steps.changed-py-files.outputs.all_changed_files }} + # run on the oldest supported Python version. + # specify `--ignore-missing-imports` until the datalad-packages have + # type stubs for all their modules. + hatch run types:mypy --python-version 3.11 --ignore-missing-imports --follow-imports skip --pretty --show-error-context ${{ steps.changed-py-files.outputs.all_changed_files }} diff --git a/.github/workflows/mypy-project.yml b/.github/workflows/mypy-project.yml index 3a1bb1b..c6b2654 100644 --- a/.github/workflows/mypy-project.yml +++ b/.github/workflows/mypy-project.yml @@ -25,5 +25,7 @@ jobs: # get any type stubs that mypy thinks it needs hatch run types:mypy --install-types --non-interactive --follow-imports skip datalad_core # run mypy on the full project. - # run on the oldest supported Python version - hatch run types:mypy --python-version 3.9 --pretty --show-error-context datalad_core + # run on the oldest supported Python version. + # specify `--ignore-missing-imports` until the datalad-packages have + # type stubs for all their modules. + hatch run types:mypy --python-version 3.11 --ignore-missing-imports --pretty --show-error-context datalad_core diff --git a/.github/workflows/test_crippledfs.yml b/.github/workflows/test_crippledfs.yml new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md index 181b343..ac4afe3 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,138 @@ [![Hatch project](https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg)](https://github.com/pypa/hatch) +**This code is a POC**, that means currently: +- code does not thoroughly validate inputs +- names might be inconsistent +- few tests +- fewer docs +- no support for locking + +This is a naive datalad compute extension that serves as a playground for +the datalad remake-project. + +It contains an annex remote that can compute content on demand. It uses template +files that specify the operations. It encodes computation parameters in URLs +that are associated with annex keys, which allows to compute dropped content +instead of fetching it from some storage system. It also contains the new +datalad command `compute` that +can trigger the computation of content, generate the parameterized URLs, and +associate this URL with the respective annex key. This information can then +be used by the annex remote to repeat the computation. + +## Installation + +There is no pypi-package yet. To install the extension, clone the repository +and install it via `pip` (preferably in a virtual environment): + +```bash +git clone https://github.com/christian-monch/datalad-compute.git +cd datalad-compute +pip install -r requirements-devel.txt +pip install . +``` + + +## Example usage + +Install the extension and create a dataset + + +```bash +> datalad create compute-test-1 +> cd compute-test-1 +``` + +Create the template directory and a template + +```bash +> mkdir -p .datalad/compute/methods +> cat > .datalad/compute/methods/one-to-many < '{output}-1.txt';", + "echo content: {second} > '{output}-2.txt'", +] +EOF +> datalad save -m "add `one-to-many` compute method" +``` + +Create a "compute" annex special remote: +```bash +> git annex initremote compute encryption=none type=external externaltype=compute +``` + +Execute a computation and save the result: +```bash +> datalad compute -p first=bob -p second=alice -p output=name -o name-1.txt \ +-o name-2.txt one-to-many +``` +The method `one-to-many` will create two files with the names `-1.txt` +and `-2.txt`. That is why the two files `name-1.txt` and `name-2.txt` +are listed as outputs in the command above. + +Note that only output files that are defined by the `-o/--output` option will +be available in the dataset after `datalad compute`. Similarly, only the files +defined by `-i/--input` will be available as inputs to the computation (the +computation is performed in a "scratch" directory, so the input files must be +copied there and the output files must be copied back). + +```bash +> cat name-1.txt +content: bob +> cat name-2.txt +content: alice +``` + +Drop the content of `name-1.txt`, verify it is gone, recreate it via +`datalad get`, which "fetches" is from the compute remote: + +```bash +> datalad drop name-1.txt +> cat name-1.txt +> datalad get name-1.txt +> cat name-1.txt +``` + +The command `datalad compute` does also support to just record the parameters +that would lead to a certain computation, without actually performing the +computation. We refer to this as *speculative computation*. + +To use this feature, the following configuration value has to be set: + +```bash +> git config annex.security.allow-unverified-downloads ACKTHPPT +``` + +Afterward, a speculative computation can be recorded by providing the `-u` option +(url-only) to `datalad compute`. + +```bash +> datalad compute -p first=john -p second=susan -p output=person \ +-o person-1.txt -o person-2.txt -u one-to-many +> cat person-1.txt # this will fail, because the computation has not yet been performed +``` + +`ls -l person-1.txt` will show a link to a not-downloaded URL-KEY. +`git annex whereis person-1.txt` will show the associated computation description URL. +No computation has been performed yet, `datalad compute` just creates an URL-KEY and +associates a computation description URL with the URL-KEY. + +Use `datalad get` to perform the computation for the first time and receive the result:: +```bash +> datalad get person-1.txt +> cat person-1.txt +``` + + +# Contributing + +See [CONTRIBUTING.md](CONTRIBUTING.md) if you are interested in internals or +contributing to the project. + ## Acknowledgements This development was supported by European Union’s Horizon research and diff --git a/datalad_remake/__init__.py b/datalad_remake/__init__.py index 3ac1200..cb44161 100644 --- a/datalad_remake/__init__.py +++ b/datalad_remake/__init__.py @@ -1,25 +1,47 @@ +"""DataLad remake extension""" + from __future__ import annotations from datalad_remake._version import __version__ __all__ = [ '__version__', + 'command_suite', ] -# command_suite = ( -# # description of the command suite, displayed in cmdline help -# "Demo DataLad command suite", -# [ -# # specification of a command, any number of commands can be defined -# ( -# # importable module that contains the command implementation -# 'datalad_remake.commands.compute_cmd', -# # name of the command class implementation in above module -# 'Compute', -# # optional name of the command in the cmdline API -# 'compute', -# # optional name of the command in the Python API -# 'compute' -# ), -# ] -# ) + +# Defines a datalad command suite. +# This variable must be bound as a setuptools entrypoint +# to be found by datalad +command_suite = ( + # description of the command suite, displayed in cmdline help + 'DataLad remake command suite', + [ + # specification of a command, any number of commands can be defined + ( + # importable module that contains the command implementation + 'datalad_remake.commands.make_cmd', + # name of the command class implementation in above module + 'Make', + # optional name of the command in the cmdline API + 'make', + # optional name of the command in the Python API + 'make', + ), + ( + # importable module that contains the command implementation + 'datalad_remake.commands.provision_cmd', + # name of the command class implementation in above module + 'Provision', + # optional name of the command in the cmdline API + 'provision', + # optional name of the command in the Python API + 'provision', + ), + ], +) + + +url_scheme = 'datalad-remake' +template_dir = '.datalad/make/methods' +specification_dir = '.datalad/make/specifications' diff --git a/datalad_remake/annexremotes/__init__.py b/datalad_remake/annexremotes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datalad_remake/annexremotes/remake_remote.py b/datalad_remake/annexremotes/remake_remote.py new file mode 100644 index 0000000..5b0417f --- /dev/null +++ b/datalad_remake/annexremotes/remake_remote.py @@ -0,0 +1,210 @@ +from __future__ import annotations + +import json +import logging +import shutil +import subprocess +from pathlib import Path +from typing import ( + TYPE_CHECKING, + Any, +) +from urllib.parse import ( + unquote, + urlparse, +) + +from datalad.customremotes import RemoteError +from datalad_next.annexremotes import SpecialRemote, super_main +from datalad_next.datasets import Dataset +from datalad_next.runners import call_git_success + +from datalad_remake import ( + specification_dir, + url_scheme, +) +from datalad_remake.commands.make_cmd import ( + execute, + get_file_dataset, + provide_context, +) +from datalad_remake.utils.glob import resolve_patterns + +if TYPE_CHECKING: + from collections.abc import Iterable + + from annexremote import Master + +lgr = logging.getLogger('datalad.remake.annexremotes.remake') + + +class RemakeRemote(SpecialRemote): + def __init__(self, annex: Master): + super().__init__(annex) + + def __del__(self): + self.close() + + def close(self) -> None: + pass + + def _check_url(self, url: str) -> bool: + return url.startswith((f'URL--{url_scheme}:', f'{url_scheme}:')) + + def prepare(self): + self.annex.debug('PREPARE') + + def initremote(self): + self.annex.debug('INITREMOTE') + + def remove(self, key: str): + self.annex.debug(f'REMOVE {key!r}') + + def transfer_store(self, key: str, local_file: str): + self.annex.debug(f'TRANSFER STORE {key!r}, {local_file!r}') + + def claimurl(self, url: str) -> bool: + self.annex.debug(f'CLAIMURL {url!r}') + return self._check_url(url) + + def checkurl(self, url: str) -> bool: + self.annex.debug(f'CHECKURL {url!r}') + return self._check_url(url) + + def getcost(self) -> int: + self.annex.debug('GETCOST') + return 100 + + def get_url_encoded_info(self, url: str) -> list[str]: + parts = urlparse(url).query.split('&', 5) + self.annex.debug(f'get_url_encoded_info: url: {url!r}, parts: {parts!r}') + return parts + + def get_url_for_key(self, key: str) -> str: + urls = self.annex.geturls(key, f'{url_scheme}:') + self.annex.debug(f'get_url_for_key: key: {key!r}, urls: {urls!r}') + return urls[0] + + def get_compute_info(self, key: str) -> tuple[dict[str, Any], Dataset]: + def get_assigned_value(assignment: str) -> str: + return assignment.split('=', 1)[1] + + root_version, spec_name, this = ( + unquote(get_assigned_value(expr)) + for expr in self.get_url_encoded_info(self.get_url_for_key(key)) + ) + + dataset = self._find_dataset(root_version) + spec_path = dataset.pathobj / specification_dir / spec_name + with open(spec_path, 'rb') as f: + spec = json.load(f) + + return { + 'root_version': root_version, + 'this': this, + **{name: spec[name] for name in ['method', 'input', 'output', 'parameter']}, + }, dataset + + def transfer_retrieve(self, key: str, file_name: str) -> None: + self.annex.debug(f'TRANSFER RETRIEVE key: {key!r}, file_name: {file_name!r}') + + compute_info, dataset = self.get_compute_info(key) + self.annex.debug(f'TRANSFER RETRIEVE compute_info: {compute_info!r}') + + # Perform the computation, and collect the results + lgr.debug('Starting provision') + self.annex.debug('Starting provision') + with provide_context( + dataset, compute_info['root_version'], compute_info['input'] + ) as worktree: + lgr.debug('Starting execution') + self.annex.debug('Starting execution') + execute( + worktree, + compute_info['method'], + compute_info['parameter'], + compute_info['output'], + ) + lgr.debug('Starting collection') + self.annex.debug('Starting collection') + self._collect( + worktree, + dataset, + compute_info['output'], + compute_info['this'], + file_name, + ) + lgr.debug('Leaving provision context') + self.annex.debug('Leaving provision context') + + def checkpresent(self, key: str) -> bool: + # See if at least one URL with the remake url-scheme is present + return self.annex.geturls(key, f'{url_scheme}:') != [] + + def _find_dataset(self, commit: str) -> Dataset: + """Find the first enclosing dataset with the given commit""" + # TODO: get version override from configuration + start_dir = Path(self.annex.getgitdir()).parent.absolute() + current_dir = start_dir + while current_dir != Path('/'): + result = subprocess.run( + ['git', 'cat-file', '-t', commit], # noqa: S607 + stdout=subprocess.PIPE, + cwd=current_dir, + check=False, + ) + if result.returncode == 0 and result.stdout.strip() == b'commit': + return Dataset(current_dir) + current_dir = current_dir.parent + msg = ( + f'Could not find dataset with commit {commit!r}, starting from ' + f'{start_dir}' + ) + raise RemoteError(msg) + + def _collect( + self, + worktree: Path, + dataset: Dataset, + output_patterns: Iterable[str], + this: str, + this_destination: str, + ) -> None: + """Collect computation results for `this` (and all other outputs)""" + + # Get all outputs that were created during computation + outputs = resolve_patterns(root_dir=worktree, patterns=output_patterns) + + # Collect all output files that have been created while creating + # `this` file. + for output in outputs: + if output == this: + continue + dataset_path, file_path = get_file_dataset(dataset.pathobj / output) + is_annexed = call_git_success( + ['annex', 'whereis', str(file_path)], + cwd=dataset_path, + capture_output=True, + ) + if is_annexed: + self.annex.debug( + f'_collect: reinject: {worktree / output} -> {dataset_path}:{file_path}' + ) + call_git_success( + ['annex', 'reinject', str(worktree / output), str(file_path)], + cwd=dataset_path, + capture_output=True, + ) + + # Collect `this` file. It has to be copied to the destination given + # by git-annex. Git-annex will check its integrity. + shutil.copyfile(worktree / this, this_destination) + + +def main(): + """cmdline entry point""" + super_main( + cls=RemakeRemote, + remote_name='datalad-remake', + description='Remake data based on datalad-remake specifications', + ) diff --git a/datalad_remake/annexremotes/tests/__init__.py b/datalad_remake/annexremotes/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datalad_remake/annexremotes/tests/test_hierarchies.py b/datalad_remake/annexremotes/tests/test_hierarchies.py new file mode 100644 index 0000000..a09060c --- /dev/null +++ b/datalad_remake/annexremotes/tests/test_hierarchies.py @@ -0,0 +1,120 @@ +from collections.abc import Iterable +from pathlib import Path + +import pytest +from datalad.distribution.get import Get as datalad_Get +from datalad_next.datasets import Dataset + +from datalad_remake.commands.tests.create_datasets import ( + create_simple_computation_dataset, +) + +test_method = """ +inputs = ['first', 'second', 'third'] +use_shell = 'true' +executable = 'echo' +arguments = [ + "content: {first} > 'a.txt';", + "mkdir -p 'd2_subds0/d2_subds1/d2_subds2';", + "echo content: {second} > 'b.txt';", + "echo content: {third} > 'new.txt';", + "echo content: {first} > 'd2_subds0/a0.txt';", + "echo content: {second} > 'd2_subds0/b0.txt';", + "echo content: {third} > 'd2_subds0/new.txt';", + "echo content: {first} > 'd2_subds0/d2_subds1/a1.txt';", + "echo content: {second} > 'd2_subds0/d2_subds1/b1.txt';", + "echo content: {third} > 'd2_subds0/d2_subds1/new.txt';", + "echo content: {first} > 'd2_subds0/d2_subds1/d2_subds2/a2.txt';", + "echo content: {second} > 'd2_subds0/d2_subds1/d2_subds2/b2.txt';", + "echo content: {third} > 'd2_subds0/d2_subds1/d2_subds2/new.txt';", +] +""" + + +output_pattern_static = [ + 'a.txt', + 'b.txt', + 'new.txt', + 'd2_subds0/a0.txt', + 'd2_subds0/b0.txt', + 'd2_subds0/new.txt', + 'd2_subds0/d2_subds1/a1.txt', + 'd2_subds0/d2_subds1/b1.txt', + 'd2_subds0/d2_subds1/new.txt', + 'd2_subds0/d2_subds1/d2_subds2/a2.txt', + 'd2_subds0/d2_subds1/d2_subds2/b2.txt', + 'd2_subds0/d2_subds1/d2_subds2/new.txt', +] + + +output_pattern_glob = [ + '*.txt', + 'd2_subds0/*.txt', + 'd2_subds0/d2_subds1/*.txt', + 'd2_subds0/d2_subds1/d2_subds2/*.txt', +] + + +test_file_content = list( + zip( + output_pattern_static, + ['content: first\n', 'content: second\n', 'content: third\n'] * 4, + strict=False, + ) +) + + +def _drop_files(dataset: Dataset, files: Iterable[str]): + for file in files: + dataset.drop(file, reckless='availability', result_renderer='disabled') + assert not (dataset.pathobj / file).exists() + + +def _check_content(dataset, file_content: Iterable[tuple[str, str]]): + for file, content in file_content: + assert (dataset.pathobj / file).read_text() == content + + +@pytest.mark.parametrize('output_pattern', [output_pattern_static, output_pattern_glob]) +def test_end_to_end(tmp_path, monkeypatch, output_pattern): + root_dataset = create_simple_computation_dataset(tmp_path, 'd2', 3, test_method) + + # run `make` command + results = root_dataset.make( + template='test_method', + parameter=[ + 'first=first', + 'second=second', + 'third=third', + ], + output=output_pattern, + result_renderer='disabled', + ) + + collected_output = [ + str(Path(result['path']).relative_to(root_dataset.pathobj)) + for result in results + ] + assert set(collected_output) == set(output_pattern_static) + + # check computation success + _check_content(root_dataset, test_file_content) + + # Drop all computed content + _drop_files(root_dataset, collected_output) + + # Go to the subdataset `d2_subds0/d2_subds1` and fetch the content of `a1.txt` + # from a datalad-remake remote. + monkeypatch.chdir(root_dataset.pathobj / 'd2_subds0' / 'd2_subds1') + datalad_Get()('a1.txt') + + # check that all known files that were computed are added to the annex + _check_content(root_dataset, test_file_content) + + _drop_files(root_dataset, collected_output) + + # check get in subdatasets + monkeypatch.chdir(root_dataset.pathobj) + datalad_Get()('d2_subds0/d2_subds1/a1.txt') + + _check_content(root_dataset, test_file_content) diff --git a/datalad_remake/annexremotes/tests/test_remake_remote.py b/datalad_remake/annexremotes/tests/test_remake_remote.py new file mode 100644 index 0000000..004e6e4 --- /dev/null +++ b/datalad_remake/annexremotes/tests/test_remake_remote.py @@ -0,0 +1,118 @@ +import subprocess +from queue import Queue + +from annexremote import Master + +from datalad_remake.commands.tests.create_datasets import create_ds_hierarchy + +from ... import specification_dir +from ...commands.make_cmd import build_json +from ..remake_remote import RemakeRemote + +template = """ +inputs = ['content'] + +use_shell = 'true' +executable = "echo" + +arguments = [ + "content: {content} > 'a.txt';", +] +""" + + +class MockedOutput: + def __init__(self): + self.output = '' + self.lines = [] + + def write(self, *args, **_): + self.output += ''.join(args) + lineswith = self.output.splitlines(keepends=True) + lineswithout = self.output.splitlines(keepends=False) + if not lineswith: + pass + elif lineswithout[-1] == lineswith[-1]: + self.lines = lineswithout[:-1] + self.output = lineswith[-1] + else: + self.lines = lineswithout + self.output = '' + + def flush(self): + pass + + def next_line(self): + if self.lines: + while True: + line = self.lines.pop(0) + if line.startswith('DEBUG '): + continue + return line + return None + + +class MockedInput: + def __init__(self): + self.input = Queue() + + def readline(self): + return self.input.get() + + def send(self, value): + self.input.put(value) + + +def test_compute_remote_main(tmp_path, monkeypatch): + dataset = create_ds_hierarchy(tmp_path, 'ds1', 0)[0][2] + monkeypatch.chdir(dataset.path) + + template_path = dataset.pathobj / '.datalad' / 'make' / 'methods' + template_path.mkdir(parents=True) + (template_path / 'echo').write_text(template) + dataset.save() + + key = next( + filter( + lambda line: line.startswith(b'key: '), + subprocess.run( + ['git', 'annex', 'info', 'a.txt'], # noqa: S607 + stdout=subprocess.PIPE, + check=True, + ).stdout.splitlines(), + ) + ).split(b': ')[1] + + (dataset.pathobj / specification_dir).mkdir(parents=True) + (dataset.pathobj / specification_dir / '000001111122222').write_text( + build_json('echo', [], ['a.txt'], {'content': 'some_string'}) + ) + + input_ = MockedInput() + + # We send all messages into the queue upfront because we do the test in a + # single thread and do not get back control once `master.listen` is called + # below. + input_.send('PREPARE\n') + input_.send(f'TRANSFER RETRIEVE {key} {tmp_path / "remade.txt"!s}\n') + url = ( + 'datalad-make:///?' + f'root_version={dataset.repo.get_hexsha()}' + '&specification=000001111122222' + '&this=a.txt' + ) + input_.send(f'VALUE {url}\n') + input_.send('VALUE\n') + input_.send('VALUE .git\n') + input_.send('') + + output = MockedOutput() + + master = Master(output=output) + remote = RemakeRemote(master) + master.LinkRemote(remote) + master.Listen(input=input_) + + # At this point the datalad-remake remote should have executed the + # computation and written the result. + assert (tmp_path / 'remade.txt').read_text().strip() == 'content: some_string' diff --git a/datalad_remake/commands/__init__.py b/datalad_remake/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datalad_remake/commands/make_cmd.py b/datalad_remake/commands/make_cmd.py new file mode 100644 index 0000000..cb7a5c3 --- /dev/null +++ b/datalad_remake/commands/make_cmd.py @@ -0,0 +1,441 @@ +"""DataLad make command""" + +from __future__ import annotations + +import contextlib +import hashlib +import json +import logging +import os +import shutil +from pathlib import Path +from typing import TYPE_CHECKING +from urllib.parse import quote + +from datalad.support.exceptions import IncompleteResultsError +from datalad_next.commands import ( + EnsureCommandParameterization, + Parameter, + ValidatedInterface, + build_doc, + datasetmethod, + eval_results, + get_status_dict, +) +from datalad_next.constraints import ( + DatasetParameter, + EnsureDataset, + EnsureListOf, + EnsurePath, + EnsureStr, +) +from datalad_next.datasets import Dataset +from datalad_next.runners import ( + call_git_oneline, + call_git_success, +) + +from datalad_remake import ( + specification_dir, + template_dir, + url_scheme, +) +from datalad_remake.utils.compute import compute +from datalad_remake.utils.glob import resolve_patterns + +if TYPE_CHECKING: + from collections.abc import ( + Generator, + Iterable, + ) + from typing import ClassVar + +lgr = logging.getLogger('datalad.remake.make_cmd') + + +# decoration auto-generates standard help +@build_doc +# all commands must be derived from Interface +class Make(ValidatedInterface): + # first docstring line is used a short description in the cmdline help + # the rest is put in the verbose help and manpage + """Specify a computation and optionally execute it""" + + _validator_ = EnsureCommandParameterization( + { + 'dataset': EnsureDataset(installed=True), + 'template': EnsureStr(min_len=1), + 'input': EnsureListOf(EnsureStr(min_len=1)), + 'input_list': EnsurePath(), + 'output': EnsureListOf(EnsureStr(min_len=1), min_len=1), + 'output_list': EnsurePath(), + 'parameter': EnsureListOf(EnsureStr(min_len=3)), + 'parameter_list': EnsurePath(), + } + ) + + # parameters of the command, must be exhaustive + _params_: ClassVar[dict[str, Parameter]] = { + 'dataset': Parameter( + args=('-d', '--dataset'), + doc='Dataset to be used as a configuration source. Beyond ' + 'reading configuration items, this command does not interact with ' + 'the dataset.', + ), + 'url_only': Parameter( + args=('-u', '--url-only'), + action='store_true', + doc="Don't perform the computation, register an URL-key " + 'instead. A `git annex get ` will trigger the computation', + ), + 'template': Parameter( + args=('template',), + doc='Name of the computing template (template should be present ' + 'in $DATASET/.datalad/remake/methods)', + ), + 'branch': Parameter( + args=( + '-b', + '--branch', + ), + doc='Branch (or commit) that should be used for computation, if ' + 'not specified HEAD will be used', + ), + 'input': Parameter( + args=( + '-i', + '--input', + ), + action='append', + doc='An input file pattern (repeat for multiple inputs, ' + 'file pattern support python globbing, globbing is expanded ' + 'in the source dataset)', + ), + 'input_list': Parameter( + args=( + '-I', + '--input-list', + ), + doc='Name of a file that contains a list of input file patterns. ' + 'Format is one file per line, relative path from `dataset`. ' + 'Empty lines, i.e. lines that contain only newlines, and lines ' + "that start with '#' are ignored. Line content is stripped " + 'before used. This is useful if a large number of input file ' + 'patterns should be provided.', + ), + 'output': Parameter( + args=( + '-o', + '--output', + ), + action='append', + doc='An output file pattern (repeat for multiple outputs)' + 'file pattern support python globbing, globbing is expanded ' + 'in the worktree)', + ), + 'output_list': Parameter( + args=( + '-O', + '--output-list', + ), + doc='Name of a file that contains a list of output patterns. Format ' + 'is one file per line, relative path from `dataset`. Empty ' + 'lines, i.e. lines that contain only newlines, arg ignored. ' + 'This is useful if a large number of output files should be ' + 'provided.', + ), + 'parameter': Parameter( + args=( + '-p', + '--parameter', + ), + action='append', + doc='Input parameter in the form = (repeat for ' + 'multiple parameters)', + ), + 'parameter_list': Parameter( + args=( + '-P', + '--parameter-list', + ), + doc='Name of a file that contains a list of parameters. Format ' + 'is one `=` string per line. ' + 'Empty lines, i.e. lines that contain only newlines, and lines ' + "that start with '#' are ignored. Line content is stripped " + 'before used. This is useful if a large number of parameters ' + 'should be provided.', + ), + } + + @staticmethod + @datasetmethod(name='make') + @eval_results + def __call__( + dataset: DatasetParameter | None = None, + *, + template: str = '', + url_only: bool = False, + branch: str | None = None, + input: list[str] | None = None, # noqa: A002 + input_list: Path | None = None, + output: list[str] | None = None, + output_list: Path | None = None, + parameter: list[str] | None = None, + parameter_list: Path | None = None, + ) -> Generator: + ds: Dataset = dataset.ds if dataset else Dataset('.') + + input_pattern = (input or []) + read_list(input_list) + output_pattern = (output or []) + read_list(output_list) + parameter = (parameter or []) + read_list(parameter_list) + + parameter_dict = {p.split('=', 1)[0]: p.split('=', 1)[1] for p in parameter} + + # We have to get the URL first, because saving the specification to + # the dataset will change the version. + url_base, reset_commit = get_url( + ds, branch, template, parameter_dict, input_pattern, output_pattern + ) + + if not url_only: + with provide_context( + ds, + branch, + input_pattern, + ) as worktree: + execute(worktree, template, parameter_dict, output_pattern) + resolved_output = collect(worktree, ds, output_pattern) + else: + resolved_output = set(output_pattern) + + for out in resolved_output: + url = add_url(ds, out, url_base, url_only=url_only) + yield get_status_dict( + action='make', + path=str(ds.pathobj / out), + status='ok', + message=f'added url: {url!r} to {out!r} in {ds.pathobj}', + ) + + +def read_list(list_file: str | Path | None) -> list[str]: + if list_file is None: + return [] + return list( + filter( + lambda s: s != '' and not s.startswith('#'), + [ + line.strip() + for line in Path(list_file).read_text().splitlines(keepends=False) + ], + ) + ) + + +def get_url( + dataset: Dataset, + branch: str | None, + template_name: str, + parameters: dict[str, str], + input_pattern: list[str], + output_pattern: list[str], +) -> tuple[str, str]: + # If something goes wrong after the make specification was saved, + # the dataset state should be reset to `branch` + reset_branch = branch or dataset.repo.get_hexsha() + + # Write the specification to a file in the dataset + digest = write_spec( + dataset, template_name, input_pattern, output_pattern, parameters + ) + + return ( + f'{url_scheme}:///' + f'?root_version={quote(dataset.repo.get_hexsha())}' + f'&specification={quote(digest)}' + ), reset_branch + + +def write_spec( + dataset: Dataset, + method: str, + input_pattern: list[str], + output_pattern: list[str], + parameters: dict[str, str], +) -> str: + # create the specification and hash it + spec = build_json(method, input_pattern, output_pattern, parameters) + hasher = hashlib.sha256() + hasher.update(spec.encode()) + digest = hasher.hexdigest() + + # write the specification file + spec_dir = dataset.pathobj / specification_dir + spec_dir.mkdir(exist_ok=True) + spec_file = spec_dir / digest + with contextlib.chdir(dataset.pathobj): + call_git_success(['annex', 'unlock', str(spec_file)], capture_output=True) + spec_file.write_text(spec) + dataset.save( + message=f'[DATALAD] saving computation spec\n\nfile name: {digest}', + recursive=True, + result_renderer='disabled', + ) + return digest + + +def build_json( + method: str, inputs: list[str], outputs: list[str], parameters: dict[str, str] +) -> str: + return json.dumps( + {'method': method, 'input': inputs, 'output': outputs, 'parameter': parameters} + ) + + +def add_url(dataset: Dataset, file_path: str, url_base: str, *, url_only: bool) -> str: + lgr.debug('add_url: %s %s %s %s', str(dataset), file_path, url_base, repr(url_only)) + + # Build the file-specific URL and store it in the annex + url = url_base + f'&this={quote(file_path)}' + dataset_path, path = get_file_dataset(dataset.pathobj / file_path) + + # If the file does not exist and speculative computation is requested, we + # can just add the URL. + if not (dataset.pathobj / path).exists() and url_only: + can_add = True + else: + # Check if the file is annexed, otherwise we cannot add a URL + can_add = call_git_success( + ['annex', 'whereis', str(path)], + cwd=dataset_path, + capture_output=True, + ) + + # Add the URL + if can_add: + success = call_git_success( + ['annex', 'addurl', url, '--file', str(path)] + + (['--relaxed'] if url_only else []), + cwd=dataset_path, + capture_output=True, + ) + if not success: + msg = ( + f'\naddurl failed:\ndataset_path: {dataset_path}\n' + f'url: {url!r}\nfile_path: {path!r}' + ) + raise RuntimeError(msg) + return url + + +def get_file_dataset(file: Path) -> tuple[Path, Path]: + """Get dataset of file and relative path of file from the dataset + + Determine the path of the dataset that contains the file and the relative + path of the file in this dataset.""" + top_level = Path( + call_git_oneline(['rev-parse', '--show-toplevel'], cwd=file.parent) + ) + return (Path(top_level), file.absolute().relative_to(top_level)) + + +def provide( + dataset: Dataset, + branch: str | None, + input_patterns: list[str], +) -> Path: + lgr.debug('provide: %s %s %s', dataset, branch, input_patterns) + result = dataset.provision( + input=input_patterns, branch=branch, result_renderer='disabled' + ) + return Path(result[0]['path']) + + +@contextlib.contextmanager +def provide_context( + dataset: Dataset, + branch: str | None, + input_patterns: list[str], +) -> Generator: + worktree = provide(dataset, branch=branch, input_patterns=input_patterns) + try: + yield worktree + finally: + lgr.debug('un_provide: %s %s', dataset, str(worktree)) + dataset.provision(delete=worktree, result_renderer='disabled') + + +def execute( + worktree: Path, + template_name: str, + parameter: dict[str, str], + output_pattern: list[str], +) -> None: + lgr.debug( + 'execute: %s %s %s %s', + str(worktree), + template_name, + repr(parameter), + repr(output_pattern), + ) + + worktree_ds = Dataset(worktree) + + # Determine which outputs already exist + existing_outputs = resolve_patterns(root_dir=worktree, patterns=output_pattern) + + # Get the subdatasets, directories, and files of the existing output space + create_output_space(worktree_ds, existing_outputs) + + # Unlock existing output files in the output space (worktree-directory) + unlock_files(worktree_ds, existing_outputs) + + # Run the computation in the worktree-directory + template_path = Path(template_dir) / template_name + worktree_ds.get(template_path) + compute(worktree, worktree / template_path, parameter) + + +def collect( + worktree: Path, + dataset: Dataset, + output_pattern: Iterable[str], +) -> set[str]: + output = resolve_patterns(root_dir=worktree, patterns=output_pattern) + + # Unlock output files in the dataset-directory and copy the result + unlock_files(dataset, output) + for o in output: + lgr.debug('collect: collecting %s', o) + destination = dataset.pathobj / o + destination.parent.mkdir(parents=True, exist_ok=True) + shutil.copyfile(worktree / o, destination) + + # Save the dataset + dataset.save(recursive=True, result_renderer='disabled') + return output + + +def unlock_files(dataset: Dataset, files: Iterable[str]) -> None: + """Use datalad to resolve subdatasets and unlock files in the dataset.""" + # TODO: for some reason `dataset unlock` does not operate in the + # context of `dataset.pathobj`, so we need to change the working + # directory manually here. + with contextlib.chdir(dataset.pathobj): + for f in files: + file = dataset.pathobj / f + if not file.exists() and file.is_symlink(): + # `datalad unlock` does not "unlock" dangling symlinks, so we + # mimic the behavior of `git annex unlock` here: + link = os.readlink(file) + file.unlink() + file.write_text('/annex/objects/' + link.split('/')[-1] + '\n') + elif file.is_symlink(): + dataset.unlock(file, result_renderer='disabled') + + +def create_output_space(dataset: Dataset, files: Iterable[str]) -> None: + """Get all files that are part of the output space.""" + for f in files: + with contextlib.suppress(IncompleteResultsError): + dataset.get(f, result_renderer='disabled') diff --git a/datalad_remake/commands/provision_cmd.py b/datalad_remake/commands/provision_cmd.py new file mode 100644 index 0000000..501a3e9 --- /dev/null +++ b/datalad_remake/commands/provision_cmd.py @@ -0,0 +1,418 @@ +""" +A data provisioner that works with local git repositories. +Data is provisioned in a temporary worktree. All subdatasets +are currently also provisioned. +""" + +from __future__ import annotations + +import logging +import os +from contextlib import chdir +from glob import glob +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import ( + TYPE_CHECKING, + ClassVar, +) + +from datalad_next.commands import ( + EnsureCommandParameterization, + Parameter, + ValidatedInterface, + build_doc, + datasetmethod, + eval_results, + get_status_dict, +) +from datalad_next.constraints import ( + AnyOf, + DatasetParameter, + EnsureDataset, + EnsureListOf, + EnsurePath, + EnsureStr, +) +from datalad_next.datasets import Dataset +from datalad_next.runners import call_git_lines, call_git_success + +from datalad_remake.commands.make_cmd import read_list + +if TYPE_CHECKING: + from collections.abc import Generator, Iterable + +lgr = logging.getLogger('datalad.remake.provision_cmd') + + +# decoration auto-generates standard help +@build_doc +# all commands must be derived from Interface +class Provision(ValidatedInterface): + # first docstring line is used a short description in the cmdline help + # the rest is put in the verbose help and manpage + """Provision inputs for a `make` command + + This command provides a temporary, partial copy of the dataset in a separate + tree, called a "worktree". The worktree will contain all files that are + specified by the input patterns. All necessary subdatasets will be + installed. If a subdataset is locally available in the source dataset, it + will be installed from there. Its main purpose is to provide an isolated + environment for `make` commands. + """ + + _validator_ = EnsureCommandParameterization( + { + 'dataset': EnsureDataset(installed=True), + 'input': EnsureListOf(EnsureStr(min_len=1)), + 'input_list': EnsurePath(), + 'delete': EnsureDataset(installed=True), + 'worktree_dir': AnyOf(EnsurePath(), EnsureStr(min_len=1)), + } + ) + + # parameters of the command, must be exhaustive + _params_: ClassVar[dict[str, Parameter]] = { + 'dataset': Parameter( + args=('-d', '--dataset'), + doc='Dataset to be used as a configuration source. Beyond ' + 'reading configuration items, this command does not interact with ' + 'the dataset.', + ), + 'branch': Parameter( + args=( + '-b', + '--branch', + ), + doc='Branch (or commit) that should be provisioned, if ' + 'not specified HEAD will be used', + ), + 'delete': Parameter( + args=('--delete',), + doc='Delete the temporary worktree WORKTREE that belongs the the ' + 'dataset (cannot be used with `-b`, `--branch`, `-i`,' + '`--input`, `-I`, or `--input-list`).', + ), + 'input': Parameter( + args=( + '-i', + '--input', + ), + action='append', + doc='An input file pattern (repeat for multiple inputs, ' + 'file pattern support python globbing, globbing is done in the ' + 'worktree and through all matching subdatasets, installing ' + 'if necessary).', + ), + 'input_list': Parameter( + args=( + '-I', + '--input-list', + ), + doc='Name of a file that contains a list of input file patterns. ' + 'Format is one file per line, relative path from `dataset`. ' + 'Empty lines, i.e. lines that contain only newlines, and lines ' + "that start with '#' are ignored. Line content is stripped " + 'before used. This is useful if a large number of input file ' + 'patterns should be provided.', + ), + 'worktree_dir': Parameter( + args=( + '-w', + '--worktree-dir', + ), + doc='Path of the directory that should become the temporary ' + 'worktree, defaults to `tempfile.TemporaryDirectory().name`.', + ), + } + + @staticmethod + @datasetmethod(name='provision') + @eval_results + def __call__( + dataset: DatasetParameter | None = None, + branch: str | None = None, + delete: DatasetParameter | None = None, + input: list[str] | None = None, # noqa: A002 + input_list: Path | None = None, + worktree_dir: str | Path | None = None, + ): + ds: Dataset = dataset.ds if dataset else Dataset('.') + if delete: + if branch or input: + msg = ( + 'Cannot use `-d`, `--delete` with `-b`, `--branch`,' + ' `-i`, or `--input`' + ) + raise ValueError(msg) + + remove(ds, delete.ds) + yield get_status_dict( + action='provision [delete]', + path=delete.ds.path, + status='ok', + message=f'delete workspace: {delete.ds.path!r} from dataset {ds!r}', + ) + return + + resolved_worktree_dir: Path = Path(worktree_dir or TemporaryDirectory().name) + inputs = input or [*read_list(input_list)] + yield from provide(ds, resolved_worktree_dir, inputs, branch) + + +def remove(dataset: Dataset, worktree: Dataset) -> None: + worktree.drop( + what='all', reckless='kill', recursive=True, result_renderer='disabled' + ) + prune_worktrees(dataset) + call_git_success(['branch', '-d', worktree.pathobj.name], cwd=dataset.pathobj) + + +def prune_worktrees(dataset: Dataset) -> None: + call_git_lines(['worktree', 'prune'], cwd=dataset.pathobj) + + +def provide( + dataset: Dataset, + worktree_dir: Path, + input_patterns: list[str], + source_branch: str | None = None, +) -> Generator: + """Provide paths defined by input_patterns in a temporary worktree + + Parameters + ---------- + dataset: Dataset + Dataset that should be provisioned + worktree_dir: Path + Path to a directory that should contain the provisioned worktree + input_patterns: list[str] + List of patterns that describe the input files + source_branch: str | None + Branch that should be provisioned, if None HEAD will be used [optional] + + Returns + ------- + + """ + + lgr.debug('Provisioning dataset %s at %s', dataset, worktree_dir) + + worktree_dir.mkdir(parents=True, exist_ok=True) + + # Create a worktree + args = ( + ['worktree', 'add'] + + [str(worktree_dir)] + + ([source_branch] if source_branch else []) + ) + call_git_lines(args, cwd=dataset.pathobj) + + is_dirty = False + for element in get_dirty_elements(dataset): + is_dirty = True + yield get_status_dict( + action='provision', + path=element['path'], + status='error', + state=element['state'], + message=f'cannot provision {element["state"]} input: {element["path"]!r} from dataset {dataset}', + ) + if is_dirty: + return + + worktree_dataset = Dataset(worktree_dir) + + # Get all input files in the worktree + with chdir(worktree_dataset.path): + for path in resolve_patterns(dataset, worktree_dataset, input_patterns): + worktree_dataset.get(path) + + yield get_status_dict( + action='provision', + path=str(worktree_dir), + status='ok', + message=f'provisioned dataset: {dataset} in workspace: {worktree_dir!r}', + ) + + +def resolve_patterns( + dataset: Dataset, worktree: Dataset, pattern_list: list[str] +) -> set[Path]: + """Resolve file patterns in the dataset + + This method will resolve relative path-patterns in the dataset. It will + install all subdatasets that are matched by the patterns. Pattern are + described as outline in `glob.glob`. The method support recursive globbing + of zero or more directories with the pattern: `**`. + + Parameters + ---------- + dataset: Dataset, + Dataset that should be provisioned. + worktree : Dataset + Worktree dataset, in which the patterns should be resolved. + pattern_list : list[str] + List of patterns thatThat should be resolved. + + Returns + ------- + set[Path] + Set of paths that match the patterns. + """ + matches = set() + for pattern in pattern_list: + pattern_parts = pattern.split(os.sep) + + if pattern_parts[0] == '': + lgr.warning('Ignoring absolute input pattern %s', pattern) + continue + + matches.update( + glob_pattern( + worktree, + Path(), + pattern_parts, + get_uninstalled_subdatasets(worktree), + get_installed_subdatasets(dataset), + ) + ) + return matches + + +def get_uninstalled_subdatasets(dataset: Dataset) -> set[Path]: + """Get a list of the paths of all visible, non-installed subdatasets""" + return { + Path(result['path']).relative_to(dataset.pathobj) + for result in dataset.subdatasets(recursive=True, result_renderer='disabled') + if result['state'] == 'absent' + } + + +def glob_pattern( + root: Dataset, + position: Path, + pattern: list[str], + uninstalled_subdatasets: set[Path], + locally_available_subdatasets: Iterable[tuple[Path, Path, Path]], +) -> set[Path]: + """Glob a pattern in a dataset installing subdatasets if necessary + + Parameters + ---------- + root: Dataset + The dataset in which the pattern should be resolved. + position: Path + A relative path that denotes the position in the dataset from which a + pattern is matched. + pattern: list[str] + The path-elements of the pattern. For example `['*', 'a', '*.txt']` + represents the pattern `'*/a/*.txt'`. + uninstalled_subdatasets: set[Path] + A set that contains all currently known uninstalled subdatasets. This + set will be updated in the method. + locally_available_subdatasets: set[Path] + A set that contains all datasets that are available in the dataset for + which the worktree is created. + + Returns + ------- + set[Path] + A set that contains all paths that match the pattern. + """ + if not pattern: + return {position} + + # If the pattern starts with `**` we have to glob the remainder of the + # pattern from this position. + if pattern[0] == '**': + result = glob_pattern( + root, + position, + pattern[1:], + uninstalled_subdatasets, + locally_available_subdatasets, + ) + else: + result = set() + + # Match all elements at the current position with the first part of the + # pattern. + for rec_match in glob( + '*' if pattern[0] == '**' else pattern[0], root_dir=root.pathobj / position + ): + match = position / rec_match + + # If the match is a directory that is in uninstalled subdatasets, + # install the dataset and updated uninstalled datasets before proceeding + # with matching the pattern. + if match.is_dir() and match in uninstalled_subdatasets: + lgr.info('Installing subdataset %s to glob input', match) + install_subdataset( + root, match, uninstalled_subdatasets, locally_available_subdatasets + ) + + # We have a match, try to match the remainder of the pattern. + submatch_pattern = pattern if pattern[0] == '**' else pattern[1:] + result.update( + glob_pattern( + root, + match, + submatch_pattern, + uninstalled_subdatasets, + locally_available_subdatasets, + ) + ) + + return result + + +def get_dirty_elements(dataset: Dataset) -> Generator: + """Get all dirty elements in the dataset""" + for result in dataset.status(recursive=True): + if result['type'] == 'file' and result['state'] != 'clean': + yield result + + +def install_subdataset( + worktree: Dataset, + subdataset_path: Path, + uninstalled_subdatasets: set[Path], + locally_available_datasets: Iterable[tuple[Path, Path, Path]], +) -> None: + """Install a subdataset, prefer locally available subdatasets""" + local_subdataset = [ + dataset_info + for dataset_info in locally_available_datasets + if dataset_info[2] == subdataset_path + ] + + if local_subdataset: + absolute_path, parent_ds_path, path_from_root = local_subdataset[0] + # Set the URL to the full source path + args = [ + '-C', + str(worktree.pathobj / parent_ds_path), + 'submodule', + 'set-url', + '--', + str(path_from_root.relative_to(parent_ds_path)), + 'file://' + str(absolute_path), + ] + call_git_lines(args) + worktree.get(str(subdataset_path), get_data=False, result_renderer='disabled') + uninstalled_subdatasets.remove(subdataset_path) + uninstalled_subdatasets.update(get_uninstalled_subdatasets(worktree)) + + +def get_installed_subdatasets(dataset: Dataset) -> Iterable[tuple[Path, Path, Path]]: + results = dataset.subdatasets(recursive=True, result_renderer='disabled') + return [ + ( + Path(result['path']), + Path(result['parentds']).relative_to(dataset.pathobj), + Path(result['path']).relative_to(dataset.pathobj), + ) + for result in results + if result['state'] == 'present' + ] diff --git a/datalad_remake/commands/tests/__init__.py b/datalad_remake/commands/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datalad_remake/commands/tests/create_datasets.py b/datalad_remake/commands/tests/create_datasets.py new file mode 100644 index 0000000..06e6a0b --- /dev/null +++ b/datalad_remake/commands/tests/create_datasets.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +from pathlib import Path + +from datalad_next.datasets import Dataset +from datalad_next.runners import call_git_success + +from datalad_remake import template_dir + + +def update_config_for_remake(dataset: Dataset): + # set annex security related variables to allow remake-URLs + dataset.configuration( + action='set', + scope='local', + recursive=True, + spec=[('annex.security.allow-unverified-downloads', 'ACKTHPPT')], + result_renderer='disabled', + ) + + +def add_remake_remote(dataset: Dataset): + call_git_success( + [ + '-C', + dataset.path, + 'annex', + 'initremote', + 'remake', + 'type=external', + 'externaltype=datalad-remake', + 'encryption=none', + ], + capture_output=True, + ) + + +def create_ds_hierarchy( + tmp_path: Path, name: str, subdataset_levels: int = 2 +) -> list[tuple[str, Path, Dataset]]: + # Create root dataset + root_dataset = Dataset(tmp_path / name) + root_dataset.create(force=True, result_renderer='disabled') + (root_dataset.pathobj / 'a.txt').write_text('a\n') + (root_dataset.pathobj / 'b.txt').write_text('b\n') + root_dataset.save(result_renderer='disabled') + datasets = [(name, tmp_path / name, root_dataset)] + + # Create subdatasets + for level in range(subdataset_levels): + subdataset_path = tmp_path / f'{name}_subds{level}' + subdataset = Dataset(subdataset_path) + subdataset.create(force=True, result_renderer='disabled') + (subdataset.pathobj / f'a{level}.txt').write_text(f'a{level}\n') + (subdataset.pathobj / f'b{level}.txt').write_text(f'b{level}\n') + subdataset.save(result_renderer='disabled') + datasets.append((f'{name}_subds{level}', subdataset_path, subdataset)) + + # Link the datasets + for index in range(len(datasets) - 2, -1, -1): + dataset, subdataset = datasets[index : index + 2] + dataset[2].install( + path=subdataset[0], + source='file://' + subdataset[2].path, + result_renderer='disabled', + ) + dataset[2].save(result_renderer='disabled') + + root_dataset.get(recursive=True, result_renderer='disabled') + update_config_for_remake(root_dataset) + + # Add datalad-remake remotes to the root dataset and all subdatasets + add_remake_remote(root_dataset) + subdataset_path = Path() + for index in range(subdataset_levels): + subdataset_path /= f'{name}_subds{index}' + add_remake_remote(Dataset(root_dataset.pathobj / subdataset_path)) + + return datasets + + +def create_simple_computation_dataset( + tmp_path: Path, + dataset_name: str, + subdataset_levels: int, + test_method: str, +) -> Dataset: + datasets = create_ds_hierarchy(tmp_path, dataset_name, subdataset_levels) + root_dataset = datasets[0][2] + + # add method template + template_path = root_dataset.pathobj / template_dir + template_path.mkdir(parents=True) + (template_path / 'test_method').write_text(test_method) + root_dataset.save(result_renderer='disabled') + + return root_dataset diff --git a/datalad_remake/commands/tests/test_collection.py b/datalad_remake/commands/tests/test_collection.py new file mode 100644 index 0000000..e0c1590 --- /dev/null +++ b/datalad_remake/commands/tests/test_collection.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from ..make_cmd import collect +from .create_datasets import create_ds_hierarchy +from .test_provision import get_file_list + + +def test_collect(tmp_path): + dataset = create_ds_hierarchy(tmp_path, 'ds1', 1)[0][2] + + worktree_dir = tmp_path / 'ds1_worktree' + worktree_dir.mkdir(parents=True, exist_ok=False) + worktree = dataset.provision(worktree_dir=worktree_dir, result_renderer='disabled') + + result_dir = worktree_dir / 'results' / 'sub-01' + result_dir.mkdir(parents=True) + (result_dir / 'a.txt').write_text('content: a\n') + (result_dir / 'b.txt').write_text('content: b\n') + + result = collect( + worktree=Path(worktree[0]['path']), + dataset=dataset, + output_pattern=['results/**'], + ) + assert result == {'results/sub-01/a.txt', 'results/sub-01/b.txt'} + assert set(get_file_list(dataset.pathobj / 'results')) == { + 'sub-01/a.txt', + 'sub-01/b.txt', + } diff --git a/datalad_remake/commands/tests/test_compute.py b/datalad_remake/commands/tests/test_compute.py new file mode 100644 index 0000000..6b763da --- /dev/null +++ b/datalad_remake/commands/tests/test_compute.py @@ -0,0 +1,56 @@ +from datalad_next.datasets import Dataset + +from datalad_remake.commands.tests.create_datasets import ( + create_simple_computation_dataset, +) + +test_method = """ +inputs = ['name', 'file'] +use_shell = 'true' +executable = 'echo' +arguments = ["Hello {name} > {file}"] +""" + +output_pattern = ['a.txt'] + + +def test_duplicated_computation(tmp_path): + root_dataset = create_simple_computation_dataset(tmp_path, 'ds1', 0, test_method) + + # run the same command twice + _run_simple_computation(root_dataset) + _run_simple_computation(root_dataset) + + +def test_speculative_computation(tmp_path, datalad_cfg): + root_dataset = create_simple_computation_dataset(tmp_path, 'ds1', 0, test_method) + + root_dataset.make( + template='test_method', + parameter=['name=Robert', 'file=spec.txt'], + output=['spec.txt'], + url_only=True, + result_renderer='disabled', + ) + + # set annex security related variables to allow datalad-remake-URLs + # in speculative make commands + datalad_cfg.set( + 'annex.security.allow-unverified-downloads', 'ACKTHPPT', scope='global' + ) + + # Perform the speculative computation + root_dataset.get('spec.txt') + assert (root_dataset.pathobj / 'spec.txt').read_text() == 'Hello Robert\n' + + +def _run_simple_computation(root_dataset: Dataset): + root_dataset.make( + template='test_method', + parameter=['name=Robert', 'file=a.txt'], + output=['a.txt'], + result_renderer='disabled', + ) + + # check that the output is correct + assert (root_dataset.pathobj / 'a.txt').read_text() == 'Hello Robert\n' diff --git a/datalad_remake/commands/tests/test_listhandling.py b/datalad_remake/commands/tests/test_listhandling.py new file mode 100644 index 0000000..dd18095 --- /dev/null +++ b/datalad_remake/commands/tests/test_listhandling.py @@ -0,0 +1,45 @@ +import tempfile +from pathlib import Path + +from hypothesis import given +from hypothesis.strategies import lists, text + +from datalad_remake.commands.make_cmd import read_list + + +def test_empty_list_reading(): + assert read_list(None) == [] + + +@given(lists(text('abcdefghijklmnopqrstuvwxyz_', min_size=1))) +def test_list_reading_basic(word_list): + with tempfile.TemporaryDirectory() as temp_dir: + _test_wordlist(Path(temp_dir), word_list) + + +def test_list_reading_comments(tmp_path: Path): + list_file = _write_list(tmp_path, ['# a', 'a', ' # b']) + assert read_list(str(list_file)) == ['a'] + + +def test_list_reading_strip(tmp_path: Path): + list_file = _write_list(tmp_path, [' a', 'b ', ' c ']) + assert read_list(str(list_file)) == ['a', 'b', 'c'] + + +def _test_wordlist( + tmp_path: Path, + word_list: list[str], +) -> None: + list_file = _write_list(tmp_path, word_list) + assert read_list(str(list_file)) == word_list + assert read_list(list_file) == word_list + + +def _write_list( + tmp_path: Path, + word_list: list[str], +) -> Path: + list_file = tmp_path / 'list.txt' + list_file.write_text('\n'.join(word_list)) + return list_file diff --git a/datalad_remake/commands/tests/test_provision.py b/datalad_remake/commands/tests/test_provision.py new file mode 100644 index 0000000..3fcb8f7 --- /dev/null +++ b/datalad_remake/commands/tests/test_provision.py @@ -0,0 +1,209 @@ +from __future__ import annotations + +import contextlib +from contextlib import chdir +from pathlib import Path +from typing import TYPE_CHECKING + +from datalad_next.datasets import Dataset +from datalad_next.runners import call_git_lines + +from ..make_cmd import provide_context +from .create_datasets import create_ds_hierarchy + +if TYPE_CHECKING: + from collections.abc import Iterable + +file_path_templates = [ + '{file}.txt', + '{{ds_name}}_subds0/{file}0.txt', + '{{ds_name}}_subds0/{{ds_name}}_subds1/{file}1.txt', + '{{ds_name}}_subds0/{{ds_name}}_subds1/{{ds_name}}_subds2/{file}2.txt', +] + + +all_paths = [ + template.format(file=f) for template in file_path_templates for f in ['a', 'b'] +] + +a_paths = [path.format(file='a') for path in file_path_templates] + +b_paths = [path.format(file='b') for path in file_path_templates] + + +def test_worktree_basic(tmp_path): + dataset = create_ds_hierarchy(tmp_path, 'ds1', 3)[0][2] + inputs = [ + 'a.txt', + 'b.txt', + 'ds1_subds0/a0.txt', + 'ds1_subds0/b0.txt', + 'ds1_subds0/ds1_subds1/a1.txt', + 'ds1_subds0/ds1_subds1/b1.txt', + ] + provision_result = dataset.provision( + worktree_dir=tmp_path / 'ds1_worktree1', + input=inputs, + result_renderer='disabled', + )[0] + + worktree = Dataset(provision_result['path']) + # Check input availability + assert all((worktree.pathobj / path).exists() for path in inputs) + + dataset.provision(delete=worktree.path, result_renderer='disabled') + + def check_deleted_worktrees(ds: Dataset): + with chdir(ds.path): + for line in call_git_lines(['worktree', 'list']): + directory = line.split()[0] + assert directory == ds.path + for sub_ds in ds.subdatasets(result_renderer='disabled'): + check_deleted_worktrees(Dataset(sub_ds['path'])) + + check_deleted_worktrees(dataset) + dataset.drop( + what='all', reckless='kill', recursive=True, result_renderer='disabled' + ) + + +def test_worktree_globbing(tmp_path): + dataset = create_ds_hierarchy(tmp_path, 'ds1', 3)[0][2] + result = dataset.provision( + worktree_dir=tmp_path / 'ds1_worktree2', + input=[ + '*.txt', + '*_subds0/*.txt', + '*_subds0/*_subds1/*.txt', + '*_subds0/*_subds1/*_subds2/*.txt', + ], + result_renderer='disabled', + )[0] + + worktree = Path(result['path']) + worktree_set = set(get_file_list(worktree)) + assert worktree_set == {path.format(ds_name='ds1') for path in all_paths} + dataset.provision(delete=worktree, result_renderer='disabled') + + result = dataset.provision( + worktree_dir=tmp_path / 'ds1_worktree2', + input=[ + 'b*txt', + '*_subds0/b*txt', + '*_subds0/*_subds1/b*txt', + '*_subds0/*_subds1/*_subds2/b*txt', + ], + result_renderer='disabled', + )[0] + + worktree = Path(result['path']) + worktree_set = set(get_file_list(worktree)) + assert {path.format(ds_name='ds1') for path in b_paths}.issubset(worktree_set) + dataset.provision(delete=worktree, result_renderer='disabled') + + dataset.drop( + what='all', reckless='kill', recursive=True, result_renderer='disabled' + ) + + +def get_file_list( + root: Path, path: Path | None = None, prefix: Path | None = None +) -> Iterable[str]: + prefix = prefix or Path('') + path = path or root + for child in path.iterdir(): + if not child.name.startswith('.'): + if child.is_dir(): + yield from get_file_list(root, child, prefix=prefix / child) + else: + yield str((prefix / child).relative_to(root)) + + +def test_provision_context(tmp_path): + dataset = create_ds_hierarchy(tmp_path, 'ds1')[0][2] + with provide_context(dataset, branch=None, input_patterns=['**']) as worktree: + files = set(get_file_list(worktree)) + assert files + assert not worktree.exists() + + +def test_unclean_dataset(tmp_path): + dataset = Dataset(tmp_path / 'ds1') + dataset.create(cfg_proc='text2git', result_renderer='disabled') + (dataset.pathobj / 'a.txt').write_text('content') + dataset.save() + (dataset.pathobj / 'a.txt').write_text('changed content') + (dataset.pathobj / 'b.txt').write_text('untracked content') + + # Check that provision of unclean input results in errors + input_pattern = ['a.txt', 'b.txt'] + results = dataset.provision( + input=input_pattern, + worktree_dir=tmp_path / 'ds1_worktree1', + on_failure='ignore', + result_renderer='disabled', + ) + assert {(result['status'], result['state']) for result in results} == { + ('error', 'modified'), + ('error', 'untracked'), + } + + # Check that a saved dataset can be provisioned + dataset.save() + dataset.provision( + input=input_pattern, + worktree_dir=tmp_path / 'ds1_worktree2', + result_renderer='disabled', + ) + + +def test_branch_deletion_after_provision(tmp_path): + dataset = create_ds_hierarchy(tmp_path, 'ds1', 3)[0][2] + with provide_context( + dataset=dataset, branch=None, input_patterns=['a.txt'] + ) as worktree: + assert worktree.exists() + assert not worktree.exists() + with contextlib.chdir(dataset.path): + branches = [line.strip() for line in call_git_lines(['branch'])] + assert worktree.name not in branches + + +def test_not_present_local_datasets(tmp_path): + root_ds = Dataset(tmp_path / 'ds1') + root_ds.create(cfg_proc='text2git', result_renderer='disabled') + root_ds.clone( + 'https://github.com/OpenNeuroDatasets/ds000102', result_renderer='disabled' + ) + provisioned_dataset = Dataset( + root_ds.provision(input=['ds000102/README'], result_renderer='disabled')[0][ + 'path' + ] + ) + url = _get_submodule_url(provisioned_dataset, 'ds000102') + assert url.startswith(f'file://{root_ds.path}') + + root_ds.drop( + 'ds000102', what='all', reckless='availability', result_renderer='disabled' + ) + + provisioned_dataset_2 = Dataset( + root_ds.provision( + input=['ds000102/README'], on_failure='ignore', result_renderer='disabled' + )[0]['path'] + ) + url_2 = _get_submodule_url(provisioned_dataset_2, 'ds000102') + assert url_2 == 'https://github.com/OpenNeuroDatasets/ds000102' + + +def _get_submodule_url(dataset: Dataset, submodule_path: str) -> str: + x = call_git_lines( + [ + 'config', + '-f', + str(dataset.pathobj / '.gitmodules'), + '--get', + f'submodule.{submodule_path}.url', + ] + ) + return x[0].strip() diff --git a/datalad_remake/tests/test_dummy.py b/datalad_remake/tests/test_dummy.py deleted file mode 100644 index ebdfde4..0000000 --- a/datalad_remake/tests/test_dummy.py +++ /dev/null @@ -1,6 +0,0 @@ -import datalad_remake # noqa: F401 - - -def test_dummy(): - # nothing but a placeholder - pass diff --git a/datalad_remake/tests/test_register.py b/datalad_remake/tests/test_register.py new file mode 100644 index 0000000..89f3a7f --- /dev/null +++ b/datalad_remake/tests/test_register.py @@ -0,0 +1,5 @@ +def test_register(): + import datalad.api as da + + assert hasattr(da, 'make') + assert hasattr(da, 'provision') diff --git a/datalad_remake/utils/__init__.py b/datalad_remake/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datalad_remake/utils/compute.py b/datalad_remake/utils/compute.py new file mode 100644 index 0000000..19b474e --- /dev/null +++ b/datalad_remake/utils/compute.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +import contextlib +import logging +import subprocess +import tomllib +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from pathlib import Path + +lgr = logging.getLogger('datalad.compute') + + +def substitute_string( + format_str: str, + replacements: dict[str, str], +) -> str: + for variable_name, replacement in replacements.items(): + place_holder = '{' + variable_name + '}' + if place_holder in format_str: + format_str = format_str.replace(place_holder, replacement) + return format_str + + +def substitute_arguments( + spec: dict[str, Any], + replacements: dict[str, str], + format_list_id: str, +) -> list[str]: + return [ + substitute_string(str(format_str), replacements) + for format_str in spec[format_list_id] + ] + + +def get_substitutions( + template: dict[str, Any], + arguments: dict[str, str], +) -> dict[str, str]: + # Check the user specified inputs + inputs = template['inputs'] + if len(inputs) != len(arguments.keys()): + msg = 'Template inputs and arguments have different lengths' + raise ValueError(msg) + if not all(input_name in arguments for input_name in inputs): + msg = ( + f'Template inputs and arguments have different names: ' + f'inputs: {inputs}, arguments: {arguments}' + ) + raise ValueError(msg) + + if len(inputs) != len(set(inputs)): + msg = 'Template inputs contain duplicates' + raise ValueError(msg) + + return {input_name: arguments[input_name] for input_name in inputs} + + +def compute( + root_directory: Path, + template_path: Path, + compute_arguments: dict[str, str], +) -> None: + with template_path.open('rb') as f: + template = tomllib.load(f) + + substitutions = get_substitutions(template, compute_arguments) + substitutions['root_directory'] = str(root_directory) + + substituted_executable = substitute_string(template['executable'], substitutions) + substituted_arguments = substitute_arguments(template, substitutions, 'arguments') + + with contextlib.chdir(root_directory): + if template.get('use_shell', 'false') == 'true': + cmd = ' '.join([substituted_executable, *substituted_arguments]) + lgr.debug(f'compute: RUNNING: with shell=True: {cmd}') + subprocess.run(cmd, shell=True, check=True) # noqa: S602 + else: + cmd_list = [substituted_executable, *substituted_arguments] + lgr.debug(f'compute: RUNNING: {cmd_list}') + subprocess.run(cmd_list, check=True) diff --git a/datalad_remake/utils/glob.py b/datalad_remake/utils/glob.py new file mode 100644 index 0000000..3ffbdbb --- /dev/null +++ b/datalad_remake/utils/glob.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from glob import glob +from itertools import chain +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Iterable + + +# Resolve input file patterns in the original dataset +def resolve_patterns(root_dir: str | Path, patterns: Iterable[str]) -> set[str]: + return set( + filter( + lambda p: not (Path(root_dir) / p).is_dir(), + chain.from_iterable( + glob(pattern, root_dir=str(root_dir), recursive=True) + for pattern in patterns + ), + ) + ) diff --git a/datalad_remake/utils/tests/__init__.py b/datalad_remake/utils/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datalad_remake/utils/tests/test_substitution.py b/datalad_remake/utils/tests/test_substitution.py new file mode 100644 index 0000000..86d4ff9 --- /dev/null +++ b/datalad_remake/utils/tests/test_substitution.py @@ -0,0 +1,34 @@ +from ..compute import ( + substitute_arguments, + substitute_string, +) + + +def test_multiple_substitutions(): + assert ( + substitute_string( + 'This is a {test} with {multiple} substitutions', + {'test': 'string', 'multiple': 'multiple'}, + ) + == 'This is a string with multiple substitutions' + ) + + +def test_argument_substitution(): + arguments = [ + '{root_directory}/{input_dir}', + '{root_directory}/{output_dir}', + ] + s = substitute_arguments( + {'arguments': arguments}, + { + 'root_directory': '/path/to/root', + 'input_dir': 'input', + 'output_dir': 'output', + }, + 'arguments', + ) + assert s == [ + '/path/to/root/input', + '/path/to/root/output', + ] diff --git a/docs/source/cli_reference.rst b/docs/source/cli_reference.rst new file mode 100644 index 0000000..2fe6ba9 --- /dev/null +++ b/docs/source/cli_reference.rst @@ -0,0 +1,7 @@ +Command line reference +====================== + +.. toctree:: + :maxdepth: 1 + + generated/man/datalad-remake diff --git a/docs/source/python_reference.rst b/docs/source/python_reference.rst new file mode 100644 index 0000000..7655dfe --- /dev/null +++ b/docs/source/python_reference.rst @@ -0,0 +1,8 @@ +High-level API commands +======================= + +.. currentmodule:: datalad.api +.. autosummary:: + :toctree: generated + + compute diff --git a/examples/fmriprep docker/fmriprep-docker b/examples/fmriprep docker/fmriprep-docker new file mode 100644 index 0000000..c889700 --- /dev/null +++ b/examples/fmriprep docker/fmriprep-docker @@ -0,0 +1,38 @@ +# This is a very simple template to run fmriprep-docker on a single subject +# of a BIDS dataset. +# It needs an environment with `fmriprep-docker` installed (e.g. a virtualenv +# in which `pip install fmriprep-docker` has been executed). +# +# The template takes the following inputs: +# - input_dir: the path to the BIDS dataset +# - output_dir: the path to the output directory, typically a directory called +# `derivatives` in `{input_dir}`. +# - participant_label: the label of the participant to be processed, e.g. `01`. +# - license_file: the path to the FreeSurfer license file. +# +# The template assumes that the BIDS dataset referenced in `input_dir` is +# a subdataset of the dataset in which the computation is started, as outlined +# in the fairly-big-follow-up document. +# +# Input files, output files, and parameter for the computation are defined in +# the lists: `input.txt`, `output.txt`, and `parameter.txt` to keep the command +# line short. +# +# `datalad compute -I input.txt -O output.txt -P parameter.txt fmriprep_template` + +inputs = ['input_dir', 'output_dir', 'participant_label', 'license_file'] + +use_shell = 'false' +executable = 'fmriprep-docker' + +# Note: `{root_directory}` resolves to the directory of the dataset in which the +# computation was started with `datalad compute`. +arguments = [ + '{root_directory}/{input_dir}', + '{root_directory}/{output_dir}', + 'participant', + '--participant-label', '{participant_label}', + '--fs-license-file', '{license_file}', + '--skip-bids-validation', + '--ignore', 'slicetiming', +] diff --git a/examples/fmriprep docker/input.txt b/examples/fmriprep docker/input.txt new file mode 100644 index 0000000..108c779 --- /dev/null +++ b/examples/fmriprep docker/input.txt @@ -0,0 +1,7 @@ +# Paths are relative to the dataset in which `datalad compute` was executed +datasets/ds000102/dataset_description.json +datasets/ds000102/participants.tsv +datasets/ds000102/T1w.json +datasets/ds000102/task-flanker_bold.json + +datasets/ds000102/sub-01/** diff --git a/examples/fmriprep docker/output.txt b/examples/fmriprep docker/output.txt new file mode 100644 index 0000000..829130f --- /dev/null +++ b/examples/fmriprep docker/output.txt @@ -0,0 +1,2 @@ +# Paths are relative to the dataset in which `datalad compute` was executed +derivatives/ds000102/** diff --git a/examples/fmriprep docker/parameter.txt b/examples/fmriprep docker/parameter.txt new file mode 100644 index 0000000..421e49b --- /dev/null +++ b/examples/fmriprep docker/parameter.txt @@ -0,0 +1,4 @@ +input_dir=datasets/ds000102 +output_dir=derivatives/ds000102 +participant_label=01 +license_file=license.txt diff --git a/examples/fmriprep docker/readme.md b/examples/fmriprep docker/readme.md new file mode 100644 index 0000000..414b42c --- /dev/null +++ b/examples/fmriprep docker/readme.md @@ -0,0 +1,30 @@ +This directory contains a simple example for running `fmriprep-docker` on a single subject of a BIDS dataset. The template is `fmriprep-docker`, input, output, and parameter files are defined in `input.txt`, `output.txt`, and `parameter.txt`, respectively. + +The example assumes that the BIDS dataset referenced in `input_dir` is a subdataset of the dataset in which the computation is started (the root-dataset), as outlined in the fairly-big-follow-up document (https://hackmd.io/7oRB8qwuRtCm6BkV44Ubww). In contrast to the fairly-big-follow-up document, the example uses another subdataset, that collects the results of the computation. The dataset layout is therefore as follows: +``` +root-dataset +├── datasets +│ ├── ds000102 +│ +├── derivatives + ├── ds000102 +``` + +Executing the computation requires installation of this extension (see https://github.com/christian-monch/datalad-compute/tree/main/README.md), and the installation of the python package `fmriprep-docker`. The template, i.e. `fmriprep-docker` has to be placed in the folder `.datalad/compute/methods` of the root-dataset (and the dataset has to be saved). + +To keep the command line short, input files, output files, and parameter for the computation are defined in the lists: +- `input.txt` +- `output.txt` +- `parameter.txt` + +Be sure to add a compute special remote to the dataset that contains the folder `derivatives/ds000102`. +This can be done with the following command: +```bash +> git annex initremote compute type=external externaltype=compute encryption=none +``` + +The computation can be executed with the following command: + +```bash +> datalad compute -I input.txt -O output.txt -P parameter.txt fmriprep-docker +``` diff --git a/examples/one-to-many b/examples/one-to-many new file mode 100644 index 0000000..82c1139 --- /dev/null +++ b/examples/one-to-many @@ -0,0 +1,41 @@ +# This is a computing template that demonstrates a computation with +# multiple output files. +# +# Templates are addressed by their name. They should be stored in +# `$DATASET_ROOT/.datalad/compute/methods` +# +# Each template must define the following variables: +# - `inputs`: a list of strings that define the input variables +# - `use_shell`: a boolean that defines whether to use a shell to interpret executable and arguments +# - `executable`: the name of the executable +# - `arguments`: a list of strings that define the arguments to the executable +# +# During execution `subprocess.run([executable] + arguments, shell=use_shell, ...)` +# will be invoked. +# Variable placeholders, i.e `{}` in `arguments` will be +# replaced with the values provided in the parameter arguments of +# `datalad compute`. + +# An invocation of `datalad compute` has to provide a parameter argument for +# each input variable. In this case the invocation could look like this: +# `datalad compute -p first=bob -p second=alice -p output=name ... one-to-many` +# +inputs = ['first', 'second', 'output'] + +# Use a shell to interpret `arguments`. By default, `use_shell` is 'false'. +# +use_shell = 'true' + +# The name of the executable. This will be the prepended to the argument list +# given in `arguments`. +# +executable = 'echo' + +# Arguments to the executable. The curly braces are placeholders for the +# input variables that were defined above. They will be replaced with the +# values provided in the parameter arguments of `datalad compute`. +# +arguments = [ + "content: {first} > '{output}-1.txt';", + "echo content: {second} > '{output}-2.txt'", +] diff --git a/pyproject.toml b/pyproject.toml index caa7f0f..0653f5f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ name = "datalad-remake" dynamic = ["version"] description = '' readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.11" license = "MIT" keywords = [ "datalad", @@ -18,7 +18,7 @@ keywords = [ "git-annex", ] authors = [ -# { name = "Michael Hanke", email = "michael.hanke@gmail.com" }, + { name = "The DataLad Team and Contributors", email = "team@datalad.org" }, ] maintainers = [ # { name = "Michael Hanke", email = "michael.hanke@gmail.com" }, @@ -35,15 +35,16 @@ classifiers = [ "Topic :: Software Development :: Version Control :: Git", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ + "annexremote", + "datalad", "datalad_next", + "datasalad", ] [project.urls] @@ -53,6 +54,12 @@ Issues = "https://github.com/datalad/datalad-remake/issues" Source = "https://github.com/datalad/datalad-remake" Changelog = "https://github.com/datalad/datalad-remake/blob/main/CHANGELOG.md" +[project.scripts] +git-annex-remote-datalad-remake = "datalad_remake.annexremotes.remake_remote:main" + +[project.entry-points."datalad.extensions"] +remake = "datalad_remake:command_suite" + [tool.hatch.version] source = "vcs" @@ -62,6 +69,7 @@ version-file = "datalad_remake/_version.py" [tool.hatch.envs.hatch-test] default-args = ["datalad_remake"] extra-dependencies = [ + "hypothesis", "pytest", # if you come here, because coverage combination crashed for you # run `hatch test --cover` and/or see @@ -69,13 +77,15 @@ extra-dependencies = [ "pytest-cov", ] +[tool.hatch.envs.hatch-test.env-vars] +DATALAD_EXTENSIONS_LOAD = "next" [tool.hatch.envs.tests] description = "run tests across Python versions" template = "hatch-test" [[tool.hatch.envs.tests.matrix]] -python = ["3.9", "3.10", "3.11", "3.12"] +python = ["3.11", "3.12"] [tool.hatch.envs.tests.scripts] run = 'python -m pytest {args}' @@ -86,9 +96,10 @@ extra-dependencies = [ "mypy>=1.0.0", "pytest", ] + [tool.hatch.envs.types.scripts] check = [ - "mypy --install-types --non-interactive --python-version 3.9 --pretty --show-error-context datalad_remake", + "mypy --install-types --non-interactive --python-version 3.11 --pretty --show-error-context datalad_remake", ] [tool.hatch.envs.docs] @@ -96,6 +107,7 @@ description = "build Sphinx-based docs" extra-dependencies = [ "sphinx", ] + [tool.hatch.envs.docs.scripts] build = [ "make -C docs html", @@ -111,6 +123,7 @@ detached = true extra-dependencies = [ "commitizen", ] + [tool.hatch.envs.cz.scripts] check-commits = [ # check all commit messages since the (before) beginning @@ -131,6 +144,7 @@ detached = true extra-dependencies = [ "codespell", ] + [tool.hatch.envs.codespell.scripts] check = "codespell" fix = "codespell --write-changes" @@ -149,7 +163,7 @@ data_file = "${COVERAGE_ROOT-.}/.coverage" [tool.coverage.paths] datalad_remake = ["src/datalad_remake", "*/datalad_remake/src/datalad_remake"] -tests = ["tests", "*/datalad_remake/tests"] +tests = ["tests", "*/datalad_remake/*/tests"] [tool.coverage.report] show_missing = true @@ -167,7 +181,7 @@ exclude = [ ] line-length = 88 indent-width = 4 -target-version = "py39" +target-version = "py311" [tool.ruff.format] # Prefer single quotes over double quotes. quote-style = "single" diff --git a/requirements-devel.txt b/requirements-devel.txt new file mode 100644 index 0000000..dfab594 --- /dev/null +++ b/requirements-devel.txt @@ -0,0 +1,16 @@ +# requirements for a development environment +annexremote +coverage +datalad +datalad-next +datasalad +hatch +hatch-vcs +hypothesis +pytest +pytest-cov + +# requirements for a document building +sphinx +sphinx_rtd_theme +sphinx_copybutton