From 5b35399fb2ecc03c925edc94785342164902effa Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sat, 26 Oct 2024 14:20:51 +0200 Subject: [PATCH] test: add trusted computation test --- datalad_remake/annexremotes/remake_remote.py | 25 +++--- .../annexremotes/tests/test_remake_remote.py | 86 +++++++++++++++++-- datalad_remake/commands/make_cmd.py | 19 ++-- .../commands/tests/create_datasets.py | 25 ++++-- datalad_remake/utils/verify.py | 15 +--- 5 files changed, 121 insertions(+), 49 deletions(-) diff --git a/datalad_remake/annexremotes/remake_remote.py b/datalad_remake/annexremotes/remake_remote.py index 11d8428..0ead4f2 100644 --- a/datalad_remake/annexremotes/remake_remote.py +++ b/datalad_remake/annexremotes/remake_remote.py @@ -43,10 +43,9 @@ class RemakeRemote(SpecialRemote): def __init__(self, annex: Master): super().__init__(annex) self.configs = { - 'allow_untrusted_execution': - 'Allow execution of untrusted code with untrusted parameters. ' - 'set to "true" to enable. THIS IS DANGEROUS and might lead to ' - 'remote code execution.', + 'allow_untrusted_execution': 'Allow execution of untrusted code with untrusted parameters. ' + 'set to "true" to enable. THIS IS DANGEROUS and might lead to ' + 'remote code execution.', } def __del__(self): @@ -92,12 +91,9 @@ def get_url_for_key(self, key: str) -> str: self.annex.debug(f'get_url_for_key: key: {key!r}, urls: {urls!r}') return urls[0] - def get_compute_info(self, - key: str, - *, - allow_untrusted_specs: bool = False - ) -> tuple[dict[str, Any], Dataset]: - + def get_compute_info( + self, key: str, *, allow_untrusted_specs: bool = False + ) -> tuple[dict[str, Any], Dataset]: def get_assigned_value(assignment: str) -> str: return assignment.split('=', 1)[1] @@ -122,13 +118,12 @@ def get_assigned_value(assignment: str) -> str: def transfer_retrieve(self, key: str, file_name: str) -> None: self.annex.debug(f'TRANSFER RETRIEVE key: {key!r}, file_name: {file_name!r}') - allow_untrusted_execution = self.annex.getconfig( - 'allow_untrusted_execution' - ) == 'true' + allow_untrusted_execution = ( + self.annex.getconfig('allow_untrusted_execution') == 'true' + ) compute_info, dataset = self.get_compute_info( - key, - allow_untrusted_specs=allow_untrusted_execution + key, allow_untrusted_specs=allow_untrusted_execution ) self.annex.debug(f'TRANSFER RETRIEVE compute_info: {compute_info!r}') diff --git a/datalad_remake/annexremotes/tests/test_remake_remote.py b/datalad_remake/annexremotes/tests/test_remake_remote.py index 7fd5f3d..ccd3c41 100644 --- a/datalad_remake/annexremotes/tests/test_remake_remote.py +++ b/datalad_remake/annexremotes/tests/test_remake_remote.py @@ -1,14 +1,20 @@ +import re import subprocess from io import TextIOBase +from pathlib import Path from queue import Queue from typing import cast +import pytest from annexremote import Master from datalad_next.tests import skip_if_on_windows from datalad_remake.commands.tests.create_datasets import create_ds_hierarchy -from ... import specification_dir +from ... import ( + specification_dir, + template_dir, +) from ...commands.make_cmd import build_json from ..remake_remote import RemakeRemote @@ -64,11 +70,24 @@ def send(self, value): @skip_if_on_windows -def test_compute_remote_main(tmp_path, monkeypatch): - dataset = create_ds_hierarchy(tmp_path, 'ds1', 0)[0][2] +@pytest.mark.parametrize('trusted', [True, False]) +def test_compute_remote_main(tmp_path, monkeypatch, trusted): + if trusted: + gpg_homedir = tmp_path / 'tmp_gpg_dir' + + # make sure that the users keystore is not overwritten + monkeypatch.setenv('HOME', '/dev/null') + monkeypatch.setenv('GNUPGHOME', str(gpg_homedir)) + + # Generate a keypair + keyid = _create_keypair(gpg_homedir) + else: + keyid = None + + dataset = create_ds_hierarchy(tmp_path, 'ds1', 0, keyid)[0][2] monkeypatch.chdir(dataset.path) - template_path = dataset.pathobj / '.datalad' / 'make' / 'methods' + template_path = dataset.pathobj / template_dir template_path.mkdir(parents=True) (template_path / 'echo').write_text(template) dataset.save() @@ -84,10 +103,13 @@ def test_compute_remote_main(tmp_path, monkeypatch): ) ).split(b': ')[1] - (dataset.pathobj / specification_dir).mkdir(parents=True, exist_ok=True) - (dataset.pathobj / specification_dir / '000001111122222').write_text( + specification_path = dataset.pathobj / specification_dir + spec_name = '000001111122222' + specification_path.mkdir(parents=True, exist_ok=True) + (specification_path / spec_name).write_text( build_json('echo', [], ['a.txt'], {'content': 'some_string'}) ) + dataset.save() input_ = MockedInput() @@ -97,7 +119,7 @@ def test_compute_remote_main(tmp_path, monkeypatch): input_.send('PREPARE\n') input_.send(f'TRANSFER RETRIEVE {key.decode()} {tmp_path / "remade.txt"!s}\n') # The next line is the answer to `GETCONFIG allow_untrusted_execution` - input_.send('VALUE true\n') + input_.send(f'VALUE {"false" if trusted else "true"}\n') url = ( 'datalad-make:///?' f'root_version={dataset.repo.get_hexsha()}' @@ -121,3 +143,53 @@ def test_compute_remote_main(tmp_path, monkeypatch): # At this point the datalad-remake remote should have executed the # computation and written the result. assert (tmp_path / 'remade.txt').read_text().strip() == 'content: some_string' + + +def _create_keypair(gpg_dir: Path): + gpg_dir.mkdir(parents=True) + gpg_dir.chmod(0o700) + private_keys_dir = gpg_dir / 'private-keys-v1.d' + private_keys_dir.mkdir() + private_keys_dir.chmod(0o700) + script = b""" + Key-Type: RSA + Key-Length: 4096 + Subkey-Type: RSA + Subkey-Length: 4096 + Name-Real: Test User + Name-Email: test@example.com + Expire-Date: 0 + %no-protection + #%transient-key + %commit + """ + environment = {'HOME': '/dev/null'} + result = subprocess.run( + [ # noqa: S607 + 'gpg', + '--batch', + '--homedir', + str(gpg_dir), + '--gen-key', + '--keyid-format', + 'long', + ], + input=script, + capture_output=True, + check=True, + env=environment, + ) + result = subprocess.run( + [ # noqa: S607 + 'gpg', + '--homedir', + str(gpg_dir), + '--list-secret-keys', + '--keyid-format', + 'long', + ], + capture_output=True, + check=True, + env=environment, + ) + return re.findall('sec.*rsa4096/([A-Z0-9]+)', result.stdout.decode())[0] diff --git a/datalad_remake/commands/make_cmd.py b/datalad_remake/commands/make_cmd.py index 2347287..34530f0 100644 --- a/datalad_remake/commands/make_cmd.py +++ b/datalad_remake/commands/make_cmd.py @@ -34,7 +34,6 @@ call_git_oneline, call_git_success, ) -from hypothesis.reporting import default from datalad_remake import ( specification_dir, @@ -168,19 +167,17 @@ class Make(ValidatedInterface): 'should be provided.', ), 'allow_untrusted_code': Parameter( - args=( - '--allow-untrusted-code', - ), + args=('--allow-untrusted-code',), action='store_true', default=False, doc='Skip commit signature verification before executing code. This ' - 'should only be used in a strictly controlled environment with ' - 'fully trusted datasets. Trusted dataset means: every commit ' - 'stems from a trusted entity. ' - 'DO NOT USE THIS OPTION, unless you are sure to understand the ' - 'consequences. One of which is that arbitrary parties can ' - 'execute arbitrary code under your account on your ' - 'infrastructure.', + 'should only be used in a strictly controlled environment with ' + 'fully trusted datasets. Trusted dataset means: every commit ' + 'stems from a trusted entity. ' + 'DO NOT USE THIS OPTION, unless you are sure to understand the ' + 'consequences. One of which is that arbitrary parties can ' + 'execute arbitrary code under your account on your ' + 'infrastructure.', ), } diff --git a/datalad_remake/commands/tests/create_datasets.py b/datalad_remake/commands/tests/create_datasets.py index bc28d12..5c375d1 100644 --- a/datalad_remake/commands/tests/create_datasets.py +++ b/datalad_remake/commands/tests/create_datasets.py @@ -19,7 +19,8 @@ def update_config_for_remake(dataset: Dataset): ) -def add_remake_remote(dataset: Dataset): +def add_remake_remote(dataset: Dataset, signing_key: str | None = None): + aue = 'false' if signing_key else 'true' call_git_success( [ '-C', @@ -30,20 +31,24 @@ def add_remake_remote(dataset: Dataset): 'type=external', 'externaltype=datalad-remake', 'encryption=none', - 'allow_untrusted_execution=true', + f'allow_untrusted_execution={aue}', ], capture_output=True, ) def create_ds_hierarchy( - tmp_path: Path, name: str, subdataset_levels: int = 2 + tmp_path: Path, + name: str, + subdataset_levels: int = 2, + signing_key: str | None = None, ) -> list[tuple[str, Path, Dataset]]: # Create root dataset root_dataset = Dataset(tmp_path / name) root_dataset.create(force=True, result_renderer='disabled') (root_dataset.pathobj / 'a.txt').write_text('a\n') (root_dataset.pathobj / 'b.txt').write_text('b\n') + _enable_signing(root_dataset, signing_key) root_dataset.save(result_renderer='disabled') datasets = [(name, tmp_path / name, root_dataset)] @@ -55,6 +60,7 @@ def create_ds_hierarchy( (subdataset.pathobj / f'a{level}.txt').write_text(f'a{level}\n') (subdataset.pathobj / f'b{level}.txt').write_text(f'b{level}\n') subdataset.save(result_renderer='disabled') + _enable_signing(subdataset, signing_key) datasets.append((f'{name}_subds{level}', subdataset_path, subdataset)) # Link the datasets @@ -71,15 +77,24 @@ def create_ds_hierarchy( update_config_for_remake(root_dataset) # Add datalad-remake remotes to the root dataset and all subdatasets - add_remake_remote(root_dataset) + add_remake_remote(root_dataset, signing_key) subdataset_path = Path() for index in range(subdataset_levels): subdataset_path /= f'{name}_subds{index}' - add_remake_remote(Dataset(root_dataset.pathobj / subdataset_path)) + add_remake_remote( + Dataset(root_dataset.pathobj / subdataset_path), + signing_key, + ) return datasets +def _enable_signing(dataset: Dataset, key: str | None): + if key is not None: + dataset.config.set('commit.gpgsign', 'true', scope='local') + dataset.config.set('user.signingkey', key, scope='local') + + def create_simple_computation_dataset( tmp_path: Path, dataset_name: str, diff --git a/datalad_remake/utils/verify.py b/datalad_remake/utils/verify.py index 874d634..b80f992 100644 --- a/datalad_remake/utils/verify.py +++ b/datalad_remake/utils/verify.py @@ -8,19 +8,12 @@ def verify_file(root_directory: Path, file: Path): # Get the latest commit of `file` - commit = call_git_oneline([ - '-C', str(root_directory), - 'log', '-1', '--follow', - '--pretty=%H', - str(file) - ]) + commit = call_git_oneline( + ['-C', str(root_directory), 'log', '-1', '--follow', '--pretty=%H', str(file)] + ) # Let git do the verification of the commit - result = call_git_success([ - '-C', str(root_directory), - 'verify-commit', - commit - ]) + result = call_git_success(['-C', str(root_directory), 'verify-commit', commit]) if not result: msg = f'Signature validation of {file} failed' raise ValueError(msg)