diff --git a/src/runcrate/convert.py b/src/runcrate/convert.py index 9d85fb5..0dd842b 100644 --- a/src/runcrate/convert.py +++ b/src/runcrate/convert.py @@ -19,6 +19,7 @@ import hashlib import json import re +from io import StringIO from pathlib import Path import networkx as nx @@ -187,6 +188,8 @@ def __init__(self, root, workflow_name=None, license=None, readme=None): # index collections by their main entity's id self.collections = {} self.hashes = {} + # map source files to destination files + self.file_map = {} @staticmethod def _get_step_maps(cwl_defs): @@ -531,6 +534,11 @@ def convert_param(self, prov_param, crate, convert_secondary=True, parent=None): "sha1": hash_, }) self._set_alternate_name(prov_param, action_p, parent=parent) + try: + source_k = str(source.resolve(strict=False)) + except RuntimeError: + source_k = str(source) + self.file_map[source_k] = dest return action_p if "ro:Folder" in type_names: hash_ = self.hashes[prov_param.id.localpart] @@ -647,10 +655,33 @@ def patch_workflow_input_collection(self, crate, wf=None): if "ComputationalWorkflow" in as_list(tool.type): self.patch_workflow_input_collection(crate, wf=tool) + def _map_input_data(self, data): + if isinstance(data, list): + return [self._map_input_data(_) for _ in data] + if isinstance(data, dict): + rval = {} + for k, v in data.items(): + if k == "location": + source = self.root / "workflow" / v + try: + source_k = str(source.resolve(strict=False)) + except RuntimeError: + source_k = str(source) + dest = self.file_map.get(source_k) + rval[k] = str(dest) if dest else v + else: + rval[k] = self._map_input_data(v) + return rval + return data + def add_inputs_file(self, crate): path = self.root / "workflow" / INPUTS_FILE_BASENAME if path.is_file(): - crate.add_file(path, properties={ + with open(path) as f: + data = json.load(f) + data = self._map_input_data(data) + source = StringIO(json.dumps(data, indent=4)) + crate.add_file(source, path.name, properties={ "name": "input object document", "encodingFormat": "application/json", }) diff --git a/tests/test_cwlprov_crate_builder.py b/tests/test_cwlprov_crate_builder.py index 1d53ec5..23489ec 100644 --- a/tests/test_cwlprov_crate_builder.py +++ b/tests/test_cwlprov_crate_builder.py @@ -32,7 +32,7 @@ def test_revsort(data_dir, tmpdir): output = tmpdir / "revsort-run-1-crate" license = "Apache-2.0" readme = data_dir / "README.md" - inputs_file = data_dir / "workflow" / "primary-job.json" + inputs_file = root / "workflow" / "primary-job.json" workflow_name = "RevSort" builder = ProvCrateBuilder(root, workflow_name=workflow_name, license=license, readme=readme) crate = builder.build() @@ -145,6 +145,17 @@ def test_revsort(data_dir, tmpdir): assert inputs_f assert inputs_f.type == "File" assert inputs_f["encodingFormat"] == "application/json" + with open(inputs_file) as f: + ro_json = json.load(f) + with open(output / inputs_file.name) as f: + crate_json = json.load(f) + assert set(crate_json) == {"input", "reverse_sort"} + assert set(crate_json["input"]) == set(ro_json["input"]) + for k, v in crate_json["input"].items(): + if k == "location": + assert v == ro_json["input"][k].rsplit("/", 1)[-1] + else: + assert v == ro_json["input"][k] # file contents in_text = (root / "data/32/327fc7aedf4f6b69a42a7c8b808dc5a7aff61376").read_text()