diff --git a/src/runcrate/convert.py b/src/runcrate/convert.py index 9bdc3e1..0dd842b 100644 --- a/src/runcrate/convert.py +++ b/src/runcrate/convert.py @@ -19,6 +19,7 @@ import hashlib import json import re +from io import StringIO from pathlib import Path import networkx as nx @@ -37,6 +38,7 @@ WORKFLOW_BASENAME = "packed.cwl" +INPUTS_FILE_BASENAME = "primary-job.json" CWL_TYPE_MAP = { "string": "Text", @@ -186,6 +188,8 @@ def __init__(self, root, workflow_name=None, license=None, readme=None): # index collections by their main entity's id self.collections = {} self.hashes = {} + # map source files to destination files + self.file_map = {} @staticmethod def _get_step_maps(cwl_defs): @@ -256,6 +260,7 @@ def build(self): self.add_engine_run(crate) self.add_action(crate, self.workflow_run) self.patch_workflow_input_collection(crate) + self.add_inputs_file(crate) return crate def add_root_metadata(self, crate): @@ -529,6 +534,11 @@ def convert_param(self, prov_param, crate, convert_secondary=True, parent=None): "sha1": hash_, }) self._set_alternate_name(prov_param, action_p, parent=parent) + try: + source_k = str(source.resolve(strict=False)) + except RuntimeError: + source_k = str(source) + self.file_map[source_k] = dest return action_p if "ro:Folder" in type_names: hash_ = self.hashes[prov_param.id.localpart] @@ -644,3 +654,34 @@ def patch_workflow_input_collection(self, crate, wf=None): for tool in wf.get("hasPart", []): if "ComputationalWorkflow" in as_list(tool.type): self.patch_workflow_input_collection(crate, wf=tool) + + def _map_input_data(self, data): + if isinstance(data, list): + return [self._map_input_data(_) for _ in data] + if isinstance(data, dict): + rval = {} + for k, v in data.items(): + if k == "location": + source = self.root / "workflow" / v + try: + source_k = str(source.resolve(strict=False)) + except RuntimeError: + source_k = str(source) + dest = self.file_map.get(source_k) + rval[k] = str(dest) if dest else v + else: + rval[k] = self._map_input_data(v) + return rval + return data + + def add_inputs_file(self, crate): + path = self.root / "workflow" / INPUTS_FILE_BASENAME + if path.is_file(): + with open(path) as f: + data = json.load(f) + data = self._map_input_data(data) + source = StringIO(json.dumps(data, indent=4)) + crate.add_file(source, path.name, properties={ + "name": "input object document", + "encodingFormat": "application/json", + }) diff --git a/tests/test_cwlprov_crate_builder.py b/tests/test_cwlprov_crate_builder.py index f3234e7..23489ec 100644 --- a/tests/test_cwlprov_crate_builder.py +++ b/tests/test_cwlprov_crate_builder.py @@ -32,6 +32,7 @@ def test_revsort(data_dir, tmpdir): output = tmpdir / "revsort-run-1-crate" license = "Apache-2.0" readme = data_dir / "README.md" + inputs_file = root / "workflow" / "primary-job.json" workflow_name = "RevSort" builder = ProvCrateBuilder(root, workflow_name=workflow_name, license=license, readme=readme) crate = builder.build() @@ -140,6 +141,22 @@ def test_revsort(data_dir, tmpdir): assert set(_connected(workflow)) == set([ ("packed.cwl#sorttool.cwl/output", "packed.cwl#main/output"), ]) + inputs_f = crate.get(inputs_file.name) + assert inputs_f + assert inputs_f.type == "File" + assert inputs_f["encodingFormat"] == "application/json" + with open(inputs_file) as f: + ro_json = json.load(f) + with open(output / inputs_file.name) as f: + crate_json = json.load(f) + assert set(crate_json) == {"input", "reverse_sort"} + assert set(crate_json["input"]) == set(ro_json["input"]) + for k, v in crate_json["input"].items(): + if k == "location": + assert v == ro_json["input"][k].rsplit("/", 1)[-1] + else: + assert v == ro_json["input"][k] + # file contents in_text = (root / "data/32/327fc7aedf4f6b69a42a7c8b808dc5a7aff61376").read_text() assert (output / wf_input_file.id).read_text() == in_text