From 8db4f9caf33d2be1d7e4aa1da17c08f19c5fafc8 Mon Sep 17 00:00:00 2001 From: Alex Hambley <33315205+alexhambley@users.noreply.github.com> Date: Wed, 21 Aug 2024 14:57:16 +0100 Subject: [PATCH] Update authors and readme.md - Addresses #40, #39, #38, #36 - Added newer diagram to readme. - Removed references to ghcr, this will be added back later - Added proper authors and orcids - Replaced Dockerfile encodingFormat with text/plain - Added created_files.json to RO Crate and added to data_entity["isBasedOn"]. --- README.md | 51 +++++++++-------- workflowhub_graph/create_ro_crate.py | 86 ++++++++++++++++++++-------- 2 files changed, 87 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index f253ba7..94dd7f0 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,42 @@ -# WorkflowHub Knowledge Graph +# WorkflowHub Knowledge Graph -## Getting started +A tool to generate a knowledge graph from a source of RO Crates. By default, this tool sources and generates an RDF graph of crates from [WorkflowHub](https://workflowhub.eu/). -### Obtaining workflowhub-graph +## Getting Started -workflowhub-graph is available packaged as a Docker container. You can pull the latest version of the container by running: +This tool is run as a Snakemake workflow. We recommend building a Docker container to run the workflow: -```bash -docker pull ghcr.io/uomresearchit/workflowhub-graph:latest +```bash +docker build -t knowledgegraph . ``` -This provides the a wrapper for the executable `workflowhub-graph` which can be used to run the various tools provided by the package. +Then, you can run the workflow using the following command: -### Running workflowhub-graph +```bash +docker run --rm -v $(pwd):/app -w /app knowledgegraph --cores 4 -s /app/Snakefile +``` -There are several tools provided by the `workflowhub-graph` package. These are: -- 'help': Display help information. -- 'source-crates': Download ROCrates from the WorkflowHub API. -- 'absolutize': Make all paths in an ROCrate absolute. -- 'upload': Upload an ROCrate to Zenodo. -- 'merge': Merge multiple ROCrates into an RDF graph. +This command runs a Docker container using the `knowledgegraph` image. It mounts the working directory to `/app` +inside the container, sets `/app` as the working directory, and then runs the workflow. Once the workflow completes, +the container is automatically removed. -To run any of these tools, you can use the following command: +## Structure -```bash -docker run ghcr.io/uomresearchit/workflowhub-graph:latest +```mermaid +flowchart TD + A[Source RO Crates] --> B[Check Outputs]; + B[Check Outputs] --> C[Report Downloaded RO Crates]; + B[Check Outputs]-->D[Merge RO Crates]; + D[Merge RO Crates]-->E[Create Merged Workflow Run RO Crate] ``` -For example, to download ROCrates from the WorkflowHub API, you can run: +- **`source_ro_crates`**: This rule sources RO crates from the WorkflowHub API (`source_crates.py`) and then checks +the output (`check_outputs.py`). This generates a list of expected file paths based on the workflow IDs and versions to +facilitate the workflow. -```bash -docker run ghcr.io/uomresearchit/workflowhub-graph:latest source-crates -``` +- **`report_created_files`**: Optional. This rule reports the downloaded RO crates to the user. +- **`merge_files`**: This rule merges the downloaded RO crates into a single RDF graph (`merge_ro_crates.py`). +- **`create_ro_crate`**: This rule creates a merged workflow run RO crate from the merged RDF graph (`create_ro_crate.py`). ## Contributing @@ -46,10 +51,6 @@ docker run ghcr.io/uomresearchit/workflowhub-graph:latest source-crates - **Development Branch**: The `develop` branch is currently our main integration branch. Features and fixes should target `develop` through PRs. - **Feature Branches**: These feature branches should be short-lived and focused. Once done, please create a pull request to merge it into `develop`. -## Overview - -![arch_diagram.png](./docs/images/arch_diagram.png) - ## License [BSD 2-Clause License](https://opensource.org/license/bsd-2-clause) diff --git a/workflowhub_graph/create_ro_crate.py b/workflowhub_graph/create_ro_crate.py index 65874c1..3709cfd 100644 --- a/workflowhub_graph/create_ro_crate.py +++ b/workflowhub_graph/create_ro_crate.py @@ -12,7 +12,6 @@ def create_ro_crate(input_file: str, workflow_file: str, output_dir: str) -> Non :param input_file: The input file provided by the Snakemake workflow (e.g., merged data file). :param workflow_file: Reference to the Snakemake workflow. :param output_dir: The output directory to store the RO-Crate metadata file. - :return: """ crate = ROCrate() @@ -23,18 +22,54 @@ def create_ro_crate(input_file: str, workflow_file: str, output_dir: str) -> Non ) # Add authors: - alice = crate.add( + auth_1 = crate.add( Person( crate, "https://orcid.org/0000-0000-0000-0000", - properties={"name": "Alice Doe", "affiliation": "University of Flatland"}, + properties={ + "name": "Alexander Hambley", + "affiliation": "University of Manchester", + }, + ) + ) + auth_2 = crate.add( + Person( + crate, + "https://orcid.org/0000-0002-0035-6475", + properties={ + "name": "Eli Chadwick", + "affiliation": "University of Manchester", + }, + ) + ) + auth_3 = crate.add( + Person( + crate, + "https://orcid.org/0000-0002-4565-9760", + properties={ + "name": "Oliver Woolland", + "affiliation": "University of Manchester", + }, ) ) - bob = crate.add( + auth_4 = crate.add( Person( crate, - "https://orcid.org/0000-0000-0000-0001", - properties={"name": "Bob Doe", "affiliation": "University of Flatland"}, + "https://orcid.org/0000-0001-9842-9718", + properties={ + "name": "Stian Soiland-Reyes", + "affiliation": "University of Manchester", + }, + ) + ) + auth_5 = crate.add( + Person( + crate, + "https://orcid.org/0000-0001-6353-0808", + properties={ + "name": "Volodymyr Savchenko", + "affiliation": "University of Geneva", + }, ) ) @@ -52,12 +87,23 @@ def create_ro_crate(input_file: str, workflow_file: str, output_dir: str) -> Non properties={ "@type": "File", "name": "Dockerfile", - "encodingFormat": "application/yaml", + "encodingFormat": "text/plain", "description": "The Dockerfile used to build the Docker images for the workflow.", "conformsTo": {"@id": "https://docs.docker.com/reference/dockerfile/"}, }, ) + created_files = crate.add_file( + "./created_files.json", + properties={ + "@type": "File", + "name": "created_files.json", + "encodingFormat": "application/json", + "description": "A JSON file containing the list of files sourced by the workflow.", + "conformsTo": {"@id": "https://docs.docker.com/reference/dockerfile/"}, + }, + ) + crate.add_file("./poetry.lock") crate.add_file("./README.md") @@ -68,42 +114,32 @@ def create_ro_crate(input_file: str, workflow_file: str, output_dir: str) -> Non "name": "Merged Data File", "description": "This file contains merged RDF triples from multiple RO-Crates sourced from WorkflowHub.", "encodingFormat": "text/turtle", - "author": [alice["@id"], bob["@id"]], }, ) + data_entity["author"] = [auth_1, auth_2, auth_3, auth_4, auth_5] + data_entity["isBasedOn"] = created_files + workflow_entity = crate.add_workflow( source=workflow_file, properties={ "name": "Snakemake Workflow", "description": "This is the Snakemake workflow used to generate the merged RDF triples.", - "author": [alice["@id"], bob["@id"]], - "output": data_entity["@id"], }, main=True, lang="snakemake", ) + workflow_entity["author"] = [auth_1, auth_2, auth_3, auth_4] + workflow_entity["output"] = data_entity + if "conformsTo" not in crate.root_dataset: crate.root_dataset.append_to( "conformsTo", {"@id": "https://w3id.org/ro/wfrun/workflow/0.5"} ) - crate.add( - ContextEntity( - crate, - identifier=str(uuid.uuid4()), - properties={ - "@type": "CreateAction", - "name": "Merge RDF Triples", - "description": "Merging RDF triples from sourced crates.", - "agent": [alice["@id"], bob["@id"]], - "endTime": datetime.now().time().isoformat(), - "instrument": workflow_entity["@id"], - "result": data_entity["@id"], - }, - ) - ) + # Add license: + crate.license = "https://opensource.org/license/bsd-2-clause" # Writing the RO-Crate metadata: crate.write(output_dir)