Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactored Snakemake workflow and generate example output #41

Closed
wants to merge 12 commits into from
Closed
51 changes: 33 additions & 18 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,29 +1,44 @@
FROM python:3.11-slim
# Stage 1: Build environment
FROM python:3.11-slim AS build-stage

RUN pip install poetry
# Install build tools and Poetry
RUN apt-get update && apt-get install -y build-essential \
&& pip install poetry

# Set the working directory
WORKDIR /app

# Install build tools for Snakemake (gcc, make, etc.)
RUN apt-get update && apt-get install -y build-essential
# Copy dependency files and install dependencies
COPY pyproject.toml poetry.lock /app/
RUN poetry config virtualenvs.create false \
&& poetry install --no-interaction --no-ansi

# Copy the pyproject.toml file
COPY pyproject.toml /app/
# Copy and install the application
COPY . /app
RUN poetry install

# Install the dependencies
RUN poetry install --no-root
# Stage 2: Snakemake runtime environment
FROM snakemake/snakemake:latest

# Copy the rest of the application files
COPY . /app
# Install Poetry
RUN pip install poetry

# Install the package
RUN poetry install
WORKDIR /app

# Copy the application from the build stage
COPY --from=build-stage /app /app

# Install dependencies
RUN pip install -r <(poetry export --format requirements.txt --without-hashes) \
&& pip install -e .

# Set up non-root user
RUN groupadd -r snakemake && useradd -r -g snakemake snakemake \
&& chown -R snakemake:snakemake /app

# Install Snakemake using Poetry
RUN poetry add snakemake
USER snakemake

# Set the entry point for the container
ENTRYPOINT ["poetry", "run"]
# Configure Python path
ENV PYTHONPATH="/app:${PYTHONPATH}"

CMD ["help"]
# Set the entry point
ENTRYPOINT ["snakemake"]
66 changes: 50 additions & 16 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -1,20 +1,13 @@
# TODO - Refactor to input args to the Snakemake file
WORKFLOW_IDS = range(1,11)
from snakemake.io import directory

VERSIONS = ['1']
OUTPUT_DIRS = "data"
MERGED_FILE = "merged.ttl"


def list_expected_files():
files = []
for wf_id in WORKFLOW_IDS:
for ver in VERSIONS:
files.append(f"{OUTPUT_DIRS}/{wf_id}_{ver}_ro-crate-metadata.json")
return files
ro_crate_metadata_dir = "ro-crate-metadata/"

rule all:
input:
MERGED_FILE
"ro-crate-metadata"

rule source_ro_crates:
output:
Expand All @@ -23,12 +16,20 @@ rule source_ro_crates:
"""
# Create the output directory if it doesn't exist:
mkdir -p {OUTPUT_DIRS}

# Add the current directory to PYTHONPATH, creating it if it doesn't exist
export PYTHONPATH="${{PYTHONPATH:+$PYTHONPATH:}}$(pwd)"

# Run the source_crates script to download the RO Crate metadata:
python workflowhub_graph/source_crates.py --workflow-ids 1-10 --prod --all-versions

# After sourcing, check which files were actually created:
python workflowhub_graph/check_outputs.py --workflow-ids 1-10 --versions {VERSIONS} --output-dir {OUTPUT_DIRS}
# Run the source_crates script to download the RO Crate metadata,
# then check the output files and generate created_files.json:

# - all versions of all workflows:
python workflowhub_graph/source_crates.py --prod --all-versions
python workflowhub_graph/check_outputs.py --versions {VERSIONS} --output-dir {OUTPUT_DIRS}

# - all versions of first 10 workflows:
# python workflowhub_graph/source_crates.py --workflow-ids 1-20 --prod --all-versions
# python workflowhub_graph/check_outputs.py --workflow-ids 1-20 --versions {VERSIONS} --output-dir {OUTPUT_DIRS}
"""

rule report_created_files:
Expand Down Expand Up @@ -65,3 +66,36 @@ rule merge_files:
shell(f"""
python workflowhub_graph/merge.py {output[0]} -p "data/*.json"
""")

rule create_ro_crate:
input:
MERGED_FILE
params:
workflow_file = "Snakefile"
output:
directory("ro-crate-metadata/")
shell:
"""
# Create a new virtual environment
python -m venv rocrate_env

# Activate the virtual environment
source rocrate_env/bin/activate

# Upgrade pip to avoid any potential issues
pip install --upgrade pip

# pip uninstall urllib3

# Install required packages
pip install requests urllib3 rocrate rocrate-zenodo

# Run the create_ro_crate script
python workflowhub_graph/create_ro_crate.py {input} {params.workflow_file} {output}

# Deactivate the virtual environment
deactivate

# Remove the virtual environment to clean up
rm -rf rocrate_env
"""
Loading
Loading