From 0721e36194fe053515951729cee2567eec922a03 Mon Sep 17 00:00:00 2001 From: Roey Ben Chaim Date: Thu, 19 Sep 2024 13:01:02 -0700 Subject: [PATCH] Dev containers for: analyzer, analyzer+transformers, anonymizer and image redaction (#1450) --- .../devcontainer.json | 21 +++++++++++++++++++ .../presidio-analyzer/devcontainer.json | 18 ++++++++++++++++ .../presidio-anonymizer/devcontainer.json | 20 ++++++++++++++++++ .../presidio-image-redactor/devcontainer.json | 20 ++++++++++++++++++ presidio-analyzer/Dockerfile.dev | 20 ++++++++++++++++++ presidio-analyzer/install_dependencies.sh | 5 +++++ presidio-analyzer/pyproject.toml | 7 ++++++- presidio-anonymizer/Dockerfile.dev | 7 +++++++ presidio-image-redactor/Dockerfile.dev | 16 ++++++++++++++ 9 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 .devcontainer/presidio-analyzer-transformers/devcontainer.json create mode 100644 .devcontainer/presidio-analyzer/devcontainer.json create mode 100644 .devcontainer/presidio-anonymizer/devcontainer.json create mode 100644 .devcontainer/presidio-image-redactor/devcontainer.json create mode 100644 presidio-analyzer/Dockerfile.dev create mode 100755 presidio-analyzer/install_dependencies.sh create mode 100644 presidio-anonymizer/Dockerfile.dev create mode 100644 presidio-image-redactor/Dockerfile.dev diff --git a/.devcontainer/presidio-analyzer-transformers/devcontainer.json b/.devcontainer/presidio-analyzer-transformers/devcontainer.json new file mode 100644 index 000000000..b11c785be --- /dev/null +++ b/.devcontainer/presidio-analyzer-transformers/devcontainer.json @@ -0,0 +1,21 @@ +{ + "name": "Presidio Analyzer Transformers", + "build": { + "dockerfile": "../../presidio-analyzer/Dockerfile.dev", + "context": "../../presidio-analyzer", + "args": { + "DEV_MODE": "transformers", + "NLP_CONF_FILE": "presidio_analyzer/conf/transformers.yaml", + "POETRY_EXTRAS": "-E transformers" + } + }, + "workspaceMount": "source=${localWorkspaceFolder}/presidio-analyzer,target=/workspace,type=bind", + "workspaceFolder": "/workspace", + "postCreateCommand": "chmod +x ./install_dependencies.sh && ./install_dependencies.sh", + "postAttachCommand": "poetry shell", + "customizations": { + "extensions": [ + "ms-python.python", + ], + } +} diff --git a/.devcontainer/presidio-analyzer/devcontainer.json b/.devcontainer/presidio-analyzer/devcontainer.json new file mode 100644 index 000000000..0e11b77d9 --- /dev/null +++ b/.devcontainer/presidio-analyzer/devcontainer.json @@ -0,0 +1,18 @@ +{ + "name": "Presidio Analyzer", + "build": { + "dockerfile": "../../presidio-analyzer/Dockerfile.dev", + "context": "../../presidio-analyzer", + "args": { + "DEV_MODE": "dev" + } + }, + "workspaceMount": "source=${localWorkspaceFolder}/presidio-analyzer,target=/workspace,type=bind", + "workspaceFolder": "/workspace", + "postAttachCommand": "chmod +x ./install_dependencies.sh && ./install_dependencies.sh && poetry shell", + "customizations": { + "extensions": [ + "ms-python.python", + ], + } +} diff --git a/.devcontainer/presidio-anonymizer/devcontainer.json b/.devcontainer/presidio-anonymizer/devcontainer.json new file mode 100644 index 000000000..63777e47a --- /dev/null +++ b/.devcontainer/presidio-anonymizer/devcontainer.json @@ -0,0 +1,20 @@ +{ + "name": "Presidio Anonymizer", + "build": { + "dockerfile": "../../presidio-anonymizer/Dockerfile.dev", + "context": "../../presidio-anonymizer" + }, + "workspaceMount": "source=${localWorkspaceFolder}/presidio-anonymizer,target=/workspace,type=bind", + "workspaceFolder": "/workspace", + "onCreateCommand": [ + "poetry", + "install", + "--no-interaction" + ], + "postAttachCommand": "poetry shell", + "customizations": { + "extensions": [ + "ms-python.python", + ], + } +} diff --git a/.devcontainer/presidio-image-redactor/devcontainer.json b/.devcontainer/presidio-image-redactor/devcontainer.json new file mode 100644 index 000000000..e82cddc49 --- /dev/null +++ b/.devcontainer/presidio-image-redactor/devcontainer.json @@ -0,0 +1,20 @@ +{ + "name": "Presidio Image Redactor", + "build": { + "dockerfile": "../../presidio-image-redactor/Dockerfile.dev", + "context": "../../presidio-image-redactor" + }, + "workspaceMount": "source=${localWorkspaceFolder}/presidio-image-redactor,target=/workspace,type=bind", + "workspaceFolder": "/workspace", + "onCreateCommand": [ + "poetry", + "install", + "--no-interaction" + ], + "postAttachCommand": "poetry shell", + "customizations": { + "extensions": [ + "ms-python.python", + ], + } +} diff --git a/presidio-analyzer/Dockerfile.dev b/presidio-analyzer/Dockerfile.dev new file mode 100644 index 000000000..a23eb7813 --- /dev/null +++ b/presidio-analyzer/Dockerfile.dev @@ -0,0 +1,20 @@ +FROM python:3.9-slim + +ARG DEV_MODE=dev +ARG POETRY_EXTRAS="" +ARG NLP_CONF_FILE=presidio_analyzer/conf/default.yaml +ARG ANALYZER_CONF_FILE=presidio_analyzer/conf/default_analyzer.yaml +ARG RECOGNIZER_REGISTRY_CONF_FILE=presidio_analyzer/conf/default_recognizers.yaml + +ENV DEV_MODE=${DEV_MODE} +ENV PIP_NO_CACHE_DIR=1 +ENV ANALYZER_CONF_FILE=${ANALYZER_CONF_FILE} +ENV RECOGNIZER_REGISTRY_CONF_FILE=${RECOGNIZER_REGISTRY_CONF_FILE} +ENV NLP_CONF_FILE=${NLP_CONF_FILE} +ENV POETRY_EXTRAS=${POETRY_EXTRAS} + +# Install essential build tools +RUN apt-get update \ + && apt-get install -y build-essential + +RUN pip install poetry \ No newline at end of file diff --git a/presidio-analyzer/install_dependencies.sh b/presidio-analyzer/install_dependencies.sh new file mode 100755 index 000000000..f16e60e22 --- /dev/null +++ b/presidio-analyzer/install_dependencies.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +poetry install -E server ${POETRY_EXTRAS} --no-interaction + +poetry run python install_nlp_models.py --conf_file "$NLP_CONF_FILE" \ No newline at end of file diff --git a/presidio-analyzer/pyproject.toml b/presidio-analyzer/pyproject.toml index 198759632..d6811e659 100644 --- a/presidio-analyzer/pyproject.toml +++ b/presidio-analyzer/pyproject.toml @@ -34,10 +34,15 @@ stanza = { version = "*", optional = true } spacy_stanza = { version = "*", optional = true } azure-ai-textanalytics = { version = "*", optional = true } azure-core = { version = "*", optional = true } +transformers = { version = "*", optional = true } +huggingface_hub = { version = "*", optional = true } [tool.poetry.extras] server = ["flask"] -transformers = ["spacy_huggingface_pipelines"] +transformers = [ + "transformers", + "huggingface_hub", + "spacy_huggingface_pipelines"] stanza = [ "stanza", "spacy_stanza", diff --git a/presidio-anonymizer/Dockerfile.dev b/presidio-anonymizer/Dockerfile.dev new file mode 100644 index 000000000..87a769b41 --- /dev/null +++ b/presidio-anonymizer/Dockerfile.dev @@ -0,0 +1,7 @@ +# Dockerfile +FROM python:3.9-slim + +RUN apt-get update \ + && apt-get install -y build-essential + +RUN pip install poetry \ No newline at end of file diff --git a/presidio-image-redactor/Dockerfile.dev b/presidio-image-redactor/Dockerfile.dev new file mode 100644 index 000000000..cdda8519a --- /dev/null +++ b/presidio-image-redactor/Dockerfile.dev @@ -0,0 +1,16 @@ +# Dockerfile +FROM python:3.9-slim + +RUN apt-get update \ + && apt-get install -y build-essential + +# Install dependencies required for Tesseract +RUN apt-get update \ + && apt-get install tesseract-ocr -y \ + && rm -rf /var/lib/apt/lists/* \ + && tesseract -v + +RUN apt-get update \ +&& apt-get install ffmpeg libsm6 libxext6 -y + +RUN pip install poetry \ No newline at end of file