From 9695e33d2831df068ef1898bcc9bcfc70a094b4d Mon Sep 17 00:00:00 2001 From: Gord Lueck Date: Fri, 6 Oct 2023 16:47:51 -0700 Subject: [PATCH 01/10] document intelligence support --- CHANGELOG.md | 3 + presidio-image-redactor/Pipfile | 1 + .../presidio_image_redactor/__init__.py | 2 + .../document_intelligence_ocr.py | 115 ++++++++++++++++++ presidio-image-redactor/tests/conftest.py | 8 ++ ...dicom_image_redactor_engine_integration.py | 36 ++++-- .../tests/test_document_intelligence_ocr.py | 78 ++++++++++++ 7 files changed, 233 insertions(+), 10 deletions(-) create mode 100755 presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py create mode 100644 presidio-image-redactor/tests/test_document_intelligence_ocr.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 02be21136..82db07006 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,9 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +### Changed +#### Image redactor +* Added support for Microsoft's document intelligence OCR ## [2.2.33] - June 1st 2023 ### Added diff --git a/presidio-image-redactor/Pipfile b/presidio-image-redactor/Pipfile index 4fa362a02..f503109a1 100644 --- a/presidio-image-redactor/Pipfile +++ b/presidio-image-redactor/Pipfile @@ -14,6 +14,7 @@ python-gdcm = ">=3.0.22,<4.0.0" matplotlib = ">=3.6.2,<4.0.0" opencv-python = ">=4.8.0" typing-extensions = "*" +azure-ai-formrecognizer = "*" [dev-packages] pytest = "*" diff --git a/presidio-image-redactor/presidio_image_redactor/__init__.py b/presidio-image-redactor/presidio_image_redactor/__init__.py index a706f763b..96d40c1cf 100644 --- a/presidio-image-redactor/presidio_image_redactor/__init__.py +++ b/presidio-image-redactor/presidio_image_redactor/__init__.py @@ -3,6 +3,7 @@ from .ocr import OCR from .tesseract_ocr import TesseractOCR +from .document_intelligence_ocr import DocumentIntelligenceOCR from .bbox import BboxProcessor from .image_processing_engine import ImagePreprocessor from .image_analyzer_engine import ImageAnalyzerEngine @@ -23,6 +24,7 @@ __all__ = [ "OCR", "TesseractOCR", + "DocumentIntelligenceOCR", "BboxProcessor", "ImageAnalyzerEngine", "ImageRedactorEngine", diff --git a/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py b/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py new file mode 100755 index 000000000..164421f5e --- /dev/null +++ b/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py @@ -0,0 +1,115 @@ +import os +from io import BytesIO + +import numpy as np +from PIL import Image + +from presidio_image_redactor import OCR + +from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzedDocument +from azure.core.credentials import AzureKeyCredential + +class DocumentIntelligenceOCR(OCR): + """OCR class that uses Microsoft's Document Intelligence OCR engine""" + + SUPPORTED_MODELS=[ + "prebuilt-document", + "prebuilt-read", + "prebuilt-layout", + "prebuilt-contract", + "prebuilt-healthInsuranceCard.us", + "prebuilt-invoice", + "prebuilt-receipt", + "prebuilt-idDocument", + "prebuilt-businessCard" + ] + + def __init__(self, endpoint = None, key = None, model_id = "prebuilt-document"): + if model_id not in DocumentIntelligenceOCR.SUPPORTED_MODELS: + raise ValueError("Unsupported model id: %s" % model_id) + + #If endpoint and/or key are not passed, attempt to get from environment variables + if not endpoint: + endpoint = os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT") + + if not key: + key = os.getenv("DOCUMENT_INTELLIGENCE_KEY") + + if not key or not endpoint: + raise ValueError("Endpoint and key must be passed or set in environment variables") + + self.client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key)) + self.model_id = model_id + + @staticmethod + def _polygon_to_bbox(polygon): + #Returns an tuple of left/top/width/height according to expected which covers the passed polygon + + #We need at least two points for a valid bounding box + if len(polygon) < 2: + return (0,0,0,0) + + left = min([int(p.x) for p in polygon]) + top = min([int(p.y) for p in polygon]) + right = max([int(p.x) for p in polygon]) + bottom = max([int(p.y) for p in polygon]) + width = right - left + height = bottom - top + return (left, top, width, height) + + @staticmethod + def _page_to_bboxes(page): + """Presidio supports tesseract format of output only, so we format in the same way""" + #Expected format looks like: + #{ + # "left": [123, 345], + # "top": [0, 15], + # "width": [100, 75], + # "height": [25, 30], + # "conf": ["1", "0.87"], + # "text": ["JOHN", "DOE"], + # } + bounds = [DocumentIntelligenceOCR._polygon_to_bbox(word.polygon) for word in page.words] + + return { + "left": [box[0] for box in bounds], + "top": [box[1] for box in bounds], + "width": [box[2] for box in bounds], + "height": [box[3] for box in bounds], + "conf": [w.confidence for w in page.words], + "text": [w.content for w in page.words] + } + + def get_imgbytes(self, image:object, **kwargs) -> bytes: + if isinstance(image, bytes): + return image + if isinstance(image, np.ndarray): + image = Image.fromarray(image) + #Fallthrough to process PIL image + if isinstance(image, Image.Image): + #Image is a PIL image, write to bytes stream + ostream = BytesIO() + image.save(ostream, 'PNG') + imgbytes = ostream.getvalue() + elif isinstance(image, str): + #image is a filename + imgbytes = open(image, "rb") + else: + raise ValueError("Unsupported image type: %s" % type(image)) + return imgbytes + + def analyze_document(self, imgbytes) -> AnalyzedDocument: + """Analyze the document and return the result""" + poller = self.client.begin_analyze_document(self.model_id, imgbytes) + return poller.result() + + def perform_ocr(self, image:object, **kwargs) -> dict: + """Perform OCR on the image""" + imgbytes = self.get_imgbytes(image) + result = self.analyze_document(imgbytes) + + #Currently cannot handle more than one page. + if not (len(result.pages) == 1): + raise ValueError("DocumentIntelligenceOCR currently only supports single page documents") + + return DocumentIntelligenceOCR._page_to_bboxes(result.pages[0]) diff --git a/presidio-image-redactor/tests/conftest.py b/presidio-image-redactor/tests/conftest.py index b4ad6d4ee..c23b4f837 100644 --- a/presidio-image-redactor/tests/conftest.py +++ b/presidio-image-redactor/tests/conftest.py @@ -1,6 +1,8 @@ import pydicom import json import os + +from PIL import Image from presidio_analyzer.recognizer_result import RecognizerResult from presidio_image_redactor import ImageAnalyzerEngine @@ -73,3 +75,9 @@ def get_mock_dicom_verify_results(): results_json = json.load(json_file) return results_json + + +@pytest.fixture(scope="module") +def get_mock_png(): + filepath = f"{SCRIPT_DIR}/test_data/png_images/0_ORIGINAL.png" + return Image.open(filepath) \ No newline at end of file diff --git a/presidio-image-redactor/tests/integration/test_dicom_image_redactor_engine_integration.py b/presidio-image-redactor/tests/integration/test_dicom_image_redactor_engine_integration.py index 3d2aabb60..c8a190d4f 100644 --- a/presidio-image-redactor/tests/integration/test_dicom_image_redactor_engine_integration.py +++ b/presidio-image-redactor/tests/integration/test_dicom_image_redactor_engine_integration.py @@ -9,6 +9,8 @@ import os import numpy as np from presidio_image_redactor.dicom_image_redactor_engine import DicomImageRedactorEngine +from presidio_image_redactor.document_intelligence_ocr import DocumentIntelligenceOCR +from presidio_image_redactor.image_analyzer_engine import ImageAnalyzerEngine import pytest SCRIPT_DIR = os.path.dirname(__file__) @@ -17,13 +19,26 @@ RESOURCES_DIR2 = f"{SCRIPT_DIR}/resources/dir1/dir2" -@pytest.fixture(scope="module") +#These are not fixtures, because depending on the setup, some of the object +#instantiations may fail +@pytest.fixture def mock_engine(): """Instance of the DicomImageRedactorEngine""" dicom_image_redactor_engine = DicomImageRedactorEngine() - return dicom_image_redactor_engine +def mock_tesseract_engine(): + return DicomImageRedactorEngine() + +def mock_di_engine(): + di_ocr = DocumentIntelligenceOCR() + ia_engine = ImageAnalyzerEngine(ocr=di_ocr) + return DicomImageRedactorEngine(image_analyzer_engine=ia_engine) + +def all_engines_required(): + """Returns engine, must_pass flag for tests using these engines""" + return [(mock_tesseract_engine), + pytest.param(mock_di_engine, marks=pytest.mark.xfail(reason="This engine may fail if environment variables are not set"))] @pytest.mark.parametrize( "dcm_filepath", @@ -33,33 +48,34 @@ def mock_engine(): (Path(RESOURCES_DIR2, "2_ORIGINAL.dcm")), ], ) -def test_redact_image_correctly( - mock_engine: DicomImageRedactorEngine, dcm_filepath: Path -): +@pytest.mark.parametrize("engine_builder", all_engines_required()) +def test_redact_image_correctly(engine_builder, dcm_filepath: Path): """Test the redact function Args: - mock_engine (DicomImageRedactorEngine): Mock instance. + engine (DicomImageRedactorEngine): Mock instance. dcm_filepath (Path): Path to DICOM file to load. """ test_image = pydicom.dcmread(dcm_filepath) - test_redacted_image = mock_engine.redact(test_image, use_metadata=True) + test_redacted_image = engine_builder().redact(test_image, use_metadata=True) assert ( np.array_equal(test_image.pixel_array, test_redacted_image.pixel_array) is False ) -def test_redact_from_single_file_correctly(mock_engine: DicomImageRedactorEngine): + +@pytest.mark.parametrize("engine_builder", all_engines_required()) +def test_redact_from_single_file_correctly(engine_builder): """Test the redact_from_file function with single file case Args: - mock_engine (DicomImageRedactorEngine): Mock instance. + engine (DicomImageRedactorEngine): Mock instance. """ with tempfile.TemporaryDirectory() as tmpdirname: # Set file paths and redact PII input_path = Path(RESOURCES_PARENT_DIR, "0_ORIGINAL.dcm") - mock_engine.redact_from_file( + engine_builder().redact_from_file( input_dicom_path=str(input_path), output_dir=tmpdirname, fill="contrast", diff --git a/presidio-image-redactor/tests/test_document_intelligence_ocr.py b/presidio-image-redactor/tests/test_document_intelligence_ocr.py new file mode 100644 index 000000000..8eeea818a --- /dev/null +++ b/presidio-image-redactor/tests/test_document_intelligence_ocr.py @@ -0,0 +1,78 @@ +import pytest +from unittest import mock +from presidio_image_redactor.document_intelligence_ocr import DocumentIntelligenceOCR +from azure.ai.formrecognizer import AnalyzeResult + +@pytest.fixture +def ocr_response(request): + return AnalyzeResult.from_dict(request.param) + +@pytest.mark.parametrize( "ocr_response, expected", +[ + #Base Case + ({"pages":[{"words":[]}]}, + {"left": [], "top": [], "width": [], "height": [], "conf": [], "text": []}), + #Polygon of sequence 0 are invalid + ({"pages":[{"words":[{"content":"Happy", "confidence": 3.14, "polygon":[]}]}]}, + {"left": [0], "top": [0], "width": [0], "height": [0], "conf": [3.14], "text": ["Happy"]}), + #Polygon of sequence 1 are invalid + ({"pages":[{"words":[{"content":"Happy", "confidence": 3.14, "polygon":[{"x":1, "y":2}]}]}]}, + {"left": [0], "top": [0], "width": [0], "height": [0], "conf": [3.14], "text": ["Happy"]}), + #Regular two point polygon + ({"pages":[{"words":[{"content":"Happy", "confidence": 3.14, "polygon":[{"x":1, "y":2},{"x":3, "y":42}]}]}]}, + {"left": [1], "top": [2], "width": [2], "height": [40], "conf": [3.14], "text": ["Happy"]}), + #Order doesn't matter + ({"pages":[{"words":[{"content":"Happy", "confidence": 3.14, "polygon":[{"x":3, "y":42},{"x":1, "y":2}]}]}]}, + {"left": [1], "top": [2], "width": [2], "height": [40], "conf": [3.14], "text": ["Happy"]}), + #Can specify other corners + ({"pages":[{"words":[{"content":"Happy", "confidence": 3.14, "polygon":[{"x":3, "y":2},{"x":1, "y":42}]}]}]}, + {"left": [1], "top": [2], "width": [2], "height": [40], "conf": [3.14], "text": ["Happy"]}), +], +indirect=["ocr_response"]) +def test_given_da_response_then_get_bboxes_matches(ocr_response, expected): + result = DocumentIntelligenceOCR._page_to_bboxes(ocr_response.pages[0]) + assert expected == result + +@pytest.mark.parametrize("ocr_response", +[ + #word is incorrect + ({"pages":[{"word":[]}]}) +]) +def test_given_wrong_keys_in_response_then_parsing_fails_returns_exception(ocr_response): + with pytest.raises(AttributeError): + DocumentIntelligenceOCR._page_to_bboxes(ocr_response.pages[0]) + +def test_model_id_wrong_then_raises_exception(): + with pytest.raises(ValueError): + DocumentIntelligenceOCR(key="fake_key", endpoint="fake_endpoint", model_id = "fake_model_id") + +def test_model_id_correct_then_raises_no_exception(): + DocumentIntelligenceOCR(key="fake_key", endpoint="fake_endpoint", model_id = "prebuilt-document") + +@pytest.mark.parametrize("result, ok", + [ + ({"pages": []}, False), + ({"pages": [{"words": []}]}, True), + ({"pages": [{"words": []}, {"words": []}]}, False), + ] +) +@mock.patch("presidio_image_redactor.document_intelligence_ocr.DocumentIntelligenceOCR.analyze_document") +def test_pages_not_one_then_raises_exception(analyze_document, result, ok: bool): + ocr_result = AnalyzeResult.from_dict(result) + diOCR = DocumentIntelligenceOCR(endpoint="fake_endpoint", key="fake_key") + diOCR.analyze_document.return_value = ocr_result + if not ok: + with pytest.raises(ValueError): + diOCR.perform_ocr(b"") + else: + diOCR.perform_ocr(b"") + + +#Mark this test as optionally failing +@pytest.mark.xfail(reason="This test is expected to fail unless the environment variables are set") +def test_ocr_endpoint_via_environment_vars_then_valid_response(get_mock_png): + diOCR = DocumentIntelligenceOCR() + result = diOCR.perform_ocr(get_mock_png) + assert type(result) == dict + assert "text" in result + assert "DAVIDSON" in result["text"] \ No newline at end of file From 1dc5f38f6e60b77d10c4058a4b6957309867b965 Mon Sep 17 00:00:00 2001 From: Gord Lueck Date: Fri, 6 Oct 2023 17:33:15 -0700 Subject: [PATCH 02/10] Added some documentation --- docs/image-redactor/index.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/docs/image-redactor/index.md b/docs/image-redactor/index.md index 47fc8c7ba..a99baeb10 100644 --- a/docs/image-redactor/index.md +++ b/docs/image-redactor/index.md @@ -145,6 +145,39 @@ Python script example can be found under: ocr_kwargs = {"ocr_threshold": 50} engine.redact_from_directory("path/to/your/dicom", output_dir, fill="background", save_bboxes=True, ocr_kwargs=ocr_kwargs) ``` +## Getting started using the document intelligence OCR engine + +You will need to register with Azure get an API key and endpoint. Perform the steps in the "Prerequisites" section of [this page](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api). Once your resource deploys, copy your endpoint and key values and save them for the next step. + +The most basic usage of the engine can be setup like the following in python +``` + diOCR = DocumentIntelligenceOCR(endpoint="", key="") +``` + +The DocumentIntelligenceOCR can also attempt to pull your endpoint and key values from environment variables. +``` +$ export DOCUMENT_INTELLIGENCE_ENDPOINT= +$ export DOCUMENT_INTELLIGENCE_KEY= +``` + +##### Creating an image redactor engine in Python: +``` +diOCR = DocumentIntelligenceOCR() +ia_engine = ImageAnalyzerEngine(ocr=di_ocr) +my_engine = DicomImageRedactorEngine(image_analyzer_engine=ia_engine) +``` + +#### Testing Document Inteligence + +Follow the steps of [running the tests](../development.md#running-tests) + +The test suite has a series of tests which are only exercised when the appropriate environment variables are populated. To run the test suite, to test the DocumentIntelligenceOCR engine, call the tests like this: + +``` +$ export DOCUMENT_INTELLIGENCE_ENDPOINT= +$ export DOCUMENT_INTELLIGENCE_KEY= +$ pytest +``` ### Evaluating de-identification performance From f140ea265e50c0a7ca834b2c9fffdfba36680b54 Mon Sep 17 00:00:00 2001 From: Gord Lueck Date: Wed, 11 Oct 2023 12:40:17 -0700 Subject: [PATCH 03/10] Fix document alignment --- docs/image-redactor/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/image-redactor/index.md b/docs/image-redactor/index.md index a99baeb10..a458399b0 100644 --- a/docs/image-redactor/index.md +++ b/docs/image-redactor/index.md @@ -151,7 +151,7 @@ You will need to register with Azure get an API key and endpoint. Perform the s The most basic usage of the engine can be setup like the following in python ``` - diOCR = DocumentIntelligenceOCR(endpoint="", key="") +diOCR = DocumentIntelligenceOCR(endpoint="", key="") ``` The DocumentIntelligenceOCR can also attempt to pull your endpoint and key values from environment variables. From d718c6232fe0377d379ec8c3e582307fc595c7a2 Mon Sep 17 00:00:00 2001 From: Gord Lueck Date: Wed, 11 Oct 2023 13:33:30 -0700 Subject: [PATCH 04/10] Linting changes and type hints put LICENSE in the NOTICE file --- NOTICE | 25 ++++++ docs/image-redactor/index.md | 4 +- presidio-image-redactor/Pipfile | 2 +- .../document_intelligence_ocr.py | 84 +++++++++++-------- ...com_image_pii_verify_engine_integration.py | 23 ++--- .../test_image_processing_engine.py | 4 +- 6 files changed, 94 insertions(+), 48 deletions(-) diff --git a/NOTICE b/NOTICE index e43b57db8..ba22239e8 100644 --- a/NOTICE +++ b/NOTICE @@ -3,6 +3,31 @@ Do Not Translate or Localize This project incorporates components from the projects listed below. The original copyright notices and the licenses under which Microsoft received such components are set forth below. Microsoft reserves all rights not expressly granted herein, whether by implication, estoppel or otherwise. +******* +azure-ai-formrecognizer + +Copyright (c) Microsoft Corporation. + +MIT License + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + ******* opencv-python diff --git a/docs/image-redactor/index.md b/docs/image-redactor/index.md index a458399b0..39e8dcb46 100644 --- a/docs/image-redactor/index.md +++ b/docs/image-redactor/index.md @@ -147,6 +147,8 @@ Python script example can be found under: ``` ## Getting started using the document intelligence OCR engine +Presidio offers two engines for OCR based PHI removal. The first is the default engine which uses Tesseract OCR. The second is the Document Intelligence OCR engine which uses Azure's Document Intelligence service, which requires an Azure subscription. The following sections describe how to setup and use the Document Intelligence OCR engine. + You will need to register with Azure get an API key and endpoint. Perform the steps in the "Prerequisites" section of [this page](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api). Once your resource deploys, copy your endpoint and key values and save them for the next step. The most basic usage of the engine can be setup like the following in python @@ -164,7 +166,7 @@ $ export DOCUMENT_INTELLIGENCE_KEY= ``` diOCR = DocumentIntelligenceOCR() ia_engine = ImageAnalyzerEngine(ocr=di_ocr) -my_engine = DicomImageRedactorEngine(image_analyzer_engine=ia_engine) +my_engine = ImageRedactorEngine(image_analyzer_engine=ia_engine) ``` #### Testing Document Inteligence diff --git a/presidio-image-redactor/Pipfile b/presidio-image-redactor/Pipfile index f503109a1..6be656a4b 100644 --- a/presidio-image-redactor/Pipfile +++ b/presidio-image-redactor/Pipfile @@ -14,7 +14,7 @@ python-gdcm = ">=3.0.22,<4.0.0" matplotlib = ">=3.6.2,<4.0.0" opencv-python = ">=4.8.0" typing-extensions = "*" -azure-ai-formrecognizer = "*" +azure-ai-formrecognizer = ">=3.3.0,<4.0.0" [dev-packages] pytest = "*" diff --git a/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py b/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py index 164421f5e..0acd2eaff 100755 --- a/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py +++ b/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py @@ -1,6 +1,8 @@ import os from io import BytesIO +from typing import Optional + import numpy as np from PIL import Image @@ -9,10 +11,11 @@ from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzedDocument from azure.core.credentials import AzureKeyCredential + class DocumentIntelligenceOCR(OCR): - """OCR class that uses Microsoft's Document Intelligence OCR engine""" + """OCR class that uses Microsoft's Document Intelligence OCR engine.""" - SUPPORTED_MODELS=[ + SUPPORTED_MODELS = [ "prebuilt-document", "prebuilt-read", "prebuilt-layout", @@ -24,31 +27,39 @@ class DocumentIntelligenceOCR(OCR): "prebuilt-businessCard" ] - def __init__(self, endpoint = None, key = None, model_id = "prebuilt-document"): + def __init__(self, + endpoint: Optional[str] = None, + key: Optional[str] = None, + model_id: Optional[str] = "prebuilt-document"): if model_id not in DocumentIntelligenceOCR.SUPPORTED_MODELS: raise ValueError("Unsupported model id: %s" % model_id) - - #If endpoint and/or key are not passed, attempt to get from environment variables + + # If endpoint and/or key are not passed, attempt to get from environment + # variables if not endpoint: endpoint = os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT") - if not key: + if not key: key = os.getenv("DOCUMENT_INTELLIGENCE_KEY") if not key or not endpoint: - raise ValueError("Endpoint and key must be passed or set in environment variables") + raise ValueError("Endpoint and key must be specified") - self.client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key)) + self.client = DocumentAnalysisClient( + endpoint=endpoint, + credential=AzureKeyCredential(key) + ) self.model_id = model_id @staticmethod def _polygon_to_bbox(polygon): - #Returns an tuple of left/top/width/height according to expected which covers the passed polygon + # Returns a tuple of left/top/width/height according to expected which covers + # the passed polygon. - #We need at least two points for a valid bounding box + # We need at least two points for a valid bounding box. if len(polygon) < 2: - return (0,0,0,0) - + return (0, 0, 0, 0) + left = min([int(p.x) for p in polygon]) top = min([int(p.y) for p in polygon]) right = max([int(p.x) for p in polygon]) @@ -58,18 +69,22 @@ def _polygon_to_bbox(polygon): return (left, top, width, height) @staticmethod - def _page_to_bboxes(page): - """Presidio supports tesseract format of output only, so we format in the same way""" - #Expected format looks like: - #{ - # "left": [123, 345], - # "top": [0, 15], - # "width": [100, 75], - # "height": [25, 30], - # "conf": ["1", "0.87"], - # "text": ["JOHN", "DOE"], - # } - bounds = [DocumentIntelligenceOCR._polygon_to_bbox(word.polygon) for word in page.words] + def _page_to_bboxes(page) -> dict: + """Convert bounding boxes to uniform format.""" + # Presidio supports tesseract format of output only, so we format in the same + # way. + # + # Expected format looks like: + # { + # "left": [123, 345], + # "top": [0, 15], + # "width": [100, 75], + # "height": [25, 30], + # "conf": ["1", "0.87"], + # "text": ["JOHN", "DOE"], + # } + bounds = [DocumentIntelligenceOCR._polygon_to_bbox(word.polygon) + for word in page.words] return { "left": [box[0] for box in bounds], @@ -80,36 +95,37 @@ def _page_to_bboxes(page): "text": [w.content for w in page.words] } - def get_imgbytes(self, image:object, **kwargs) -> bytes: + def get_imgbytes(self, image: object, **kwargs) -> bytes: + """Get the image bytes from the image object.""" if isinstance(image, bytes): return image if isinstance(image, np.ndarray): image = Image.fromarray(image) - #Fallthrough to process PIL image + # Fallthrough to process PIL image if isinstance(image, Image.Image): - #Image is a PIL image, write to bytes stream + # Image is a PIL image, write to bytes stream ostream = BytesIO() image.save(ostream, 'PNG') imgbytes = ostream.getvalue() elif isinstance(image, str): - #image is a filename + # image is a filename imgbytes = open(image, "rb") else: raise ValueError("Unsupported image type: %s" % type(image)) return imgbytes - def analyze_document(self, imgbytes) -> AnalyzedDocument: - """Analyze the document and return the result""" + def analyze_document(self, imgbytes : bytes) -> AnalyzedDocument: + """Analyze the document and return the result.""" poller = self.client.begin_analyze_document(self.model_id, imgbytes) return poller.result() - def perform_ocr(self, image:object, **kwargs) -> dict: - """Perform OCR on the image""" + def perform_ocr(self, image: object, **kwargs) -> dict: + """Perform OCR on the image.""" imgbytes = self.get_imgbytes(image) result = self.analyze_document(imgbytes) - #Currently cannot handle more than one page. + # Currently cannot handle more than one page. if not (len(result.pages) == 1): - raise ValueError("DocumentIntelligenceOCR currently only supports single page documents") + raise ValueError("DocumentIntelligenceOCR only supports 1 page documents") return DocumentIntelligenceOCR._page_to_bboxes(result.pages[0]) diff --git a/presidio-image-redactor/tests/integration/test_dicom_image_pii_verify_engine_integration.py b/presidio-image-redactor/tests/integration/test_dicom_image_pii_verify_engine_integration.py index 66581e7ff..7f57ca8d0 100644 --- a/presidio-image-redactor/tests/integration/test_dicom_image_pii_verify_engine_integration.py +++ b/presidio-image-redactor/tests/integration/test_dicom_image_pii_verify_engine_integration.py @@ -1,4 +1,4 @@ -"""Integration test for dicom_image_pii_verify_engine +"""Integration test for dicom_image_pii_verify_engine. Note we are not checking exact pixel data for the returned image because that is covered by testing of the "verify" function in @@ -7,10 +7,11 @@ import PIL import pydicom -from presidio_image_redactor import DicomImagePiiVerifyEngine, BboxProcessor +from presidio_image_redactor import DicomImagePiiVerifyEngine PADDING_WIDTH = 25 + def test_verify_correctly( get_mock_dicom_instance: pydicom.dataset.FileDataset, get_mock_dicom_verify_results: dict, @@ -27,12 +28,13 @@ def test_verify_correctly( expected_ocr_results_labels.append(item["label"]) # Act - test_image_verify, test_ocr_results_formatted, _ = DicomImagePiiVerifyEngine().verify_dicom_instance( - instance=get_mock_dicom_instance, - padding_width=PADDING_WIDTH, - display_image=True, - ocr_kwargs=None - ) + test_image_verify, test_ocr_results_formatted, _ = \ + DicomImagePiiVerifyEngine().verify_dicom_instance( + instance=get_mock_dicom_instance, + padding_width=PADDING_WIDTH, + display_image=True, + ocr_kwargs=None + ) # Check most OCR results (labels) are the same # Don't worry about position since that is implied in analyzer results @@ -42,8 +44,9 @@ def test_verify_correctly( test_common_labels = set(expected_ocr_results_labels).intersection( set(test_ocr_results_labels) ) - test_all_labels = set(expected_ocr_results_labels).union(set(test_ocr_results_labels)) + test_all_labels = \ + set(expected_ocr_results_labels).union(set(test_ocr_results_labels)) # Assert - assert type(test_image_verify) == PIL.Image.Image + assert isinstance(test_image_verify, PIL.Image.Image) assert len(test_common_labels) / len(test_all_labels) >= 0.5 diff --git a/presidio-image-redactor/tests/integration/test_image_processing_engine.py b/presidio-image-redactor/tests/integration/test_image_processing_engine.py index afc3ed9c8..616bdc6b6 100644 --- a/presidio-image-redactor/tests/integration/test_image_processing_engine.py +++ b/presidio-image-redactor/tests/integration/test_image_processing_engine.py @@ -101,6 +101,6 @@ def test_contrast_segmented_image_enhancer__improve_contrast(): result = preprocessor._improve_contrast(image) assert len(result) == 3 assert isinstance(result[0], PIL.Image.Image) - assert type(result[1]) == np.float64 - assert type(result[2]) == np.float64 + assert isinstance(result[1], np.float64) + assert isinstance(result[2], np.float64) assert result[1] <= result[2] From f8399db2e5d7c086a3b52f9b4024d7d6aec622d6 Mon Sep 17 00:00:00 2001 From: Gord Lueck Date: Wed, 11 Oct 2023 17:43:45 -0700 Subject: [PATCH 05/10] Linting changes Optional skip changes based on engine constructor --- .../tests/engine_test_utils.py | 18 +++++ .../tests/integration/methods.py | 24 ++++--- ...dicom_image_redactor_engine_integration.py | 34 +++++---- .../integration/test_image_redactor_engine.py | 72 +++++++++++++------ .../tests/test_document_intelligence_ocr.py | 45 ++++++------ 5 files changed, 130 insertions(+), 63 deletions(-) create mode 100644 presidio-image-redactor/tests/engine_test_utils.py diff --git a/presidio-image-redactor/tests/engine_test_utils.py b/presidio-image-redactor/tests/engine_test_utils.py new file mode 100644 index 000000000..3df22045c --- /dev/null +++ b/presidio-image-redactor/tests/engine_test_utils.py @@ -0,0 +1,18 @@ +from typing import Callable + +import pytest + +def must_succeed(engine_builder: Callable) -> Callable: + def _must_succeed(): + engine = engine_builder() + return engine + return _must_succeed + +def allow_failure(engine_builder: Callable) -> Callable: + def _allow_failure(): + try: + engine = engine_builder() + except ValueError as e: + pytest.skip(reason="Could not set up engine, skipping test") + return engine + return _allow_failure \ No newline at end of file diff --git a/presidio-image-redactor/tests/integration/methods.py b/presidio-image-redactor/tests/integration/methods.py index 2a8ba3a3b..0f20968ad 100644 --- a/presidio-image-redactor/tests/integration/methods.py +++ b/presidio-image-redactor/tests/integration/methods.py @@ -3,17 +3,25 @@ import os from functools import reduce -from PIL import Image +import numpy as np +from PIL import Image, ImageChops -def compare_images(image_one: Image, image_two: Image): - i1 = image_one.histogram() - i2 = image_two.histogram() +IMAGE_SIMILARITY_PROPORTION=0.95 - result = math.sqrt( - reduce(operator.add, map(lambda a, b: (a - b) ** 2, i1, i2)) / len(i1) - ) - return result == 0 +def image_sim(image_one: Image, image_two: Image) -> float: + # Compare if two images are similar, by thresholding + delta = ImageChops.difference(image_one, image_two).convert('L') + # Count number of black pixels, those that are exactly the same + num_zero = (np.array(delta.getdata()) == 0).sum() + num_nonzero = (np.array(delta.getdata()) != 0).sum() + # If the number of black pixels is above a threshold, the images are not similar + print(num_zero, num_nonzero, num_zero / (num_zero + num_nonzero)) + return num_zero / (num_zero + num_nonzero) + + +def compare_images(image_one: Image, image_two: Image) -> bool: + return image_sim(image_one, image_two) >= IMAGE_SIMILARITY_PROPORTION def get_resource_image(file_name: str) -> Image: diff --git a/presidio-image-redactor/tests/integration/test_dicom_image_redactor_engine_integration.py b/presidio-image-redactor/tests/integration/test_dicom_image_redactor_engine_integration.py index c8a190d4f..f2212354b 100644 --- a/presidio-image-redactor/tests/integration/test_dicom_image_redactor_engine_integration.py +++ b/presidio-image-redactor/tests/integration/test_dicom_image_redactor_engine_integration.py @@ -1,13 +1,16 @@ -"""Integration test for the DicomImageRedactorEngine class +"""Integration test for the DicomImageRedactorEngine class. -Note that we are not asserting every pixel is equal when comparing original to redacted images -to account for differences in performance with different versions of Tesseract OCR. +Note that we are not asserting every pixel is equal when comparing original to redacted +images to account for differences in performance with different versions of +OCR. """ import tempfile import pydicom from pathlib import Path import os import numpy as np + +from tests.engine_test_utils import must_succeed, allow_failure from presidio_image_redactor.dicom_image_redactor_engine import DicomImageRedactorEngine from presidio_image_redactor.document_intelligence_ocr import DocumentIntelligenceOCR from presidio_image_redactor.image_analyzer_engine import ImageAnalyzerEngine @@ -19,26 +22,32 @@ RESOURCES_DIR2 = f"{SCRIPT_DIR}/resources/dir1/dir2" -#These are not fixtures, because depending on the setup, some of the object -#instantiations may fail @pytest.fixture def mock_engine(): - """Instance of the DicomImageRedactorEngine""" + """Instance of the DicomImageRedactorEngine.""" dicom_image_redactor_engine = DicomImageRedactorEngine() return dicom_image_redactor_engine + +# These are not fixtures, because depending on the setup, some of the object +# instantiations may fail def mock_tesseract_engine(): + """Get the Dicom Redactor Engine.""" return DicomImageRedactorEngine() + def mock_di_engine(): + """Build the Dicom Redactor Engine with Document Intelligence OCR.""" di_ocr = DocumentIntelligenceOCR() ia_engine = ImageAnalyzerEngine(ocr=di_ocr) return DicomImageRedactorEngine(image_analyzer_engine=ia_engine) + def all_engines_required(): - """Returns engine, must_pass flag for tests using these engines""" - return [(mock_tesseract_engine), - pytest.param(mock_di_engine, marks=pytest.mark.xfail(reason="This engine may fail if environment variables are not set"))] + """Return all required engines and their must_pass flag for tests.""" + return [(must_succeed(mock_tesseract_engine)), + (allow_failure(mock_di_engine))] + @pytest.mark.parametrize( "dcm_filepath", @@ -50,7 +59,7 @@ def all_engines_required(): ) @pytest.mark.parametrize("engine_builder", all_engines_required()) def test_redact_image_correctly(engine_builder, dcm_filepath: Path): - """Test the redact function + """Test the redact function. Args: engine (DicomImageRedactorEngine): Mock instance. @@ -64,10 +73,9 @@ def test_redact_image_correctly(engine_builder, dcm_filepath: Path): ) - @pytest.mark.parametrize("engine_builder", all_engines_required()) def test_redact_from_single_file_correctly(engine_builder): - """Test the redact_from_file function with single file case + """Test the redact_from_file function with single file case. Args: engine (DicomImageRedactorEngine): Mock instance. @@ -110,7 +118,7 @@ def test_redact_from_single_file_correctly(engine_builder): def test_redact_from_directory_correctly(mock_engine: DicomImageRedactorEngine): - """Test the redact_from_file function with multiple files case + """Test the redact_from_file function with multiple files case. Args: mock_engine (DicomImageRedactorEngine): Mock instance. diff --git a/presidio-image-redactor/tests/integration/test_image_redactor_engine.py b/presidio-image-redactor/tests/integration/test_image_redactor_engine.py index cbc8a6449..794f26423 100644 --- a/presidio-image-redactor/tests/integration/test_image_redactor_engine.py +++ b/presidio-image-redactor/tests/integration/test_image_redactor_engine.py @@ -1,41 +1,73 @@ -from presidio_image_redactor import ImageRedactorEngine -from tests.integration.methods import get_resource_image, compare_images +from typing import Callable + +import pytest + +from presidio_image_redactor import ImageRedactorEngine, \ + ImageAnalyzerEngine, \ + DocumentIntelligenceOCR + +from tests.integration.methods import get_resource_image, compare_images, image_sim red_fill = (255, 0, 0) -def test_given_image_with_text_and_fill_then_text_is_greyed_out(): +def mock_tesseract_engine(): + """Get the Dicom Redactor Engine.""" + return ImageRedactorEngine() + + +def mock_di_engine(): + """Build the Dicom Redactor Engine with Document Intelligence OCR.""" + di_ocr = DocumentIntelligenceOCR() + ia_engine = ImageAnalyzerEngine(ocr=di_ocr) + return ImageRedactorEngine(image_analyzer_engine=ia_engine) + +from tests.engine_test_utils import must_succeed, allow_failure + +def all_engines_required(): + """Return all required engines and their must_pass flag for tests.""" + return [(must_succeed(mock_tesseract_engine)), + (allow_failure(mock_di_engine))] + +@pytest.mark.parametrize("engine_builder", all_engines_required()) +def test_given_image_with_text_and_fill_then_text_is_greyed_out( + engine_builder: Callable): # Image with PII entities image = get_resource_image("ocr_test.png") result_image = get_resource_image("ocr_test_redacted.png") - redacted_image = ImageRedactorEngine().redact(image, 1) + redacted_image = engine_builder().redact(image, 1) assert compare_images(redacted_image, result_image) -def test_given_image_with_text_and_matrix_fill_then_text_is_colored_out(): +@pytest.mark.parametrize("engine_builder", all_engines_required()) +def test_given_image_with_text_and_matrix_fill_then_text_is_colored_out( + engine_builder: Callable): # Image with PII entities image = get_resource_image("ocr_test.png") - redacted_image = ImageRedactorEngine().redact(image, red_fill) + redacted_image = engine_builder().redact(image, red_fill) expected_result_image = get_resource_image("ocr_test_redacted_matrix.png") - assert compare_images(redacted_image, expected_result_image) - assert not compare_images(redacted_image, image) - + # The redacted image is closer to the expected result than the original image + assert image_sim(redacted_image, expected_result_image) > image_sim(redacted_image, image) -def test_given_image_without_text_and_fill_then_image_does_not_change(): +@pytest.mark.parametrize("engine_builder", all_engines_required()) +def test_given_image_without_text_and_fill_then_image_does_not_change( + engine_builder: Callable): # Image without PII entities image = get_resource_image("no_ocr.jpg") - redacted_image = ImageRedactorEngine().redact(image, red_fill) + redacted_image = engine_builder().redact(image, red_fill) assert compare_images(redacted_image, image) -def test_given_two_word_entity_then_no_extra_bounding_box_appears(): +@pytest.mark.parametrize("engine_builder", all_engines_required()) +def test_given_two_word_entity_then_no_extra_bounding_box_appears( + engine_builder: Callable): """Tests bounding boxes for multiword entities. Given a PII entity is identified, has two or more words, and second word is longer than first, then: - no extra bounding box should be created created. + no extra bounding box should be created. """ @@ -43,25 +75,23 @@ def test_given_two_word_entity_then_no_extra_bounding_box_appears(): image = get_resource_image("ocr_bounding_box.png") expected_image = get_resource_image("ocr_bounding_box_redacted.png") - redacted_image = ImageRedactorEngine().redact(image, red_fill) - assert compare_images(expected_image, redacted_image) + redacted_image = engine_builder().redact(image, red_fill) + assert image_sim(expected_image, redacted_image) > 0.90 -def test_given_analzyer_kwargs_then_different_entities_are_redacted(): +@pytest.mark.parametrize("engine_builder", all_engines_required()) +def test_given_analzyer_kwargs_then_different_entities_are_redacted(engine_builder: Callable): """ Tests that kwargs such as entities and score_threshold are available for redact method """ # Image with PII entities image = get_resource_image("kwargs_test.jpg") - redacted_image_no_args = ImageRedactorEngine().redact(image) + redacted_image_no_args = engine_builder().redact(image) redacted_image_entities_args = ImageRedactorEngine().redact( image, entities=["PERSON", "LOCATION"] ) redacted_image_score_args = ImageRedactorEngine().redact(image, score_threshold=1) assert not compare_images(redacted_image_no_args, redacted_image_entities_args) assert not compare_images(redacted_image_no_args, redacted_image_score_args) - assert not compare_images(redacted_image_entities_args, redacted_image_score_args) - - -test_given_image_with_text_and_matrix_fill_then_text_is_colored_out() + assert not compare_images(redacted_image_entities_args, redacted_image_score_args) \ No newline at end of file diff --git a/presidio-image-redactor/tests/test_document_intelligence_ocr.py b/presidio-image-redactor/tests/test_document_intelligence_ocr.py index 8eeea818a..fb69f1280 100644 --- a/presidio-image-redactor/tests/test_document_intelligence_ocr.py +++ b/presidio-image-redactor/tests/test_document_intelligence_ocr.py @@ -3,29 +3,31 @@ from presidio_image_redactor.document_intelligence_ocr import DocumentIntelligenceOCR from azure.ai.formrecognizer import AnalyzeResult + @pytest.fixture def ocr_response(request): return AnalyzeResult.from_dict(request.param) -@pytest.mark.parametrize( "ocr_response, expected", + +@pytest.mark.parametrize("ocr_response, expected", [ - #Base Case - ({"pages":[{"words":[]}]}, + # Base Case + ({"pages": [{"words": []}]}, {"left": [], "top": [], "width": [], "height": [], "conf": [], "text": []}), - #Polygon of sequence 0 are invalid - ({"pages":[{"words":[{"content":"Happy", "confidence": 3.14, "polygon":[]}]}]}, + # Polygon of sequence 0 are invalid + ({"pages": [{"words": [{"content": "Happy", "confidence": 3.14, "polygon": []}]}]}, {"left": [0], "top": [0], "width": [0], "height": [0], "conf": [3.14], "text": ["Happy"]}), - #Polygon of sequence 1 are invalid - ({"pages":[{"words":[{"content":"Happy", "confidence": 3.14, "polygon":[{"x":1, "y":2}]}]}]}, + # Polygon of sequence 1 are invalid + ({"pages": [{"words": [{"content": "Happy", "confidence": 3.14, "polygon": [{"x": 1, "y": 2}]}]}]}, {"left": [0], "top": [0], "width": [0], "height": [0], "conf": [3.14], "text": ["Happy"]}), - #Regular two point polygon - ({"pages":[{"words":[{"content":"Happy", "confidence": 3.14, "polygon":[{"x":1, "y":2},{"x":3, "y":42}]}]}]}, + # Regular two point polygon + ({"pages": [{"words": [{"content": "Happy", "confidence": 3.14, "polygon": [{"x": 1, "y": 2}, {"x": 3, "y": 42}]}]}]}, {"left": [1], "top": [2], "width": [2], "height": [40], "conf": [3.14], "text": ["Happy"]}), - #Order doesn't matter - ({"pages":[{"words":[{"content":"Happy", "confidence": 3.14, "polygon":[{"x":3, "y":42},{"x":1, "y":2}]}]}]}, + # Order doesn't matter + ({"pages": [{"words": [{"content": "Happy", "confidence": 3.14, "polygon": [{"x": 3, "y": 42}, {"x": 1, "y": 2}]}]}]}, {"left": [1], "top": [2], "width": [2], "height": [40], "conf": [3.14], "text": ["Happy"]}), - #Can specify other corners - ({"pages":[{"words":[{"content":"Happy", "confidence": 3.14, "polygon":[{"x":3, "y":2},{"x":1, "y":42}]}]}]}, + # Can specify other corners + ({"pages": [{"words": [{"content": "Happy", "confidence": 3.14, "polygon": [{"x": 3, "y": 2}, {"x": 1, "y": 42}]}]}]}, {"left": [1], "top": [2], "width": [2], "height": [40], "conf": [3.14], "text": ["Happy"]}), ], indirect=["ocr_response"]) @@ -35,8 +37,8 @@ def test_given_da_response_then_get_bboxes_matches(ocr_response, expected): @pytest.mark.parametrize("ocr_response", [ - #word is incorrect - ({"pages":[{"word":[]}]}) + # word is incorrect + ({"pages": [{"word": []}]}) ]) def test_given_wrong_keys_in_response_then_parsing_fails_returns_exception(ocr_response): with pytest.raises(AttributeError): @@ -49,7 +51,7 @@ def test_model_id_wrong_then_raises_exception(): def test_model_id_correct_then_raises_no_exception(): DocumentIntelligenceOCR(key="fake_key", endpoint="fake_endpoint", model_id = "prebuilt-document") -@pytest.mark.parametrize("result, ok", +@pytest.mark.parametrize("result, ok", [ ({"pages": []}, False), ({"pages": [{"words": []}]}, True), @@ -68,11 +70,12 @@ def test_pages_not_one_then_raises_exception(analyze_document, result, ok: bool) diOCR.perform_ocr(b"") -#Mark this test as optionally failing -@pytest.mark.xfail(reason="This test is expected to fail unless the environment variables are set") def test_ocr_endpoint_via_environment_vars_then_valid_response(get_mock_png): - diOCR = DocumentIntelligenceOCR() - result = diOCR.perform_ocr(get_mock_png) - assert type(result) == dict + try: + di_ocr = DocumentIntelligenceOCR() + except Exception: + + result = di_ocr.perform_ocr(get_mock_png) + assert isinstance(result, dict) assert "text" in result assert "DAVIDSON" in result["text"] \ No newline at end of file From f9663729138709019027adf46f12706239e94c21 Mon Sep 17 00:00:00 2001 From: Gord Lueck Date: Wed, 11 Oct 2023 17:58:21 -0700 Subject: [PATCH 06/10] Pass kwargs to di client Fix one missing line --- .../document_intelligence_ocr.py | 14 ++++++++++---- .../tests/test_document_intelligence_ocr.py | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py b/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py index 0acd2eaff..b2592f930 100755 --- a/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py +++ b/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py @@ -114,15 +114,21 @@ def get_imgbytes(self, image: object, **kwargs) -> bytes: raise ValueError("Unsupported image type: %s" % type(image)) return imgbytes - def analyze_document(self, imgbytes : bytes) -> AnalyzedDocument: + def analyze_document(self, imgbytes : bytes, **kwargs) -> AnalyzedDocument: """Analyze the document and return the result.""" - poller = self.client.begin_analyze_document(self.model_id, imgbytes) + poller = self.client.begin_analyze_document(self.model_id, imgbytes, **kwargs) return poller.result() def perform_ocr(self, image: object, **kwargs) -> dict: - """Perform OCR on the image.""" + """Perform OCR on the image. + + :param image: PIL Image/numpy array or file path(str) to be processed + :param kwargs: Additional values for begin_analyze_document + + :return: results dictionary containing bboxes and text for each detected word + """ imgbytes = self.get_imgbytes(image) - result = self.analyze_document(imgbytes) + result = self.analyze_document(imgbytes, **kwargs) # Currently cannot handle more than one page. if not (len(result.pages) == 1): diff --git a/presidio-image-redactor/tests/test_document_intelligence_ocr.py b/presidio-image-redactor/tests/test_document_intelligence_ocr.py index fb69f1280..0dda68cb7 100644 --- a/presidio-image-redactor/tests/test_document_intelligence_ocr.py +++ b/presidio-image-redactor/tests/test_document_intelligence_ocr.py @@ -74,7 +74,7 @@ def test_ocr_endpoint_via_environment_vars_then_valid_response(get_mock_png): try: di_ocr = DocumentIntelligenceOCR() except Exception: - + pytest.skip("Environment variables not set") result = di_ocr.perform_ocr(get_mock_png) assert isinstance(result, dict) assert "text" in result From 55ddc258033bf1410188641b62bac5f4b034e1c5 Mon Sep 17 00:00:00 2001 From: Gord Lueck Date: Wed, 11 Oct 2023 19:02:32 -0700 Subject: [PATCH 07/10] Type hints --- .../presidio_image_redactor/document_intelligence_ocr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py b/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py index b2592f930..8509657ad 100755 --- a/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py +++ b/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py @@ -8,7 +8,7 @@ from presidio_image_redactor import OCR -from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzedDocument +from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzedDocument, DocumentPage from azure.core.credentials import AzureKeyCredential @@ -69,7 +69,7 @@ def _polygon_to_bbox(polygon): return (left, top, width, height) @staticmethod - def _page_to_bboxes(page) -> dict: + def _page_to_bboxes(page: DocumentPage) -> dict: """Convert bounding boxes to uniform format.""" # Presidio supports tesseract format of output only, so we format in the same # way. From 8e7ee855409177c6a9b408232887236f32d2807d Mon Sep 17 00:00:00 2001 From: Gord Lueck Date: Wed, 11 Oct 2023 19:09:49 -0700 Subject: [PATCH 08/10] Type hints --- .../presidio_image_redactor/document_intelligence_ocr.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py b/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py index 8509657ad..8381976e9 100755 --- a/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py +++ b/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py @@ -1,14 +1,17 @@ import os from io import BytesIO -from typing import Optional +from typing import Optional, Sequence import numpy as np from PIL import Image from presidio_image_redactor import OCR -from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzedDocument, DocumentPage +from azure.ai.formrecognizer import DocumentAnalysisClient, \ + AnalyzedDocument, \ + DocumentPage, \ + Point from azure.core.credentials import AzureKeyCredential @@ -52,7 +55,7 @@ def __init__(self, self.model_id = model_id @staticmethod - def _polygon_to_bbox(polygon): + def _polygon_to_bbox(polygon : Sequence[Point]) -> tuple: # Returns a tuple of left/top/width/height according to expected which covers # the passed polygon. From 363f266a05066d796377b95b08047ad03c161428 Mon Sep 17 00:00:00 2001 From: Gord Lueck Date: Mon, 16 Oct 2023 09:52:48 -0700 Subject: [PATCH 09/10] Additional notes on enhanced features --- docs/image-redactor/index.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/image-redactor/index.md b/docs/image-redactor/index.md index 39e8dcb46..38c707f32 100644 --- a/docs/image-redactor/index.md +++ b/docs/image-redactor/index.md @@ -161,6 +161,11 @@ The DocumentIntelligenceOCR can also attempt to pull your endpoint and key value $ export DOCUMENT_INTELLIGENCE_ENDPOINT= $ export DOCUMENT_INTELLIGENCE_KEY= ``` +### Document Intelligence Model Support + +There are numerous document processing models available, and currently we only support the most basic usage of the model. For an overview of the functionalities offered by Document Intelligence, see [this page](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-model-overview). Presidio offers only word-level processing on the result for PHI redaction purposes, as all prebuilt document models support this interface. Different models support additional structured support for tables, paragraphs, key-value pairs, fields and other types of metadata in the response. + +Additional metadata can be sent to the Document Intelligence API call, such as pages, locale, and features, which are documented [here](https://learn.microsoft.com/en-us/python/api/azure-ai-formrecognizer/azure.ai.formrecognizer.documentanalysisclient?view=azure-python#azure-ai-formrecognizer-documentanalysisclient-begin-analyze-document). You are encouraged to test each model to see which fits best to your use case. ##### Creating an image redactor engine in Python: ``` From 5694b9c02ebc43790f70178018d7d4f2dc557536 Mon Sep 17 00:00:00 2001 From: Gord Lueck Date: Tue, 17 Oct 2023 11:54:08 -0700 Subject: [PATCH 10/10] Improving the documentation --- docs/image-redactor/index.md | 10 +-- .../document_intelligence_ocr.py | 70 +++++++++++++------ ...dicom_image_redactor_engine_integration.py | 10 +-- .../tests/test_document_intelligence_ocr.py | 33 +++++++-- 4 files changed, 88 insertions(+), 35 deletions(-) diff --git a/docs/image-redactor/index.md b/docs/image-redactor/index.md index 38c707f32..5045d67a1 100644 --- a/docs/image-redactor/index.md +++ b/docs/image-redactor/index.md @@ -12,8 +12,8 @@ This module may also be used on medical DICOM images. The `DicomImageRedactorEng ![img.png](../assets/dicom-image-redactor-design.png) !!! note "Note" - This class only redacts pixel data and does not scrub text PHI which may exist in the DICOM metadata. - We highly recommend using the DICOM image redactor engine to redact text from images BEFORE scrubbing metadata PHI.* + This class only redacts pixel data and does not scrub text PII which may exist in the DICOM metadata. + We highly recommend using the DICOM image redactor engine to redact text from images BEFORE scrubbing metadata PII.* ## Installation @@ -147,9 +147,9 @@ Python script example can be found under: ``` ## Getting started using the document intelligence OCR engine -Presidio offers two engines for OCR based PHI removal. The first is the default engine which uses Tesseract OCR. The second is the Document Intelligence OCR engine which uses Azure's Document Intelligence service, which requires an Azure subscription. The following sections describe how to setup and use the Document Intelligence OCR engine. +Presidio offers two engines for OCR based PII removal. The first is the default engine which uses Tesseract OCR. The second is the Document Intelligence OCR engine which uses Azure's Document Intelligence service, which requires an Azure subscription. The following sections describe how to setup and use the Document Intelligence OCR engine. -You will need to register with Azure get an API key and endpoint. Perform the steps in the "Prerequisites" section of [this page](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api). Once your resource deploys, copy your endpoint and key values and save them for the next step. +You will need to register with Azure to get an API key and endpoint. Perform the steps in the "Prerequisites" section of [this page](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api). Once your resource deploys, copy your endpoint and key values and save them for the next step. The most basic usage of the engine can be setup like the following in python ``` @@ -163,7 +163,7 @@ $ export DOCUMENT_INTELLIGENCE_KEY= ``` ### Document Intelligence Model Support -There are numerous document processing models available, and currently we only support the most basic usage of the model. For an overview of the functionalities offered by Document Intelligence, see [this page](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-model-overview). Presidio offers only word-level processing on the result for PHI redaction purposes, as all prebuilt document models support this interface. Different models support additional structured support for tables, paragraphs, key-value pairs, fields and other types of metadata in the response. +There are numerous document processing models available, and currently we only support the most basic usage of the model. For an overview of the functionalities offered by Document Intelligence, see [this page](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-model-overview). Presidio offers only word-level processing on the result for PII redaction purposes, as all prebuilt document models support this interface. Different models support additional structured support for tables, paragraphs, key-value pairs, fields and other types of metadata in the response. Additional metadata can be sent to the Document Intelligence API call, such as pages, locale, and features, which are documented [here](https://learn.microsoft.com/en-us/python/api/azure-ai-formrecognizer/azure.ai.formrecognizer.documentanalysisclient?view=azure-python#azure-ai-formrecognizer-documentanalysisclient-begin-analyze-document). You are encouraged to test each model to see which fits best to your use case. diff --git a/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py b/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py index 8381976e9..a03896586 100755 --- a/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py +++ b/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py @@ -1,7 +1,7 @@ import os from io import BytesIO -from typing import Optional, Sequence +from typing import Optional, Sequence, Union import numpy as np from PIL import Image @@ -16,7 +16,15 @@ class DocumentIntelligenceOCR(OCR): - """OCR class that uses Microsoft's Document Intelligence OCR engine.""" + """OCR class that uses Azure AI Document Intelligence OCR engine. + + :param key: The API key + :param endpoint: The API endpoint + :param model_id: Which model to use + + For details, see + https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/ + """ SUPPORTED_MODELS = [ "prebuilt-document", @@ -56,9 +64,15 @@ def __init__(self, @staticmethod def _polygon_to_bbox(polygon : Sequence[Point]) -> tuple: - # Returns a tuple of left/top/width/height according to expected which covers - # the passed polygon. + """Convert polygon to a tuple of left/top/width/height. + + The returned bounding box should entirely cover the passed polygon. + + :param polygon: A sequence of points + :return a tuple of left/top/width/height in pixel dimensions + + """ # We need at least two points for a valid bounding box. if len(polygon) < 2: return (0, 0, 0, 0) @@ -73,19 +87,24 @@ def _polygon_to_bbox(polygon : Sequence[Point]) -> tuple: @staticmethod def _page_to_bboxes(page: DocumentPage) -> dict: - """Convert bounding boxes to uniform format.""" - # Presidio supports tesseract format of output only, so we format in the same - # way. - # - # Expected format looks like: - # { - # "left": [123, 345], - # "top": [0, 15], - # "width": [100, 75], - # "height": [25, 30], - # "conf": ["1", "0.87"], - # "text": ["JOHN", "DOE"], - # } + """Convert bounding boxes to uniform format. + + Presidio supports tesseract format of output only, so we format in the same + way. + Expected format looks like: + { + "left": [123, 345], + "top": [0, 15], + "width": [100, 75], + "height": [25, 30], + "conf": ["1", "0.87"], + "text": ["JOHN", "DOE"], + } + + :param page: The documentpage object from the DI client library + + :return dictionary in the expected format for presidio + """ bounds = [DocumentIntelligenceOCR._polygon_to_bbox(word.polygon) for word in page.words] @@ -98,8 +117,13 @@ def _page_to_bboxes(page: DocumentPage) -> dict: "text": [w.content for w in page.words] } - def get_imgbytes(self, image: object, **kwargs) -> bytes: - """Get the image bytes from the image object.""" + def get_imgbytes(self, image: Union[bytes, np.ndarray, Image.Image]) -> bytes: + """Retrieve the image bytes from the image object. + + :param image: Any of bytes/numpy array /PIL image object + + :return raw image bytes + """ if isinstance(image, bytes): return image if isinstance(image, np.ndarray): @@ -118,7 +142,13 @@ def get_imgbytes(self, image: object, **kwargs) -> bytes: return imgbytes def analyze_document(self, imgbytes : bytes, **kwargs) -> AnalyzedDocument: - """Analyze the document and return the result.""" + """Analyze the document and return the result. + + :param imgbytes: The bytes to send to the API endpoint + :param kwargs: additional arguments for begin_analyze_document + + :return the result of the poller, an AnalyzedDocument object. + """ poller = self.client.begin_analyze_document(self.model_id, imgbytes, **kwargs) return poller.result() diff --git a/presidio-image-redactor/tests/integration/test_dicom_image_redactor_engine_integration.py b/presidio-image-redactor/tests/integration/test_dicom_image_redactor_engine_integration.py index f2212354b..04ca53460 100644 --- a/presidio-image-redactor/tests/integration/test_dicom_image_redactor_engine_integration.py +++ b/presidio-image-redactor/tests/integration/test_dicom_image_redactor_engine_integration.py @@ -10,6 +10,8 @@ import os import numpy as np +from typing import Callable + from tests.engine_test_utils import must_succeed, allow_failure from presidio_image_redactor.dicom_image_redactor_engine import DicomImageRedactorEngine from presidio_image_redactor.document_intelligence_ocr import DocumentIntelligenceOCR @@ -58,11 +60,11 @@ def all_engines_required(): ], ) @pytest.mark.parametrize("engine_builder", all_engines_required()) -def test_redact_image_correctly(engine_builder, dcm_filepath: Path): +def test_redact_image_correctly(engine_builder: Callable, dcm_filepath: Path): """Test the redact function. Args: - engine (DicomImageRedactorEngine): Mock instance. + engine_builder: function returning a DicomImageRedactorEngine dcm_filepath (Path): Path to DICOM file to load. """ test_image = pydicom.dcmread(dcm_filepath) @@ -74,11 +76,11 @@ def test_redact_image_correctly(engine_builder, dcm_filepath: Path): @pytest.mark.parametrize("engine_builder", all_engines_required()) -def test_redact_from_single_file_correctly(engine_builder): +def test_redact_from_single_file_correctly(engine_builder: Callable): """Test the redact_from_file function with single file case. Args: - engine (DicomImageRedactorEngine): Mock instance. + engine_builder: function returning a DicomImageRedactorEngine """ with tempfile.TemporaryDirectory() as tmpdirname: # Set file paths and redact PII diff --git a/presidio-image-redactor/tests/test_document_intelligence_ocr.py b/presidio-image-redactor/tests/test_document_intelligence_ocr.py index 0dda68cb7..ac562013a 100644 --- a/presidio-image-redactor/tests/test_document_intelligence_ocr.py +++ b/presidio-image-redactor/tests/test_document_intelligence_ocr.py @@ -4,7 +4,7 @@ from azure.ai.formrecognizer import AnalyzeResult -@pytest.fixture +@pytest.fixture(scope="module") def ocr_response(request): return AnalyzeResult.from_dict(request.param) @@ -32,23 +32,34 @@ def ocr_response(request): ], indirect=["ocr_response"]) def test_given_da_response_then_get_bboxes_matches(ocr_response, expected): + """Test that the bounding boxes are correctly extracted from the OCR response. + + :param ocr_response: The OCR response from the Document Intelligence client + :param expected: The expected bounding boxes + """ result = DocumentIntelligenceOCR._page_to_bboxes(ocr_response.pages[0]) assert expected == result -@pytest.mark.parametrize("ocr_response", -[ - # word is incorrect - ({"pages": [{"word": []}]}) -]) +@pytest.mark.parametrize("ocr_response", + [ + # word is incorrect + ({"pages": [{"word": []}]}) + ]) def test_given_wrong_keys_in_response_then_parsing_fails_returns_exception(ocr_response): + """Test parsing failures. + + :param ocr_response: The OCR response from the Document Intelligence client + """ with pytest.raises(AttributeError): DocumentIntelligenceOCR._page_to_bboxes(ocr_response.pages[0]) def test_model_id_wrong_then_raises_exception(): + """Test an incorrect model raises an exception""" with pytest.raises(ValueError): DocumentIntelligenceOCR(key="fake_key", endpoint="fake_endpoint", model_id = "fake_model_id") def test_model_id_correct_then_raises_no_exception(): + """Confirm that there's no exception if the model_id is correct""" DocumentIntelligenceOCR(key="fake_key", endpoint="fake_endpoint", model_id = "prebuilt-document") @pytest.mark.parametrize("result, ok", @@ -60,6 +71,12 @@ def test_model_id_correct_then_raises_no_exception(): ) @mock.patch("presidio_image_redactor.document_intelligence_ocr.DocumentIntelligenceOCR.analyze_document") def test_pages_not_one_then_raises_exception(analyze_document, result, ok: bool): + """Test that the number of pages is exactly one. + + :param analyze_document: The mocked analyze_document function + :param result: The result to return from the mocked analyze_document function + :param ok: Whether the test should pass or fail + """ ocr_result = AnalyzeResult.from_dict(result) diOCR = DocumentIntelligenceOCR(endpoint="fake_endpoint", key="fake_key") diOCR.analyze_document.return_value = ocr_result @@ -71,6 +88,10 @@ def test_pages_not_one_then_raises_exception(analyze_document, result, ok: bool) def test_ocr_endpoint_via_environment_vars_then_valid_response(get_mock_png): + """Test that the OCR endpoint returns a valid response. + + :param get_mock_png: The mock PNG image + """ try: di_ocr = DocumentIntelligenceOCR() except Exception: