microsoft · omri374 · Oct 18, 2023 · Oct 6, 2023 · Oct 7, 2023 · Oct 11, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,9 @@
 All notable changes to this project will be documented in this file.
 
 ## [Unreleased]
+### Changed
+#### Image redactor
+* Added support for Microsoft's document intelligence OCR
 
 ## [2.2.33] - June 1st 2023
 ### Added

diff --git a/NOTICE b/NOTICE
@@ -3,6 +3,31 @@ Do Not Translate or Localize
 
 This project incorporates components from the projects listed below. The original copyright notices and the licenses under which Microsoft received such components are set forth below. Microsoft reserves all rights not expressly granted herein, whether by implication, estoppel or otherwise.
 
+*******
+azure-ai-formrecognizer
+
+Copyright (c) Microsoft Corporation.
+
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
 *******
 opencv-python
 

diff --git a/docs/image-redactor/index.md b/docs/image-redactor/index.md
@@ -145,6 +145,46 @@ Python script example can be found under:
     ocr_kwargs = {"ocr_threshold": 50}
     engine.redact_from_directory("path/to/your/dicom", output_dir, fill="background", save_bboxes=True, ocr_kwargs=ocr_kwargs)
     ```
+## Getting started using the document intelligence OCR engine
+
+Presidio offers two engines for OCR based PHI removal. The first is the default engine which uses Tesseract OCR. The second is the Document Intelligence OCR engine which uses Azure's Document Intelligence service, which requires an Azure subscription. The following sections describe how to setup and use the Document Intelligence OCR engine.
+
+You will need to register with Azure get an API key and endpoint.  Perform the steps in the "Prerequisites" section of [this page](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api).  Once your resource deploys, copy your endpoint and key values and save them for the next step.
+
+The most basic usage of the engine can be setup like the following in python
+```
+diOCR = DocumentIntelligenceOCR(endpoint="<your_endpoint>", key="<your_key>")
+```
+
+The DocumentIntelligenceOCR can also attempt to pull your endpoint and key values from environment variables.  
+``` 
+$ export DOCUMENT_INTELLIGENCE_ENDPOINT=<your_endpoint>
+$ export DOCUMENT_INTELLIGENCE_KEY=<your_key>
+```
+### Document Intelligence Model Support
+
+There are numerous document processing models available, and currently we only support the most basic usage of the model.  For an overview of the functionalities offered by Document Intelligence, see [this page](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-model-overview). Presidio offers only word-level processing on the result for PHI redaction purposes, as all prebuilt document models support this interface. Different models support additional structured support for tables, paragraphs, key-value pairs, fields and other types of metadata in the response. 
+
+Additional metadata can be sent to the Document Intelligence API call, such as pages, locale, and features, which are documented [here](https://learn.microsoft.com/en-us/python/api/azure-ai-formrecognizer/azure.ai.formrecognizer.documentanalysisclient?view=azure-python#azure-ai-formrecognizer-documentanalysisclient-begin-analyze-document). You are encouraged to test each model to see which fits best to your use case.
+
+##### Creating an image redactor engine in Python:
+```
+diOCR = DocumentIntelligenceOCR()
+ia_engine = ImageAnalyzerEngine(ocr=di_ocr)
+my_engine = ImageRedactorEngine(image_analyzer_engine=ia_engine)
+```
+
+#### Testing Document Inteligence
+
+Follow the steps of [running the tests](../development.md#running-tests)
+
+The test suite has a series of tests which are only exercised when the appropriate environment variables are populated.  To run the test suite, to test the DocumentIntelligenceOCR engine, call the tests like this:
+
+``` 
+$ export DOCUMENT_INTELLIGENCE_ENDPOINT=<your_endpoint>
+$ export DOCUMENT_INTELLIGENCE_KEY=<your_key>
+$ pytest
+```
 
 ### Evaluating de-identification performance
 

diff --git a/presidio-image-redactor/Pipfile b/presidio-image-redactor/Pipfile
@@ -14,6 +14,7 @@ python-gdcm = ">=3.0.22,<4.0.0"
 matplotlib = ">=3.6.2,<4.0.0"
 opencv-python = ">=4.8.0"
 typing-extensions = "*"
+azure-ai-formrecognizer = ">=3.3.0,<4.0.0"
 
 [dev-packages]
 pytest = "*"

diff --git a/presidio-image-redactor/presidio_image_redactor/__init__.py b/presidio-image-redactor/presidio_image_redactor/__init__.py
@@ -3,6 +3,7 @@
 
 from .ocr import OCR
 from .tesseract_ocr import TesseractOCR
+from .document_intelligence_ocr import DocumentIntelligenceOCR
 from .bbox import BboxProcessor
 from .image_processing_engine import ImagePreprocessor
 from .image_analyzer_engine import ImageAnalyzerEngine
@@ -23,6 +24,7 @@
 __all__ = [
     "OCR",
     "TesseractOCR",
+    "DocumentIntelligenceOCR",
     "BboxProcessor",
     "ImageAnalyzerEngine",
     "ImageRedactorEngine",

diff --git a/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py b/presidio-image-redactor/presidio_image_redactor/document_intelligence_ocr.py
@@ -0,0 +1,140 @@
+import os
+from io import BytesIO
+
+from typing import Optional, Sequence
+
+import numpy as np
+from PIL import Image
+
+from presidio_image_redactor import OCR
+
+from azure.ai.formrecognizer import DocumentAnalysisClient, \
+                                    AnalyzedDocument, \
+                                    DocumentPage, \
+                                    Point
+from azure.core.credentials import AzureKeyCredential
+
+
+class DocumentIntelligenceOCR(OCR):
+    """OCR class that uses Microsoft's Document Intelligence OCR engine."""
+
+    SUPPORTED_MODELS = [
+        "prebuilt-document",
+        "prebuilt-read",
+        "prebuilt-layout",
+        "prebuilt-contract",
+        "prebuilt-healthInsuranceCard.us",
+        "prebuilt-invoice",
+        "prebuilt-receipt",
+        "prebuilt-idDocument",
+        "prebuilt-businessCard"
+    ]
+
+    def __init__(self,
+                 endpoint: Optional[str] = None,
+                 key: Optional[str] = None,
+                 model_id: Optional[str] = "prebuilt-document"):
+        if model_id not in DocumentIntelligenceOCR.SUPPORTED_MODELS:
+            raise ValueError("Unsupported model id: %s" % model_id)
+
+        # If endpoint and/or key are not passed, attempt to get from environment
+        # variables
+        if not endpoint:
+            endpoint = os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT")
+
+        if not key:
+            key = os.getenv("DOCUMENT_INTELLIGENCE_KEY")
+
+        if not key or not endpoint:
+            raise ValueError("Endpoint and key must be specified")
+
+        self.client = DocumentAnalysisClient(
+            endpoint=endpoint,
+            credential=AzureKeyCredential(key)
+        )
+        self.model_id = model_id
+
+    @staticmethod
+    def _polygon_to_bbox(polygon : Sequence[Point]) -> tuple:
+        # Returns a tuple of left/top/width/height according to expected which covers
+        # the passed polygon.
+
+        # We need at least two points for a valid bounding box.
+        if len(polygon) < 2:
+            return (0, 0, 0, 0)
+
+        left = min([int(p.x) for p in polygon])
+        top = min([int(p.y) for p in polygon])
+        right = max([int(p.x) for p in polygon])
+        bottom = max([int(p.y) for p in polygon])
+        width = right - left
+        height = bottom - top
+        return (left, top, width, height)
+
+    @staticmethod
+    def _page_to_bboxes(page: DocumentPage) -> dict:
+        """Convert bounding boxes to uniform format."""
+        # Presidio supports tesseract format of output only, so we format in the same
+        # way.
+        #
+        # Expected format looks like:
+        # {
+        #     "left": [123, 345],
+        #     "top": [0, 15],
+        #     "width": [100, 75],
+        #     "height": [25, 30],
+        #     "conf": ["1", "0.87"],
+        #     "text": ["JOHN", "DOE"],
+        # }
+        bounds = [DocumentIntelligenceOCR._polygon_to_bbox(word.polygon)
+                  for word in page.words]
+
+        return {
+            "left": [box[0] for box in bounds],
+            "top": [box[1] for box in bounds],
+            "width": [box[2] for box in bounds],
+            "height": [box[3] for box in bounds],
+            "conf": [w.confidence for w in page.words],
+            "text": [w.content for w in page.words]
+        }
+
+    def get_imgbytes(self, image: object, **kwargs) -> bytes:
+        """Get the image bytes from the image object."""
+        if isinstance(image, bytes):
+            return image
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+            # Fallthrough to process PIL image
+        if isinstance(image, Image.Image):
+            # Image is a PIL image, write to bytes stream
+            ostream = BytesIO()
+            image.save(ostream, 'PNG')
+            imgbytes = ostream.getvalue()
+        elif isinstance(image, str):
+            # image is a filename
+            imgbytes = open(image, "rb")
+        else:
+            raise ValueError("Unsupported image type: %s" % type(image))
+        return imgbytes
+
+    def analyze_document(self, imgbytes : bytes, **kwargs) -> AnalyzedDocument:
+        """Analyze the document and return the result."""
+        poller = self.client.begin_analyze_document(self.model_id, imgbytes, **kwargs)
+        return poller.result()
+
+    def perform_ocr(self, image: object, **kwargs) -> dict:
+        """Perform OCR on the image.
+
+        :param image: PIL Image/numpy array or file path(str) to be processed
+        :param kwargs: Additional values for begin_analyze_document
+
+        :return: results dictionary containing bboxes and text for each detected word
+        """
+        imgbytes = self.get_imgbytes(image)
+        result = self.analyze_document(imgbytes, **kwargs)
+
+        # Currently cannot handle more than one page.
+        if not (len(result.pages) == 1):
+            raise ValueError("DocumentIntelligenceOCR only supports 1 page documents")
+
+        return DocumentIntelligenceOCR._page_to_bboxes(result.pages[0])
diff --git a/presidio-image-redactor/tests/conftest.py b/presidio-image-redactor/tests/conftest.py
@@ -1,6 +1,8 @@
 import pydicom
 import json
 import os
+
+from PIL import Image
 from presidio_analyzer.recognizer_result import RecognizerResult
 
 from presidio_image_redactor import ImageAnalyzerEngine
@@ -73,3 +75,9 @@ def get_mock_dicom_verify_results():
         results_json = json.load(json_file)
 
     return results_json
+
+
+@pytest.fixture(scope="module")
+def get_mock_png():
+    filepath = f"{SCRIPT_DIR}/test_data/png_images/0_ORIGINAL.png"
+    return Image.open(filepath)
diff --git a/presidio-image-redactor/tests/engine_test_utils.py b/presidio-image-redactor/tests/engine_test_utils.py
@@ -0,0 +1,18 @@
+from typing import Callable
+
+import pytest
+
+def must_succeed(engine_builder: Callable) -> Callable:
+    def _must_succeed():
+        engine = engine_builder()
+        return engine
+    return _must_succeed
+
+def allow_failure(engine_builder: Callable) -> Callable:
+    def _allow_failure():
+        try:
+            engine = engine_builder()
+        except ValueError as e:
+            pytest.skip(reason="Could not set up engine, skipping test")
+        return engine
+    return _allow_failure
diff --git a/presidio-image-redactor/tests/integration/methods.py b/presidio-image-redactor/tests/integration/methods.py
@@ -3,17 +3,25 @@
 import os
 from functools import reduce
 
-from PIL import Image
+import numpy as np
 
+from PIL import Image, ImageChops
 
-def compare_images(image_one: Image, image_two: Image):
-    i1 = image_one.histogram()
-    i2 = image_two.histogram()
+IMAGE_SIMILARITY_PROPORTION=0.95
 
-    result = math.sqrt(
-        reduce(operator.add, map(lambda a, b: (a - b) ** 2, i1, i2)) / len(i1)
-    )
-    return result == 0
+def image_sim(image_one: Image, image_two: Image) -> float:
+    # Compare if two images are similar, by thresholding
+    delta = ImageChops.difference(image_one, image_two).convert('L')
+    # Count number of black pixels, those that are exactly the same
+    num_zero = (np.array(delta.getdata()) == 0).sum()
+    num_nonzero = (np.array(delta.getdata()) != 0).sum()
+    # If the number of black pixels is above a threshold, the images are not similar
+    print(num_zero, num_nonzero, num_zero / (num_zero + num_nonzero))
+    return num_zero / (num_zero + num_nonzero)
+
+
+def compare_images(image_one: Image, image_two: Image) -> bool:
+    return image_sim(image_one, image_two) >= IMAGE_SIMILARITY_PROPORTION
 
 
 def get_resource_image(file_name: str) -> Image:

diff --git a/presidio-image-redactor/tests/integration/test_dicom_image_pii_verify_engine_integration.py b/presidio-image-redactor/tests/integration/test_dicom_image_pii_verify_engine_integration.py
@@ -1,4 +1,4 @@
-"""Integration test for dicom_image_pii_verify_engine
+"""Integration test for dicom_image_pii_verify_engine.
 
 Note we are not checking exact pixel data for the returned image
 because that is covered by testing of the "verify" function in
@@ -7,10 +7,11 @@
 import PIL
 import pydicom
 
-from presidio_image_redactor import DicomImagePiiVerifyEngine, BboxProcessor
+from presidio_image_redactor import DicomImagePiiVerifyEngine
 
 PADDING_WIDTH = 25
 
+
 def test_verify_correctly(
     get_mock_dicom_instance: pydicom.dataset.FileDataset,
     get_mock_dicom_verify_results: dict,
@@ -27,12 +28,13 @@ def test_verify_correctly(
         expected_ocr_results_labels.append(item["label"])
 
     # Act
-    test_image_verify, test_ocr_results_formatted, _ = DicomImagePiiVerifyEngine().verify_dicom_instance(
-        instance=get_mock_dicom_instance,
-        padding_width=PADDING_WIDTH,
-        display_image=True,
-        ocr_kwargs=None
-    )
+    test_image_verify, test_ocr_results_formatted, _ = \
+        DicomImagePiiVerifyEngine().verify_dicom_instance(
+            instance=get_mock_dicom_instance,
+            padding_width=PADDING_WIDTH,
+            display_image=True,
+            ocr_kwargs=None
+        )
 
     # Check most OCR results (labels) are the same
     # Don't worry about position since that is implied in analyzer results
@@ -42,8 +44,9 @@ def test_verify_correctly(
     test_common_labels = set(expected_ocr_results_labels).intersection(
         set(test_ocr_results_labels)
     )
-    test_all_labels = set(expected_ocr_results_labels).union(set(test_ocr_results_labels))
+    test_all_labels = \
+        set(expected_ocr_results_labels).union(set(test_ocr_results_labels))
 
     # Assert
-    assert type(test_image_verify) == PIL.Image.Image
+    assert isinstance(test_image_verify, PIL.Image.Image)
     assert len(test_common_labels) / len(test_all_labels) >= 0.5