feat(docx): add pluggable picture sub-partitioner (#3081)

scanny · web-flow · commit 47d28612f789 · 2024-05-23T18:46:30.000Z
**Summary**
Allow registration of a custom sub-partitioner that extracts images from
a DOCX paragraph.

**Additional Context**
- A custom image sub-partitioner must implement the
`PicturePartitionerT` interface defined in this PR. Basically have an
`.iter_elements()` classmethod that takes the paragraph and generates
zero or more `Image` elements from it.
- The custom image sub-partitioner must be registered by passing the
class to `register_picture_partitioner()`.
- The default image sub-partitioner is `_NullPicturePartitioner` that
does nothing.
- The registered picture partitioner is called once for each paragraph.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ### Enhancements
 
 * **Move `category` field from Text class to Element class.**
+* **`partition_docx()` now supports pluggable picture sub-partitioners.** A subpartitioner that accepts a DOCX `Paragraph` and generates elements is now supported. This allows adding a custom sub-partitioner that extracts images and applies OCR or summarization for the image.
 
 ### Features
 
diff --git a/example-docs/contains-pictures.docx b/example-docs/contains-pictures.docx
diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py
@@ -4,15 +4,17 @@
 
 from __future__ import annotations
 
+import hashlib
 import io
 import pathlib
 import re
 import tempfile
-from typing import Any
+from typing import Any, Iterator
 
 import docx
 import pytest
 from docx.document import Document
+from docx.text.paragraph import Paragraph
 from pytest_mock import MockFixture
 
 from test_unstructured.unit_utils import (
@@ -31,6 +33,7 @@
     Element,
     Footer,
     Header,
+    Image,
     ListItem,
     NarrativeText,
     PageBreak,
@@ -39,7 +42,12 @@
     Text,
     Title,
 )
-from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx
+from unstructured.partition.docx import (
+    DocxPartitionerOptions,
+    _DocxPartitioner,
+    partition_docx,
+    register_picture_partitioner,
+)
 from unstructured.partition.utils.constants import (
     UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
     PartitionStrategy,
@@ -622,6 +630,45 @@ def test_it_considers_text_inside_shapes():
     ]
 
 
+# -- image sub-partitioning behaviors ------------------------------------------------------------
+
+
+def test_partition_docx_generates_no_Image_elements_by_default():
+    assert not any(
+        isinstance(e, Image) for e in partition_docx(example_doc_path("contains-pictures.docx"))
+    )
+
+
+def test_partition_docx_uses_registered_picture_partitioner():
+    class FakeParagraphPicturePartitioner:
+        @classmethod
+        def iter_elements(
+            cls, paragraph: Paragraph, opts: DocxPartitionerOptions
+        ) -> Iterator[Image]:
+            call_hash = hashlib.sha1(f"{paragraph.text}{opts.strategy}".encode()).hexdigest()
+            yield Image(f"Image with hash {call_hash}, strategy: {opts.strategy}")
+
+    register_picture_partitioner(FakeParagraphPicturePartitioner)
+
+    elements = partition_docx(example_doc_path("contains-pictures.docx"))
+
+    # -- picture-partitioner registration has module-lifetime, so need to de-register this fake
+    # -- so other tests in same test-run don't use it
+    DocxPartitionerOptions._PicturePartitionerCls = None
+
+    assert len(elements) == 11
+    image_elements = [e for e in elements if isinstance(e, Image)]
+    assert len(image_elements) == 6
+    assert [e.text for e in image_elements] == [
+        "Image with hash 429de54e71f1f0fb395b6f6191961a3ea1b64dc0, strategy: hi_res",
+        "Image with hash 5e0cd2c62809377d8ce7422d8ca6b0cf5f4453bc, strategy: hi_res",
+        "Image with hash 429de54e71f1f0fb395b6f6191961a3ea1b64dc0, strategy: hi_res",
+        "Image with hash ccbd34be6096544babc391890cb0849c24cc046c, strategy: hi_res",
+        "Image with hash a41b819c7b4a9750ec0f9198c59c2057d39c653c, strategy: hi_res",
+        "Image with hash ba0dc2a1205af8f6d9e06c8d415df096b0a9c428, strategy: hi_res",
+    ]
+
+
 # -- module-level fixtures -----------------------------------------------------------------------
 
 
diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
@@ -6,7 +6,7 @@
 import io
 import itertools
 import tempfile
-from typing import IO, Any, Iterator, Optional, Type
+from typing import IO, Any, Iterator, Optional, Protocol, Type
 
 # -- CT_* stands for "complex-type", an XML element type in docx parlance --
 import docx
@@ -33,6 +33,7 @@
     EmailAddress,
     Footer,
     Header,
+    Image,
     Link,
     ListItem,
     NarrativeText,
@@ -63,6 +64,43 @@
 BlockItem: TypeAlias = "Paragraph | DocxTable"
 
 
+def register_picture_partitioner(picture_partitioner: PicturePartitionerT) -> None:
+    """Specify a pluggable sub-partitioner to be used for partitioning DOCX images."""
+    DocxPartitionerOptions.register_picture_partitioner(picture_partitioner)
+
+
+# ================================================================================================
+# DOCX DOMAIN MODEL DEFINITIONS
+# ================================================================================================
+
+
+class PicturePartitionerT(Protocol):
+    """Defines the interface for a pluggable sub-partitioner for DOCX Picture objects.
+
+    In Microsoft Word parlance, an image is a "picture". We use that term here for an image in a
+    DOCX file both for domain consistency and because it conveniently avoids confusion with an
+    `unstructured` `Image` element.
+
+    A picture can be either *inline* or *floating*. An inline picture is treated like a big
+    character in the text of a paragraph, moving with the text. A floating picture can be moved
+    freely and text flows around it.
+
+    Both inline and floating pictures are defined inside a paragraph in the DOCX file. A paragraph
+    can have zero or more pictures. A DOCX picture partitioner takes a `docx` `Paragraph` object
+    and generates an `Image` element for each picture found in that paragraph.
+    """
+
+    @classmethod
+    def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]:
+        """Generate an `Image` element for each picture in `paragraph`."""
+        ...
+
+
+# ================================================================================================
+# PARTITIONER
+# ================================================================================================
+
+
 @process_metadata()
 @add_metadata_with_filetype(FileType.DOCX)
 @add_chunking_strategy
@@ -142,6 +180,16 @@ def partition_docx(
 class DocxPartitionerOptions:
     """Encapsulates partitioning option validation, computation, and application of defaults."""
 
+    _PicturePartitionerCls = None
+    """Sub-partitioner used to extract pictures from a paragraph as `Image` elements.
+
+    This value has module lifetime and is updated by calling the `register_picture_partitioner()`
+    function defined in this module. The value sent to `register_picture_partitioner()` must be a
+    pluggable sub-partitioner implementing the `PicturePartitionerT` interface. After
+    registration, all paragraphs in subsequently partitioned DOCX documents will be sent to this
+    sub-partitioner to extract images when so configured.
+    """
+
     def __init__(
         self,
         *,
@@ -166,6 +214,11 @@ def __init__(
         # -- options object maintains page-number state --
         self._page_counter = starting_page_number
 
+    @classmethod
+    def register_picture_partitioner(cls, picture_partitioner: PicturePartitionerT):
+        """Specify a pluggable sub-partitioner to extract images from DOCX paragraphs."""
+        cls._PicturePartitionerCls = picture_partitioner
+
     @lazyproperty
     def document(self) -> Document:
         """The python-docx `Document` object loaded from file or filename."""
@@ -248,6 +301,16 @@ def page_number(self) -> int:
         """
         return self._page_counter
 
+    @lazyproperty
+    def picture_partitioner(self) -> PicturePartitionerT:
+        """The sub-partitioner to use for DOCX image extraction."""
+        # -- Note this value has partitioning-run scope. An instance of this options class is
+        # -- instantiated once per partitioning run (each document can have different options).
+        # -- Because this is a lazyproperty, it is computed only on the first reference. All
+        # -- subsequent references during the same partitioning run will get the same value. This
+        # -- ensures image extraction is processed consistently within a single document.
+        return self._PicturePartitionerCls or _NullPicturePartitioner
+
     @lazyproperty
     def strategy(self) -> str:
         """The partitioning strategy for this document.
@@ -569,6 +632,7 @@ def iter_paragraph_items(paragraph: Paragraph) -> Iterator[Paragraph | RenderedP
         for item in iter_paragraph_items(paragraph):
             if isinstance(item, Paragraph):
                 yield from self._classify_paragraph_to_element(item)
+                yield from self._iter_paragraph_images(item)
             else:
                 yield from self._opts.increment_page_number()
 
@@ -583,6 +647,13 @@ def _iter_paragraph_emphasis(self, paragraph: Paragraph) -> Iterator[dict[str, s
             if run.italic:
                 yield {"text": text, "tag": "i"}
 
+    def _iter_paragraph_images(self, paragraph: Paragraph) -> Iterator[Image]:
+        """Generate `Image` element for each picture shape in `paragraph` when so configured."""
+        # -- Delegate this job to the pluggable Picture partitioner. Note the default picture
+        # -- partitioner does not extract images.
+        PicturePartitionerCls = self._opts.picture_partitioner
+        yield from PicturePartitionerCls.iter_elements(paragraph, self._opts)
+
     def _iter_section_footers(self, section: Section) -> Iterator[Footer]:
         """Generate any `Footer` elements defined for this section.
 
@@ -925,3 +996,18 @@ def _table_emphasis(self, table: DocxTable) -> tuple[list[str], list[str]]:
         """[contents, tags] pair describing emphasized text in `table`."""
         iter_tbl_emph, iter_tbl_emph_2 = itertools.tee(self._iter_table_emphasis(table))
         return ([e["text"] for e in iter_tbl_emph], [e["tag"] for e in iter_tbl_emph_2])
+
+
+# ================================================================================================
+# SUB-PARTITIONERS
+# ================================================================================================
+
+
+class _NullPicturePartitioner:
+    """Does not parse the provided paragraph for pictures and generates zero `Image` elements."""
+
+    @classmethod
+    def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]:
+        """No-op picture partitioner."""
+        return
+        yield