Skip to content

Commit 47d2861

Browse files
authored
feat(docx): add pluggable picture sub-partitioner (#3081)
**Summary** Allow registration of a custom sub-partitioner that extracts images from a DOCX paragraph. **Additional Context** - A custom image sub-partitioner must implement the `PicturePartitionerT` interface defined in this PR. Basically have an `.iter_elements()` classmethod that takes the paragraph and generates zero or more `Image` elements from it. - The custom image sub-partitioner must be registered by passing the class to `register_picture_partitioner()`. - The default image sub-partitioner is `_NullPicturePartitioner` that does nothing. - The registered picture partitioner is called once for each paragraph.
1 parent 171b5df commit 47d2861

File tree

4 files changed

+137
-3
lines changed

4 files changed

+137
-3
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
### Enhancements
44

55
* **Move `category` field from Text class to Element class.**
6+
* **`partition_docx()` now supports pluggable picture sub-partitioners.** A subpartitioner that accepts a DOCX `Paragraph` and generates elements is now supported. This allows adding a custom sub-partitioner that extracts images and applies OCR or summarization for the image.
67

78
### Features
89

example-docs/contains-pictures.docx

92.9 KB
Binary file not shown.

test_unstructured/partition/test_docx.py

+49-2
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,17 @@
44

55
from __future__ import annotations
66

7+
import hashlib
78
import io
89
import pathlib
910
import re
1011
import tempfile
11-
from typing import Any
12+
from typing import Any, Iterator
1213

1314
import docx
1415
import pytest
1516
from docx.document import Document
17+
from docx.text.paragraph import Paragraph
1618
from pytest_mock import MockFixture
1719

1820
from test_unstructured.unit_utils import (
@@ -31,6 +33,7 @@
3133
Element,
3234
Footer,
3335
Header,
36+
Image,
3437
ListItem,
3538
NarrativeText,
3639
PageBreak,
@@ -39,7 +42,12 @@
3942
Text,
4043
Title,
4144
)
42-
from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx
45+
from unstructured.partition.docx import (
46+
DocxPartitionerOptions,
47+
_DocxPartitioner,
48+
partition_docx,
49+
register_picture_partitioner,
50+
)
4351
from unstructured.partition.utils.constants import (
4452
UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
4553
PartitionStrategy,
@@ -622,6 +630,45 @@ def test_it_considers_text_inside_shapes():
622630
]
623631

624632

633+
# -- image sub-partitioning behaviors ------------------------------------------------------------
634+
635+
636+
def test_partition_docx_generates_no_Image_elements_by_default():
637+
assert not any(
638+
isinstance(e, Image) for e in partition_docx(example_doc_path("contains-pictures.docx"))
639+
)
640+
641+
642+
def test_partition_docx_uses_registered_picture_partitioner():
643+
class FakeParagraphPicturePartitioner:
644+
@classmethod
645+
def iter_elements(
646+
cls, paragraph: Paragraph, opts: DocxPartitionerOptions
647+
) -> Iterator[Image]:
648+
call_hash = hashlib.sha1(f"{paragraph.text}{opts.strategy}".encode()).hexdigest()
649+
yield Image(f"Image with hash {call_hash}, strategy: {opts.strategy}")
650+
651+
register_picture_partitioner(FakeParagraphPicturePartitioner)
652+
653+
elements = partition_docx(example_doc_path("contains-pictures.docx"))
654+
655+
# -- picture-partitioner registration has module-lifetime, so need to de-register this fake
656+
# -- so other tests in same test-run don't use it
657+
DocxPartitionerOptions._PicturePartitionerCls = None
658+
659+
assert len(elements) == 11
660+
image_elements = [e for e in elements if isinstance(e, Image)]
661+
assert len(image_elements) == 6
662+
assert [e.text for e in image_elements] == [
663+
"Image with hash 429de54e71f1f0fb395b6f6191961a3ea1b64dc0, strategy: hi_res",
664+
"Image with hash 5e0cd2c62809377d8ce7422d8ca6b0cf5f4453bc, strategy: hi_res",
665+
"Image with hash 429de54e71f1f0fb395b6f6191961a3ea1b64dc0, strategy: hi_res",
666+
"Image with hash ccbd34be6096544babc391890cb0849c24cc046c, strategy: hi_res",
667+
"Image with hash a41b819c7b4a9750ec0f9198c59c2057d39c653c, strategy: hi_res",
668+
"Image with hash ba0dc2a1205af8f6d9e06c8d415df096b0a9c428, strategy: hi_res",
669+
]
670+
671+
625672
# -- module-level fixtures -----------------------------------------------------------------------
626673

627674

unstructured/partition/docx.py

+87-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import io
77
import itertools
88
import tempfile
9-
from typing import IO, Any, Iterator, Optional, Type
9+
from typing import IO, Any, Iterator, Optional, Protocol, Type
1010

1111
# -- CT_* stands for "complex-type", an XML element type in docx parlance --
1212
import docx
@@ -33,6 +33,7 @@
3333
EmailAddress,
3434
Footer,
3535
Header,
36+
Image,
3637
Link,
3738
ListItem,
3839
NarrativeText,
@@ -63,6 +64,43 @@
6364
BlockItem: TypeAlias = "Paragraph | DocxTable"
6465

6566

67+
def register_picture_partitioner(picture_partitioner: PicturePartitionerT) -> None:
68+
"""Specify a pluggable sub-partitioner to be used for partitioning DOCX images."""
69+
DocxPartitionerOptions.register_picture_partitioner(picture_partitioner)
70+
71+
72+
# ================================================================================================
73+
# DOCX DOMAIN MODEL DEFINITIONS
74+
# ================================================================================================
75+
76+
77+
class PicturePartitionerT(Protocol):
78+
"""Defines the interface for a pluggable sub-partitioner for DOCX Picture objects.
79+
80+
In Microsoft Word parlance, an image is a "picture". We use that term here for an image in a
81+
DOCX file both for domain consistency and because it conveniently avoids confusion with an
82+
`unstructured` `Image` element.
83+
84+
A picture can be either *inline* or *floating*. An inline picture is treated like a big
85+
character in the text of a paragraph, moving with the text. A floating picture can be moved
86+
freely and text flows around it.
87+
88+
Both inline and floating pictures are defined inside a paragraph in the DOCX file. A paragraph
89+
can have zero or more pictures. A DOCX picture partitioner takes a `docx` `Paragraph` object
90+
and generates an `Image` element for each picture found in that paragraph.
91+
"""
92+
93+
@classmethod
94+
def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]:
95+
"""Generate an `Image` element for each picture in `paragraph`."""
96+
...
97+
98+
99+
# ================================================================================================
100+
# PARTITIONER
101+
# ================================================================================================
102+
103+
66104
@process_metadata()
67105
@add_metadata_with_filetype(FileType.DOCX)
68106
@add_chunking_strategy
@@ -142,6 +180,16 @@ def partition_docx(
142180
class DocxPartitionerOptions:
143181
"""Encapsulates partitioning option validation, computation, and application of defaults."""
144182

183+
_PicturePartitionerCls = None
184+
"""Sub-partitioner used to extract pictures from a paragraph as `Image` elements.
185+
186+
This value has module lifetime and is updated by calling the `register_picture_partitioner()`
187+
function defined in this module. The value sent to `register_picture_partitioner()` must be a
188+
pluggable sub-partitioner implementing the `PicturePartitionerT` interface. After
189+
registration, all paragraphs in subsequently partitioned DOCX documents will be sent to this
190+
sub-partitioner to extract images when so configured.
191+
"""
192+
145193
def __init__(
146194
self,
147195
*,
@@ -166,6 +214,11 @@ def __init__(
166214
# -- options object maintains page-number state --
167215
self._page_counter = starting_page_number
168216

217+
@classmethod
218+
def register_picture_partitioner(cls, picture_partitioner: PicturePartitionerT):
219+
"""Specify a pluggable sub-partitioner to extract images from DOCX paragraphs."""
220+
cls._PicturePartitionerCls = picture_partitioner
221+
169222
@lazyproperty
170223
def document(self) -> Document:
171224
"""The python-docx `Document` object loaded from file or filename."""
@@ -248,6 +301,16 @@ def page_number(self) -> int:
248301
"""
249302
return self._page_counter
250303

304+
@lazyproperty
305+
def picture_partitioner(self) -> PicturePartitionerT:
306+
"""The sub-partitioner to use for DOCX image extraction."""
307+
# -- Note this value has partitioning-run scope. An instance of this options class is
308+
# -- instantiated once per partitioning run (each document can have different options).
309+
# -- Because this is a lazyproperty, it is computed only on the first reference. All
310+
# -- subsequent references during the same partitioning run will get the same value. This
311+
# -- ensures image extraction is processed consistently within a single document.
312+
return self._PicturePartitionerCls or _NullPicturePartitioner
313+
251314
@lazyproperty
252315
def strategy(self) -> str:
253316
"""The partitioning strategy for this document.
@@ -569,6 +632,7 @@ def iter_paragraph_items(paragraph: Paragraph) -> Iterator[Paragraph | RenderedP
569632
for item in iter_paragraph_items(paragraph):
570633
if isinstance(item, Paragraph):
571634
yield from self._classify_paragraph_to_element(item)
635+
yield from self._iter_paragraph_images(item)
572636
else:
573637
yield from self._opts.increment_page_number()
574638

@@ -583,6 +647,13 @@ def _iter_paragraph_emphasis(self, paragraph: Paragraph) -> Iterator[dict[str, s
583647
if run.italic:
584648
yield {"text": text, "tag": "i"}
585649

650+
def _iter_paragraph_images(self, paragraph: Paragraph) -> Iterator[Image]:
651+
"""Generate `Image` element for each picture shape in `paragraph` when so configured."""
652+
# -- Delegate this job to the pluggable Picture partitioner. Note the default picture
653+
# -- partitioner does not extract images.
654+
PicturePartitionerCls = self._opts.picture_partitioner
655+
yield from PicturePartitionerCls.iter_elements(paragraph, self._opts)
656+
586657
def _iter_section_footers(self, section: Section) -> Iterator[Footer]:
587658
"""Generate any `Footer` elements defined for this section.
588659
@@ -925,3 +996,18 @@ def _table_emphasis(self, table: DocxTable) -> tuple[list[str], list[str]]:
925996
"""[contents, tags] pair describing emphasized text in `table`."""
926997
iter_tbl_emph, iter_tbl_emph_2 = itertools.tee(self._iter_table_emphasis(table))
927998
return ([e["text"] for e in iter_tbl_emph], [e["tag"] for e in iter_tbl_emph_2])
999+
1000+
1001+
# ================================================================================================
1002+
# SUB-PARTITIONERS
1003+
# ================================================================================================
1004+
1005+
1006+
class _NullPicturePartitioner:
1007+
"""Does not parse the provided paragraph for pictures and generates zero `Image` elements."""
1008+
1009+
@classmethod
1010+
def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]:
1011+
"""No-op picture partitioner."""
1012+
return
1013+
yield

0 commit comments

Comments
 (0)