6
6
import io
7
7
import itertools
8
8
import tempfile
9
- from typing import IO , Any , Iterator , Optional , Type
9
+ from typing import IO , Any , Iterator , Optional , Protocol , Type
10
10
11
11
# -- CT_* stands for "complex-type", an XML element type in docx parlance --
12
12
import docx
33
33
EmailAddress ,
34
34
Footer ,
35
35
Header ,
36
+ Image ,
36
37
Link ,
37
38
ListItem ,
38
39
NarrativeText ,
63
64
BlockItem : TypeAlias = "Paragraph | DocxTable"
64
65
65
66
67
+ def register_picture_partitioner (picture_partitioner : PicturePartitionerT ) -> None :
68
+ """Specify a pluggable sub-partitioner to be used for partitioning DOCX images."""
69
+ DocxPartitionerOptions .register_picture_partitioner (picture_partitioner )
70
+
71
+
72
+ # ================================================================================================
73
+ # DOCX DOMAIN MODEL DEFINITIONS
74
+ # ================================================================================================
75
+
76
+
77
+ class PicturePartitionerT (Protocol ):
78
+ """Defines the interface for a pluggable sub-partitioner for DOCX Picture objects.
79
+
80
+ In Microsoft Word parlance, an image is a "picture". We use that term here for an image in a
81
+ DOCX file both for domain consistency and because it conveniently avoids confusion with an
82
+ `unstructured` `Image` element.
83
+
84
+ A picture can be either *inline* or *floating*. An inline picture is treated like a big
85
+ character in the text of a paragraph, moving with the text. A floating picture can be moved
86
+ freely and text flows around it.
87
+
88
+ Both inline and floating pictures are defined inside a paragraph in the DOCX file. A paragraph
89
+ can have zero or more pictures. A DOCX picture partitioner takes a `docx` `Paragraph` object
90
+ and generates an `Image` element for each picture found in that paragraph.
91
+ """
92
+
93
+ @classmethod
94
+ def iter_elements (cls , paragraph : Paragraph , opts : DocxPartitionerOptions ) -> Iterator [Image ]:
95
+ """Generate an `Image` element for each picture in `paragraph`."""
96
+ ...
97
+
98
+
99
+ # ================================================================================================
100
+ # PARTITIONER
101
+ # ================================================================================================
102
+
103
+
66
104
@process_metadata ()
67
105
@add_metadata_with_filetype (FileType .DOCX )
68
106
@add_chunking_strategy
@@ -142,6 +180,16 @@ def partition_docx(
142
180
class DocxPartitionerOptions :
143
181
"""Encapsulates partitioning option validation, computation, and application of defaults."""
144
182
183
+ _PicturePartitionerCls = None
184
+ """Sub-partitioner used to extract pictures from a paragraph as `Image` elements.
185
+
186
+ This value has module lifetime and is updated by calling the `register_picture_partitioner()`
187
+ function defined in this module. The value sent to `register_picture_partitioner()` must be a
188
+ pluggable sub-partitioner implementing the `PicturePartitionerT` interface. After
189
+ registration, all paragraphs in subsequently partitioned DOCX documents will be sent to this
190
+ sub-partitioner to extract images when so configured.
191
+ """
192
+
145
193
def __init__ (
146
194
self ,
147
195
* ,
@@ -166,6 +214,11 @@ def __init__(
166
214
# -- options object maintains page-number state --
167
215
self ._page_counter = starting_page_number
168
216
217
+ @classmethod
218
+ def register_picture_partitioner (cls , picture_partitioner : PicturePartitionerT ):
219
+ """Specify a pluggable sub-partitioner to extract images from DOCX paragraphs."""
220
+ cls ._PicturePartitionerCls = picture_partitioner
221
+
169
222
@lazyproperty
170
223
def document (self ) -> Document :
171
224
"""The python-docx `Document` object loaded from file or filename."""
@@ -248,6 +301,16 @@ def page_number(self) -> int:
248
301
"""
249
302
return self ._page_counter
250
303
304
+ @lazyproperty
305
+ def picture_partitioner (self ) -> PicturePartitionerT :
306
+ """The sub-partitioner to use for DOCX image extraction."""
307
+ # -- Note this value has partitioning-run scope. An instance of this options class is
308
+ # -- instantiated once per partitioning run (each document can have different options).
309
+ # -- Because this is a lazyproperty, it is computed only on the first reference. All
310
+ # -- subsequent references during the same partitioning run will get the same value. This
311
+ # -- ensures image extraction is processed consistently within a single document.
312
+ return self ._PicturePartitionerCls or _NullPicturePartitioner
313
+
251
314
@lazyproperty
252
315
def strategy (self ) -> str :
253
316
"""The partitioning strategy for this document.
@@ -569,6 +632,7 @@ def iter_paragraph_items(paragraph: Paragraph) -> Iterator[Paragraph | RenderedP
569
632
for item in iter_paragraph_items (paragraph ):
570
633
if isinstance (item , Paragraph ):
571
634
yield from self ._classify_paragraph_to_element (item )
635
+ yield from self ._iter_paragraph_images (item )
572
636
else :
573
637
yield from self ._opts .increment_page_number ()
574
638
@@ -583,6 +647,13 @@ def _iter_paragraph_emphasis(self, paragraph: Paragraph) -> Iterator[dict[str, s
583
647
if run .italic :
584
648
yield {"text" : text , "tag" : "i" }
585
649
650
+ def _iter_paragraph_images (self , paragraph : Paragraph ) -> Iterator [Image ]:
651
+ """Generate `Image` element for each picture shape in `paragraph` when so configured."""
652
+ # -- Delegate this job to the pluggable Picture partitioner. Note the default picture
653
+ # -- partitioner does not extract images.
654
+ PicturePartitionerCls = self ._opts .picture_partitioner
655
+ yield from PicturePartitionerCls .iter_elements (paragraph , self ._opts )
656
+
586
657
def _iter_section_footers (self , section : Section ) -> Iterator [Footer ]:
587
658
"""Generate any `Footer` elements defined for this section.
588
659
@@ -925,3 +996,18 @@ def _table_emphasis(self, table: DocxTable) -> tuple[list[str], list[str]]:
925
996
"""[contents, tags] pair describing emphasized text in `table`."""
926
997
iter_tbl_emph , iter_tbl_emph_2 = itertools .tee (self ._iter_table_emphasis (table ))
927
998
return ([e ["text" ] for e in iter_tbl_emph ], [e ["tag" ] for e in iter_tbl_emph_2 ])
999
+
1000
+
1001
+ # ================================================================================================
1002
+ # SUB-PARTITIONERS
1003
+ # ================================================================================================
1004
+
1005
+
1006
+ class _NullPicturePartitioner :
1007
+ """Does not parse the provided paragraph for pictures and generates zero `Image` elements."""
1008
+
1009
+ @classmethod
1010
+ def iter_elements (cls , paragraph : Paragraph , opts : DocxPartitionerOptions ) -> Iterator [Image ]:
1011
+ """No-op picture partitioner."""
1012
+ return
1013
+ yield
0 commit comments