Skip to content

Commit

Permalink
fix: preserve the order of shapes in partition_pptx output (#193)
Browse files Browse the repository at this point in the history
* order the shapes top to bottom and left to right

* added tests for ordering

* update change log and bump version

* more tests

* don't need enumerate

* n -> on
  • Loading branch information
MthwRobinson authored Feb 3, 2023
1 parent a7ca58e commit 014585e
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 5 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.4.5-dev5
## 0.4.6

* Loosen the default cap threshold to `0.5`.
* Add a `UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling
Expand All @@ -13,6 +13,7 @@
* Checks that titles and narrative text are at least 50% alpha characters.
* Restricts titles to a maximum word length. Adds a `UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`
environment variable for controlling the max number of words in a title.
* Updated `partition_pptx` to order the elements on the page

## 0.4.4

Expand Down
48 changes: 47 additions & 1 deletion test_unstructured/partition/test_pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
import pathlib
import pytest

import pptx

from unstructured.partition.pptx import partition_pptx
from unstructured.documents.elements import ListItem, NarrativeText, Title
from unstructured.documents.elements import ListItem, NarrativeText, Text, Title

DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
Expand Down Expand Up @@ -41,3 +43,47 @@ def test_partition_pptx_raises_with_both_specified():
def test_partition_pptx_raises_with_neither():
with pytest.raises(ValueError):
partition_pptx()


def test_partition_pptx_orders_elements(tmpdir):
filename = os.path.join(tmpdir, "test-ordering.pptx")

presentation = pptx.Presentation()
blank_slide_layout = presentation.slide_layouts[6]
slide = presentation.slides.add_slide(blank_slide_layout)

left = top = width = height = pptx.util.Inches(2)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
tf.text = "This is lower and should come second"

left = top = width = height = pptx.util.Inches(1)
left = top = pptx.util.Inches(-10)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
tf.text = "This is off the page and shouldn't appear"

left = top = width = height = pptx.util.Inches(2)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
tf.text = ""

left = top = width = height = pptx.util.Inches(1)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
tf.text = "This is higher and should come first"

top = width = height = pptx.util.Inches(1)
left = pptx.util.Inches(0.5)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
tf.text = "-------------TOP-------------"

presentation.save(filename)

elements = partition_pptx(filename=filename)
assert elements == [
Text("-------------TOP-------------"),
NarrativeText("This is higher and should come first"),
NarrativeText("This is lower and should come second"),
]
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.4.5-dev5" # pragma: no cover
__version__ = "0.4.6" # pragma: no cover
18 changes: 16 additions & 2 deletions unstructured/partition/pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pptx

from unstructured.documents.elements import Element, ListItem, NarrativeText, Title
from unstructured.documents.elements import Element, ListItem, NarrativeText, Text, Title
from unstructured.partition.text_type import (
is_possible_narrative_text,
is_possible_title,
Expand Down Expand Up @@ -35,9 +35,16 @@ def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) ->

elements: List[Element] = list()
for slide in presentation.slides:
for shape in slide.shapes:
for shape in _order_shapes(slide.shapes):
# NOTE(robinson) - we don't deal with tables yet, but so future humans can find
# it again, here are docs on how to deal with tables. The check for tables should
# be `if shape.has_table`
# ref: https://python-pptx.readthedocs.io/en/latest/user/table.html#adding-a-table
if not shape.has_text_frame:
continue
# NOTE(robinson) - avoid processing shapes that are not on the actual slide
if shape.top < 0 or shape.left < 0:
continue
for paragraph in shape.text_frame.paragraphs:
text = paragraph.text
if text.strip() == "":
Expand All @@ -48,10 +55,17 @@ def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) ->
elements.append(NarrativeText(text=text))
elif is_possible_title(text):
elements.append(Title(text=text))
else:
elements.append(Text(text=text))

return elements


def _order_shapes(shapes):
"""Orders the shapes from top to bottom and left to right."""
return sorted(shapes, key=lambda x: (x.top, x.left))


def _is_bulleted_paragraph(paragraph) -> bool:
"""Determines if the paragraph is bulleted by looking for a bullet character prefix. Bullet
characters in the openxml schema are represented by buChar"""
Expand Down

0 comments on commit 014585e

Please sign in to comment.