diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bad5d1eb2..f069e75248 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.4.5-dev5 +## 0.4.6 * Loosen the default cap threshold to `0.5`. * Add a `UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling @@ -13,6 +13,7 @@ * Checks that titles and narrative text are at least 50% alpha characters. * Restricts titles to a maximum word length. Adds a `UNSTRUCTURED_TITLE_MAX_WORD_LENGTH` environment variable for controlling the max number of words in a title. +* Updated `partition_pptx` to order the elements on the page ## 0.4.4 diff --git a/test_unstructured/partition/test_pptx.py b/test_unstructured/partition/test_pptx.py index 4455e227f4..c300be4a8c 100644 --- a/test_unstructured/partition/test_pptx.py +++ b/test_unstructured/partition/test_pptx.py @@ -2,8 +2,10 @@ import pathlib import pytest +import pptx + from unstructured.partition.pptx import partition_pptx -from unstructured.documents.elements import ListItem, NarrativeText, Title +from unstructured.documents.elements import ListItem, NarrativeText, Text, Title DIRECTORY = pathlib.Path(__file__).parent.resolve() EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs") @@ -41,3 +43,47 @@ def test_partition_pptx_raises_with_both_specified(): def test_partition_pptx_raises_with_neither(): with pytest.raises(ValueError): partition_pptx() + + +def test_partition_pptx_orders_elements(tmpdir): + filename = os.path.join(tmpdir, "test-ordering.pptx") + + presentation = pptx.Presentation() + blank_slide_layout = presentation.slide_layouts[6] + slide = presentation.slides.add_slide(blank_slide_layout) + + left = top = width = height = pptx.util.Inches(2) + txBox = slide.shapes.add_textbox(left, top, width, height) + tf = txBox.text_frame + tf.text = "This is lower and should come second" + + left = top = width = height = pptx.util.Inches(1) + left = top = pptx.util.Inches(-10) + txBox = slide.shapes.add_textbox(left, top, width, height) + tf = txBox.text_frame + tf.text = "This is off the page and shouldn't appear" + + left = top = width = height = pptx.util.Inches(2) + txBox = slide.shapes.add_textbox(left, top, width, height) + tf = txBox.text_frame + tf.text = "" + + left = top = width = height = pptx.util.Inches(1) + txBox = slide.shapes.add_textbox(left, top, width, height) + tf = txBox.text_frame + tf.text = "This is higher and should come first" + + top = width = height = pptx.util.Inches(1) + left = pptx.util.Inches(0.5) + txBox = slide.shapes.add_textbox(left, top, width, height) + tf = txBox.text_frame + tf.text = "-------------TOP-------------" + + presentation.save(filename) + + elements = partition_pptx(filename=filename) + assert elements == [ + Text("-------------TOP-------------"), + NarrativeText("This is higher and should come first"), + NarrativeText("This is lower and should come second"), + ] diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d09b26d0e8..62e1ee83ab 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.4.5-dev5" # pragma: no cover +__version__ = "0.4.6" # pragma: no cover diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index acc1f83259..f5ac2ab167 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -2,7 +2,7 @@ import pptx -from unstructured.documents.elements import Element, ListItem, NarrativeText, Title +from unstructured.documents.elements import Element, ListItem, NarrativeText, Text, Title from unstructured.partition.text_type import ( is_possible_narrative_text, is_possible_title, @@ -35,9 +35,16 @@ def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) -> elements: List[Element] = list() for slide in presentation.slides: - for shape in slide.shapes: + for shape in _order_shapes(slide.shapes): + # NOTE(robinson) - we don't deal with tables yet, but so future humans can find + # it again, here are docs on how to deal with tables. The check for tables should + # be `if shape.has_table` + # ref: https://python-pptx.readthedocs.io/en/latest/user/table.html#adding-a-table if not shape.has_text_frame: continue + # NOTE(robinson) - avoid processing shapes that are not on the actual slide + if shape.top < 0 or shape.left < 0: + continue for paragraph in shape.text_frame.paragraphs: text = paragraph.text if text.strip() == "": @@ -48,10 +55,17 @@ def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) -> elements.append(NarrativeText(text=text)) elif is_possible_title(text): elements.append(Title(text=text)) + else: + elements.append(Text(text=text)) return elements +def _order_shapes(shapes): + """Orders the shapes from top to bottom and left to right.""" + return sorted(shapes, key=lambda x: (x.top, x.left)) + + def _is_bulleted_paragraph(paragraph) -> bool: """Determines if the paragraph is bulleted by looking for a bullet character prefix. Bullet characters in the openxml schema are represented by buChar"""