Skip to content

Commit

Permalink
fix: required fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
scanny committed Dec 14, 2024
1 parent 04f1f2d commit 807e822
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 61 deletions.
48 changes: 25 additions & 23 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,12 +245,14 @@ def _test(result):
_test(result)
else:
with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile()
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
result = pdf.partition_pdf(
file=spooled_temp_file, strategy=strategy, starting_page_number=starting_page_number
)
with SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
result = pdf.partition_pdf(
file=spooled_temp_file,
strategy=strategy,
starting_page_number=starting_page_number,
)
_test(result)


Expand Down Expand Up @@ -757,14 +759,14 @@ def test_partition_pdf_metadata_date(
)
else:
with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile()
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
elements = pdf.partition_pdf(
file=spooled_temp_file,
strategy=strategy,
metadata_last_modified=metadata_last_modified,
)
with SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
elements = pdf.partition_pdf(
file=spooled_temp_file,
strategy=strategy,
metadata_last_modified=metadata_last_modified,
)

assert {el.metadata.last_modified for el in elements} == {expected_last_modified}

Expand Down Expand Up @@ -1131,15 +1133,15 @@ def test_partition_pdf_with_ocr_only_strategy(
)
else:
with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile()
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
elements = pdf.partition_pdf(
file=spooled_temp_file,
strategy=PartitionStrategy.OCR_ONLY,
languages=["eng"],
is_image=is_image,
)
with SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
elements = pdf.partition_pdf(
file=spooled_temp_file,
strategy=PartitionStrategy.OCR_ONLY,
languages=["eng"],
is_image=is_image,
)

assert elements[0].metadata.languages == ["eng"]
# check pages
Expand Down
16 changes: 8 additions & 8 deletions test_unstructured/partition/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -921,16 +921,16 @@ def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(
def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
self, opts_args: dict[str, Any]
):
spooled_temp_file = tempfile.SpooledTemporaryFile()
spooled_temp_file.write(b"abcdefg")
opts_args["file"] = spooled_temp_file
opts = DocxPartitionerOptions(**opts_args)
with tempfile.SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(b"abcdefg")
opts_args["file"] = spooled_temp_file
opts = DocxPartitionerOptions(**opts_args)

docx_file = opts._docx_file
docx_file = opts._docx_file

assert docx_file is not spooled_temp_file
assert isinstance(docx_file, io.BytesIO)
assert docx_file.getvalue() == b"abcdefg"
assert docx_file is not spooled_temp_file
assert isinstance(docx_file, io.BytesIO)
assert docx_file.getvalue() == b"abcdefg"

def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
self, opts_args: dict[str, Any]
Expand Down
24 changes: 13 additions & 11 deletions test_unstructured/partition/test_pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,12 @@ def test_partition_pptx_with_spooled_file():
Including one that does not have its read-pointer set to the start.
"""
with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file:
spooled_temp_file = tempfile.SpooledTemporaryFile()
spooled_temp_file.write(test_file.read())
with tempfile.SpooledTemporaryFile() as spooled_temp_file:
with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file:
spooled_temp_file.write(test_file.read())

elements = partition_pptx(file=spooled_temp_file)

assert elements == EXPECTED_PPTX_OUTPUT
for element in elements:
assert element.metadata.filename is None
Expand Down Expand Up @@ -701,16 +703,16 @@ def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(
def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
self, opts_args: dict[str, Any]
):
spooled_temp_file = tempfile.SpooledTemporaryFile()
spooled_temp_file.write(b"abcdefg")
opts_args["file"] = spooled_temp_file
opts = PptxPartitionerOptions(**opts_args)
with tempfile.SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(b"abcdefg")
opts_args["file"] = spooled_temp_file
opts = PptxPartitionerOptions(**opts_args)

pptx_file = opts.pptx_file
pptx_file = opts.pptx_file

assert pptx_file is not spooled_temp_file
assert isinstance(pptx_file, io.BytesIO)
assert pptx_file.getvalue() == b"abcdefg"
assert pptx_file is not spooled_temp_file
assert isinstance(pptx_file, io.BytesIO)
assert pptx_file.getvalue() == b"abcdefg"

def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
self, opts_args: dict[str, Any]
Expand Down
10 changes: 6 additions & 4 deletions test_unstructured/partition/test_xlsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,12 @@ def test_partition_xlsx_from_filename():


def test_partition_xlsx_from_SpooledTemporaryFile_with_emoji():
f = tempfile.SpooledTemporaryFile()
with open("example-docs/emoji.xlsx", "rb") as g:
f.write(g.read())
elements = partition_xlsx(file=f, include_header=False)
with tempfile.SpooledTemporaryFile() as f:
with open("example-docs/emoji.xlsx", "rb") as g:
f.write(g.read())

elements = partition_xlsx(file=f, include_header=False)

assert sum(isinstance(element, Text) for element in elements) == 1
assert len(elements) == 1
assert clean_extra_whitespace(elements[0].text) == "🤠😅"
Expand Down
21 changes: 6 additions & 15 deletions unstructured/partition/html/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def ontology_to_unstructured_elements(
"""
elements_to_return = []
if ontology_element.elementType == ontology.ElementTypeEnum.layout and depth <= RECURSION_LIMIT:

if page_number is None and isinstance(ontology_element, ontology.Page):
page_number = ontology_element.page_number

Expand Down Expand Up @@ -200,10 +199,7 @@ def is_text_element(ontology_element: ontology.OntologyElement) -> bool:
if any(isinstance(ontology_element, class_) for class_ in text_classes):
return True

if any(ontology_element.elementType == category for category in text_categories):
return True

return False
return any(ontology_element.elementType == category for category in text_categories)


def is_inline_element(ontology_element: ontology.OntologyElement) -> bool:
Expand All @@ -218,10 +214,7 @@ def is_inline_element(ontology_element: ontology.OntologyElement) -> bool:
if any(isinstance(ontology_element, class_) for class_ in inline_classes):
return True

if any(ontology_element.elementType == category for category in inline_categories):
return True

return False
return any(ontology_element.elementType == category for category in inline_categories)


def unstructured_elements_to_ontology(
Expand Down Expand Up @@ -327,10 +320,7 @@ def is_empty(tag):
if tag.attrs:
return False

if not tag.get_text(strip=True):
return True

return False
return bool(not tag.get_text(strip=True))

def remove_empty_tags(soup):
for tag in soup.find_all():
Expand Down Expand Up @@ -419,8 +409,9 @@ def extract_tag_and_ontology_class_from_tag(

# Scenario 1: Valid Ontology Element
if soup.attrs.get("class"):
html_tag, element_class = soup.name, HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP.get(
(soup.name, soup.attrs["class"][0])
html_tag, element_class = (
soup.name,
HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP.get((soup.name, soup.attrs["class"][0])),
)

# Scenario 2: HTML tag incorrect, CSS class correct
Expand Down

0 comments on commit 807e822

Please sign in to comment.