Skip to content

Commit

Permalink
Add max recursion limit and fix to_text() method (#3773)
Browse files Browse the repository at this point in the history
  • Loading branch information
plutasnyy authored Nov 7, 2024
1 parent df156eb commit 66d1e5a
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 21 deletions.
12 changes: 11 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
## 0.16.5-dev0

### Enhancements

### Features

### Fixes
- **Fixes parsing HTML v2 parser** Now max recursion limit is set and value is correctly extracted from ontology element


## 0.16.4

### Enhancements
Expand All @@ -9,7 +19,7 @@

### Features

* **Add support for link extraction in pdf hi_res strategy.** The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents more effectively.
* **Add support for link extraction in pdf hi_res strategy.** The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents more effectively.

### Fixes

Expand Down
53 changes: 51 additions & 2 deletions test_unstructured/partition/html/test_html_to_ontology_parsing.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from bs4 import BeautifulSoup

from unstructured.documents.ontology import OntologyElement
from unstructured.documents.ontology import Form, FormFieldValue, OntologyElement, Page
from unstructured.partition.html.html_utils import indent_html
from unstructured.partition.html.transformations import parse_html_to_ontology
from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology


def _wrap_with_body(html: str) -> str:
Expand Down Expand Up @@ -605,3 +605,52 @@ def test_text_in_form_field_value():
form_field_value = page.children[0]
assert form_field_value.text == ""
assert form_field_value.to_text() == "Random Input Value"


def test_to_text_when_form_field():
ontology = Page(
children=[
Form(
tag="input",
additional_attributes={"value": "Random Input Value"},
children=[
FormFieldValue(
tag="input",
additional_attributes={"value": "Random Input Value"},
)
],
)
]
)
assert ontology.to_text(add_children=True) == "Random Input Value"


def test_recursion_limit_is_limiting_parsing():
# language=HTML
broken_html = "some text"
for i in range(100):
broken_html = f"<p class='Paragraph'>{broken_html}</p>"
broken_html = _wrap_with_body(broken_html)
ontology = parse_html_to_ontology(broken_html)

iterator = 1
last_child = ontology.children[0]
while last_child.children:
last_child = last_child.children[0]
iterator += 1
assert last_child.text.startswith('<p class="Paragraph">')
assert iterator == RECURSION_LIMIT


def test_get_text_when_recursion_limit_activated():
broken_html = "some text"
for i in range(100):
broken_html = f"<p class='Paragraph'>{broken_html}</p>"
broken_html = _wrap_with_body(broken_html)
ontology = parse_html_to_ontology(broken_html)

last_child = ontology.children[0]
while last_child.children:
last_child = last_child.children[0]

assert last_child.to_text() == "some text"
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ def test_forms():
assert expected_html == parsed_html
expected_elements = _page_elements + [
Text(
text="Option 1 (Checked)",
text="2 Option 1 (Checked)",
element_id="2",
detection_origin="vlm_partitioner",
metadata=ElementMetadata(
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.4" # pragma: no cover
__version__ = "0.16.5-dev0" # pragma: no cover
19 changes: 7 additions & 12 deletions unstructured/documents/ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class ElementTypeEnum(str, Enum):


class OntologyElement(BaseModel):
text: Optional[str] = Field(None, description="Text content of the element")
text: Optional[str] = Field("", description="Text content of the element")
css_class_name: Optional[str] = Field(
default_factory=lambda: "", description="CSS class associated with the element"
)
Expand Down Expand Up @@ -90,7 +90,10 @@ def to_html(self, add_children=True) -> str:
return result_html

def to_text(self, add_children=True) -> str:
return " ".join(BeautifulSoup(self.to_html(add_children), "html.parser").stripped_strings)
if self.children and add_children:
children_text = " ".join(child.to_text().strip() for child in self.children)
return children_text
return BeautifulSoup(self.to_html()).get_text().strip()

def _construct_attribute_string(self, attributes: dict) -> str:
return " ".join(
Expand Down Expand Up @@ -450,15 +453,6 @@ class Form(OntologyElement):
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
allowed_tags: List[str] = Field(["form"], frozen=True)

def to_text(self, add_children=True) -> str:
texts = [self.text] if self.text else []

if add_children:
for child in self.children:
texts.append(child.to_text(add_children=True))

return " ".join(filter(None, texts)).strip()


class FormField(OntologyElement):
description: str = Field("A property value of a form", frozen=True)
Expand All @@ -472,7 +466,8 @@ class FormFieldValue(OntologyElement):
allowed_tags: List[str] = Field(["input"], frozen=True)

def to_text(self, add_children=True) -> str:
return super().to_text() + self.additional_attributes.get("value", "")
text = super().to_text() + self.additional_attributes.get("value", "")
return text.strip()


class Checkbox(OntologyElement):
Expand Down
12 changes: 8 additions & 4 deletions unstructured/partition/html/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
UncategorizedText,
)

RECURSION_LIMIT = 50


def ontology_to_unstructured_elements(
ontology_element: OntologyElement,
Expand Down Expand Up @@ -68,7 +70,7 @@ def ontology_to_unstructured_elements(
list[Element]: A list of unstructured Element objects.
"""
elements_to_return = []
if ontology_element.elementType == ElementTypeEnum.layout:
if ontology_element.elementType == ElementTypeEnum.layout and depth <= RECURSION_LIMIT:

if page_number is None and isinstance(ontology_element, Page):
page_number = ontology_element.page_number
Expand Down Expand Up @@ -354,7 +356,7 @@ def remove_empty_tags(soup):
return str(soup)


def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> OntologyElement | None:
"""
Converts a BeautifulSoup Tag object into an OntologyElement object. This function is recursive.
First tries to recognize a class from Unstructured Ontology, then if class is matched tries
Expand All @@ -364,6 +366,7 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
Args:
soup (Tag): The BeautifulSoup Tag object to be converted.
recursion_depth (int): Flag to control limit of recursion depth.
Returns:
OntologyElement: The converted OntologyElement object.
Expand All @@ -384,12 +387,13 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
and any(isinstance(content, Tag) for content in soup.contents)
or ontology_class().elementType == ElementTypeEnum.layout
)
should_unwrap_html = has_children and recursion_depth <= RECURSION_LIMIT

if has_children:
if should_unwrap_html:
text = ""
children = [
(
parse_html_to_ontology_element(child)
parse_html_to_ontology_element(child, recursion_depth=recursion_depth + 1)
if isinstance(child, Tag)
else Paragraph(text=str(child).strip())
)
Expand Down

0 comments on commit 66d1e5a

Please sign in to comment.