Skip to content

Commit

Permalink
Fix layout parsing (#3754)
Browse files Browse the repository at this point in the history
  • Loading branch information
plutasnyy authored Oct 25, 2024
1 parent 2417f8e commit 5a91f0c
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 9 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.16.3-dev1
## 0.16.3-dev2

### Enhancements

Expand All @@ -7,6 +7,7 @@
### Fixes

* **V2 elements without first parent ID can be parsed**
* **Fix missing elements when layout element parsed in V2 ontology**


## 0.16.2
Expand Down
35 changes: 33 additions & 2 deletions test_unstructured/partition/html/test_html_to_ontology_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def test_when_class_is_missing_it_can_be_inferred_from_type():
expected_html = _wrap_with_body(
"""
<div class="Page">
<aside class='Sidebar'>Some text</aside>
<aside class='Sidebar'><p class='Paragraph'>Some text</p></aside>
</div>
"""
)
Expand Down Expand Up @@ -87,7 +87,7 @@ def test_when_class_is_wrong_tag_name_is_overwritten():
expected_html = _wrap_with_body(
"""
<div class="Page">
<aside class='Sidebar'>Some text</aside>
<aside class='Sidebar'><p class='Paragraph'>Some text</p></aside>
</div>
"""
)
Expand Down Expand Up @@ -535,6 +535,8 @@ def test_malformed_html():
# language=HTML
expected_html = """
<body class="Document">
<p class="Paragraph">
Unclosed comment
<div class="">
<p>
Expand All @@ -554,6 +556,7 @@ def test_malformed_html():
<p>
Paragraph with invalid characters: � � �
</p>
</p>
</body>
"""

Expand All @@ -563,3 +566,31 @@ def test_malformed_html():
parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))

assert parsed_ontology == expected_html


def test_text_is_wrapped_inside_layout_element():
# language=HTML
base_html = _wrap_with_body(
"""
<div class="Page">
Text
</div>
"""
)
base_html = indent_html(base_html)

# language=HTML
expected_html = _wrap_with_body(
"""
<div class="Page">
<p class='Paragraph'>Text</p>
</div>
"""
)

expected_html = indent_html(expected_html)

ontology: OntologyElement = parse_html_to_ontology(base_html)
parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))

assert parsed_ontology == expected_html
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.3-dev1" # pragma: no cover
__version__ = "0.16.3-dev2" # pragma: no cover
13 changes: 8 additions & 5 deletions unstructured/partition/html/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,17 @@ def ontology_to_unstructured_elements(
),
)
]

childreen = []
for child in ontology_element.children:
elements_to_return += ontology_to_unstructured_elements(
childreen += ontology_to_unstructured_elements(
child,
parent_id=ontology_element.id,
page_number=page_number,
depth=0 if isinstance(ontology_element, Document) else depth + 1,
filename=filename,
)

elements_to_return += childreen
else:
unstructured_element_class_name = ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME[
ontology_element.__class__.__name__
Expand All @@ -98,7 +100,6 @@ def ontology_to_unstructured_elements(
BeautifulSoup(html_code_of_ontology_element, "html.parser").get_text().strip()
)
# TODO value attribute from form input should be added to the text

unstructured_element = element_class(
text=element_text,
element_id=ontology_element.id,
Expand Down Expand Up @@ -255,8 +256,10 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
additional_attributes=escaped_attrs,
)

has_children = (ontology_class != UncategorizedText) and any(
isinstance(content, Tag) for content in soup.contents
has_children = (
(ontology_class != UncategorizedText)
and any(isinstance(content, Tag) for content in soup.contents)
or ontology_class().elementType == ElementTypeEnum.layout
)

if has_children:
Expand Down

0 comments on commit 5a91f0c

Please sign in to comment.