fix: html incorrectly categorizing text (#3841)

Fixes #3666 --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com>
Unstructured-IO · Dec 18, 2024 · b3a2dd4 · b3a2dd4
1 parent 9ece0b5
commit b3a2dd4
Show file tree

Hide file tree

Showing 39 changed files with 187 additions and 13,557 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.16.12-dev4
+## 0.16.12-dev5
 
 ### Enhancements
 
@@ -11,6 +11,7 @@
 - **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
 - **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
 - **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements.
+- **Improve element-type mapping for HTML.** Fixes bug where certain non-title elements were classified as `Title`.
 
 ## 0.16.11
 

diff --git a/test_unstructured/metrics/test_element_type.py b/test_unstructured/metrics/test_element_type.py
@@ -19,7 +19,7 @@
             "fake-email.txt",
             {
                 ("NarrativeText", None): 1,
-                ("Title", 0): 1,
+                ("UncategorizedText", None): 1,
                 ("ListItem", 1): 2,
             },
         ),
@@ -50,7 +50,7 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in
         (
             "fake-email.txt",
             {
-                ("Title", 0): 1,
+                ("UncategorizedText", None): 1,
                 ("ListItem", 1): 2,
                 ("NarrativeText", None): 2,
             },

diff --git a/test_unstructured/partition/html/test_parser.py b/test_unstructured/partition/html/test_parser.py
@@ -384,29 +384,26 @@ def it_generates_the_document_elements_from_the_Flow_element(self):
         elements = div.iter_elements()
 
         e = next(elements)
-        assert e == Title("Text of div with hierarchical phrasing content before first block item")
+        assert e == Text("Text of div with hierarchical phrasing content before first block item")
         assert e.metadata.to_dict() == {
-            "category_depth": 0,
             "emphasized_text_contents": ["with", "hierarchical", "phrasing"],
             "emphasized_text_tags": ["b", "bi", "b"],
         }
         e = next(elements)
         assert e == NarrativeText("Click here to see the blurb for this block item.")
         assert e.metadata.to_dict() == {"link_texts": ["here"], "link_urls": ["http://blurb.io"]}
         e = next(elements)
-        assert e == Title("tail of block item with hierarchical phrasing content")
+        assert e == Text("tail of block item with hierarchical phrasing content")
         assert e.metadata.to_dict() == {
-            "category_depth": 0,
             "emphasized_text_contents": ["with", "hierarchical", "phrasing"],
             "emphasized_text_tags": ["b", "bi", "b"],
         }
         e = next(elements)
-        assert e == Title("second block item")
-        assert e.metadata.to_dict() == {"category_depth": 0}
+        assert e == Text("second block item")
+        assert e.metadata.to_dict() == {}
         e = next(elements)
-        assert e == Title("tail of block item with hierarchical phrasing content")
+        assert e == Text("tail of block item with hierarchical phrasing content")
         assert e.metadata.to_dict() == {
-            "category_depth": 0,
             "emphasized_text_contents": ["with", "hierarchical"],
             "emphasized_text_tags": ["b", "bi"],
         }
@@ -664,22 +661,22 @@ def it_generates_text_segments_for_its_text_and_children_and_tail(
         ("html_text", "expected_value"),
         [
             # -- Phrasing with nested block but no text or tail produces only element for block --
-            ("<strong><p>aaa</p></strong>", [Title("aaa")]),
+            ("<strong><p>aaa</p></strong>", [Text("aaa")]),
             # -- Phrasing with text produces annotated text-segment for the text --
             (
                 "<strong>aaa<p>bbb</p></strong>",
                 [
                     TextSegment(
                         "aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"}
                     ),
-                    Title("bbb"),
+                    Text("bbb"),
                 ],
             ),
             # -- Phrasing with tail produces annotated text-segment for the tail --
             (
                 "<strong><p>aaa</p>bbb</strong>",
                 [
-                    Title("aaa"),
+                    Text("aaa"),
                     TextSegment(
                         "bbb", {"emphasized_text_contents": "bbb", "emphasized_text_tags": "b"}
                     ),
@@ -692,7 +689,7 @@ def it_generates_text_segments_for_its_text_and_children_and_tail(
                     TextSegment(
                         "aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"}
                     ),
-                    Title("bbb"),
+                    Text("bbb"),
                     TextSegment(
                         "ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "b"}
                     ),
@@ -776,15 +773,15 @@ def it_generates_text_segments_for_its_children_and_their_tails(
             # -- a phrasing element with no block children produces no elements --
             ("<dfn></dfn>", "", []),
             # -- a child block element produces an element --
-            ("<kbd><p>aaa</p></kbd>", "", [Title("aaa")]),
+            ("<kbd><p>aaa</p></kbd>", "", [Text("aaa")]),
             # -- a child block element with a tail also produces a text-segment for the tail --
-            ("<kbd><p>aaa</p>bbb</kbd>", "", [Title("aaa"), TextSegment("bbb", {})]),
+            ("<kbd><p>aaa</p>bbb</kbd>", "", [Text("aaa"), TextSegment("bbb", {})]),
             # -- and also text-segments for phrasing following the tail --
             (
                 "<kbd><p>aaa</p>bbb<mark>ccc</mark>ddd</kbd>",
                 "",
                 [
-                    Title("aaa"),
+                    Text("aaa"),
                     TextSegment("bbb", {}),
                     TextSegment("ccc", {}),
                     TextSegment("ddd", {}),
@@ -798,7 +795,7 @@ def it_generates_text_segments_for_its_children_and_their_tails(
                     TextSegment(
                         "aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"}
                     ),
-                    Title("bbb"),
+                    Text("bbb"),
                     TextSegment(
                         "ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "b"}
                     ),
@@ -872,7 +869,7 @@ def and_it_generates_elements_for_its_block_children(
                 [
                     TextSegment("aaa", {}),
                     TextSegment("bbb", {}),
-                    Title("ccc"),
+                    Text("ccc"),
                     TextSegment("ddd", {}),
                     TextSegment("eee", {}),
                 ],
@@ -996,7 +993,7 @@ def it_generates_enclosed_block_items_as_separate_elements(self):
                     "link_urls": ["http://eie.io"],
                 },
             ),
-            Title("one with"),
+            Text("one with"),
             TextSegment(
                 " the Force.",
                 {

diff --git a/test_unstructured/partition/html/test_partition.py b/test_unstructured/partition/html/test_partition.py
@@ -72,7 +72,7 @@ def test_partition_html_accepts_a_file_path(tmp_path: pathlib.Path):
     assert elements == [
         Title("A Great and Glorious Section"),
         NarrativeText("Dear Leader is the best. He is such a wonderful engineer!"),
-        Title("Another Magnificent paragraph"),
+        Text("Another Magnificent paragraph"),
         NarrativeText("The prior element is a title based on its capitalization patterns!"),
         Table("I'm in a table"),
         Title("A New Beginning"),
@@ -201,7 +201,7 @@ def test_partition_html_processes_chinese_chracters():
 
 def test_emoji_appears_with_emoji_utf8_code():
     assert partition_html(text='<html charset="utf-8"><p>Hello &#128512;</p></html>') == [
-        Title("Hello 😀")
+        Text("Hello 😀")
     ]
 
 
@@ -575,10 +575,10 @@ def test_pre_tag_parsing_respects_order():
             "<div>The Big Blue Bear</div>\n"
         )
     ) == [
-        Title("The Big Brown Bear"),
+        Text("The Big Brown Bear"),
         NarrativeText("The big brown bear is growling."),
         NarrativeText("The big brown bear is sleeping."),
-        Title("The Big Blue Bear"),
+        Text("The Big Blue Bear"),
     ]
 
 
@@ -604,7 +604,7 @@ def test_partition_html_br_tag_parsing():
 
     assert elements == [
         Title("Header 1"),
-        Title("Text"),
+        Text("Text"),
         Title("Header 2"),
         Text(
             "    Param1 = Y\nParam2 = 1\nParam3 = 2\nParam4 = A\n    \nParam5 = A,B,C,D,E\n"
@@ -640,7 +640,7 @@ def test_partition_html_tag_tail_parsing():
 
     elements = partition_html(text=html_text)
 
-    assert elements == [Title("Head"), Title("Nested"), Title("Tail")]
+    assert elements == [Text("Head"), Text("Nested"), Text("Tail")]
 
 
 # -- parsing edge cases --------------------------------------------------------------------------
@@ -731,11 +731,11 @@ def test_containers_with_text_are_processed():
     assert elements == [
         Text("Hi All,"),
         NarrativeText("Get excited for our first annual family day!"),
-        Title("Best."),
+        Text("Best."),
         Text("--"),
-        Title("Dino the Datasaur"),
-        Title("Unstructured Technologies"),
-        Title("Data Scientist"),
+        Text("Dino the Datasaur"),
+        Text("Unstructured Technologies"),
+        Text("Data Scientist"),
         Address("Doylestown, PA 18901"),
         NarrativeText("See you there!"),
     ]
@@ -786,7 +786,7 @@ def test_html_grabs_bulleted_text_in_paras():
 
 def test_joins_tag_text_correctly():
     elements = partition_html(text="<p>Hello again peet mag<i>ic</i>al</p>")
-    assert elements == [Title("Hello again peet magical")]
+    assert elements == [Text("Hello again peet magical")]
 
 
 def test_sample_doc_with_emoji():
@@ -796,17 +796,17 @@ def test_sample_doc_with_emoji():
 
 def test_only_text_and_no_elements_in_body():
     elements = partition_html(text="<body>Hello</body>")
-    assert elements == [Title("Hello")]
+    assert elements == [Text("Hello")]
 
 
 def test_text_before_elements_in_body():
     elements = partition_html(text="<body>Hello<p>World</p></body>")
-    assert elements == [Title("Hello"), Title("World")]
+    assert elements == [Text("Hello"), Text("World")]
 
 
 def test_line_break_in_container():
     elements = partition_html(text="<div>Hello<br/>World</div>")
-    assert elements == [Title("Hello World")]
+    assert elements == [Text("Hello World")]
 
 
 @pytest.mark.parametrize("tag", ["del", "form", "noscript"])
@@ -963,7 +963,7 @@ def test_partition_html_grabs_emphasized_texts():
     assert e.metadata.emphasized_text_contents is None
     assert e.metadata.emphasized_text_tags is None
     e = elements[4]
-    assert e == Title("A lone span text!")
+    assert e == Text("A lone span text!")
     assert e.metadata.emphasized_text_contents is None
     assert e.metadata.emphasized_text_tags is None
 
@@ -1078,7 +1078,7 @@ def test_partition_html_grabs_links():
     assert e.metadata.link_urls is None
     assert e.metadata.link_texts is None
     e = elements[4]
-    assert e == Title("A lone link!")
+    assert e == Text("A lone link!")
     assert e.metadata.link_urls == ["/loner"]
     assert e.metadata.link_texts == ["A lone link!"]
 

diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -175,7 +175,7 @@ def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
 
 EXPECTED_EMAIL_OUTPUT = [
     NarrativeText(text="This is a test email to use for unit tests."),
-    Title(text="Important points:"),
+    Text(text="Important points:"),
     ListItem(text="Roses are red"),
     ListItem(text="Violets are blue"),
 ]
@@ -440,7 +440,7 @@ def test_partition_md_from_url_works_with_embedded_html():
 def test_auto_partition_msg_from_filename():
     assert partition(example_doc_path("fake-email.msg"), strategy=PartitionStrategy.HI_RES) == [
         NarrativeText(text="This is a test email to use for unit tests."),
-        Title(text="Important points:"),
+        Text(text="Important points:"),
         ListItem(text="Roses are red"),
         ListItem(text="Violets are blue"),
     ]

diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py
@@ -30,7 +30,7 @@
 
 EXPECTED_OUTPUT = [
     NarrativeText(text="This is a test email to use for unit tests."),
-    Title(text="Important points:"),
+    Text(text="Important points:"),
     ListItem(text="Roses are red"),
     ListItem(text="Violets are blue"),
 ]
@@ -88,9 +88,9 @@ def test_extract_email_from_text_plain_matches_elements_extracted_from_text_html
     elements_from_text = partition_email(file_path, content_source="text/plain")
     elements_from_html = partition_email(file_path, content_source="text/html")
 
-    assert elements_from_text == EXPECTED_OUTPUT
+    assert all(e.text == eo.text for e, eo in zip(elements_from_text, EXPECTED_OUTPUT))
     assert elements_from_html == EXPECTED_OUTPUT
-    assert elements_from_html == elements_from_text
+    assert all(eh.text == et.text for eh, et in zip(elements_from_html, elements_from_text))
 
 
 def test_partition_email_round_trips_via_json():
@@ -354,14 +354,14 @@ def test_partition_email_can_process_attachments():
     )
 
     assert elements == [
-        Title("Hello!"),
+        Text("Hello!"),
         NarrativeText("Here's the attachments!"),
         NarrativeText("It includes:"),
         ListItem("Lots of whitespace"),
         ListItem("Little to no content"),
         ListItem("and is a quick read"),
         Text("Best,"),
-        Title("Mallori"),
+        Text("Mallori"),
         NarrativeText("Hey this is a fake attachment!"),
     ]
     assert all(e.metadata.last_modified == "2022-12-23T18:08:48+00:00" for e in elements)

diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py
@@ -23,14 +23,13 @@
     ListItem,
     NarrativeText,
     Text,
-    Title,
 )
 from unstructured.partition.common import UnsupportedFileFormatError
 from unstructured.partition.msg import MsgPartitionerOptions, partition_msg
 
 EXPECTED_MSG_OUTPUT = [
     NarrativeText(text="This is a test email to use for unit tests."),
-    Title(text="Important points:"),
+    Text(text="Important points:"),
     ListItem(text="Roses are red"),
     ListItem(text="Violets are blue"),
 ]
@@ -138,9 +137,9 @@ def test_partition_msg_can_process_attachments():
     assert [type(e).__name__ for e in elements][:10] == [
         "NarrativeText",
         "Text",
-        "Title",
-        "Title",
-        "Title",
+        "Text",
+        "Text",
+        "Text",
         "Image",
         "Title",
         "Text",
@@ -175,9 +174,9 @@ def test_partition_msg_silently_skips_attachments_it_cannot_partition(request: F
         # -- the email body is partitioned --
         NarrativeText("Here are those documents."),
         Text("--"),
-        Title("Mallori Harrell"),
-        Title("Unstructured Technologies"),
-        Title("Data Scientist"),
+        Text("Mallori Harrell"),
+        Text("Unstructured Technologies"),
+        Text("Data Scientist"),
         # -- no elements appear for the attachment(s) --
     ]