Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: improve false-positive Title elements on Chinese text #3836

Merged
merged 6 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.16.12-dev3
## 0.16.12-dev4

### Enhancements

Expand All @@ -10,6 +10,7 @@

- **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
- **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
- **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements.

## 0.16.11

Expand Down
10 changes: 4 additions & 6 deletions test_unstructured/metrics/test_element_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,20 +74,18 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in
"handbook-1p.docx",
{
("Header", None): 1,
("Title", 0): 1,
("Title", 1): 1,
("Title", 2): 1,
("UncategorizedText", 0): 6,
("ListItem", 3): 3,
("NarrativeText", 4): 7,
("NarrativeText", 0): 7,
("Footer", None): 1,
},
(0.43, 0.07, 0.65),
(0.78, 0.72, 0.81),
),
(
"handbook-1p.docx",
{
("Header", None): 1,
("Title", 0): 6,
("UncategorizedText", 0): 6,
("NarrativeText", 0): 7,
("PageBreak", None): 1,
("Footer", None): 1,
Expand Down
2 changes: 1 addition & 1 deletion test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -1286,7 +1286,7 @@ def expected_docx_elements():
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
Text("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
Expand Down
2 changes: 1 addition & 1 deletion test_unstructured/partition/test_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ def expected_elements() -> list[Element]:
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
Text("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
Expand Down
16 changes: 8 additions & 8 deletions test_unstructured/partition/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,7 @@ def expected_elements() -> list[Text]:
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
Text("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
Expand Down Expand Up @@ -1210,7 +1210,7 @@ def str_repr(e: Element) -> str:
opts_args["file_path"] = example_doc_path("page-breaks.docx")
opts = DocxPartitionerOptions(**opts_args)
expected = [
# NOTE(scanny) - -- page 1 --
# -- page 1 --
NarrativeText(
"First page, tab here:\t"
"followed by line-break here:\n"
Expand All @@ -1220,28 +1220,28 @@ def str_repr(e: Element) -> str:
"and hard page-break here>>"
),
PageBreak(""),
# NOTE(scanny) - -- page 2 --
# -- page 2 --
NarrativeText(
"<<Text on second page. The font is big so it breaks onto third page--"
"------------------here-->> <<but break falls inside link so text stays"
" together."
),
PageBreak(""),
# NOTE(scanny) - -- page 3 --
# -- page 3 --
NarrativeText("Continuous section break here>>"),
NarrativeText("<<followed by text on same page"),
NarrativeText("Odd-page section break here>>"),
PageBreak(""),
# NOTE(scanny) - -- page 4 --
# -- page 4 --
PageBreak(""),
# NOTE(scanny) - -- page 5 --
# -- page 5 --
NarrativeText("<<producing two page-breaks to get from page-3 to page-5."),
NarrativeText(
'Then text gets big again so a "natural" rendered page break happens again here>> '
),
PageBreak(""),
# NOTE(scanny) - -- page 6 --
Title("<<and then more text proceeds."),
# -- page 6 --
Text("<<and then more text proceeds."),
]

elements = _DocxPartitioner.iter_document_elements(opts)
Expand Down
5 changes: 2 additions & 3 deletions test_unstructured/partition/test_odt.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
Table,
TableChunk,
Text,
Title,
)
from unstructured.partition.docx import partition_docx
from unstructured.partition.odt import partition_odt
Expand All @@ -44,7 +43,7 @@ def test_partition_odt_from_filename():
elements = partition_odt(example_doc_path("fake.odt"))

assert elements == [
Title("Lorem ipsum dolor sit amet."),
Text("Lorem ipsum dolor sit amet."),
Table(
"Header row Mon Wed Fri"
" Color Blue Red Green"
Expand All @@ -63,7 +62,7 @@ def test_partition_odt_from_file():
elements = partition_odt(file=f)

assert elements == [
Title("Lorem ipsum dolor sit amet."),
Text("Lorem ipsum dolor sit amet."),
Table(
"Header row Mon Wed Fri"
" Color Blue Red Green"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "5209312022a75a31d95385fdccff68fa",
"text": "CHAPTER 1",
"metadata": {
Expand Down Expand Up @@ -51,7 +51,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "22a23e29022f32945965002cd734a8f0",
"text": "INTRODUCTION",
"metadata": {
Expand Down Expand Up @@ -79,7 +79,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "4c175cf543957acc4420221de28d3fca",
"text": "CHAPTER 1 \u2013 INTRODUCTION",
"metadata": {
Expand All @@ -101,7 +101,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "77022a5264f552b223538977cd40f640",
"text": "A.\tPURPOSE",
"metadata": {
Expand Down Expand Up @@ -189,7 +189,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "e341ffc123dd2827638aba18149c4175",
"text": "B.\tROLE OF THE UNITED STATES TRUSTEE",
"metadata": {
Expand Down Expand Up @@ -255,7 +255,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "1b11ebe52652656e0ed8c12e5969de9b",
"text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t",
"metadata": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "5209312022a75a31d95385fdccff68fa",
"text": "CHAPTER 1",
"metadata": {
Expand Down Expand Up @@ -51,7 +51,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "22a23e29022f32945965002cd734a8f0",
"text": "INTRODUCTION",
"metadata": {
Expand Down Expand Up @@ -79,7 +79,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "4c175cf543957acc4420221de28d3fca",
"text": "CHAPTER 1 \u2013 INTRODUCTION",
"metadata": {
Expand All @@ -101,7 +101,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "77022a5264f552b223538977cd40f640",
"text": "A.\tPURPOSE",
"metadata": {
Expand Down Expand Up @@ -189,7 +189,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "e341ffc123dd2827638aba18149c4175",
"text": "B.\tROLE OF THE UNITED STATES TRUSTEE",
"metadata": {
Expand Down Expand Up @@ -255,7 +255,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "1b11ebe52652656e0ed8c12e5969de9b",
"text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t",
"metadata": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "56d531394823d81787d77a04462ed096",
"text": "Lorem ipsum dolor sit amet.",
"metadata": {
Expand All @@ -17,6 +17,13 @@
"date_created": "1686809759.687",
"date_modified": "1686809743.0",
"permissions_data": [
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{
"id": "18298851591250030956",
"displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
Expand All @@ -29,31 +36,24 @@
"pendingOwner": false
},
{
"id": "09147371668407854156",
"displayName": "roman",
"id": "04774006893477068632",
"displayName": "ryan",
"type": "user",
"kind": "drive#permission",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
"emailAddress": "roman@unstructured.io",
"role": "writer",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
"emailAddress": "ryan@unstructured.io",
"role": "owner",
"deleted": false,
"pendingOwner": false
},
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{
"id": "04774006893477068632",
"displayName": "ryan",
"id": "09147371668407854156",
"displayName": "roman",
"type": "user",
"kind": "drive#permission",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
"emailAddress": "ryan@unstructured.io",
"role": "owner",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
"emailAddress": "roman@unstructured.io",
"role": "writer",
"deleted": false,
"pendingOwner": false
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "56d531394823d81787d77a04462ed096",
"text": "Lorem ipsum dolor sit amet.",
"metadata": {
Expand All @@ -17,6 +17,13 @@
"date_created": "1718722775.76",
"date_modified": "1718722788.018",
"permissions_data": [
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{
"id": "18298851591250030956",
"displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
Expand All @@ -39,13 +46,6 @@
"deleted": false,
"pendingOwner": false
},
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{
"id": "09147371668407854156",
"displayName": "roman",
Expand Down
Loading
Loading