Skip to content

Commit

Permalink
fix: default to None for the languages metadata field (#1743)
Browse files Browse the repository at this point in the history
### Summary
Closes #1714
Changes the default value for `languages` to `None` for elements that
don't have text or the language can't be detected.

### Testing
```
from unstructured.partition.auto import partition
filename = "example-docs/handbook-1p.docx"
elements = partition(filename=filename, detect_language_per_element=True)

# PageBreak elements don't have text and will be collected here
none_langs = [element for element in elements if element.metadata.languages is None]
none_langs[0].text
```

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
  • Loading branch information
4 people authored Oct 14, 2023
1 parent d0c84d6 commit 6d7fe3a
Show file tree
Hide file tree
Showing 8 changed files with 63 additions and 121 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

* **Cleans up temporary files after conversion** Previously a file conversion utility was leaving temporary files behind on the filesystem without removing them when no longer needed. This fix helps prevent an accumulation of temporary files taking up excessive disk space.
* **Fixes `under_non_alpha_ratio` dividing by zero** Although this function guarded against a specific cause of division by zero, there were edge cases slipping through like strings with only whitespace. This update more generally prevents the function from performing a division by zero.
* **Fix languages default** Previously the default language was being set to English when elements didn't have text or if langdetect could not detect the language. It now defaults to None so there is not misleading information about the language detected.
* **Fixes recursion limit error that was being raised when partitioning Excel documents of a certain size** Previously we used a recursive method to find subtables within an excel sheet. However this would run afoul of Python's recursion depth limit when there was a contiguous block of more than 1000 cells within a sheet. This function has been updated to use the NetworkX library which avoids Python recursion issues.

## 0.10.22
Expand Down
2 changes: 1 addition & 1 deletion test_unstructured/partition/pptx/test_ppt.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,6 @@ def test_partition_ppt_respects_detect_language_per_element():
langs = [element.metadata.languages for element in elements]
# languages other than English and Spanish are detected by this partitioner,
# so this test is slightly different from the other partition tests
langs = {element.metadata.languages[0] for element in elements}
langs = {element.metadata.languages[0] for element in elements if element.metadata.languages}
assert "eng" in langs
assert "spa" in langs
2 changes: 1 addition & 1 deletion test_unstructured/partition/pptx/test_pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ def test_partition_pptx_respects_detect_language_per_element():
langs = [element.metadata.languages for element in elements]
# languages other than English and Spanish are detected by this partitioner,
# so this test is slightly different from the other partition tests
langs = {element.metadata.languages[0] for element in elements}
langs = {element.metadata.languages[0] for element in elements if element.metadata.languages}
assert "eng" in langs
assert "spa" in langs

Expand Down
19 changes: 19 additions & 0 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -1123,3 +1123,22 @@ def test_partition_default_does_not_overwrite_other_defaults():
auto_elements = partition(filename)
assert auto_elements[0].metadata.languages != ["eng"]
assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages


def test_partition_languages_default_to_None():
filename = "example-docs/handbook-1p.docx"
elements = partition(filename=filename, detect_language_per_element=True)
# PageBreak and other elements with no text will have `None` for `languages`
none_langs = [element for element in elements if element.metadata.languages is None]
assert none_langs[0].text == ""


def test_partition_languages_incorrectly_defaults_to_English(tmpdir):
# We don't totally rely on langdetect for short text, so text like the following that is
# in German will be labeled as English.
german = "Ein kurzer Satz."
filepath = os.path.join(tmpdir, "short-german.txt")
with open(filepath, "w") as f:
f.write(german)
elements = partition(filepath)
assert elements[0].metadata.languages == ["eng"]
4 changes: 3 additions & 1 deletion test_unstructured/partition/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,5 +542,7 @@ def test_partition_text_element_metadata_raises_TypeError():
def test_partition_text_detects_more_than_3_languages():
filename = "example-docs/language-docs/UDHR_first_article_all.txt"
elements = partition_text(filename=filename, detect_language_per_element=True)
langs = list({element.metadata.languages[0] for element in elements})
langs = list(
{element.metadata.languages[0] for element in elements if element.metadata.languages},
)
assert len(langs) > 10

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"last_modified": "2023-07-15T08:35:51-07:00",
"filetype": "message/rfc822",
"languages": [
"fra"
"eng"
],
"sent_from": [
"David Potter <potterdavidm@gmail.com>"
Expand Down
16 changes: 9 additions & 7 deletions unstructured/partition/lang.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def _convert_to_standard_langcode(lang: str) -> str:
def detect_languages(
text: str,
languages: Optional[List[str]] = ["auto"],
) -> List[str]:
) -> Optional[List[str]]:
"""
Detects the list of languages present in the text (in the default "auto" mode),
or formats and passes through the user inputted document languages if provided.
Expand All @@ -234,12 +234,14 @@ def detect_languages(
# For example, partition_msg relies on partition_html and partition_text, but the metadata
# gets overwritten after elements have been returned by _html and _text,
# so `languages` would be detected twice.
if languages[0] == "":
return [""]
# Also return None if there is no text.
if languages[0] == "" or text.strip == "":
return None

# Default to "eng" if text is empty or it has only ascii characters and is short
if text.strip() == "" or (re.match(r"^[\x00-\x7F]+$", text) and len(text) < 20):
return ["eng"] # english as default
# If text contains special characters (like ñ, å, or Korean/Mandarin/etc.) it will NOT default
# to English. It will default to English if text is only ascii characters and is short.
if re.match(r"^[\x00-\x7F]+$", text) and len(text.split()) < 5:
return ["eng"]

# set seed for deterministic langdetect outputs
DetectorFactory.seed = 0
Expand All @@ -264,7 +266,7 @@ def detect_languages(
langdetect_result = detect_langs(text)
except lang_detect_exception.LangDetectException as e:
logger.warning(e)
return ["eng"] # english as default
return None # None as default

# NOTE(robinson) - Chinese gets detected with codes zh-cn, zh-tw, zh-hk for various
# Chinese variants. We normalizes these because there is a single model for Chinese
Expand Down

0 comments on commit 6d7fe3a

Please sign in to comment.