fix: default to None for the languages metadata field (#1743)

### Summary Closes #1714 Changes the default value for `languages` to `None` for elements that don't have text or the language can't be detected. ### Testing ``` from unstructured.partition.auto import partition filename = "example-docs/handbook-1p.docx" elements = partition(filename=filename, detect_language_per_element=True) # PageBreak elements don't have text and will be collected here none_langs = [element for element in elements if element.metadata.languages is None] none_langs[0].text ``` --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: Coniferish <Coniferish@users.noreply.github.com> Co-authored-by: cragwolfe <crag@unstructured.io>
Unstructured-IO · Oct 14, 2023 · 6d7fe3a · 6d7fe3a
1 parent d0c84d6
commit 6d7fe3a
Show file tree

Hide file tree

Showing 8 changed files with 63 additions and 121 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@
 
 * **Cleans up temporary files after conversion** Previously a file conversion utility was leaving temporary files behind on the filesystem without removing them when no longer needed. This fix helps prevent an accumulation of temporary files taking up excessive disk space.
 * **Fixes `under_non_alpha_ratio` dividing by zero** Although this function guarded against a specific cause of division by zero, there were edge cases slipping through like strings with only whitespace. This update more generally prevents the function from performing a division by zero.
+* **Fix languages default** Previously the default language was being set to English when elements didn't have text or if langdetect could not detect the language. It now defaults to None so there is not misleading information about the language detected.
 * **Fixes recursion limit error that was being raised when partitioning Excel documents of a certain size** Previously we used a recursive method to find subtables within an excel sheet. However this would run afoul of Python's recursion depth limit when there was a contiguous block of more than 1000 cells within a sheet. This function has been updated to use the NetworkX library which avoids Python recursion issues.
 
 ## 0.10.22

diff --git a/test_unstructured/partition/pptx/test_ppt.py b/test_unstructured/partition/pptx/test_ppt.py
@@ -190,6 +190,6 @@ def test_partition_ppt_respects_detect_language_per_element():
     langs = [element.metadata.languages for element in elements]
     # languages other than English and Spanish are detected by this partitioner,
     # so this test is slightly different from the other partition tests
-    langs = {element.metadata.languages[0] for element in elements}
+    langs = {element.metadata.languages[0] for element in elements if element.metadata.languages}
     assert "eng" in langs
     assert "spa" in langs
diff --git a/test_unstructured/partition/pptx/test_pptx.py b/test_unstructured/partition/pptx/test_pptx.py
@@ -367,7 +367,7 @@ def test_partition_pptx_respects_detect_language_per_element():
     langs = [element.metadata.languages for element in elements]
     # languages other than English and Spanish are detected by this partitioner,
     # so this test is slightly different from the other partition tests
-    langs = {element.metadata.languages[0] for element in elements}
+    langs = {element.metadata.languages[0] for element in elements if element.metadata.languages}
     assert "eng" in langs
     assert "spa" in langs
 

diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -1123,3 +1123,22 @@ def test_partition_default_does_not_overwrite_other_defaults():
     auto_elements = partition(filename)
     assert auto_elements[0].metadata.languages != ["eng"]
     assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages
+
+
+def test_partition_languages_default_to_None():
+    filename = "example-docs/handbook-1p.docx"
+    elements = partition(filename=filename, detect_language_per_element=True)
+    # PageBreak and other elements with no text will have `None` for `languages`
+    none_langs = [element for element in elements if element.metadata.languages is None]
+    assert none_langs[0].text == ""
+
+
+def test_partition_languages_incorrectly_defaults_to_English(tmpdir):
+    # We don't totally rely on langdetect for short text, so text like the following that is
+    # in German will be labeled as English.
+    german = "Ein kurzer Satz."
+    filepath = os.path.join(tmpdir, "short-german.txt")
+    with open(filepath, "w") as f:
+        f.write(german)
+    elements = partition(filepath)
+    assert elements[0].metadata.languages == ["eng"]
diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py
@@ -542,5 +542,7 @@ def test_partition_text_element_metadata_raises_TypeError():
 def test_partition_text_detects_more_than_3_languages():
     filename = "example-docs/language-docs/UDHR_first_article_all.txt"
     elements = partition_text(filename=filename, detect_language_per_element=True)
-    langs = list({element.metadata.languages[0] for element in elements})
+    langs = list(
+        {element.metadata.languages[0] for element in elements if element.metadata.languages},
+    )
     assert len(langs) > 10
diff --git a/...tured_ingest/expected-structured-output/local-single-file/UDHR_first_article_all.txt.json b/...tured_ingest/expected-structured-output/local-single-file/UDHR_first_article_all.txt.json
diff --git a/test_unstructured_ingest/expected-structured-output/outlook/21be155fb0c95885.eml.json b/test_unstructured_ingest/expected-structured-output/outlook/21be155fb0c95885.eml.json
@@ -17,7 +17,7 @@
       "last_modified": "2023-07-15T08:35:51-07:00",
       "filetype": "message/rfc822",
       "languages": [
-        "fra"
+        "eng"
       ],
       "sent_from": [
         "David Potter <potterdavidm@gmail.com>"

diff --git a/unstructured/partition/lang.py b/unstructured/partition/lang.py
@@ -220,7 +220,7 @@ def _convert_to_standard_langcode(lang: str) -> str:
 def detect_languages(
     text: str,
     languages: Optional[List[str]] = ["auto"],
-) -> List[str]:
+) -> Optional[List[str]]:
     """
     Detects the list of languages present in the text (in the default "auto" mode),
     or formats and passes through the user inputted document languages if provided.
@@ -234,12 +234,14 @@ def detect_languages(
     # For example, partition_msg relies on partition_html and partition_text, but the metadata
     # gets overwritten after elements have been returned by _html and _text,
     # so `languages` would be detected twice.
-    if languages[0] == "":
-        return [""]
+    # Also return None if there is no text.
+    if languages[0] == "" or text.strip == "":
+        return None
 
-    # Default to "eng" if text is empty or it has only ascii characters and is short
-    if text.strip() == "" or (re.match(r"^[\x00-\x7F]+$", text) and len(text) < 20):
-        return ["eng"]  # english as default
+    # If text contains special characters (like ñ, å, or Korean/Mandarin/etc.) it will NOT default
+    # to English. It will default to English if text is only ascii characters and is short.
+    if re.match(r"^[\x00-\x7F]+$", text) and len(text.split()) < 5:
+        return ["eng"]
 
     # set seed for deterministic langdetect outputs
     DetectorFactory.seed = 0
@@ -264,7 +266,7 @@ def detect_languages(
             langdetect_result = detect_langs(text)
         except lang_detect_exception.LangDetectException as e:
             logger.warning(e)
-            return ["eng"]  # english as default
+            return None  # None as default
 
         # NOTE(robinson) - Chinese gets detected with codes zh-cn, zh-tw, zh-hk for various
         # Chinese variants. We normalizes these because there is a single model for Chinese