Merge branch 'main' into praktiskt/fix-ipv4-regex

Unstructured-IO · Dec 8, 2024 · 9ee149c · 9ee149c
2 parents ce5bf74 + 2f06d5a
commit 9ee149c
Show file tree

Hide file tree

Showing 8 changed files with 328 additions and 9 deletions.
diff --git a/.gitignore b/.gitignore
@@ -190,7 +190,7 @@ tags
 # Persistent undo
 [._]*.un~
 
-.DS_Store
+*.DS_Store
 
 # Ruff cache
 .ruff_cache/

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,12 +1,14 @@
-## 0.16.10
+## 0.16.10-dev0
 
 ### Enhancements
 
+- **Enhance quote standardization tests with additional Unicode scenarios
+
 ### Features
 
 ### Fixes
 
-- Fix ipv4 regex to correctly include up to three digit octets.
+- **Fix original file doctype detection** from cct converted file paths for metrics calculation.
 
 ## 0.16.9
 

diff --git a/Makefile b/Makefile
@@ -198,6 +198,10 @@ test-extra-pypandoc:
 test-extra-xlsx:
 	PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_xlsx.py
 
+.PHONY: test-text-extraction-evaluate
+test-text-extraction-evaluate:
+	PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/metrics/test_text_extraction.py
+
 ## check:                   runs linters (includes tests)
 .PHONY: check
 check: check-ruff check-black check-flake8 check-version

diff --git a/test_unstructured/metrics/test_evaluate.py b/test_unstructured/metrics/test_evaluate.py
@@ -2,6 +2,7 @@
 import pathlib
 import shutil
 from pathlib import Path
+from unittest.mock import MagicMock, patch
 
 import numpy as np
 import pandas as pd
@@ -57,6 +58,52 @@
 )
 
 
+@pytest.fixture
+def mock_dependencies():
+    with patch(
+        "unstructured.metrics.evaluate.calculate_accuracy"
+    ) as mock_calculate_accuracy, patch(
+        "unstructured.metrics.evaluate.calculate_percent_missing_text"
+    ) as mock_calculate_percent_missing_text, patch.object(
+        TextExtractionMetricsCalculator, "_get_ccts"
+    ) as mock_get_ccts, patch(
+        "unstructured.metrics.evaluate.get_element_type_frequency"
+    ) as mock_get_element_type_frequency, patch(
+        "unstructured.metrics.evaluate.calculate_element_type_percent_match"
+    ) as mock_calculate_element_type_percent_match, patch(
+        "unstructured.metrics.evaluate._read_text_file"
+    ) as mock_read_text_file, patch.object(
+        Path, "exists"
+    ) as mock_path_exists, patch(
+        "unstructured.metrics.evaluate.TableEvalProcessor.from_json_files"
+    ) as mock_table_eval_processor_from_json_files, patch.object(
+        TableStructureMetricsCalculator, "supported_metric_names"
+    ) as mock_supported_metric_names:
+        mocks = {
+            "mock_calculate_accuracy": mock_calculate_accuracy,
+            "mock_calculate_percent_missing_text": mock_calculate_percent_missing_text,
+            "mock_get_ccts": mock_get_ccts,
+            "mock_get_element_type_frequency": mock_get_element_type_frequency,
+            "mock_read_text_file": mock_read_text_file,
+            "mock_calculate_element_type_percent_match": mock_calculate_element_type_percent_match,
+            "mock_table_eval_processor_from_json_files": mock_table_eval_processor_from_json_files,
+            "mock_supported_metric_names": mock_supported_metric_names,
+            "mock_path_exists": mock_path_exists,
+        }
+
+        # setup mocks
+        mocks["mock_calculate_accuracy"].return_value = 0.5
+        mocks["mock_calculate_percent_missing_text"].return_value = 0.5
+        mocks["mock_get_ccts"].return_value = ["output_cct", "source_cct"]
+        mocks["mock_get_element_type_frequency"].side_effect = [{"ele1": 1}, {"ele2": 3}]
+        mocks["mock_calculate_element_type_percent_match"].return_value = 0.5
+        mocks["mock_supported_metric_names"].return_value = ["table_level_acc"]
+        mocks["mock_path_exists"].return_value = True
+        mocks["mock_read_text_file"].side_effect = ["output_text", "source_text"]
+
+        yield mocks
+
+
 @pytest.fixture()
 def _cleanup_after_test():
     """Fixture for removing side-effects of running tests in this file."""
@@ -139,6 +186,114 @@ def test_process_document_returns_the_correct_amount_of_values(
     assert len(output_list) == expected_length
 
 
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+@pytest.mark.usefixtures("_cleanup_after_test", "mock_dependencies")
+@pytest.mark.parametrize(
+    ("calculator_class", "output_dirname", "source_dirname", "path", "kwargs"),
+    [
+        (
+            TextExtractionMetricsCalculator,
+            UNSTRUCTURED_CCT_DIRNAME,
+            GOLD_CCT_DIRNAME,
+            Path("2310.03502text_to_image_synthesis1-7.pdf.txt"),
+            {"document_type": "txt"},
+        ),
+    ],
+)
+def test_TextExtractionMetricsCalculator_process_document_returns_the_correct_doctype(
+    mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
+):
+
+    output_dir = Path(TESTING_FILE_DIR) / output_dirname
+    source_dir = Path(TESTING_FILE_DIR) / source_dirname
+    mock_calculate_accuracy = mock_dependencies["mock_calculate_accuracy"]
+    mock_calculate_percent_missing_text = mock_dependencies["mock_calculate_percent_missing_text"]
+    mock_get_ccts = mock_dependencies["mock_get_ccts"]
+    calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
+    output_list = calculator._process_document(path)
+    assert output_list[1] == ".pdf"
+    assert mock_calculate_accuracy.call_count == 1
+    assert mock_calculate_percent_missing_text.call_count == 1
+    assert mock_get_ccts.call_count == 1
+
+
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+@pytest.mark.usefixtures("_cleanup_after_test", "mock_dependencies")
+@pytest.mark.parametrize(
+    ("calculator_class", "output_dirname", "source_dirname", "path", "kwargs"),
+    [
+        (
+            TableStructureMetricsCalculator,
+            UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME,
+            GOLD_TABLE_STRUCTURE_DIRNAME,
+            Path("tablib-627mTABLES-2310.07875-p7.pdf.json"),
+            {},
+        ),
+        # (
+        #     ElementTypeMetricsCalculator,
+        #     UNSTRUCTURED_OUTPUT_DIRNAME,
+        #     GOLD_ELEMENT_TYPE_DIRNAME,
+        #     Path("IRS-form.1987.pdf.json"),
+        #     {},
+        # ),
+    ],
+)
+def test_TableStructureMetricsCalculator_process_document_returns_the_correct_doctype(
+    mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
+):
+
+    output_dir = Path(TESTING_FILE_DIR) / output_dirname
+    source_dir = Path(TESTING_FILE_DIR) / source_dirname
+    calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
+    calculator._ground_truths_dir = source_dir
+    calculator._documents_dir = output_dir
+    calculator._ground_truth_paths = [source_dir / path]
+    mock_report = MagicMock()
+    mock_report.total_predicted_tables = 3
+    mock_report.table_evel_acc = 0.83
+    mock_table_eval_processor_from_json_files = mock_dependencies[
+        "mock_table_eval_processor_from_json_files"
+    ]
+    mock_table_eval_processor_from_json_files.return_value.process_file.return_value = mock_report
+
+    output_list = calculator._process_document(path)
+    assert output_list[1] == ".pdf"
+    assert mock_table_eval_processor_from_json_files.call_count == 1
+
+
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+@pytest.mark.usefixtures("_cleanup_after_test", "mock_dependencies")
+@pytest.mark.parametrize(
+    ("calculator_class", "output_dirname", "source_dirname", "path", "kwargs"),
+    [
+        (
+            ElementTypeMetricsCalculator,
+            UNSTRUCTURED_OUTPUT_DIRNAME,
+            GOLD_ELEMENT_TYPE_DIRNAME,
+            Path("IRS-form.1987.pdf.json"),
+            {},
+        ),
+    ],
+)
+def test_ElementTypeMetricsCalculator_process_document_returns_the_correct_doctype(
+    mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
+):
+
+    output_dir = Path(TESTING_FILE_DIR) / output_dirname
+    source_dir = Path(TESTING_FILE_DIR) / source_dirname
+    calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
+    mock_element_type_frequency = mock_dependencies["mock_get_element_type_frequency"]
+    mock_read_text_file = mock_dependencies["mock_read_text_file"]
+    mock_calculate_element_type_percent_match = mock_dependencies[
+        "mock_calculate_element_type_percent_match"
+    ]
+    output_list = calculator._process_document(path)
+    assert output_list[1] == ".pdf"
+    assert mock_read_text_file.call_count == 2
+    assert mock_element_type_frequency.call_count == 2
+    assert mock_calculate_element_type_percent_match.call_count == 1
+
+
 @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
 @pytest.mark.usefixtures("_cleanup_after_test")
 def test_text_extraction_evaluation_type_txt():

diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py
@@ -340,6 +340,74 @@ def test_prepare_string(text, expected):
     assert text_extraction.prepare_str(text) == text
 
 
+@pytest.mark.parametrize(
+    ("input_text", "expected_output"),
+    [
+        # Mixed quotes in longer sentences
+        (
+            "She said \"Hello\" and then whispered 'Goodbye' before leaving.",
+            "She said \"Hello\" and then whispered 'Goodbye' before leaving.",
+        ),
+        # Double low-9 quotes with complex content
+        (
+            "„To be, or not to be, that is the question\" - Shakespeare's famous quote.",
+            '"To be, or not to be, that is the question" - Shakespeare\'s famous quote.',
+        ),
+        # Angle quotes with nested quotes
+        (
+            '«When he said "life is beautiful," I believed him» wrote Maria.',
+            '"When he said "life is beautiful," I believed him" wrote Maria.',
+        ),
+        # Heavy ornament quotes in dialogue
+        (
+            "❝Do you remember when we first met?❞ she asked with a smile.",
+            '"Do you remember when we first met?" she asked with a smile.',
+        ),
+        # Double prime quotes with punctuation
+        (
+            "〝The meeting starts at 10:00, don't be late!〟 announced the manager.",
+            '"The meeting starts at 10:00, don\'t be late!" announced the manager.',
+        ),
+        # Corner brackets with nested quotes
+        (
+            '「He told me "This is important" yesterday」, she explained.',
+            "'He told me \"This is important\" yesterday', she explained.",
+        ),
+        # White corner brackets with multiple sentences
+        (
+            "『The sun was setting. The birds were singing. It was peaceful.』",
+            "'The sun was setting. The birds were singing. It was peaceful.'",
+        ),
+        # Vertical corner brackets with numbers and special characters
+        ("﹂Meeting #123 @ 15:00 - Don't forget!﹁", "'Meeting #123 @ 15:00 - Don't forget!'"),
+        # Complex mixed quote types
+        (
+            '「Hello」, ❝World❞, "Test", \'Example\', „Quote", «Final»',
+            '\'Hello\', "World", "Test", \'Example\', "Quote", "Final"',
+        ),
+        # Quotes with multiple apostrophes
+        ("It's John's book, isn't it?", "It's John's book, isn't it?"),
+        # Single angle quotes with nested content
+        (
+            '‹Testing the system\'s capability for "quoted" text›',
+            "'Testing the system's capability for \"quoted\" text'",
+        ),
+        # Heavy single ornament quotes with multiple sentences
+        (
+            "❛First sentence. Second sentence. Third sentence.❜",
+            "'First sentence. Second sentence. Third sentence.'",
+        ),
+        # Mix of various quote types in complex text
+        (
+            '「Chapter 1」: ❝The Beginning❞ - „A new story" begins «today».',
+            '\'Chapter 1\': "The Beginning" - "A new story" begins "today".',
+        ),
+    ],
+)
+def test_standardize_quotes(input_text, expected_output):
+    assert text_extraction.standardize_quotes(input_text) == expected_output
+
+
 @pytest.mark.parametrize(
     ("output_text", "source_text", "expected_percentage"),
     [

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.10"  # pragma: no cover
+__version__ = "0.16.10-dev0"  # pragma: no cover
diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py
@@ -248,7 +248,7 @@ def default_agg_tsv_name(self):
     def _process_document(self, doc: Path) -> Optional[list]:
         doc_path = Path(doc)
         out_filename = doc_path.stem
-        doctype = Path(out_filename).suffix[1:]
+        doctype = Path(out_filename).suffix
         src_gt_filename = out_filename + ".json"
         connector = doc_path.parts[-2] if len(doc_path.parts) > 1 else None
 
@@ -407,7 +407,7 @@ def _validate_inputs(self):
 
     def _process_document(self, doc: Path) -> Optional[list]:
         filename = doc.stem
-        doctype = doc.suffixes[0]
+        doctype = doc.suffixes[-2]
         connector = doc.parts[0] if len(doc.parts) > 1 else None
 
         output_cct, source_cct = self._get_ccts(doc)
@@ -482,7 +482,7 @@ def default_agg_tsv_name(self) -> str:
 
     def _process_document(self, doc: Path) -> Optional[list]:
         filename = doc.stem
-        doctype = doc.suffixes[0]
+        doctype = doc.suffixes[-2]
         connector = doc.parts[0] if len(doc.parts) > 1 else None
 
         output = get_element_type_frequency(_read_text_file(self.documents_dir / doc))
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.16.10" # pragma: no cover
		__version__ = "0.16.10-dev0" # pragma: no cover