Skip to content

Commit

Permalink
Merge branch 'main' into praktiskt/fix-ipv4-regex
Browse files Browse the repository at this point in the history
  • Loading branch information
praktiskt authored Dec 8, 2024
2 parents ce5bf74 + 2f06d5a commit 9ee149c
Show file tree
Hide file tree
Showing 8 changed files with 328 additions and 9 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ tags
# Persistent undo
[._]*.un~

.DS_Store
*.DS_Store

# Ruff cache
.ruff_cache/
Expand Down
6 changes: 4 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
## 0.16.10
## 0.16.10-dev0

### Enhancements

- **Enhance quote standardization tests with additional Unicode scenarios

### Features

### Fixes

- Fix ipv4 regex to correctly include up to three digit octets.
- **Fix original file doctype detection** from cct converted file paths for metrics calculation.

## 0.16.9

Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,10 @@ test-extra-pypandoc:
test-extra-xlsx:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_xlsx.py

.PHONY: test-text-extraction-evaluate
test-text-extraction-evaluate:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/metrics/test_text_extraction.py

## check: runs linters (includes tests)
.PHONY: check
check: check-ruff check-black check-flake8 check-version
Expand Down
155 changes: 155 additions & 0 deletions test_unstructured/metrics/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pathlib
import shutil
from pathlib import Path
from unittest.mock import MagicMock, patch

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -57,6 +58,52 @@
)


@pytest.fixture
def mock_dependencies():
with patch(
"unstructured.metrics.evaluate.calculate_accuracy"
) as mock_calculate_accuracy, patch(
"unstructured.metrics.evaluate.calculate_percent_missing_text"
) as mock_calculate_percent_missing_text, patch.object(
TextExtractionMetricsCalculator, "_get_ccts"
) as mock_get_ccts, patch(
"unstructured.metrics.evaluate.get_element_type_frequency"
) as mock_get_element_type_frequency, patch(
"unstructured.metrics.evaluate.calculate_element_type_percent_match"
) as mock_calculate_element_type_percent_match, patch(
"unstructured.metrics.evaluate._read_text_file"
) as mock_read_text_file, patch.object(
Path, "exists"
) as mock_path_exists, patch(
"unstructured.metrics.evaluate.TableEvalProcessor.from_json_files"
) as mock_table_eval_processor_from_json_files, patch.object(
TableStructureMetricsCalculator, "supported_metric_names"
) as mock_supported_metric_names:
mocks = {
"mock_calculate_accuracy": mock_calculate_accuracy,
"mock_calculate_percent_missing_text": mock_calculate_percent_missing_text,
"mock_get_ccts": mock_get_ccts,
"mock_get_element_type_frequency": mock_get_element_type_frequency,
"mock_read_text_file": mock_read_text_file,
"mock_calculate_element_type_percent_match": mock_calculate_element_type_percent_match,
"mock_table_eval_processor_from_json_files": mock_table_eval_processor_from_json_files,
"mock_supported_metric_names": mock_supported_metric_names,
"mock_path_exists": mock_path_exists,
}

# setup mocks
mocks["mock_calculate_accuracy"].return_value = 0.5
mocks["mock_calculate_percent_missing_text"].return_value = 0.5
mocks["mock_get_ccts"].return_value = ["output_cct", "source_cct"]
mocks["mock_get_element_type_frequency"].side_effect = [{"ele1": 1}, {"ele2": 3}]
mocks["mock_calculate_element_type_percent_match"].return_value = 0.5
mocks["mock_supported_metric_names"].return_value = ["table_level_acc"]
mocks["mock_path_exists"].return_value = True
mocks["mock_read_text_file"].side_effect = ["output_text", "source_text"]

yield mocks


@pytest.fixture()
def _cleanup_after_test():
"""Fixture for removing side-effects of running tests in this file."""
Expand Down Expand Up @@ -139,6 +186,114 @@ def test_process_document_returns_the_correct_amount_of_values(
assert len(output_list) == expected_length


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test", "mock_dependencies")
@pytest.mark.parametrize(
("calculator_class", "output_dirname", "source_dirname", "path", "kwargs"),
[
(
TextExtractionMetricsCalculator,
UNSTRUCTURED_CCT_DIRNAME,
GOLD_CCT_DIRNAME,
Path("2310.03502text_to_image_synthesis1-7.pdf.txt"),
{"document_type": "txt"},
),
],
)
def test_TextExtractionMetricsCalculator_process_document_returns_the_correct_doctype(
mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
):

output_dir = Path(TESTING_FILE_DIR) / output_dirname
source_dir = Path(TESTING_FILE_DIR) / source_dirname
mock_calculate_accuracy = mock_dependencies["mock_calculate_accuracy"]
mock_calculate_percent_missing_text = mock_dependencies["mock_calculate_percent_missing_text"]
mock_get_ccts = mock_dependencies["mock_get_ccts"]
calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
output_list = calculator._process_document(path)
assert output_list[1] == ".pdf"
assert mock_calculate_accuracy.call_count == 1
assert mock_calculate_percent_missing_text.call_count == 1
assert mock_get_ccts.call_count == 1


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test", "mock_dependencies")
@pytest.mark.parametrize(
("calculator_class", "output_dirname", "source_dirname", "path", "kwargs"),
[
(
TableStructureMetricsCalculator,
UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME,
GOLD_TABLE_STRUCTURE_DIRNAME,
Path("tablib-627mTABLES-2310.07875-p7.pdf.json"),
{},
),
# (
# ElementTypeMetricsCalculator,
# UNSTRUCTURED_OUTPUT_DIRNAME,
# GOLD_ELEMENT_TYPE_DIRNAME,
# Path("IRS-form.1987.pdf.json"),
# {},
# ),
],
)
def test_TableStructureMetricsCalculator_process_document_returns_the_correct_doctype(
mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
):

output_dir = Path(TESTING_FILE_DIR) / output_dirname
source_dir = Path(TESTING_FILE_DIR) / source_dirname
calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
calculator._ground_truths_dir = source_dir
calculator._documents_dir = output_dir
calculator._ground_truth_paths = [source_dir / path]
mock_report = MagicMock()
mock_report.total_predicted_tables = 3
mock_report.table_evel_acc = 0.83
mock_table_eval_processor_from_json_files = mock_dependencies[
"mock_table_eval_processor_from_json_files"
]
mock_table_eval_processor_from_json_files.return_value.process_file.return_value = mock_report

output_list = calculator._process_document(path)
assert output_list[1] == ".pdf"
assert mock_table_eval_processor_from_json_files.call_count == 1


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test", "mock_dependencies")
@pytest.mark.parametrize(
("calculator_class", "output_dirname", "source_dirname", "path", "kwargs"),
[
(
ElementTypeMetricsCalculator,
UNSTRUCTURED_OUTPUT_DIRNAME,
GOLD_ELEMENT_TYPE_DIRNAME,
Path("IRS-form.1987.pdf.json"),
{},
),
],
)
def test_ElementTypeMetricsCalculator_process_document_returns_the_correct_doctype(
mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
):

output_dir = Path(TESTING_FILE_DIR) / output_dirname
source_dir = Path(TESTING_FILE_DIR) / source_dirname
calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
mock_element_type_frequency = mock_dependencies["mock_get_element_type_frequency"]
mock_read_text_file = mock_dependencies["mock_read_text_file"]
mock_calculate_element_type_percent_match = mock_dependencies[
"mock_calculate_element_type_percent_match"
]
output_list = calculator._process_document(path)
assert output_list[1] == ".pdf"
assert mock_read_text_file.call_count == 2
assert mock_element_type_frequency.call_count == 2
assert mock_calculate_element_type_percent_match.call_count == 1


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_text_extraction_evaluation_type_txt():
Expand Down
68 changes: 68 additions & 0 deletions test_unstructured/metrics/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,74 @@ def test_prepare_string(text, expected):
assert text_extraction.prepare_str(text) == text


@pytest.mark.parametrize(
("input_text", "expected_output"),
[
# Mixed quotes in longer sentences
(
"She said \"Hello\" and then whispered 'Goodbye' before leaving.",
"She said \"Hello\" and then whispered 'Goodbye' before leaving.",
),
# Double low-9 quotes with complex content
(
"„To be, or not to be, that is the question\" - Shakespeare's famous quote.",
'"To be, or not to be, that is the question" - Shakespeare\'s famous quote.',
),
# Angle quotes with nested quotes
(
'«When he said "life is beautiful," I believed him» wrote Maria.',
'"When he said "life is beautiful," I believed him" wrote Maria.',
),
# Heavy ornament quotes in dialogue
(
"❝Do you remember when we first met?❞ she asked with a smile.",
'"Do you remember when we first met?" she asked with a smile.',
),
# Double prime quotes with punctuation
(
"〝The meeting starts at 10:00, don't be late!〟 announced the manager.",
'"The meeting starts at 10:00, don\'t be late!" announced the manager.',
),
# Corner brackets with nested quotes
(
'「He told me "This is important" yesterday」, she explained.',
"'He told me \"This is important\" yesterday', she explained.",
),
# White corner brackets with multiple sentences
(
"『The sun was setting. The birds were singing. It was peaceful.』",
"'The sun was setting. The birds were singing. It was peaceful.'",
),
# Vertical corner brackets with numbers and special characters
("﹂Meeting #123 @ 15:00 - Don't forget!﹁", "'Meeting #123 @ 15:00 - Don't forget!'"),
# Complex mixed quote types
(
'「Hello」, ❝World❞, "Test", \'Example\', „Quote", «Final»',
'\'Hello\', "World", "Test", \'Example\', "Quote", "Final"',
),
# Quotes with multiple apostrophes
("It's John's book, isn't it?", "It's John's book, isn't it?"),
# Single angle quotes with nested content
(
'‹Testing the system\'s capability for "quoted" text›',
"'Testing the system's capability for \"quoted\" text'",
),
# Heavy single ornament quotes with multiple sentences
(
"❛First sentence. Second sentence. Third sentence.❜",
"'First sentence. Second sentence. Third sentence.'",
),
# Mix of various quote types in complex text
(
'「Chapter 1」: ❝The Beginning❞ - „A new story" begins «today».',
'\'Chapter 1\': "The Beginning" - "A new story" begins "today".',
),
],
)
def test_standardize_quotes(input_text, expected_output):
assert text_extraction.standardize_quotes(input_text) == expected_output


@pytest.mark.parametrize(
("output_text", "source_text", "expected_percentage"),
[
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.10" # pragma: no cover
__version__ = "0.16.10-dev0" # pragma: no cover
6 changes: 3 additions & 3 deletions unstructured/metrics/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ def default_agg_tsv_name(self):
def _process_document(self, doc: Path) -> Optional[list]:
doc_path = Path(doc)
out_filename = doc_path.stem
doctype = Path(out_filename).suffix[1:]
doctype = Path(out_filename).suffix
src_gt_filename = out_filename + ".json"
connector = doc_path.parts[-2] if len(doc_path.parts) > 1 else None

Expand Down Expand Up @@ -407,7 +407,7 @@ def _validate_inputs(self):

def _process_document(self, doc: Path) -> Optional[list]:
filename = doc.stem
doctype = doc.suffixes[0]
doctype = doc.suffixes[-2]
connector = doc.parts[0] if len(doc.parts) > 1 else None

output_cct, source_cct = self._get_ccts(doc)
Expand Down Expand Up @@ -482,7 +482,7 @@ def default_agg_tsv_name(self) -> str:

def _process_document(self, doc: Path) -> Optional[list]:
filename = doc.stem
doctype = doc.suffixes[0]
doctype = doc.suffixes[-2]
connector = doc.parts[0] if len(doc.parts) > 1 else None

output = get_element_type_frequency(_read_text_file(self.documents_dir / doc))
Expand Down
Loading

0 comments on commit 9ee149c

Please sign in to comment.