Skip to content

Commit

Permalink
feat: redefine table level acc (#2620)
Browse files Browse the repository at this point in the history
This PR redefines the `table_level_acc` metric as follow:
- for each predicted table use sequence matching ratio as its accuracy
- as a prerequisite for the sequence matching we sort the table cells by
row then column for both predicted and ground truth to ensure they are
ordered the same
- average all predicted table accuracy
- any prediction without a matching ground truth (false positive) would
decrease the score
- prediction that splits ground truth into smaller tables would also
have low score with perfectly equal splits having lowest score

This new definition makes the new metric a value between 0 and 1 per
file. This replaces the existing definition where the metric is defined
as (the number of predicted table that has a match to ground truth) to
(the number of ground truth table). This existing metric actually gives
higher values for predictions that splits tables and can be higher than
1. The new definition prefers predictions that do not split ground truth
tables.
  • Loading branch information
badGarnet authored Mar 8, 2024
1 parent 3853840 commit 911f998
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 13 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
## 0.12.6-dev8
## 0.12.6-dev9

### Enhancements

* **Improve ability to capture embedded links in `partition_pdf()` for `fast` strategy** Previously, a threshold value that affects the capture of embedded links was set to a fixed value by default. This allows users to specify the threshold value for better capturing.
* **Refactor `add_chunking_strategy` decorator to dispatch by name.** Add `chunk()` function to be used by the `add_chunking_strategy` decorator to dispatch chunking call based on a chunking-strategy name (that can be dynamic at runtime). This decouples chunking dispatch from only those chunkers known at "compile" time and enables runtime registration of custom chunkers.
* **Redefine `table_level_acc` metric for table evaluation.** `table_level_acc` now is an average of individual predicted table's accuracy. A predicted table's accuracy is defined as the sequence matching ratio between itself and its corresponding ground truth table.

### Features
* **Added Unstructured Platform Documentation** The Unstructured Platform is currently in beta. The documentation provides how-to guides for setting up workflow automation, job scheduling, and configuring source and destination connectors.
Expand Down
2 changes: 1 addition & 1 deletion test_unstructured/metrics/test_table_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_table_eval_processor_simple():
{
"type": "Table",
"metadata": {
"text_as_html": """<table><thead><<th>r1c1</th><th>r1c2</th></thead>
"text_as_html": """<table><thead><th>r1c1</th><th>r1c2</th></thead>
<tbody><tr><td>r2c1</td><td>r2c2</td></tr></tbody></table>"""
},
}
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.12.6-dev8" # pragma: no cover
__version__ = "0.12.6-dev9" # pragma: no cover
42 changes: 34 additions & 8 deletions unstructured/metrics/table/table_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
--ground_truth_file "ground_truth.pdf.json"
"""

import difflib
import json
from dataclasses import dataclass
from pathlib import Path
Expand Down Expand Up @@ -47,6 +48,27 @@ class TableEvaluation:
element_row_level_content_acc: float


def table_level_acc(predicted_table_data, ground_truth_table_data, matched_indices):
"""computes for each predicted table its accurary compared to ground truth.
The accuracy is defined as the SequenceMatcher.ratio() between those two strings. If a
prediction does not have a matched ground truth its accuracy is 0
"""
score = np.zeros((len(matched_indices),))
ground_truth_text = TableAlignment.get_content_in_tables(ground_truth_table_data)
for idx, predicted in enumerate(predicted_table_data):
matched_idx = matched_indices[idx]
if matched_idx == -1:
# false positive; default score 0
continue
score[idx] = difflib.SequenceMatcher(
None,
TableAlignment.get_content_in_tables([predicted])[0],
ground_truth_text[matched_idx],
).ratio()
return score


def _count_predicted_tables(matched_indices: List[int]) -> int:
"""Counts the number of predicted tables that have a corresponding match in the ground truth.
Expand Down Expand Up @@ -141,8 +163,6 @@ def process_file(self) -> TableEvaluation:
Returns:
TableEvaluation: A dataclass object containing the computed metrics.
"""
total_predicted_tables = 0
total_tables = 0

predicted_table_data = extract_and_convert_tables_from_prediction(
self.prediction,
Expand All @@ -155,8 +175,16 @@ def process_file(self) -> TableEvaluation:
predicted_table_data,
ground_truth_table_data,
)
total_predicted_tables += _count_predicted_tables(matched_indices)
total_tables += len(ground_truth_table_data)
if matched_indices:
predicted_table_acc = np.mean(
table_level_acc(predicted_table_data, ground_truth_table_data, matched_indices)
)
elif ground_truth_table_data:
# no matching prediction but has actual table -> total failure
predicted_table_acc = 0
else:
# no predicted and no actual table -> good job
predicted_table_acc = 1

metrics = TableAlignment.get_element_level_alignment(
predicted_table_data,
Expand All @@ -166,10 +194,8 @@ def process_file(self) -> TableEvaluation:
)

return TableEvaluation(
total_tables=total_tables,
table_level_acc=(
round(total_predicted_tables / total_tables, 2) if total_tables else np.nan
),
total_tables=len(ground_truth_table_data),
table_level_acc=predicted_table_acc,
element_col_level_index_acc=metrics.get("col_index_acc", np.nan),
element_row_level_index_acc=metrics.get("row_index_acc", np.nan),
element_col_level_content_acc=metrics.get("col_content_acc", np.nan),
Expand Down
8 changes: 6 additions & 2 deletions unstructured/metrics/table/table_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ def _convert_table_from_deckerd(content: List[Dict[str, Any]]) -> List[Dict[str,
return table_data


def _sort_table_cells(table_data: List[List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
return sorted(table_data, key=lambda cell: (cell["row_index"], cell["col_index"]))


def extract_and_convert_tables_from_ground_truth(
file_elements: List[Dict[str, Any]],
) -> List[List[Dict[str, Any]]]:
Expand All @@ -91,7 +95,7 @@ def extract_and_convert_tables_from_ground_truth(
converted_data = _convert_table_from_deckerd(
element["text"],
)
ground_truth_table_data.append(converted_data)
ground_truth_table_data.append(_sort_table_cells(converted_data))
except Exception as e:
print(f"Error converting ground truth data: {e}")
ground_truth_table_data.append({})
Expand Down Expand Up @@ -121,7 +125,7 @@ def extract_and_convert_tables_from_prediction(
continue
try:
converted_data = _convert_table_from_html(val)
predicted_table_data.append(converted_data)
predicted_table_data.append(_sort_table_cells(converted_data))
except Exception as e:
print(f"Error converting Unstructured table data: {e}")
predicted_table_data.append({})
Expand Down

0 comments on commit 911f998

Please sign in to comment.