feat: redefine table level acc (#2620)

This PR redefines the `table_level_acc` metric as follow: - for each predicted table use sequence matching ratio as its accuracy - as a prerequisite for the sequence matching we sort the table cells by row then column for both predicted and ground truth to ensure they are ordered the same - average all predicted table accuracy - any prediction without a matching ground truth (false positive) would decrease the score - prediction that splits ground truth into smaller tables would also have low score with perfectly equal splits having lowest score This new definition makes the new metric a value between 0 and 1 per file. This replaces the existing definition where the metric is defined as (the number of predicted table that has a match to ground truth) to (the number of ground truth table). This existing metric actually gives higher values for predictions that splits tables and can be higher than 1. The new definition prefers predictions that do not split ground truth tables.
Unstructured-IO · Mar 8, 2024 · 911f998 · 911f998
1 parent 3853840
commit 911f998
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 13 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,10 @@
-## 0.12.6-dev8
+## 0.12.6-dev9
 
 ### Enhancements
 
 * **Improve ability to capture embedded links in `partition_pdf()` for `fast` strategy** Previously, a threshold value that affects the capture of embedded links was set to a fixed value by default. This allows users to specify the threshold value for better capturing.
 * **Refactor `add_chunking_strategy` decorator to dispatch by name.** Add `chunk()` function to be used by the `add_chunking_strategy` decorator to dispatch chunking call based on a chunking-strategy name (that can be dynamic at runtime). This decouples chunking dispatch from only those chunkers known at "compile" time and enables runtime registration of custom chunkers.
+* **Redefine `table_level_acc` metric for table evaluation.** `table_level_acc` now is an average of individual predicted table's accuracy. A predicted table's accuracy is defined as the sequence matching ratio between itself and its corresponding ground truth table.
 
 ### Features
 * **Added Unstructured Platform Documentation** The Unstructured Platform is currently in beta. The documentation provides how-to guides for setting up workflow automation, job scheduling, and configuring source and destination connectors.

diff --git a/test_unstructured/metrics/test_table_structure.py b/test_unstructured/metrics/test_table_structure.py
@@ -33,7 +33,7 @@ def test_table_eval_processor_simple():
         {
             "type": "Table",
             "metadata": {
-                "text_as_html": """<table><thead><<th>r1c1</th><th>r1c2</th></thead>
+                "text_as_html": """<table><thead><th>r1c1</th><th>r1c2</th></thead>
                     <tbody><tr><td>r2c1</td><td>r2c2</td></tr></tbody></table>"""
             },
         }

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.12.6-dev8"  # pragma: no cover
+__version__ = "0.12.6-dev9"  # pragma: no cover
diff --git a/unstructured/metrics/table/table_eval.py b/unstructured/metrics/table/table_eval.py
@@ -20,6 +20,7 @@
     --ground_truth_file "ground_truth.pdf.json"
 """
 
+import difflib
 import json
 from dataclasses import dataclass
 from pathlib import Path
@@ -47,6 +48,27 @@ class TableEvaluation:
     element_row_level_content_acc: float
 
 
+def table_level_acc(predicted_table_data, ground_truth_table_data, matched_indices):
+    """computes for each predicted table its accurary compared to ground truth.
+
+    The accuracy is defined as the SequenceMatcher.ratio() between those two strings. If a
+    prediction does not have a matched ground truth its accuracy is 0
+    """
+    score = np.zeros((len(matched_indices),))
+    ground_truth_text = TableAlignment.get_content_in_tables(ground_truth_table_data)
+    for idx, predicted in enumerate(predicted_table_data):
+        matched_idx = matched_indices[idx]
+        if matched_idx == -1:
+            # false positive; default score 0
+            continue
+        score[idx] = difflib.SequenceMatcher(
+            None,
+            TableAlignment.get_content_in_tables([predicted])[0],
+            ground_truth_text[matched_idx],
+        ).ratio()
+    return score
+
+
 def _count_predicted_tables(matched_indices: List[int]) -> int:
     """Counts the number of predicted tables that have a corresponding match in the ground truth.
 
@@ -141,8 +163,6 @@ def process_file(self) -> TableEvaluation:
         Returns:
             TableEvaluation: A dataclass object containing the computed metrics.
         """
-        total_predicted_tables = 0
-        total_tables = 0
 
         predicted_table_data = extract_and_convert_tables_from_prediction(
             self.prediction,
@@ -155,8 +175,16 @@ def process_file(self) -> TableEvaluation:
             predicted_table_data,
             ground_truth_table_data,
         )
-        total_predicted_tables += _count_predicted_tables(matched_indices)
-        total_tables += len(ground_truth_table_data)
+        if matched_indices:
+            predicted_table_acc = np.mean(
+                table_level_acc(predicted_table_data, ground_truth_table_data, matched_indices)
+            )
+        elif ground_truth_table_data:
+            # no matching prediction but has actual table -> total failure
+            predicted_table_acc = 0
+        else:
+            # no predicted and no actual table -> good job
+            predicted_table_acc = 1
 
         metrics = TableAlignment.get_element_level_alignment(
             predicted_table_data,
@@ -166,10 +194,8 @@ def process_file(self) -> TableEvaluation:
         )
 
         return TableEvaluation(
-            total_tables=total_tables,
-            table_level_acc=(
-                round(total_predicted_tables / total_tables, 2) if total_tables else np.nan
-            ),
+            total_tables=len(ground_truth_table_data),
+            table_level_acc=predicted_table_acc,
             element_col_level_index_acc=metrics.get("col_index_acc", np.nan),
             element_row_level_index_acc=metrics.get("row_index_acc", np.nan),
             element_col_level_content_acc=metrics.get("col_content_acc", np.nan),

diff --git a/unstructured/metrics/table/table_extraction.py b/unstructured/metrics/table/table_extraction.py
@@ -72,6 +72,10 @@ def _convert_table_from_deckerd(content: List[Dict[str, Any]]) -> List[Dict[str,
     return table_data
 
 
+def _sort_table_cells(table_data: List[List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
+    return sorted(table_data, key=lambda cell: (cell["row_index"], cell["col_index"]))
+
+
 def extract_and_convert_tables_from_ground_truth(
     file_elements: List[Dict[str, Any]],
 ) -> List[List[Dict[str, Any]]]:
@@ -91,7 +95,7 @@ def extract_and_convert_tables_from_ground_truth(
                 converted_data = _convert_table_from_deckerd(
                     element["text"],
                 )
-                ground_truth_table_data.append(converted_data)
+                ground_truth_table_data.append(_sort_table_cells(converted_data))
             except Exception as e:
                 print(f"Error converting ground truth data: {e}")
                 ground_truth_table_data.append({})
@@ -121,7 +125,7 @@ def extract_and_convert_tables_from_prediction(
                 continue
             try:
                 converted_data = _convert_table_from_html(val)
-                predicted_table_data.append(converted_data)
+                predicted_table_data.append(_sort_table_cells(converted_data))
             except Exception as e:
                 print(f"Error converting Unstructured table data: {e}")
                 predicted_table_data.append({})
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.12.6-dev8" # pragma: no cover
		__version__ = "0.12.6-dev9" # pragma: no cover