From 5782305122e576c9db09af01aa84e7706b321b6c Mon Sep 17 00:00:00 2001 From: Soeb Hussain Date: Tue, 26 Nov 2024 18:51:30 +0000 Subject: [PATCH] more fixes for font sizes --- lib/sycamore/sycamore/transforms/detr_partitioner.py | 2 +- lib/sycamore/sycamore/transforms/text_extraction/ocr_models.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/sycamore/sycamore/transforms/detr_partitioner.py b/lib/sycamore/sycamore/transforms/detr_partitioner.py index 55bff72c8..e9ae6e37a 100644 --- a/lib/sycamore/sycamore/transforms/detr_partitioner.py +++ b/lib/sycamore/sycamore/transforms/detr_partitioner.py @@ -135,7 +135,7 @@ def _supplement_text(inferred: list[Element], text: list[Element], threshold: fl i.tokens = [{"text": elem.text_representation, "bbox": elem.bbox} for elem in matches] i.data["text_representation"] = " ".join(full_text) - i.properties["font_size"] = sum(font_sizes) / len(font_sizes) if font_sizes else 0.0 + i.properties["font_size"] = sum(font_sizes) / len(font_sizes) if font_sizes else None return inferred + unmatched def partition_pdf( diff --git a/lib/sycamore/sycamore/transforms/text_extraction/ocr_models.py b/lib/sycamore/sycamore/transforms/text_extraction/ocr_models.py index 446926add..08e5159ab 100644 --- a/lib/sycamore/sycamore/transforms/text_extraction/ocr_models.py +++ b/lib/sycamore/sycamore/transforms/text_extraction/ocr_models.py @@ -83,7 +83,7 @@ def get_text(self, image: Image.Image) -> tuple[str, Optional[float]]: out_list.append(text) font_sizes.append(res[0][2][1] - res[0][0][1]) val = " ".join(out_list) - avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0.0 + avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else None return val, avg_font_size def get_boxes_and_text(self, image: Image.Image) -> list[dict[str, Any]]: