From ae3b0a1e603cad3ab0a3b4d19d3d0d4404943505 Mon Sep 17 00:00:00 2001 From: myhloli Date: Fri, 22 Nov 2024 00:01:18 +0800 Subject: [PATCH] fix(pdf_parse): improve line stop flag detection accuracy - Add an additional condition to the line stop flag check - Ensure character is to the right of the span's left boundary - This change helps reduce false positives in line stop detection --- magic_pdf/pdf_parse_union_core_v2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py index dd2ded00..8b35a26c 100644 --- a/magic_pdf/pdf_parse_union_core_v2.py +++ b/magic_pdf/pdf_parse_union_core_v2.py @@ -151,6 +151,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag): if char_is_line_stop_flag: if ( (span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2] + and char_center_x > span_bbox[0] and span_bbox[1] < char_center_y < span_bbox[3] and abs(char_center_y - span_center_y) < span_height / 4 ):