From 0e4492619feac1354d84c18c5ff6038d61133d08 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Wed, 8 Jan 2025 11:38:42 +0100 Subject: [PATCH] Do not return None --- unstructured/partition/utils/ocr_models/tesseract_ocr.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 64ba58e073..17589df06d 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -136,14 +136,12 @@ def hocr_to_dataframe( return ocr_df @staticmethod - def extract_word_from_hocr( - word: Tag, character_confidence_threshold: float = 0.0 - ) -> str | None: + def extract_word_from_hocr(word: Tag, character_confidence_threshold: float = 0.0) -> str: """Extracts a word from an hOCR word tag, filtering out characters with low confidence.""" character_spans = word.find_all("span", class_="ocrx_cinfo") if len(character_spans) == 0: - return None + return "" word_text = "" for character_span in character_spans: