Skip to content

Commit

Permalink
Added a file that is able to run OCR on all of the TIF files within t…
Browse files Browse the repository at this point in the history
…he SCC
  • Loading branch information
njquisel committed Oct 31, 2024
1 parent fbf5a88 commit 929526a
Showing 1 changed file with 37 additions and 0 deletions.
37 changes: 37 additions & 0 deletions modules/deed_preprocessing/read_all_tiffs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import os
import zipfile
import importlib.util
from spellcheck import correct_spelling

spec = importlib.util.spec_from_file_location("google_cloud_ocr", "../google_cloud_ocr/google_cloud_ocr.py")
google_cloud_ocr_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(google_cloud_ocr_module)

output_dir = './outputs'

os.makedirs(output_dir, exist_ok=True)

for root, dirs, files in os.walk(r'../../../../mass-sec-state-deeds-data/Books 547-1849/'):
for file in files:

if file.endswith('.TIF'):
tiff_file_path = os.path.join(root, file)

with open(tiff_file_path, 'rb') as tiff_file:
try:
print(tiff_file_path)
extracted_text = google_cloud_ocr_module.google_cloud_ocr(tiff_file)

# spell check the extracted text
corrected_text = correct_spelling(extracted_text)

output_file_name = f"{os.path.splitext(file)[0]}.txt"
output_file_path = os.path.join(output_dir, output_file_name)

with open(output_file_path, 'w', encoding='utf-8') as output_txt:
output_txt.write(extracted_text)

except Exception as e:
print(f"Error processing {file}: {str(e)}")

print("OCR processing complete. Text files are saved in:", output_dir)

0 comments on commit 929526a

Please sign in to comment.