-
Notifications
You must be signed in to change notification settings - Fork 1
/
dataextraction.py
28 lines (22 loc) · 897 Bytes
/
dataextraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from pdf2image import convert_from_path
import os
import pytesseract
from PIL import Image
import pandas as pd
class text_extract:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
fields = []
def retrieve_text(self, field_name, image_path):
try:
img = Image.open(image_path)
tessdata_dir_config = r'--tessdata-dir "./ocr-layers" --psm 6'
img_text = pytesseract.image_to_string(
img, lang='eng_layer', config=tessdata_dir_config)
print(field_name + ':' + img_text)
self.fields.append([field_name, img_text])
except Exception as e:
# print(e)
print(field_name + ' not found')
def save_fields(self, file_path):
df = pd.DataFrame(self.fields, columns=["Field", "Value"])
df.to_csv(file_path+"\\fields.csv")