-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextrai_cpf_cnpj.py
121 lines (99 loc) · 4.35 KB
/
extrai_cpf_cnpj.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -*- coding: utf-8 -*-
from modules.utils import procura_cnpj, procura_cpf, select_files, choose_type, to_table, show_exception_and_exit
from pdfminer.high_level import extract_text
import docx2txt
import os
import re
import sys
from tqdm import tqdm
import win32com.client
from time import sleep
class Doc:
def __init__(self, file_path, file_type):
self.type = file_type
self.path = file_path
self.filename = os.path.basename(file_path)
self.text = ''
# Get the file content
def get_text(self):
if self.type == '*.pdf':
try:
text = extract_text(self.path)
self.text = re.sub(r'[\r\n]+', r' ', text)
except TypeError:
raise f"Erro ao ler o arquivo {0}. É um 'pdf' pesquisável?".format(self.filename)
elif self.type == '*.docx':
try:
text = docx2txt.process(self.path)
self.text = re.sub(r'[\n\r]+', ' ', text)
except TypeError:
raise f"Erro ao ler o arquivo {0}. É um '.docx'?".format(self.filename)
elif self.type == '*.doc':
try:
word = win32com.client.Dispatch("Word.Application")
word.visible = False
word.Documents.Open(self.path)
doc = word.ActiveDocument
text = doc.Range().Text
self.text = re.sub(r'[\n\r]+', ' ', text)
word.Application.Quit()
except TypeError:
raise f"Erro ao ler o arquivo {0}. É um '.doc'?".format(self.filename)
# Find CPFs on text
def get_cpfs(self):
cpfs = procura_cpf(self.text)
return [(cpf, self.filename) for cpf in cpfs]
# Find cnpjs on text
def get_cnpjs(self):
cpfs = procura_cpf(self.text)
return [(cpf, self.filename) for cpf in cpfs]
if __name__ == '__main__':
input_type = choose_type()
while True:
# Hook any exception avoiding close console
sys.excepthook = show_exception_and_exit
# Get the type of the files from user according to its extensions
file_type = {'1': '*.pdf', '2': '*.docx', '3': '*.doc'}
# If the input option is not valid, ask again
if input_type not in ['1', '2', '3']:
choose_type()
# Lists to store final results
cpf_results = []
cnpj_results = []
# Open GUI window to user select the sources he wants to get CPFs/CNPJs
files = select_files(file_type[input_type])
# Set the working directory to the sources folder
BASE_PATH = os.path.dirname(files[0])
print('\nLendo arquivos...\n')
for file in tqdm(files): # To each file selected
doc = Doc(os.path.abspath(file), file_type[input_type]) # Instantiate a Doc object
# Store the file's text in one line
doc.get_text()
# Get file's valid CPFs/CNPJs
cpfs = procura_cpf(doc.text)
cnpjs = procura_cnpj(doc.text)
# Save each CPF/CNPJ as a string (Ex: "81781726255|source_file.docx")
for cpf in cpfs:
cpf_results.append('|'.join([cpf, doc.filename, doc.path, f"\"{doc.text}\""]))
for cnpj in cnpjs:
cnpj_results.append('|'.join([cnpj, doc.filename, doc.path, f"\"{doc.text}\""]))
# Create a .txt to store CPFs.
print('\nGravando CPFs em .txt ...')
with open(os.path.join(f"{BASE_PATH}", "cpfs_encontrados.txt"), mode="a", newline='\n', encoding="utf-8") as new_file:
# Write headers
# new_file.write('cpf\tarquivo\tcaminho\ttexto\n')
for result in tqdm(cpf_results):
new_file.write(f'{result}\n')
# Create a .txt to store CNPJs.
print('\nGravando CNPJs ...')
with open(os.path.join(f"{BASE_PATH}", "cnpjs_encontrados.txt"), mode="a", newline='\n', encoding="utf-8") as new_file:
# Write headers
# new_file.write('cnpj\tarquivo\tcaminho\ttexto\n')
for result in tqdm(cnpj_results):
new_file.write(f'{result}\n')
sleep(3)
# Generate html tables
print('\nGerando tabelas...')
to_table(os.path.join(f"{BASE_PATH}", "cpfs_encontrados.txt"))
to_table(os.path.join(f"{BASE_PATH}", "cnpjs_encontrados.txt"))
input_type = choose_type()