-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathservice.py
153 lines (127 loc) · 5.04 KB
/
service.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""
Content extractor service
Uses Apache Tika and Tesseract OCR libraries to extract content from different types of file
"""
import os
import io
import json
import tempfile
from pathlib import Path
import cherrypy
import requests
from flask import Flask, request, Response
from werkzeug.exceptions import BadRequest, InternalServerError
from utils import logging, file_utils, exceptions, config
from service import processor
LOGGER = logging.get_logger("main", config.LOG_LEVEL)
APP = Flask(__name__)
exceptions.JSONExceptionHandler(APP)
@APP.route("/post_file_list", methods=["POST"])
def post_file_list():
"""
Upload one or more files and return parsed result back as JSON array
:return:
"""
files = request.files
if not files:
LOGGER.info("No file found in request")
raise BadRequest("No file found in request")
result = []
for file in files:
if not files[file] or not file_utils.allowed_file(files[file].filename):
LOGGER.info("No file or file not allowed")
raise BadRequest("No file or file not allowed")
with tempfile.NamedTemporaryFile(mode='r+b', delete=False) as temp_file_ptr:
temp_file_ptr.write(files[file].read())
temp_file_ptr.flush()
os.fsync(temp_file_ptr)
temp_file_ptr.close()
result.append(processor.process_file(temp_file_ptr.name))
os.remove(temp_file_ptr.name)
return Response(
json.dumps(result), mimetype='application/json')
@APP.route("/post_json_list", methods=["POST"])
def post_json_list():
"""
Takes JSON array with one or more objects containing URL's for direct file download,
download files, parse them and upload to another place as plain text if UPLOAD_URL is defined
or returned back as JSON array otherwise
:return:
"""
input_data = request.get_json()
for input_entity in input_data:
file_url = input_entity[config.FILE_URL]
file_name = input_entity[config.FILE_NAME]
local_path = input_entity.get('local_path')
LOGGER.info("processing request for %s", file_name)
file_path = None
try:
LOGGER.info("download file %s", file_name)
res = requests.get(file_url, stream=True)
res.raise_for_status()
file_path = file_utils.download_file(res)
parsed_file = processor.process_file(file_path)
if parsed_file["status"] and parsed_file["status"] == 200:
LOGGER.info("Successfully parsed %s", file_name)
else:
raise Exception("Parsed file status not ok: {}".format(parsed_file))
if config.UPLOAD_URL:
headers = {}
if local_path:
headers["local_path"] = local_path
LOGGER.debug("Starting upload file %s to %s", file_name, config.UPLOAD_URL)
if file_utils.allowed_file(file_name):
file_like_obj = io.StringIO(parsed_file['content'])
else:
LOGGER.warning("file %s not allowed, upload empty txt instead of it.")
file_like_obj = io.StringIO()
if config.PRESERVE_FILE_TYPE:
file_name += ".txt"
else:
path = Path(file_name)
file_name = str(path.with_suffix('.txt'))
requests.post(config.UPLOAD_URL,
files={file_name: (file_name, file_like_obj)},
headers=headers)
LOGGER.debug("File %s uploaded", file_path)
input_entity['transfer_service'] = "PARSED AND TRANSFERRED"
else:
input_entity['parsed_data'] = parsed_file
except Exception as exc:
LOGGER.warning("Error occurred: %s", exc)
if config.FAIL_ON_ERROR:
raise InternalServerError(exc)
input_entity['transfer_service'] = "ERROR: {}".format(str(exc))
finally:
if file_path:
LOGGER.debug("Deleting temporary file %s", file_path)
os.remove(file_path)
return Response(json.dumps(input_data), content_type='application/json')
def clean_temp_folder():
"""
Clean temp dir from possible temp files leaved by PDFBox
"""
import tempfile
temp_dir = tempfile.gettempdir()
entries = os.listdir(temp_dir)
for entry in entries:
if not entry.startswith('PDFBox'):
continue
try:
os.unlink(entry)
except Exception as e:
LOGGER.warn(f'couldn\'t unlink {entry} due to {e}')
if __name__ == '__main__':
clean_temp_folder()
cherrypy.tree.graft(APP, '/')
cherrypy.config.update({
'environment': 'production',
'engine.autoreload_on': True,
'log.screen': True,
'server.socket_port': config.PORT,
'server.socket_host': '0.0.0.0',
'server.thread_pool': 10,
'server.max_request_body_size': 0
})
cherrypy.engine.start()
cherrypy.engine.block()