From 485bfd6c0891bf5bfdd092dc767b85ce6774c261 Mon Sep 17 00:00:00 2001 From: chongchuanbing Date: Thu, 10 Oct 2024 09:09:29 +0800 Subject: [PATCH] fix: Large document thumbnail display failed (#2763) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? In MySQL, when the thumbnail base64 of a document is relatively large, the display of the document's thumbnail fails. Now, I put the document thumbnail into MiniIO storage. ### Type of change - [✓] Bug Fix (non-breaking change which fixes an issue) --------- Co-authored-by: chongchuanbing --- api/apps/document_app.py | 6 ++++++ api/contants.py | 4 +++- api/db/services/file_service.py | 13 ++++++++++--- api/utils/file_utils.py | 18 ++++++++++-------- 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 0f69dc5ee1e..83f7b363232 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -51,6 +51,7 @@ from rag.utils.storage_factory import STORAGE_IMPL from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory from api.utils.web_utils import html2pdf, is_valid_url +from api.contants import IMG_BASE64_PREFIX @manager.route('/upload', methods=['POST']) @@ -209,6 +210,11 @@ def list_docs(): try: docs, tol = DocumentService.get_by_kb_id( kb_id, page_number, items_per_page, orderby, desc, keywords) + + for doc_item in docs: + if doc_item['thumbnail'] and not doc_item['thumbnail'].startswith(IMG_BASE64_PREFIX): + doc_item['thumbnail'] = f'/v1/document/image/{kb_id}-{doc_item['thumbnail']}' + return get_json_result(data={"total": tol, "docs": docs}) except Exception as e: return server_error_response(e) diff --git a/api/contants.py b/api/contants.py index 61c13ec9ec3..636c246bcfb 100644 --- a/api/contants.py +++ b/api/contants.py @@ -13,4 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -NAME_LENGTH_LIMIT = 2 ** 10 \ No newline at end of file +NAME_LENGTH_LIMIT = 2 ** 10 + +IMG_BASE64_PREFIX = 'data:image/png;base64,' \ No newline at end of file diff --git a/api/db/services/file_service.py b/api/db/services/file_service.py index 48796f6b38c..a63643e13c8 100644 --- a/api/db/services/file_service.py +++ b/api/db/services/file_service.py @@ -26,7 +26,7 @@ from api.db.services.document_service import DocumentService from api.db.services.file2document_service import File2DocumentService from api.utils import get_uuid -from api.utils.file_utils import filename_type, thumbnail +from api.utils.file_utils import filename_type, thumbnail_img from rag.utils.storage_factory import STORAGE_IMPL @@ -354,8 +354,15 @@ def upload_document(self, kb, file_objs, user_id): location += "_" blob = file.read() STORAGE_IMPL.put(kb.id, location, blob) + + doc_id = get_uuid() + + img = thumbnail_img(filename, blob) + thumbnail_location = f'thumbnail_{doc_id}.png' + STORAGE_IMPL.put(kb.id, thumbnail_location, img) + doc = { - "id": get_uuid(), + "id": doc_id, "kb_id": kb.id, "parser_id": self.get_parser(filetype, filename, kb.parser_id), "parser_config": kb.parser_config, @@ -364,7 +371,7 @@ def upload_document(self, kb, file_objs, user_id): "name": filename, "location": location, "size": len(blob), - "thumbnail": thumbnail(filename, blob) + "thumbnail": thumbnail_location } DocumentService.insert(doc) diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py index 807d5274120..2428f3f21e3 100644 --- a/api/utils/file_utils.py +++ b/api/utils/file_utils.py @@ -25,6 +25,7 @@ from ruamel.yaml import YAML from api.db import FileType +from api.contants import IMG_BASE64_PREFIX PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE") RAG_BASE = os.getenv("RAG_BASE") @@ -168,23 +169,20 @@ def filename_type(filename): return FileType.OTHER.value - -def thumbnail(filename, blob): +def thumbnail_img(filename, blob): filename = filename.lower() if re.match(r".*\.pdf$", filename): pdf = pdfplumber.open(BytesIO(blob)) buffered = BytesIO() pdf.pages[0].to_image(resolution=32).annotated.save(buffered, format="png") - return "data:image/png;base64," + \ - base64.b64encode(buffered.getvalue()).decode("utf-8") + return buffered.getvalue() if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename): image = Image.open(BytesIO(blob)) image.thumbnail((30, 30)) buffered = BytesIO() image.save(buffered, format="png") - return "data:image/png;base64," + \ - base64.b64encode(buffered.getvalue()).decode("utf-8") + return buffered.getvalue() if re.match(r".*\.(ppt|pptx)$", filename): import aspose.slides as slides @@ -194,11 +192,15 @@ def thumbnail(filename, blob): buffered = BytesIO() presentation.slides[0].get_thumbnail(0.03, 0.03).save( buffered, drawing.imaging.ImageFormat.png) - return "data:image/png;base64," + \ - base64.b64encode(buffered.getvalue()).decode("utf-8") + return buffered.getvalue() except Exception as e: pass + return None +def thumbnail(filename, blob): + img = thumbnail_img(filename, blob) + return IMG_BASE64_PREFIX + \ + base64.b64encode(img).decode("utf-8") def traversal_files(base): for root, ds, fs in os.walk(base):