Skip to content

Commit

Permalink
Fix the bug causing garbled text (#3640)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

Fix the bug causing garbled text #3613

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn>
  • Loading branch information
Feiue and liuhua authored Nov 26, 2024
1 parent f6c3d7c commit 5c59651
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 6 deletions.
4 changes: 2 additions & 2 deletions api/apps/kb_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,9 @@ def rm():
message="Database error (Document removal)!")
f2d = File2DocumentService.get_by_document_id(doc.id)
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])
File2DocumentService.delete_by_document_id(doc.id)

FileService.filter_delete(
[File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])
if not KnowledgebaseService.delete_by_id(req["kb_id"]):
return get_data_error_result(
message="Database error (Knowledgebase removal)!")
Expand Down
4 changes: 2 additions & 2 deletions api/apps/sdk/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,9 +252,9 @@ def delete(tenant_id):
File.id == f2d[0].file_id,
]
)
FileService.filter_delete(
[File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])
File2DocumentService.delete_by_document_id(doc.id)
FileService.filter_delete(
[File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])
if not KnowledgebaseService.delete_by_id(id):
return get_error_data_result(message="Delete dataset error.(Database error)")
return get_result(code=settings.RetCode.SUCCESS)
Expand Down
11 changes: 9 additions & 2 deletions rag/nlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
from PIL import Image
import json

import chardet

all_codecs = [
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
'cp037', 'cp273', 'cp424', 'cp437',
Expand All @@ -43,12 +45,17 @@
'iso8859_14', 'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u',
'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2', 'mac_roman',
'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213',
'utf_32', 'utf_32_be', 'utf_32_le''utf_16_be', 'utf_16_le', 'utf_7'
'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16_be', 'utf_16_le', 'utf_7', 'windows-1250', 'windows-1251',
'windows-1252', 'windows-1253', 'windows-1254', 'windows-1255', 'windows-1256',
'windows-1257', 'windows-1258', 'latin-2'
]


def find_codec(blob):
global all_codecs
detected = chardet.detect(blob[:1024])
if detected['confidence'] > 0.5:
return detected['encoding']

for c in all_codecs:
try:
blob[:1024].decode(c)
Expand Down

0 comments on commit 5c59651

Please sign in to comment.