Skip to content

Commit

Permalink
fix gb2312 encoding issue (infiniflow#394)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

Issue link:infiniflow#384
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
  • Loading branch information
KevinHuSh authored Apr 16, 2024
1 parent d396052 commit 657cd75
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 4 deletions.
8 changes: 5 additions & 3 deletions rag/app/naive.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@
from docx import Document
import re
from deepdoc.parser.pdf_parser import PlainParser
from rag.app import laws
from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks
from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
from rag.settings import cron_logger

Expand Down Expand Up @@ -140,7 +139,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.1, "Start to parse.")
txt = ""
if binary:
txt = binary.decode("utf-8")
try:
txt = binary.decode("utf-8")
except Exception as e:
txt = binary.decode("gb2312")
else:
with open(filename, "r") as f:
while True:
Expand Down
2 changes: 1 addition & 1 deletion rag/nlp/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ def insert_citations(self, answer, chunks, chunk_v,
pieces_.append(t)
es_logger.info("{} => {}".format(answer, pieces_))
if not pieces_:
return answer
return answer, set([])

ans_v, _ = embd_mdl.encode(pieces_)
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
Expand Down

0 comments on commit 657cd75

Please sign in to comment.