-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathknowledge_service.py
108 lines (96 loc) · 4.47 KB
/
knowledge_service.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python
# -*- coding:utf-8 _*-
"""
vector service
"""
import os
import nltk
work_dir = '/home/ma-user/work'
nltk.data.path.append(os.path.join(work_dir, 'nltk_data'))
from service.config import LangChainCFG
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter,MarkdownTextSplitter
from langchain.document_loaders import UnstructuredFileLoader,UnstructuredMarkdownLoader,UnstructuredPDFLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from rapidocr_onnxruntime import RapidOCR
class KnowledgeService(object):
def __init__(self, config):
self.config = config
self.knowledge_base = None
self.docs_path = self.config.docs_path
self.knowledge_base_path = self.config.knowledge_base_path
self.embeddings = HuggingFaceEmbeddings(model_name=self.config.embedding_model_path)
def init_knowledge_base(self):
"""
初始化本地知识库向量
"""
print('\n#####init_knowledge_base#####\n')
docs = []
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=20)
markdown_splitter = MarkdownTextSplitter(chunk_size=200, chunk_overlap=20)
for doc in os.listdir(self.docs_path):
if doc.endswith('.txt'):
print(doc)
loader = UnstructuredFileLoader(f'{self.docs_path}/{doc}', mode="elements")
doc = loader.load()
split_doc = text_splitter.split_documents(doc)
docs.extend(split_doc)
elif doc.endswith('.md'):
print(doc)
loader = UnstructuredMarkdownLoader(f'{self.docs_path}/{doc}', mode="elements")
doc = loader.load()
split_doc = markdown_splitter.split_documents(doc)
docs.extend(split_doc)
elif doc.endswith('.pdf'):
print(doc)
loader = UnstructuredPDFLoader(f'{self.docs_path}/{doc}', mode="elements")
doc = loader.load()
split_doc = markdown_splitter.split_documents(doc)
docs.extend(split_doc)
elif doc.endswith('.jpg'):
print(doc)
ocr = RapidOCR()
result, _ = ocr(f'{self.docs_path}/{doc}')
img_docs = ""
if result:
ocr_result = [line[1] for line in result]
img_docs += "\n".join(ocr_result)
split_docs = text_splitter.create_documents([img_docs])
docs.extend(split_docs)
self.knowledge_base = FAISS.from_documents(docs, self.embeddings)
def add_document(self, document_path):
split_doc = []
if document_path.endswith('.txt'):
print(document_path)
loader = UnstructuredFileLoader(document_path, mode="elements")
doc = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=20)
split_doc = text_splitter.split_documents(doc)
elif doc.endswith('.md'):
print(document_path)
loader = UnstructuredMarkdownLoader(document_path, mode="elements")
doc = loader.load()
markdown_splitter = MarkdownTextSplitter(chunk_size=200, chunk_overlap=20)
split_doc = markdown_splitter.split_documents(doc)
elif doc.endswith('.pdf'):
print(document_path)
loader = UnstructuredPDFLoader(document_path, mode="elements")
doc = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=20)
split_doc = text_splitter.split_documents(doc)
elif doc.endswith('.jpg'):
print(document_path)
loader = UnstructuredPDFLoader(document_path, mode="elements")
docs = self.init_knowledge_base(jpg_file)
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=20)
split_doc = text_splitter.create_documents([docs])
if not self.knowledge_base:
self.knowledge_base = FAISS.from_documents(split_doc, self.embeddings)
else:
self.knowledge_base.add_documents(split_doc)
def load_knowledge_base(self, path):
if path is None:
self.knowledge_base = FAISS.load_local(self.knowledge_base_path, self.embeddings)
else:
self.knowledge_base = FAISS.load_local(path, self.embeddings)
return self.knowledge_base