diff --git a/.gitignore b/.gitignore index 0d96928..46e76e1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ .idea /data_base /knowledge_db -/.env \ No newline at end of file +/.env diff --git a/docs/sources.txt b/docs/sources.txt new file mode 100644 index 0000000..a4b9f9b --- /dev/null +++ b/docs/sources.txt @@ -0,0 +1,30 @@ +药理学 +https://zhuanlan.zhihu.com/p/164961617 +https://zhuanlan.zhihu.com/p/165216521 +医疗器械 +http://www.nxgy.gov.cn/zwgk/zfxxgkml/spyp/spypjg3/201708/t20170808_386803.html +http://www.nxgy.gov.cn/zwgk/zfxxgkml/spyp/spypjg3/201708/t20170808_386804.html +心内科 +https://www.sohu.com/a/342578283_456057 +癌症 +https://wjw.hunan.gov.cn/wjw/ztzl/knowledge/zthd/jljh/201605/t20160530_4044693.html +http://www.caca.org.cn/system/2021/04/30/030005405.shtml +https://www.cancer.org/content/dam/cancer-org/cancer-control/zh/booklets-flyers/what-is-cancer.pdf +https://blog.dana-farber.org/insight/zh-hans/%E7%99%8C%E7%97%87%E5%92%8C%E8%82%BF%E7%98%A4%E7%9A%84%E5%8C%BA%E5%88%AB/ +眼科 +http://cnszgx.cn/read.asp?ID=86 +https://zhuanlan.zhihu.com/p/359339233 +https://www.sohu.com/a/654565938_120960539 +https://www.sohu.com/a/415154851_487694 +妇产科 +https://www.medsci.cn/article/show_article.do?id=5fbb11600e0a +https://zhuanlan.zhihu.com/p/267976684 +http://wsjkw.jl.gov.cn/zdzt/jkkp/202108/t20210805_8170018.html +疾病 +https://hospital.nwu.edu.cn/info/1036/2087.htm +http://www.ynlf.gov.cn/info/3132/68031.htm +https://www.kepuchina.cn/qykj/yxqy/201712/t20171208_291650.shtml +http://jk.anhuinews.com/newjklm/jkdjt/202310/t20231003_7104539.html +中医体质 +http://www.sdgwlc.com/health_detail/879.html +https://finance.sina.cn/2021-07-27/detail-ikqciyzk7865390.d.html \ No newline at end of file diff --git a/server/api/__pycache__/hello.cpython-39.pyc b/server/api/__pycache__/hello.cpython-39.pyc index 518a8db..a0a72c3 100644 Binary files a/server/api/__pycache__/hello.cpython-39.pyc and b/server/api/__pycache__/hello.cpython-39.pyc differ diff --git a/server/service/__pycache__/chain.cpython-39.pyc b/server/service/__pycache__/chain.cpython-39.pyc index 86e3f6e..eee5df5 100644 Binary files a/server/service/__pycache__/chain.cpython-39.pyc and b/server/service/__pycache__/chain.cpython-39.pyc differ diff --git a/server/service/__pycache__/load.cpython-39.pyc b/server/service/__pycache__/load.cpython-39.pyc index c1d3cf4..8a35325 100644 Binary files a/server/service/__pycache__/load.cpython-39.pyc and b/server/service/__pycache__/load.cpython-39.pyc differ diff --git a/server/service/__pycache__/split.cpython-39.pyc b/server/service/__pycache__/split.cpython-39.pyc index 536c698..ce4d9ad 100644 Binary files a/server/service/__pycache__/split.cpython-39.pyc and b/server/service/__pycache__/split.cpython-39.pyc differ diff --git a/server/service/load.py b/server/service/load.py index bf11b80..c14ab60 100644 --- a/server/service/load.py +++ b/server/service/load.py @@ -74,7 +74,7 @@ def load_docs(): def load_db(category: str): - base_directory = '../../data_base/chroma/' + base_directory = '../../data_base/' persist_directory = os.path.join(base_directory, category) # 定义 Embeddings embedding = QianfanEmbeddingsEndpoint( diff --git a/server/service/persist_cmd.py b/server/service/persist_cmd.py index 9438e74..a640c4b 100644 --- a/server/service/persist_cmd.py +++ b/server/service/persist_cmd.py @@ -1,10 +1,12 @@ import os +import time from langchain_community.embeddings import QianfanEmbeddingsEndpoint from langchain_community.vectorstores.chroma import Chroma from server.service.split import split_docs from server.service.load import load_docs +from dotenv import load_dotenv, find_dotenv def persist_vector_db(category: str, s_docs): @@ -12,8 +14,9 @@ def persist_vector_db(category: str, s_docs): embedding = QianfanEmbeddingsEndpoint( streaming=True, model="Embedding-V1", + chunk_size=16, ) - base_directory = '../../data_base/chroma/' + base_directory = '../../data_base/' # 定义持久化路径 persist_directory = os.path.join(base_directory, category) @@ -29,6 +32,9 @@ def persist_vector_db(category: str, s_docs): if __name__ == '__main__': + _ = load_dotenv(find_dotenv()) docs_dict = load_docs() for category, docs in docs_dict.items(): + # if category not in ['Chinese_medicine_physique','device','medicine','cancer','cardiology',]: + # print(docs) persist_vector_db(category, docs)