-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate_database.py
76 lines (54 loc) · 2.8 KB
/
update_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores.chroma import Chroma
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
import uuid
CHROMA_PATH = "chroma"
DATA_PATH = "data"
def main():
generate_data_store()
def generate_data_store():
documents = load_documents()
chunks = split_text(documents)
aggiorna(chunks)
def load_documents():
loader = DirectoryLoader(DATA_PATH, glob="*.pdf")
documents = loader.load()
return documents
def split_text(documents: list[Document]):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
)
chunks = text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
return chunks
def aggiorna(chunks: list[Document]):
'''
In the first line, a unique UUID is generated for each document by using the uuid.uuid5() function, which creates a UUID using
the SHA-1 hash of a namespace identifier and a name string (in this case, the content of the document).
The if condition in the list comprehension checks whether the ID of the current document exists in the seen_ids set:
If it doesn't exist, this implies the document is unique. It gets added to seen_ids using seen_ids.add(id), and the document gets included in unique_docs.
If it does exist, the document is a duplicate and gets ignored.
The or True at the end is necessary to always return a truthy value to the if condition, because seen_ids.add(id) returns None (which is falsy)
even when an element is successfully added.
This approach is more practical than generating IDs using URLs or other document metadata, as it directly prevents the addition of
duplicate documents based on content rather than relying on metadata or manual checks.
'''
docs = chunks
# Create a list of unique ids for each document based on the content
ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in docs]
unique_ids = list(set(ids))
# Ensure that only docs that correspond to unique ids are kept and that only one of the duplicate ids is kept
seen_ids = set()
unique_docs = [doc for doc, id in zip(docs, ids) if id not in seen_ids and (seen_ids.add(id) or True)]
embeddings = HuggingFaceInstructEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# Add the unique documents to your database
db = Chroma.from_documents(unique_docs, embeddings, ids=unique_ids, persist_directory=CHROMA_PATH)
db.persist()
db = None
print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
if __name__ == "__main__":
main()