diff --git a/example.env b/example.env index b52912b..ff91e9a 100644 --- a/example.env +++ b/example.env @@ -1,6 +1,7 @@ # Generic MODEL_N_CTX=1024 LLAMA_EMBEDDINGS_MODEL=models/ggml-model-q4_0.bin +USE_MLOCK=true # Ingestion PERSIST_DIRECTORY=db @@ -12,4 +13,4 @@ INGEST_CHUNK_OVERLAP=50 MODEL_TYPE=LlamaCpp # GPT4All or LlamaCpp MODEL_PATH=models/ggjt-v1-vic7b-uncensored-q4_0.bin MODEL_TEMP=0.8 -MODEL_STOP=###,\n \ No newline at end of file +MODEL_STOP=###,\n diff --git a/ingest.py b/ingest.py index 96d1fc0..743bfa7 100644 --- a/ingest.py +++ b/ingest.py @@ -2,17 +2,24 @@ import os import shutil import sys +from hashlib import md5 from pathlib import Path from langchain.docstore.document import Document -from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredEPubLoader, \ - UnstructuredHTMLLoader, Docx2txtLoader, UnstructuredPowerPointLoader +from langchain.document_loaders import ( + CSVLoader, + Docx2txtLoader, + PDFMinerLoader, + TextLoader, + UnstructuredEPubLoader, + UnstructuredHTMLLoader, + UnstructuredPowerPointLoader, +) from langchain.embeddings import LlamaCppEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain.vectorstores import Qdrant +from qdrant_client import QdrantClient, models -from load_env import persist_directory, chunk_size, chunk_overlap, llama_embeddings_model, model_n_ctx, \ - documents_directory +from load_env import chunk_overlap, chunk_size, documents_directory, llama_embeddings_model, model_n_ctx, persist_directory, use_mlock file_loaders = { # extension -> loader "txt": lambda path: TextLoader(path, encoding="utf8"), @@ -34,6 +41,17 @@ def load_one_doc(filepath: Path) -> list[Document]: return file_loaders[filepath.suffix[1:]](str(filepath)).load() +def embed_documents_with_progress(embedding_model: LlamaCppEmbeddings, texts: list[str]) -> list[list[float]]: + """wrapper around embed_documents that prints progress""" + embeddings = [] + N_chunks = len(texts) + for i, text in enumerate(texts): + print(f"embedding chunk {i+1}/{N_chunks}") + embeddings.append(embedding_model.client.embed(text)) + + return [list(map(float, e)) for e in embeddings] + + def main(sources_directory: str, cleandb: str) -> None: """enables to run python random_path/ to ingest // or 'python random_path/ y' to purge existing db""" db_dir = persist_directory # can be changed to ":memory:" but is not persistant @@ -49,12 +67,46 @@ def main(sources_directory: str, cleandb: str) -> None: for file in files: documents += load_one_doc(Path(root) / file) + # Split text text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) - texts = text_splitter.split_documents(documents) - print(f"Found {len(texts)} chunks from {len(documents)} documents to index") - llama = LlamaCppEmbeddings(model_path=llama_embeddings_model, n_ctx=model_n_ctx) - Qdrant.from_documents(texts, llama, path=db_dir, collection_name="test") - print(f"Indexed {len(texts)} chunks from {len(documents)} documents in Qdrant") + split_documents = text_splitter.split_documents(documents) + texts = [d.page_content for d in split_documents] + metadatas = [d.metadata for d in split_documents] + print(f"Found {len(split_documents)} chunks from {len(documents)} documents to index") + + # Generate embeddings + print("Generating embeddings...") + embedding_model = LlamaCppEmbeddings(model_path=llama_embeddings_model, n_ctx=model_n_ctx, use_mlock=use_mlock) + embeddings = embed_documents_with_progress(embedding_model, texts) + + # Store embeddings + print("Storing embeddings...") + client = QdrantClient(path=db_dir) # using Qdrant.from_documents recreates the db each time + try: + collection = client.get_collection("test") + except ValueError: # doesn't exist + print("Creating a new store") + # Just do a single quick embedding to get vector size + vector_size = max(len(e) for e in embeddings) + client.recreate_collection( + collection_name="test", + vectors_config=models.VectorParams( + size=vector_size, + distance=models.Distance["COSINE"], + ), + ) + collection = client.get_collection("test") + print(f"Loaded collection has {collection.points_count} data points") + client.upsert( + collection_name="test", + points=models.Batch.construct( + ids=[md5(text.encode("utf-8")).hexdigest() for text in texts], + vectors=embeddings, + payloads=[{"page_content": text, "metadata": metadatas[i]} for i, text in enumerate(texts)], + ), + ) + collection = client.get_collection("test") + print(f"Indexed {len(split_documents)} chunks from {len(documents)} documents in Qdrant. Total points: {collection.points_count}") if __name__ == "__main__": diff --git a/load_env.py b/load_env.py index d155404..7b57f9d 100644 --- a/load_env.py +++ b/load_env.py @@ -4,18 +4,20 @@ from dotenv import load_dotenv load_dotenv() -# ingest -persist_directory = os.environ.get("PERSIST_DIRECTORY") -documents_directory = os.environ.get("DOCUMENTS_DIRECTORY") -# generate +# generic llama_embeddings_model = os.environ.get("LLAMA_EMBEDDINGS_MODEL") - model_n_ctx = int(os.environ.get("MODEL_N_CTX")) +use_mlock = os.environ.get("USE_MLOCK").lower() == "true" + +# ingest +persist_directory = os.environ.get("PERSIST_DIRECTORY") +documents_directory = os.environ.get("DOCUMENTS_DIRECTORY") chunk_size = int(os.environ.get("INGEST_CHUNK_SIZE")) chunk_overlap = int(os.environ.get("INGEST_CHUNK_OVERLAP")) + +# generate model_type = os.environ.get("MODEL_TYPE") model_path = os.environ.get("MODEL_PATH") model_temp = float(os.environ.get("MODEL_TEMP")) model_stop = os.environ.get("MODEL_STOP").split(",") - diff --git a/source_documents/subfolder/Muscle Spasms Charley Horse MedlinePlus.html b/source_documents/subfolder/Muscle Spasms Charley Horse MedlinePlus.html index 81141c4..142f337 100644 --- a/source_documents/subfolder/Muscle Spasms Charley Horse MedlinePlus.html +++ b/source_documents/subfolder/Muscle Spasms Charley Horse MedlinePlus.html @@ -7,18 +7,18 @@ - - + + - - + + - + @@ -26,22 +26,22 @@ - + - + - + - + - - + + - - + + - + @@ -54,54 +54,54 @@ - + - + - - - + + + - - - + + + - + - + - - - + + + - +
Print Options