Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow appending ingest to existing db #43

Merged
merged 53 commits into from
May 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
b40a8ef
update requirements.txt
hippalectryon-0 May 12, 2023
3aa1d16
Merge branch 'su77ungr:main' into main
hippalectryon-0 May 12, 2023
599362d
Merge branch 'su77ungr:main' into main
hippalectryon-0 May 14, 2023
1861358
remove state_of_the_union.txt
hippalectryon-0 May 14, 2023
a4b5724
Merge remote-tracking branch 'origin/main' into main-fork
hippalectryon-0 May 14, 2023
66d0677
add poetry config
hippalectryon-0 May 14, 2023
42b9454
update streamlit version
hippalectryon-0 May 14, 2023
c4b2a33
update Dockerfile
hippalectryon-0 May 14, 2023
f5cdbcb
update Dockerfile
hippalectryon-0 May 14, 2023
3621789
fix Dockerfile
hippalectryon-0 May 14, 2023
5eae6c2
update Dockerfile
hippalectryon-0 May 14, 2023
d932557
update README.md
hippalectryon-0 May 14, 2023
a35f7c0
update README.md
hippalectryon-0 May 14, 2023
1cd7df1
update convert.py & pyproject.toml
hippalectryon-0 May 14, 2023
b14b601
Merge remote-tracking branch 'fork/main' into main-fork
hippalectryon-0 May 14, 2023
82f6af6
add tokenizer model
hippalectryon-0 May 14, 2023
e371e23
update README & lint
hippalectryon-0 May 14, 2023
6ffcf25
Merge remote-tracking branch 'fork/main' into main-fork
hippalectryon-0 May 14, 2023
83b8454
add pre-commit
hippalectryon-0 May 14, 2023
8a9ba1f
run pre-commit
hippalectryon-0 May 14, 2023
e3a0b6a
merge
hippalectryon-0 May 14, 2023
01c27f2
fix README.md
hippalectryon-0 May 14, 2023
44f0e18
fix (?) convert.py
hippalectryon-0 May 14, 2023
6c0a46d
fix (?) convert.py
hippalectryon-0 May 14, 2023
1b3e653
fix package versions
hippalectryon-0 May 14, 2023
568ece2
clean for merge
hippalectryon-0 May 14, 2023
821c28e
Merge branch 'main' into main-mr
hippalectryon-0 May 14, 2023
4c132e5
fix README.md
hippalectryon-0 May 14, 2023
f070eae
update README.md for new convert
hippalectryon-0 May 14, 2023
0cdfd80
redirect to main repo
su77ungr May 14, 2023
f41263d
fix ingest.py
hippalectryon-0 May 14, 2023
09b4c1a
Merge branch 'main-fork' into main-mr
hippalectryon-0 May 14, 2023
5b23c15
pre-commit formatting
hippalectryon-0 May 14, 2023
b4d5c1b
Merge branch 'main-fork' into main-mr
hippalectryon-0 May 14, 2023
e306509
rollback README.md
hippalectryon-0 May 14, 2023
1b4cc3a
Merge branch 'main-fork' into main-mr
hippalectryon-0 May 14, 2023
249d2f6
fix Dockerfile and README.md for streamlit
hippalectryon-0 May 14, 2023
488e59d
Merge branch 'main' into main-fork
hippalectryon-0 May 14, 2023
925b779
Merge branch 'main-fork' into main-mr
hippalectryon-0 May 14, 2023
6231aa9
fix README.md
hippalectryon-0 May 14, 2023
d3ff124
cleaner document handling in ingest.py
hippalectryon-0 May 14, 2023
595a75f
add support for ptt, docx
hippalectryon-0 May 14, 2023
0901716
add sample documents
hippalectryon-0 May 14, 2023
520c211
load env variables in centralized file
hippalectryon-0 May 14, 2023
e403170
Merge branch 'main' into main-fork
hippalectryon-0 May 14, 2023
9fd396e
Merge branch 'main-fork' into main-mr
hippalectryon-0 May 14, 2023
c7c367a
remove CI on merge
hippalectryon-0 May 14, 2023
f61e571
check for empty query
hippalectryon-0 May 14, 2023
5d7c81c
Merge remote-tracking branch 'origin/main'
hippalectryon-0 May 14, 2023
26a8ef6
print embedding progress
hippalectryon-0 May 14, 2023
1f37b66
Merge branch 'main' into main-fork
hippalectryon-0 May 14, 2023
273b818
Merge branch 'main-fork' into main-mr
hippalectryon-0 May 15, 2023
3c0f7da
fix model_stop
hippalectryon-0 May 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion example.env
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Generic
MODEL_N_CTX=1024
LLAMA_EMBEDDINGS_MODEL=models/ggml-model-q4_0.bin
USE_MLOCK=true

# Ingestion
PERSIST_DIRECTORY=db
Expand All @@ -12,4 +13,4 @@ INGEST_CHUNK_OVERLAP=50
MODEL_TYPE=LlamaCpp # GPT4All or LlamaCpp
MODEL_PATH=models/ggjt-v1-vic7b-uncensored-q4_0.bin
MODEL_TEMP=0.8
MODEL_STOP=###,\n
MODEL_STOP=###,\n
72 changes: 62 additions & 10 deletions ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,24 @@
import os
import shutil
import sys
from hashlib import md5
from pathlib import Path

from langchain.docstore.document import Document
from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredEPubLoader, \
UnstructuredHTMLLoader, Docx2txtLoader, UnstructuredPowerPointLoader
from langchain.document_loaders import (
CSVLoader,
Docx2txtLoader,
PDFMinerLoader,
TextLoader,
UnstructuredEPubLoader,
UnstructuredHTMLLoader,
UnstructuredPowerPointLoader,
)
from langchain.embeddings import LlamaCppEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient, models

from load_env import persist_directory, chunk_size, chunk_overlap, llama_embeddings_model, model_n_ctx, \
documents_directory
from load_env import chunk_overlap, chunk_size, documents_directory, llama_embeddings_model, model_n_ctx, persist_directory, use_mlock

file_loaders = { # extension -> loader
"txt": lambda path: TextLoader(path, encoding="utf8"),
Expand All @@ -34,6 +41,17 @@ def load_one_doc(filepath: Path) -> list[Document]:
return file_loaders[filepath.suffix[1:]](str(filepath)).load()


def embed_documents_with_progress(embedding_model: LlamaCppEmbeddings, texts: list[str]) -> list[list[float]]:
"""wrapper around embed_documents that prints progress"""
embeddings = []
N_chunks = len(texts)
for i, text in enumerate(texts):
print(f"embedding chunk {i+1}/{N_chunks}")
embeddings.append(embedding_model.client.embed(text))

return [list(map(float, e)) for e in embeddings]


def main(sources_directory: str, cleandb: str) -> None:
"""enables to run python random_path/ to ingest // or 'python random_path/ y' to purge existing db"""
db_dir = persist_directory # can be changed to ":memory:" but is not persistant
Expand All @@ -49,12 +67,46 @@ def main(sources_directory: str, cleandb: str) -> None:
for file in files:
documents += load_one_doc(Path(root) / file)

# Split text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
texts = text_splitter.split_documents(documents)
print(f"Found {len(texts)} chunks from {len(documents)} documents to index")
llama = LlamaCppEmbeddings(model_path=llama_embeddings_model, n_ctx=model_n_ctx)
Qdrant.from_documents(texts, llama, path=db_dir, collection_name="test")
print(f"Indexed {len(texts)} chunks from {len(documents)} documents in Qdrant")
split_documents = text_splitter.split_documents(documents)
texts = [d.page_content for d in split_documents]
metadatas = [d.metadata for d in split_documents]
print(f"Found {len(split_documents)} chunks from {len(documents)} documents to index")

# Generate embeddings
print("Generating embeddings...")
embedding_model = LlamaCppEmbeddings(model_path=llama_embeddings_model, n_ctx=model_n_ctx, use_mlock=use_mlock)
embeddings = embed_documents_with_progress(embedding_model, texts)

# Store embeddings
print("Storing embeddings...")
client = QdrantClient(path=db_dir) # using Qdrant.from_documents recreates the db each time
try:
collection = client.get_collection("test")
except ValueError: # doesn't exist
print("Creating a new store")
# Just do a single quick embedding to get vector size
vector_size = max(len(e) for e in embeddings)
client.recreate_collection(
collection_name="test",
vectors_config=models.VectorParams(
size=vector_size,
distance=models.Distance["COSINE"],
),
)
collection = client.get_collection("test")
print(f"Loaded collection has {collection.points_count} data points")
client.upsert(
collection_name="test",
points=models.Batch.construct(
ids=[md5(text.encode("utf-8")).hexdigest() for text in texts],
vectors=embeddings,
payloads=[{"page_content": text, "metadata": metadatas[i]} for i, text in enumerate(texts)],
),
)
collection = client.get_collection("test")
print(f"Indexed {len(split_documents)} chunks from {len(documents)} documents in Qdrant. Total points: {collection.points_count}")


if __name__ == "__main__":
Expand Down
14 changes: 8 additions & 6 deletions load_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,20 @@
from dotenv import load_dotenv

load_dotenv()
# ingest
persist_directory = os.environ.get("PERSIST_DIRECTORY")
documents_directory = os.environ.get("DOCUMENTS_DIRECTORY")

# generate
# generic
llama_embeddings_model = os.environ.get("LLAMA_EMBEDDINGS_MODEL")

model_n_ctx = int(os.environ.get("MODEL_N_CTX"))
use_mlock = os.environ.get("USE_MLOCK").lower() == "true"

# ingest
persist_directory = os.environ.get("PERSIST_DIRECTORY")
documents_directory = os.environ.get("DOCUMENTS_DIRECTORY")
chunk_size = int(os.environ.get("INGEST_CHUNK_SIZE"))
chunk_overlap = int(os.environ.get("INGEST_CHUNK_OVERLAP"))

# generate
model_type = os.environ.get("MODEL_TYPE")
model_path = os.environ.get("MODEL_PATH")
model_temp = float(os.environ.get("MODEL_TEMP"))
model_stop = os.environ.get("MODEL_STOP").split(",")

Loading