diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1fa2c1164..bb1913b59 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -31,7 +31,7 @@ jobs: - '3.11' services: postgres: - image: postgres:latest + image: pgvector/pgvector:pg16 env: POSTGRES_USER: postgres POSTGRES_PASSWORD: postgres_password diff --git a/apps/experiments/tasks.py b/apps/experiments/tasks.py index 064766431..50466ce5f 100644 --- a/apps/experiments/tasks.py +++ b/apps/experiments/tasks.py @@ -1,17 +1,24 @@ import time from datetime import datetime +import pymupdf4llm from celery.app import shared_task from langchain.schema import AIMessage, HumanMessage +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_core.documents import Document +from langchain_text_splitters import MarkdownTextSplitter +from pymupdf import Document as PyMuPDFDocument from taskbadger.celery import Task as TaskbadgerTask from apps.channels.datamodels import WebMessage from apps.chat.bots import create_conversation from apps.chat.channels import WebChannel -from apps.experiments.models import ExperimentSession, PromptBuilderHistory, SourceMaterial +from apps.experiments.models import Experiment, ExperimentSession, PromptBuilderHistory, SourceMaterial +from apps.files.models import File from apps.service_providers.models import LlmProvider from apps.users.models import CustomUser from apps.utils.taskbadger import update_taskbadger_data +from apps.vectordb.vectorstore import PGVector @shared_task(bind=True, base=TaskbadgerTask) @@ -23,6 +30,37 @@ def get_response_for_webchat_task(self, experiment_session_id: int, message_text return message_handler.new_user_message(message) +@shared_task(bind=True, base=TaskbadgerTask) +def store_rag_embedding(self, experiment_id: int, file_id: int) -> None: + experiment = Experiment.objects.get(id=experiment_id) + file = experiment.files.get(id=file_id) + documents = load_rag_file(file) + embeddings_model = experiment.get_llm_service().get_openai_embeddings() + PGVector.from_documents(documents, embeddings_model, experiment) + + +def load_rag_file(file: File) -> list[Document]: + """ + Loads a text file of any supported type (PDF, TXT, HTML) into Langchain. + """ + + if file.content_type == "application/pdf": + doc = PyMuPDFDocument(stream=file.file.open(), filetype="pdf") + md_text = pymupdf4llm.to_markdown(doc) + splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0) + documents = splitter.create_documents([md_text]) + elif file.content_type.startswith("text"): + with file.file.open() as f: + metadata = {"source": file.name} + doc = Document(page_content=f.read(), metadata=metadata) + text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) + documents = text_splitter.split_documents([doc]) + else: + raise ValueError(f"Unsupported file type: {file.content_type}") + + return documents + + @shared_task def get_prompt_builder_response_task(team_id: int, user_id, data_dict: dict) -> dict[str, str | int]: llm_service = LlmProvider.objects.get(id=data_dict["provider"]).get_llm_service() diff --git a/apps/experiments/views/experiment.py b/apps/experiments/views/experiment.py index f815c4ab1..66c0b7380 100644 --- a/apps/experiments/views/experiment.py +++ b/apps/experiments/views/experiment.py @@ -53,7 +53,7 @@ from apps.experiments.helpers import get_real_user_or_none from apps.experiments.models import Experiment, ExperimentSession, Participant, SessionStatus, SyntheticVoice from apps.experiments.tables import ExperimentSessionsTable, ExperimentTable -from apps.experiments.tasks import get_response_for_webchat_task +from apps.experiments.tasks import get_response_for_webchat_task, store_rag_embedding from apps.experiments.views.prompt import PROMPT_DATA_SESSION_KEY from apps.files.forms import get_file_formset from apps.files.views import BaseAddFileHtmxView, BaseDeleteFileView @@ -220,6 +220,10 @@ def _validate_prompt_variables(form_data): available_variables = set() if form_data.get("source_material"): available_variables.add("source_material") + # available_variables below should be added by making a + # db request to check if there are any RAG files uploaded + available_variables.add("context") + available_variables.add("input") missing_vars = required_variables - available_variables known_vars = {"source_material"} if missing_vars: @@ -361,6 +365,7 @@ def form_valid(self, form): experiment = get_object_or_404(Experiment, team=self.request.team, pk=self.kwargs["pk"]) file = super().form_valid(form) experiment.files.add(file) + store_rag_embedding(experiment.id, file.id) return file def get_delete_url(self, file): diff --git a/apps/service_providers/llm_service/main.py b/apps/service_providers/llm_service/main.py index ff5260dba..02b567922 100644 --- a/apps/service_providers/llm_service/main.py +++ b/apps/service_providers/llm_service/main.py @@ -7,6 +7,7 @@ from langchain_community.chat_models import ChatAnthropic from langchain_core.callbacks import BaseCallbackHandler from langchain_core.language_models import BaseLanguageModel +from langchain_openai import OpenAIEmbeddings from langchain_openai.chat_models import AzureChatOpenAI, ChatOpenAI from openai import OpenAI from openai._base_client import SyncAPIClient @@ -42,6 +43,14 @@ class OpenAILlmService(LlmService): openai_api_base: str = None openai_organization: str = None + def get_openai_embeddings(self, model="text-embedding-3-small") -> OpenAIEmbeddings: + return OpenAIEmbeddings( + openai_api_key=self.openai_api_key, + openai_api_base=self.openai_api_base, + openai_organization=self.openai_organization, + model=model, + ) + def get_raw_client(self) -> OpenAI: return OpenAI(api_key=self.openai_api_key, organization=self.openai_organization, base_url=self.openai_api_base) diff --git a/apps/service_providers/llm_service/runnables.py b/apps/service_providers/llm_service/runnables.py index 5d15a35a2..a0d6df2de 100644 --- a/apps/service_providers/llm_service/runnables.py +++ b/apps/service_providers/llm_service/runnables.py @@ -29,6 +29,7 @@ from apps.chat.conversation import compress_chat_history from apps.chat.models import ChatMessage, ChatMessageType from apps.experiments.models import Experiment, ExperimentSession +from apps.vectordb.vectorstore import PGVector logger = logging.getLogger(__name__) @@ -51,7 +52,8 @@ def create_experiment_runnable(experiment: Experiment, session: ExperimentSessio assert experiment.llm_provider, "Experiment must have an LLM provider" if experiment.tools_enabled: return AgentExperimentRunnable(experiment=experiment, session=session) - + if experiment.files.exists(): + return RagExperimentRunnable(experiment=experiment, session=session) return SimpleExperimentRunnable(experiment=experiment, session=session) @@ -219,6 +221,25 @@ def _build_chain(self) -> Runnable[dict[str, Any], str]: ) +class RagExperimentRunnable(ExperimentRunnable): + def _build_chain(self) -> Runnable[dict[str, Any], str]: + def format_docs(docs): + return "\n\n".join(doc.page_content for doc in docs) + + model = self.llm_service.get_chat_model(self.experiment.llm, self.experiment.temperature) + embeddings = self.experiment.get_llm_service().get_openai_embeddings() + retriever = PGVector(self.experiment, embeddings).as_retriever() + return ( + {"context": retriever | format_docs, "input": RunnablePassthrough()} + | RunnablePassthrough.assign( + history=RunnableLambda(self.memory.load_memory_variables) | itemgetter("history") + ) + | self.prompt + | model + | StrOutputParser() + ) + + class AgentExperimentRunnable(ExperimentRunnable): def _parse_output(self, output): return output.get("output", "") diff --git a/apps/utils/chunked.py b/apps/utils/chunked.py new file mode 100644 index 000000000..a96e7ac07 --- /dev/null +++ b/apps/utils/chunked.py @@ -0,0 +1,32 @@ +from itertools import islice + + +def chunked(it, n, collection=tuple): + """ + >>> for nums in chunked(range(10), 4): + ... print(nums) + ... + (0, 1, 2, 3) + (4, 5, 6, 7) + (8, 9) + >>> for nums in chunked(range(10), 4, list): + ... print(nums) + ... + [0, 1, 2, 3] + [4, 5, 6, 7] + [8, 9] + """ + itr = iter(it) + while True: + try: + items = take(n, itr, collection) + except StopIteration: + break + if not items: + break + yield items + + +def take(n, iterable, collection=list): + # https://docs.python.org/2/library/itertools.html#recipes + return collection(islice(iterable, n)) diff --git a/apps/vectordb/__init__.py b/apps/vectordb/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/apps/vectordb/apps.py b/apps/vectordb/apps.py new file mode 100644 index 000000000..649e9ec7b --- /dev/null +++ b/apps/vectordb/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class VectorDbConfig(AppConfig): + name = "apps.vectordb" + label = "vectordb" diff --git a/apps/vectordb/const.py b/apps/vectordb/const.py new file mode 100644 index 000000000..e8d5c07c0 --- /dev/null +++ b/apps/vectordb/const.py @@ -0,0 +1,9 @@ +META_EMBEDDING_ID = "embedding_id" + +META_EXPERIMENT_ID = "experiment_id" + +META_FILE_ID = "file_id" + +META_SEARCH_SCORE = "search_score" + +META_ALL = (META_EMBEDDING_ID, META_EXPERIMENT_ID, META_FILE_ID, META_SEARCH_SCORE) diff --git a/apps/vectordb/migrations/0001_initial.py b/apps/vectordb/migrations/0001_initial.py new file mode 100644 index 000000000..69b98bc47 --- /dev/null +++ b/apps/vectordb/migrations/0001_initial.py @@ -0,0 +1,66 @@ +# Generated by Django 4.2.7 on 2024-03-22 12:11 + +from django.db import migrations, models +import django.db.models.deletion +import pgvector.django + + +class Migration(migrations.Migration): + initial = True + + dependencies = [ + ("files", "0001_initial"), + ("experiments", "0070_alter_consentform_name_alter_experiment_llm_and_more"), + ("teams", "0005_invitation_groups"), + ] + + operations = [ + pgvector.django.VectorExtension(), + migrations.CreateModel( + name="Embedding", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("embedding", pgvector.django.VectorField(dimensions=1536)), + ("document", models.TextField(null=True)), + ("metadata", models.JSONField(null=True)), + ( + "experiment", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="embeddings", + to="experiments.experiment", + ), + ), + ( + "file", + models.ForeignKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.CASCADE, + to="files.file", + ), + ), + ( + "team", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="teams.team", + verbose_name="Team", + ), + ), + ], + options={ + "abstract": False, + }, + ), + ] diff --git a/apps/vectordb/migrations/__init__.py b/apps/vectordb/migrations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/apps/vectordb/models.py b/apps/vectordb/models.py new file mode 100644 index 000000000..d37d75081 --- /dev/null +++ b/apps/vectordb/models.py @@ -0,0 +1,14 @@ +from django.db import models +from pgvector.django import VectorField + +from apps.teams.models import BaseTeamModel + +ADA_TOKEN_COUNT = 1536 + + +class Embedding(BaseTeamModel): + experiment = models.ForeignKey("experiments.Experiment", on_delete=models.CASCADE, related_name="embeddings") + embedding = VectorField(dimensions=ADA_TOKEN_COUNT) + document = models.TextField(null=True) # noqa: DJ001 + metadata = models.JSONField(null=True) + file = models.ForeignKey("files.File", on_delete=models.CASCADE, null=True, blank=True) diff --git a/apps/vectordb/tests/__init__.py b/apps/vectordb/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/apps/vectordb/tests/test_vectorstore.py b/apps/vectordb/tests/test_vectorstore.py new file mode 100644 index 000000000..c142ed054 --- /dev/null +++ b/apps/vectordb/tests/test_vectorstore.py @@ -0,0 +1,143 @@ +"""Test PGVector functionality. + +Ported from https://github.com/hwchase17/langchain/blob/master/tests/integration_tests/vectorstores/test_pgvector.py +""" + +import pytest +from langchain.docstore.document import Document +from langchain_community.embeddings import DeterministicFakeEmbedding + +from apps.utils.factories.experiment import ExperimentFactory +from apps.vectordb.const import META_ALL +from apps.vectordb.models import ADA_TOKEN_COUNT +from apps.vectordb.vectorstore import DistanceStrategy, PGVector + + +@pytest.fixture() +def experiment(db): + return ExperimentFactory() + + +@pytest.mark.django_db() +def test_vectorstore(experiment): + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = PGVector.from_texts( + texts=texts, + embedding=DeterministicFakeEmbedding(size=ADA_TOKEN_COUNT), + experiment=experiment, + ) + output = docsearch.similarity_search("foo", k=1) + _remove_meta_fields(output) + assert output == [Document(page_content="foo")] + + +@pytest.mark.django_db() +def test_vectorstore_with_metadatas(experiment): + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = PGVector.from_texts( + texts=texts, + embedding=DeterministicFakeEmbedding(size=ADA_TOKEN_COUNT), + metadatas=metadatas, + experiment=experiment, + ) + output = docsearch.similarity_search("foo", k=1) + _remove_meta_fields(output) + assert output == [Document(page_content="foo", metadata={"page": "0"})] + + +@pytest.mark.django_db() +def test_vectorstore_with_metadatas_with_scores(experiment): + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = PGVector.from_texts( + texts=texts, + embedding=DeterministicFakeEmbedding(size=ADA_TOKEN_COUNT), + metadatas=metadatas, + experiment=experiment, + distance_strategy=DistanceStrategy.COSINE, + ) + output = docsearch.similarity_search_with_score("foo", k=1) + _remove_meta_fields(output) + assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] + + +@pytest.mark.django_db() +def test_vectorstore_with_filter_match(experiment): + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = PGVector.from_texts( + texts=texts, + embedding=DeterministicFakeEmbedding(size=ADA_TOKEN_COUNT), + metadatas=metadatas, + experiment=experiment, + ) + output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "0"}) + _remove_meta_fields(output) + assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] + + +@pytest.mark.django_db() +def test_vectorstore_with_filter_distant_match(experiment): + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = PGVector.from_texts( + texts=texts, + embedding=DeterministicFakeEmbedding(size=ADA_TOKEN_COUNT), + metadatas=metadatas, + experiment=experiment, + ) + output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "2"}) + _remove_meta_fields(output) + # ordering here is deterministic but random due to fake embeddings + assert output == [(Document(page_content="baz", metadata={"page": "2"}), 0.9290842232061864)] + + +@pytest.mark.django_db() +def test_vectorstore_distant_match_cosine_ordering(experiment): + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = PGVector.from_texts( + texts=texts, + embedding=DeterministicFakeEmbedding(size=ADA_TOKEN_COUNT), + metadatas=metadatas, + experiment=experiment, + distance_strategy=DistanceStrategy.COSINE, + ) + output = docsearch.similarity_search_with_score("foo", k=3) + _remove_meta_fields(output) + # ordering here is deterministic but random due to fake embeddings + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}), 0.0), + (Document(page_content="baz", metadata={"page": "2"}), 0.9290842232061864), + (Document(page_content="bar", metadata={"page": "1"}), 1.0246011368123038), + ] + + +@pytest.mark.django_db() +def test_vectorstore_with_filter_no_match(experiment): + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = PGVector.from_texts( + texts=texts, + embedding=DeterministicFakeEmbedding(size=ADA_TOKEN_COUNT), + metadatas=metadatas, + experiment=experiment, + ) + output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "5"}) + assert output == [] + + +def _remove_meta_fields(docs, fields=META_ALL): + for doc in docs: + if isinstance(doc, tuple): + # (doc, score) + doc = doc[0] + for field in fields: + doc.metadata.pop(field, None) diff --git a/apps/vectordb/vectorstore.py b/apps/vectordb/vectorstore.py new file mode 100644 index 000000000..3269d1a4b --- /dev/null +++ b/apps/vectordb/vectorstore.py @@ -0,0 +1,270 @@ +"""VectorStore wrapper around a Postgres/PGVector database. + +Port of https://github.com/hwchase17/langchain/blob/master/langchain/vectorstores/pgvector.py +to use Django models. +""" +from __future__ import annotations + +import enum +from collections.abc import Iterable +from typing import Any + +from django.db.models import Q +from langchain.docstore.document import Document +from langchain.embeddings.base import Embeddings +from langchain.vectorstores.base import VectorStore +from pgvector.django import CosineDistance, L2Distance, MaxInnerProduct +from sentry_sdk import capture_exception + +from apps.experiments.models import Experiment +from apps.teams.models import Team +from apps.utils.chunked import chunked + +from .const import META_EMBEDDING_ID, META_EXPERIMENT_ID, META_FILE_ID, META_SEARCH_SCORE +from .models import Embedding + + +class QueryResult: + Embedding: Embedding + distance: float + + +class DistanceStrategy(enum.Enum): + EUCLIDEAN = L2Distance + COSINE = CosineDistance + MAX_INNER_PRODUCT = MaxInnerProduct + + +DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.COSINE + + +class PGVector(VectorStore): + """ + VectorStore implementation using Postgres and pgvector. + - `embedding_function` any embedding function implementing + `langchain.embeddings.base.Embeddings` interface. + - `distance_strategy` is the distance strategy to use. (default: EUCLIDEAN) + - `EUCLIDEAN` is the euclidean distance. + - `COSINE` is the cosine distance. + """ + + def __init__( + self, + experiment: Experiment, + embedding_function: Embeddings, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + ) -> None: + self.experiment = experiment + self.embedding_function = embedding_function + self.distance_strategy = distance_strategy + + def delete_embeddings(self) -> None: + self.experiment.embeddings.all().delete() + + def add_texts(self, texts: Iterable[str], metadatas: list[dict] | None = None, **kwargs) -> None: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + + Returns: + List of ids from adding the texts into the vectorstore. + """ + if not metadatas: + metadatas = [{} for _ in texts] + + for chunk_num, chunk in enumerate(chunked(zip(texts, metadatas), 500)): + chunk_texts = [item[0] for item in chunk] + embeddings = self.embedding_function.embed_documents(chunk_texts) + for i in range(len(chunk)): + text, metadata = chunk[i] + embedding = embeddings[i] + # fix postgres null character bug + text = text.replace("\x00", "\uFFFD") + try: + Embedding.objects.create( + team_id=self.experiment.team_id, + experiment=self.experiment, + embedding=embedding, + document=text, + metadata=metadata, + file_id=metadata.pop(META_FILE_ID, None), + ) + except Exception as e: + capture_exception(e) + + def similarity_search( + self, + query: str, + k: int = 4, + filter: dict | None = None, + **kwargs: Any, + ) -> list[Document]: + """Run similarity search with PGVector with distance. + + Args: + query (str): Query text to search for. + k (int): Number of results to return. Defaults to 4. + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + + Returns: + List of Documents most similar to the query. + """ + embedding = self.embedding_function.embed_query(text=query) + return self.similarity_search_by_vector( + embedding=embedding, + k=k, + filter=filter, + ) + + def similarity_search_with_score( + self, + query: str, + k: int = 4, + filter: dict | None = None, + ) -> list[tuple[Document, float]]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + + Returns: + List of Documents most similar to the query and score for each + """ + embedding = self.embedding_function.embed_query(query) + docs = self.similarity_search_with_score_by_vector(embedding=embedding, k=k, filter=filter) + return docs + + def similarity_search_with_score_by_vector( + self, + embedding: list[float], + k: int = 4, + filter: dict | None = None, + ) -> list[tuple[Document, float]]: + additional_filter = Q(experiment=self.experiment) + if filter is not None: + for key, value in filter.items(): + additional_filter &= Q(**{f"metadata__{key}": str(value)}) + + return similarity_search_with_score_by_vector( + team=self.experiment.team, + embedding=embedding, + k=k, + distance_strategy=self.distance_strategy, + additional_filter=additional_filter, + ) + + def similarity_search_by_vector( + self, + embedding: list[float], + k: int = 4, + filter: dict | None = None, + **kwargs: Any, + ) -> list[Document]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + + Returns: + List of Documents most similar to the query vector. + """ + docs_and_scores = self.similarity_search_with_score_by_vector(embedding=embedding, k=k, filter=filter) + return [doc for doc, _ in docs_and_scores] + + @classmethod + def from_texts( + cls: type[PGVector], + texts: list[str], + embedding: Embeddings, + metadatas: list[dict] | None = None, + experiment: Experiment = None, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + **kwargs, + ) -> PGVector: + """ + Return VectorStore initialized from texts and embeddings. + """ + + store = cls( + experiment=experiment, + embedding_function=embedding, + distance_strategy=distance_strategy, + ) + + store.add_texts(texts=texts, metadatas=metadatas) + return store + + @classmethod + def from_documents( + cls: type[PGVector], + documents: list[Document], + embedding: Embeddings, + experiment: Experiment = None, + **kwargs: Any, + ) -> PGVector: + """ + Return VectorStore initialized from documents and embeddings. + """ + + texts = [d.page_content for d in documents] + metadatas = [d.metadata for d in documents] + + return cls.from_texts( + texts=texts, + embedding=embedding, + metadatas=metadatas, + experiment=experiment, + **kwargs, + ) + + +def similarity_search_by_vector( + team: Team, + embedding: list[float], + k: int = 4, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + additional_filter: Q = None, +) -> list[Document]: + docs_with_score = similarity_search_with_score_by_vector( + team=team, embedding=embedding, k=k, distance_strategy=distance_strategy, additional_filter=additional_filter + ) + return [doc for doc, _ in docs_with_score] + + +def similarity_search_with_score_by_vector( + team: Team, + embedding: list[float], + k: int = 4, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + additional_filter: Q = None, +) -> list[tuple[Document, float]]: + query = Embedding.objects.filter(experiment__team=team) + + query = query.annotate(distance=distance_strategy.value("embedding", embedding)).order_by("distance") + if additional_filter: + query = query.filter(additional_filter) + + docs = [ + ( + Document( + page_content=result.document, + metadata={ + META_EMBEDDING_ID: result.id, + META_EXPERIMENT_ID: result.experiment_id, + META_FILE_ID: result.file_id, + META_SEARCH_SCORE: result.distance, + **result.metadata, + }, + ), + result.distance, + ) + for result in query[:k] + ] + + return docs diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index 31d51d6fa..27578d566 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -2,7 +2,7 @@ version: "3.8" services: db: - image: postgres + image: pgvector/pgvector:pg16 volumes: - postgres_data:/var/lib/postgresql/data/ environment: diff --git a/gpt_playground/settings.py b/gpt_playground/settings.py index b09964107..b3774fa07 100644 --- a/gpt_playground/settings.py +++ b/gpt_playground/settings.py @@ -88,6 +88,7 @@ "apps.events", "apps.annotations", "apps.pipelines", + "apps.vectordb", ] INSTALLED_APPS = DJANGO_APPS + THIRD_PARTY_APPS + PROJECT_APPS diff --git a/requirements/requirements.in b/requirements/requirements.in index d6151818a..a70563adf 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -44,3 +44,5 @@ django-field-audit>=1.2.7 turn-python>=0.2.0 jinja2 django-taggit +pgvector +pymupdf4llm diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 6772296d5..6467dff7a 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -29,9 +29,7 @@ anyio==3.7.1 asgiref==3.7.2 # via django async-timeout==4.0.3 - # via - # aiohttp - # redis + # via aiohttp attrs==23.1.0 # via # aiohttp @@ -253,6 +251,7 @@ numpy==1.26.2 # langchain-community # langchain-openai # pandas + # pgvector oauthlib==3.2.2 # via requests-oauthlib openai==1.12.0 @@ -267,6 +266,8 @@ packaging==23.2 # marshmallow pandas==2.1.3 # via -r requirements/requirements.in +pgvector==0.2.5 + # via -r requirements/requirements.in prompt-toolkit==3.0.41 # via click-repl psycopg2-binary==2.9.9 @@ -290,7 +291,14 @@ pygments==2.16.1 pyjwt[crypto]==2.8.0 # via # django-allauth + # pyjwt # twilio +pymupdf==1.24.4 + # via pymupdf4llm +pymupdf4llm==0.0.5 + # via -r requirements/requirements.in +pymupdfb==1.24.3 + # via pymupdf pypng==0.20220715.0 # via qrcode pytelegrambotapi==4.12.0 @@ -397,7 +405,9 @@ turn-python==0.2.0 twilio==8.10.1 # via -r requirements/requirements.in typer[all]==0.9.0 - # via taskbadger + # via + # taskbadger + # typer typing-extensions==4.8.0 # via # anthropic