Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial vector db implementation #335

Draft
wants to merge 18 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions apps/utils/chunked.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from itertools import islice


def chunked(it, n, collection=tuple):
"""
>>> for nums in chunked(range(10), 4):
... print(nums)
...
(0, 1, 2, 3)
(4, 5, 6, 7)
(8, 9)
>>> for nums in chunked(range(10), 4, list):
... print(nums)
...
[0, 1, 2, 3]
[4, 5, 6, 7]
[8, 9]
"""
itr = iter(it)
while True:
try:
items = take(n, itr, collection)
except StopIteration:
break
if not items:
break
yield items


def take(n, iterable, collection=list):
# https://docs.python.org/2/library/itertools.html#recipes
return collection(islice(iterable, n))
Empty file added apps/vectordb/__init__.py
Empty file.
6 changes: 6 additions & 0 deletions apps/vectordb/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from django.apps import AppConfig


class VectorDbConfig(AppConfig):
name = "apps.vectordb"
label = "vectordb"
9 changes: 9 additions & 0 deletions apps/vectordb/const.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
META_EMBEDDING_ID = "embedding_id"

META_EXPERIMENT_ID = "experiment_id"

META_FILE_ID = "file_id"

META_SEARCH_SCORE = "search_score"

META_ALL = (META_EMBEDDING_ID, META_EXPERIMENT_ID, META_FILE_ID, META_SEARCH_SCORE)
66 changes: 66 additions & 0 deletions apps/vectordb/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Generated by Django 4.2.7 on 2024-03-22 12:11

from django.db import migrations, models
import django.db.models.deletion
import pgvector.django


class Migration(migrations.Migration):
initial = True

dependencies = [
("files", "0001_initial"),
("experiments", "0070_alter_consentform_name_alter_experiment_llm_and_more"),
("teams", "0005_invitation_groups"),
]

operations = [
pgvector.django.VectorExtension(),
migrations.CreateModel(
name="Embedding",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("created_at", models.DateTimeField(auto_now_add=True)),
("updated_at", models.DateTimeField(auto_now=True)),
("embedding", pgvector.django.VectorField(dimensions=1536)),
("document", models.TextField(null=True)),
("metadata", models.JSONField(null=True)),
(
"experiment",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="embeddings",
to="experiments.experiment",
),
),
(
"file",
models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
to="files.file",
),
),
(
"team",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to="teams.team",
verbose_name="Team",
),
),
],
options={
"abstract": False,
},
),
]
Empty file.
14 changes: 14 additions & 0 deletions apps/vectordb/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from django.db import models
from pgvector.django import VectorField

from apps.teams.models import BaseTeamModel

ADA_TOKEN_COUNT = 1536


class Embedding(BaseTeamModel):
experiment = models.ForeignKey("experiments.Experiment", on_delete=models.CASCADE, related_name="embeddings")
embedding = VectorField(dimensions=ADA_TOKEN_COUNT)
document = models.TextField(null=True) # noqa: DJ001
metadata = models.JSONField(null=True)
file = models.ForeignKey("files.File", on_delete=models.CASCADE, null=True, blank=True)
Empty file added apps/vectordb/tests/__init__.py
Empty file.
143 changes: 143 additions & 0 deletions apps/vectordb/tests/test_vectorstore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
"""Test PGVector functionality.

Ported from https://github.com/hwchase17/langchain/blob/master/tests/integration_tests/vectorstores/test_pgvector.py
"""

import pytest
from langchain.docstore.document import Document
from langchain_community.embeddings import DeterministicFakeEmbedding

from apps.utils.factories.experiment import ExperimentFactory
from apps.vectordb.const import META_ALL
from apps.vectordb.models import ADA_TOKEN_COUNT
from apps.vectordb.vectorstore import DistanceStrategy, PGVector


@pytest.fixture()
def experiment(db):
return ExperimentFactory()


@pytest.mark.django_db()
def test_vectorstore(experiment):
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
docsearch = PGVector.from_texts(
texts=texts,
embedding=DeterministicFakeEmbedding(size=ADA_TOKEN_COUNT),
experiment=experiment,
)
output = docsearch.similarity_search("foo", k=1)
_remove_meta_fields(output)
SmittieC marked this conversation as resolved.
Show resolved Hide resolved
assert output == [Document(page_content="foo")]


@pytest.mark.django_db()
def test_vectorstore_with_metadatas(experiment):
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = PGVector.from_texts(
texts=texts,
embedding=DeterministicFakeEmbedding(size=ADA_TOKEN_COUNT),
metadatas=metadatas,
experiment=experiment,
)
output = docsearch.similarity_search("foo", k=1)
_remove_meta_fields(output)
assert output == [Document(page_content="foo", metadata={"page": "0"})]


@pytest.mark.django_db()
def test_vectorstore_with_metadatas_with_scores(experiment):
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = PGVector.from_texts(
texts=texts,
embedding=DeterministicFakeEmbedding(size=ADA_TOKEN_COUNT),
metadatas=metadatas,
experiment=experiment,
distance_strategy=DistanceStrategy.COSINE,
)
output = docsearch.similarity_search_with_score("foo", k=1)
_remove_meta_fields(output)
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]


@pytest.mark.django_db()
def test_vectorstore_with_filter_match(experiment):
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = PGVector.from_texts(
texts=texts,
embedding=DeterministicFakeEmbedding(size=ADA_TOKEN_COUNT),
metadatas=metadatas,
experiment=experiment,
)
output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "0"})
_remove_meta_fields(output)
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]


@pytest.mark.django_db()
def test_vectorstore_with_filter_distant_match(experiment):
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = PGVector.from_texts(
texts=texts,
embedding=DeterministicFakeEmbedding(size=ADA_TOKEN_COUNT),
metadatas=metadatas,
experiment=experiment,
)
output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "2"})
_remove_meta_fields(output)
# ordering here is deterministic but random due to fake embeddings
assert output == [(Document(page_content="baz", metadata={"page": "2"}), 0.9290842232061864)]


@pytest.mark.django_db()
def test_vectorstore_distant_match_cosine_ordering(experiment):
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = PGVector.from_texts(
texts=texts,
embedding=DeterministicFakeEmbedding(size=ADA_TOKEN_COUNT),
metadatas=metadatas,
experiment=experiment,
distance_strategy=DistanceStrategy.COSINE,
)
output = docsearch.similarity_search_with_score("foo", k=3)
_remove_meta_fields(output)
# ordering here is deterministic but random due to fake embeddings
assert output == [
(Document(page_content="foo", metadata={"page": "0"}), 0.0),
(Document(page_content="baz", metadata={"page": "2"}), 0.9290842232061864),
(Document(page_content="bar", metadata={"page": "1"}), 1.0246011368123038),
SmittieC marked this conversation as resolved.
Show resolved Hide resolved
]


@pytest.mark.django_db()
def test_vectorstore_with_filter_no_match(experiment):
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = PGVector.from_texts(
texts=texts,
embedding=DeterministicFakeEmbedding(size=ADA_TOKEN_COUNT),
metadatas=metadatas,
experiment=experiment,
)
output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "5"})
assert output == []


def _remove_meta_fields(docs, fields=META_ALL):
for doc in docs:
if isinstance(doc, tuple):
# (doc, score)
doc = doc[0]
for field in fields:
doc.metadata.pop(field, None)
Loading