From 43cc31f74015f8d8fcbf7a8ea7d7d9ecc66cf8c9 Mon Sep 17 00:00:00 2001 From: Jackson <139523303+Jacksonxhx@users.noreply.github.com> Date: Thu, 18 Jul 2024 16:55:45 +0800 Subject: [PATCH] feat(vectordb): Milvus vector db Integration (#1996) * integrate Milvus into Private GPT * adjust milvus settings * update doc info and reformat * adjust milvus initialization * adjust import error * mionr update * adjust format * adjust the db storing path * update doc --- fern/docs/pages/installation/concepts.mdx | 3 +- fern/docs/pages/installation/installation.mdx | 1 + fern/docs/pages/manual/vectordb.mdx | 23 +++++- poetry.lock | 82 ++++++++++++++++++- .../vector_store/vector_store_component.py | 39 +++++++++ private_gpt/settings/settings.py | 24 +++++- pyproject.toml | 2 + settings.yaml | 5 ++ 8 files changed, 173 insertions(+), 6 deletions(-) diff --git a/fern/docs/pages/installation/concepts.mdx b/fern/docs/pages/installation/concepts.mdx index 1fd9da397..0a8dc744b 100644 --- a/fern/docs/pages/installation/concepts.mdx +++ b/fern/docs/pages/installation/concepts.mdx @@ -44,6 +44,7 @@ will load the configuration from `settings.yaml` and `settings-ollama.yaml`. ## About Fully Local Setups In order to run PrivateGPT in a fully local setup, you will need to run the LLM, Embeddings and Vector Store locally. + ### LLM For local LLM there are two options: * (Recommended) You can use the 'ollama' option in PrivateGPT, which will connect to your local Ollama instance. Ollama simplifies a lot the installation of local LLMs. @@ -63,4 +64,4 @@ In order for HuggingFace LLM to work (the second option), you need to download t poetry run python scripts/setup ``` ### Vector stores -The vector stores supported (Qdrant, ChromaDB and Postgres) run locally by default. \ No newline at end of file +The vector stores supported (Qdrant, Milvus, ChromaDB and Postgres) run locally by default. \ No newline at end of file diff --git a/fern/docs/pages/installation/installation.mdx b/fern/docs/pages/installation/installation.mdx index 3a6385f49..80f1c74b3 100644 --- a/fern/docs/pages/installation/installation.mdx +++ b/fern/docs/pages/installation/installation.mdx @@ -82,6 +82,7 @@ You need to choose one option per category (LLM, Embeddings, Vector Stores, UI). | **Option** | **Description** | **Extra** | |------------------|-----------------------------------------|-------------------------| | **qdrant** | Adds support for Qdrant vector store | vector-stores-qdrant | +| milvus | Adds support for Milvus vector store | vector-stores-milvus | | chroma | Adds support for Chroma DB vector store | vector-stores-chroma | | postgres | Adds support for Postgres vector store | vector-stores-postgres | | clickhouse | Adds support for Clickhouse vector store| vector-stores-clickhouse| diff --git a/fern/docs/pages/manual/vectordb.mdx b/fern/docs/pages/manual/vectordb.mdx index 2295d0659..ac3dffccd 100644 --- a/fern/docs/pages/manual/vectordb.mdx +++ b/fern/docs/pages/manual/vectordb.mdx @@ -1,6 +1,7 @@ -PrivateGPT supports [Qdrant](https://qdrant.tech/), [Chroma](https://www.trychroma.com/), [PGVector](https://github.com/pgvector/pgvector) and [ClickHouse](https://github.com/ClickHouse/ClickHouse) as vectorstore providers. Qdrant being the default. +## Vectorstores +PrivateGPT supports [Qdrant](https://qdrant.tech/), [Milvus](https://milvus.io/), [Chroma](https://www.trychroma.com/), [PGVector](https://github.com/pgvector/pgvector) and [ClickHouse](https://github.com/ClickHouse/ClickHouse) as vectorstore providers. Qdrant being the default. -In order to select one or the other, set the `vectorstore.database` property in the `settings.yaml` file to `qdrant`, `chroma`, `postgres` and `clickhouse`. +In order to select one or the other, set the `vectorstore.database` property in the `settings.yaml` file to `qdrant`, `milvus`, `chroma`, `postgres` and `clickhouse`. ```yaml vectorstore: @@ -38,6 +39,24 @@ qdrant: path: local_data/private_gpt/qdrant ``` +### Milvus configuration + +To enable Milvus, set the `vectorstore.database` property in the `settings.yaml` file to `milvus` and install the `milvus` extra. + +```bash +poetry install --extras vector-stores-milvus +``` + +The available configuration options are: +| Field | Description | +|--------------|-------------| +| uri | Default is set to "local_data/private_gpt/milvus/milvus_local.db" as a local file; you can also set up a more performant Milvus server on docker or k8s e.g.http://localhost:19530, as your uri; To use Zilliz Cloud, adjust the uri and token to Endpoint and Api key in Zilliz Cloud.| +| token | Pair with Milvus server on docker or k8s or zilliz cloud api key.| +| collection_name | The name of the collection, set to default "milvus_db".| +| overwrite | Overwrite the data in collection if it existed, set to default as True. | + +To obtain a local setup (disk-based database) without running a Milvus server, configure the uri value in settings.yaml, to store in local_data/private_gpt/milvus/milvus_local.db. + ### Chroma configuration To enable Chroma, set the `vectorstore.database` property in the `settings.yaml` file to `chroma` and install the `chroma` extra. diff --git a/poetry.lock b/poetry.lock index d2582d41a..035f434cd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiofiles" @@ -1182,6 +1182,27 @@ files = [ dnspython = ">=2.0.0" idna = ">=2.0.0" +[[package]] +name = "environs" +version = "9.5.0" +description = "simplified environment variable parsing" +optional = true +python-versions = ">=3.6" +files = [ + {file = "environs-9.5.0-py2.py3-none-any.whl", hash = "sha256:1e549569a3de49c05f856f40bce86979e7d5ffbbc4398e7f338574c220189124"}, + {file = "environs-9.5.0.tar.gz", hash = "sha256:a76307b36fbe856bdca7ee9161e6c466fd7fcffc297109a118c59b54e27e30c9"}, +] + +[package.dependencies] +marshmallow = ">=3.0.0" +python-dotenv = "*" + +[package.extras] +dev = ["dj-database-url", "dj-email-url", "django-cache-url", "flake8 (==4.0.1)", "flake8-bugbear (==21.9.2)", "mypy (==0.910)", "pre-commit (>=2.4,<3.0)", "pytest", "tox"] +django = ["dj-database-url", "dj-email-url", "django-cache-url"] +lint = ["flake8 (==4.0.1)", "flake8-bugbear (==21.9.2)", "mypy (==0.910)", "pre-commit (>=2.4,<3.0)"] +tests = ["dj-database-url", "dj-email-url", "django-cache-url", "pytest"] + [[package]] name = "fastapi" version = "0.111.0" @@ -2735,6 +2756,21 @@ files = [ clickhouse-connect = ">=0.7.0,<0.8.0" llama-index-core = ">=0.10.5,<0.11.0" +[[package]] +name = "llama-index-vector-stores-milvus" +version = "0.1.20" +description = "llama-index vector_stores milvus integration" +optional = true +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "llama_index_vector_stores_milvus-0.1.20-py3-none-any.whl", hash = "sha256:27a61fd237e67b648f36964c2e25275df4cb20dd740d111f0b75db477259ef5b"}, + {file = "llama_index_vector_stores_milvus-0.1.20.tar.gz", hash = "sha256:461bccce036be7bb739e57eb3855f64557c506023febfc08f98899778d460602"}, +] + +[package.dependencies] +llama-index-core = ">=0.10.1,<0.11.0" +pymilvus = ">=2.3.6,<3.0.0" + [[package]] name = "llama-index-vector-stores-postgres" version = "0.1.11" @@ -2991,6 +3027,22 @@ files = [ {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, ] +[[package]] +name = "milvus-lite" +version = "2.4.8" +description = "A lightweight version of Milvus wrapped with Python." +optional = true +python-versions = ">=3.7" +files = [ + {file = "milvus_lite-2.4.8-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:b7e90b34b214884cd44cdc112ab243d4cb197b775498355e2437b6cafea025fe"}, + {file = "milvus_lite-2.4.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:519dfc62709d8f642d98a1c5b1dcde7080d107e6e312d677fef5a3412a40ac08"}, + {file = "milvus_lite-2.4.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:b21f36d24cbb0e920b4faad607019bb28c1b2c88b4d04680ac8c7697a4ae8a4d"}, + {file = "milvus_lite-2.4.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:08332a2b9abfe7c4e1d7926068937e46f8fb81f2707928b7bc02c9dc99cebe41"}, +] + +[package.dependencies] +tqdm = "*" + [[package]] name = "minijinja" version = "2.0.1" @@ -4578,6 +4630,31 @@ dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pyte docs = ["sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"] tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] +[[package]] +name = "pymilvus" +version = "2.4.4" +description = "Python Sdk for Milvus" +optional = true +python-versions = ">=3.8" +files = [ + {file = "pymilvus-2.4.4-py3-none-any.whl", hash = "sha256:073b76bc36f6f4e70f0f0a0023a53324f0ba8ef9a60883f87cd30a44b6c6f2b5"}, + {file = "pymilvus-2.4.4.tar.gz", hash = "sha256:50c53eb103e034fbffe936fe942751ea3dbd2452e18cf79acc52360ed4987fb7"}, +] + +[package.dependencies] +environs = "<=9.5.0" +grpcio = ">=1.49.1,<=1.63.0" +milvus-lite = {version = ">=2.4.0,<2.5.0", markers = "sys_platform != \"win32\""} +pandas = ">=1.2.4" +protobuf = ">=3.20.0" +setuptools = ">=67" +ujson = ">=2.0.0" + +[package.extras] +bulk-writer = ["azure-storage-blob", "minio (>=7.0.0)", "pyarrow (>=12.0.0)", "requests"] +dev = ["black", "grpcio (==1.62.2)", "grpcio-testing (==1.62.2)", "grpcio-tools (==1.62.2)", "pytest (>=5.3.4)", "pytest-cov (>=2.8.1)", "pytest-timeout (>=1.3.4)", "ruff (>0.4.0)"] +model = ["milvus-model (>=0.1.0)"] + [[package]] name = "pyparsing" version = "3.1.1" @@ -6777,10 +6854,11 @@ storage-nodestore-postgres = ["asyncpg", "llama-index-storage-docstore-postgres" ui = ["gradio"] vector-stores-chroma = ["llama-index-vector-stores-chroma"] vector-stores-clickhouse = ["clickhouse-connect", "llama-index-vector-stores-clickhouse"] +vector-stores-milvus = ["llama-index-vector-stores-milvus"] vector-stores-postgres = ["llama-index-vector-stores-postgres"] vector-stores-qdrant = ["llama-index-vector-stores-qdrant"] [metadata] lock-version = "2.0" python-versions = ">=3.11,<3.12" -content-hash = "5a2ffe28c38fe59d64fcbf2094b804da8e3f784dc42e1926eb7bd8bcd9dc6056" +content-hash = "5e916cce1a7805965795dbaee0e2d24612e54305af4b1936d6bc1fa469b8012f" diff --git a/private_gpt/components/vector_store/vector_store_component.py b/private_gpt/components/vector_store/vector_store_component.py index ee2de3d15..841a1182f 100644 --- a/private_gpt/components/vector_store/vector_store_component.py +++ b/private_gpt/components/vector_store/vector_store_component.py @@ -121,6 +121,45 @@ def __init__(self, settings: Settings) -> None: collection_name="make_this_parameterizable_per_api_call", ), # TODO ) + + case "milvus": + try: + from llama_index.vector_stores.milvus import ( # type: ignore + MilvusVectorStore, + ) + except ImportError as e: + raise ImportError( + "Milvus dependencies not found, install with `poetry install --extras vector-stores-milvus`" + ) from e + + if settings.milvus is None: + logger.info( + "Milvus config not found. Using default settings.\n" + "Trying to connect to Milvus at local_data/private_gpt/milvus/milvus_local.db " + "with collection 'make_this_parameterizable_per_api_call'." + ) + + self.vector_store = typing.cast( + BasePydanticVectorStore, + MilvusVectorStore( + dim=settings.embedding.embed_dim, + collection_name="make_this_parameterizable_per_api_call", + overwrite=True, + ), + ) + + else: + self.vector_store = typing.cast( + BasePydanticVectorStore, + MilvusVectorStore( + dim=settings.embedding.embed_dim, + uri=settings.milvus.uri, + token=settings.milvus.token, + collection_name=settings.milvus.collection_name, + overwrite=settings.milvus.overwrite, + ), + ) + case "clickhouse": try: from clickhouse_connect import ( # type: ignore diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py index 30d816b84..30514ddbe 100644 --- a/private_gpt/settings/settings.py +++ b/private_gpt/settings/settings.py @@ -125,7 +125,7 @@ class LLMSettings(BaseModel): class VectorstoreSettings(BaseModel): - database: Literal["chroma", "qdrant", "postgres", "clickhouse"] + database: Literal["chroma", "qdrant", "postgres", "clickhouse", "milvus"] class NodeStoreSettings(BaseModel): @@ -508,6 +508,27 @@ class QdrantSettings(BaseModel): ) +class MilvusSettings(BaseModel): + uri: str = Field( + "local_data/private_gpt/milvus/milvus_local.db", + description="The URI of the Milvus instance. For example: 'local_data/private_gpt/milvus/milvus_local.db' for Milvus Lite.", + ) + token: str = Field( + "", + description=( + "A valid access token to access the specified Milvus instance. " + "This can be used as a recommended alternative to setting user and password separately. " + ), + ) + collection_name: str = Field( + "make_this_parameterizable_per_api_call", + description="The name of the collection in Milvus. Default is 'make_this_parameterizable_per_api_call'.", + ) + overwrite: bool = Field( + True, description="Overwrite the previous collection schema if it exists." + ) + + class Settings(BaseModel): server: ServerSettings data: DataSettings @@ -527,6 +548,7 @@ class Settings(BaseModel): qdrant: QdrantSettings | None = None postgres: PostgresSettings | None = None clickhouse: ClickHouseSettings | None = None + milvus: MilvusSettings | None = None """ diff --git a/pyproject.toml b/pyproject.toml index d5bf3fad2..f611ad7a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ llama-index-embeddings-openai = {version ="^0.1.10", optional = true} llama-index-embeddings-azure-openai = {version ="^0.1.10", optional = true} llama-index-embeddings-gemini = {version ="^0.1.8", optional = true} llama-index-vector-stores-qdrant = {version ="^0.2.10", optional = true} +llama-index-vector-stores-milvus = {version ="^0.1.20", optional = true} llama-index-vector-stores-chroma = {version ="^0.1.10", optional = true} llama-index-vector-stores-postgres = {version ="^0.1.11", optional = true} llama-index-vector-stores-clickhouse = {version ="^0.1.3", optional = true} @@ -78,6 +79,7 @@ vector-stores-qdrant = ["llama-index-vector-stores-qdrant"] vector-stores-clickhouse = ["llama-index-vector-stores-clickhouse", "clickhouse_connect"] vector-stores-chroma = ["llama-index-vector-stores-chroma"] vector-stores-postgres = ["llama-index-vector-stores-postgres"] +vector-stores-milvus = ["llama-index-vector-stores-milvus"] storage-nodestore-postgres = ["llama-index-storage-docstore-postgres","llama-index-storage-index-store-postgres","psycopg2-binary","asyncpg"] rerank-sentence-transformers = ["torch", "sentence-transformers"] diff --git a/settings.yaml b/settings.yaml index f29a4a837..cd8fccdcb 100644 --- a/settings.yaml +++ b/settings.yaml @@ -85,6 +85,11 @@ vectorstore: nodestore: database: simple +milvus: + uri: local_data/private_gpt/milvus/milvus_local.db + collection_name: milvus_db + overwrite: false + qdrant: path: local_data/private_gpt/qdrant