From 45d205286571380620a20a6101801e745ce7d11c Mon Sep 17 00:00:00 2001 From: wxywb Date: Wed, 7 Aug 2024 16:31:56 +0800 Subject: [PATCH] feat: Add nomic and mistralai depencencies, remove the vertexai temporarily. Signed-off-by: wxywb --- milvus_model/dense/__init__.py | 6 --- milvus_model/dense/mistralai.py | 10 +++-- milvus_model/dense/nomic.py | 22 ++++++--- milvus_model/dense/vertexai.py | 79 --------------------------------- milvus_model/utils/__init__.py | 12 ++++- 5 files changed, 32 insertions(+), 97 deletions(-) delete mode 100644 milvus_model/dense/vertexai.py diff --git a/milvus_model/dense/__init__.py b/milvus_model/dense/__init__.py index a3afefd..e98be90 100644 --- a/milvus_model/dense/__init__.py +++ b/milvus_model/dense/__init__.py @@ -5,7 +5,6 @@ "JinaEmbeddingFunction", "OnnxEmbeddingFunction", "CohereEmbeddingFunction", - "VertexAIEmbeddingFunction", "MistralAIEmbeddingFunction", "NomicEmbeddingFunction", ] @@ -20,7 +19,6 @@ voyageai = LazyImport("voyageai", globals(), "milvus_model.dense.voyageai") onnx = LazyImport("onnx", globals(), "milvus_model.dense.onnx") cohere = LazyImport("cohere", globals(), "milvus_model.dense.cohere") -vertexai = LazyImport("vertexai", globals(), "milvus_model.dense.vertexai") mistralai = LazyImport("mistralai", globals(), "milvus_model.dense.mistralai") nomic = LazyImport("nomic", globals(), "milvus_model.dense.nomic") @@ -49,10 +47,6 @@ def CohereEmbeddingFunction(*args, **kwargs): return cohere.CohereEmbeddingFunction(*args, **kwargs) -def VertexAIEmbeddingFunction(*args, **kwargs): - return vertexai.VertexAIEmbeddingFunction(*args, **kwargs) - - def MistralAIEmbeddingFunction(*args, **kwargs): return mistralai.MistralAIEmbeddingFunction(*args, **kwargs) diff --git a/milvus_model/dense/mistralai.py b/milvus_model/dense/mistralai.py index a8d7392..85a9c67 100644 --- a/milvus_model/dense/mistralai.py +++ b/milvus_model/dense/mistralai.py @@ -1,11 +1,15 @@ from typing import List, Optional +import os import numpy as np from collections import defaultdict -from mistralai.client import MistralClient -import os +from milvus_model.base import BaseEmbeddingFunction +from milvus_model.utils import import_mistralai + +import_mistralai() +from mistralai.client import MistralClient -class MistralAIEmbeddingFunction: +class MistralAIEmbeddingFunction(BaseEmbeddingFunction): def __init__( self, api_key: str, diff --git a/milvus_model/dense/nomic.py b/milvus_model/dense/nomic.py index a2a14d5..446ee66 100644 --- a/milvus_model/dense/nomic.py +++ b/milvus_model/dense/nomic.py @@ -1,21 +1,25 @@ from typing import List import numpy as np -from collections import defaultdict -from nomic import embed import os +from collections import defaultdict +from milvus_model.base import BaseEmbeddingFunction +from milvus_model.utils import import_nomic -class NomicEmbeddingFunction: +import_nomic() +from nomic import embed + +class NomicEmbeddingFunction(BaseEmbeddingFunction): def __init__( self, api_key: str, model_name: str = "nomic-embed-text-v1.5", task_type: str = "search_document", - dimensionality: int = 768, + dimensions: int = 768, **kwargs, ): self._nomic_model_meta_info = defaultdict(dict) - self._nomic_model_meta_info[model_name]["dim"] = dimensionality # set the dimension + self._nomic_model_meta_info[model_name]["dim"] = dimensions # set the dimension if api_key is None: if "NOMIC_API_KEY" in os.environ and os.environ["NOMIC_API_KEY"]: @@ -31,11 +35,15 @@ def __init__( self.api_key = api_key self.model_name = model_name self.task_type = task_type - self.dimensionality = dimensionality + self.dimensionality = dimensions + if "dimensionality" in kwargs: + self.dimensionality = kwargs["dimensionality"] + kwargs.pop("dimensionality") + self._encode_config = { "model": model_name, "task_type": task_type, - "dimensionality": dimensionality, + "dimensionality": self.dimensionality, **kwargs, } diff --git a/milvus_model/dense/vertexai.py b/milvus_model/dense/vertexai.py deleted file mode 100644 index 91cabb3..0000000 --- a/milvus_model/dense/vertexai.py +++ /dev/null @@ -1,79 +0,0 @@ -from typing import List, Optional -import numpy as np -from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel -from collections import defaultdict -import os - -from google.auth import credentials as auth_credentials - - -class VertexAIEmbeddingFunction: - def __init__( - self, - model_name: str = "text-embedding-004", - api_key: Optional[str] = None, - base_url: Optional[str] = None, - dimensions: Optional[int] = 256, - task: str = "SEMANTIC_SIMILARITY", - project_id: Optional[str] = None, - location: str = "us-central1", - credentials: Optional[auth_credentials.Credentials] = None, - **kwargs, - ): - self._vertexai_model_meta_info = defaultdict(dict) - self._model_config = dict({"api_key": api_key, "base_url": base_url}, **kwargs) - if dimensions is not None: - self._vertexai_model_meta_info[model_name]["dim"] = dimensions - if api_key: - self.api_key = api_key - elif "VERTEXAI_API_KEY" in os.environ and os.environ["VERTEXAI_API_KEY"]: - self.api_key = os.environ["VERTEXAI_API_KEY"] - elif credentials: - self.credentials = credentials - else: - raise ValueError( - "Did not find api_key or credentials, please add an environment variable" - " `VERTEXAI_API_KEY` which contains it," - " pass `api_key` as a named parameter, or" - " pass `credentials` as a named parameter." - ) - - self._encode_config = {"model": model_name, "dimensions": dimensions} - self.task = task - self.model_name = model_name - self.project_id = project_id - self.location = location - self.client = TextEmbeddingModel.from_pretrained(model_name) - - def encode_queries(self, queries: List[str]) -> List[np.array]: - self.task = "RETRIEVAL_QUERY" - return self._encode(queries) - - def encode_documents(self, documents: List[str]) -> List[np.array]: - self.task = "RETRIEVAL_DOCUMENT" - return self._encode(documents) - - @property - def dim(self): - return self._vertexai_model_meta_info[self.model_name]["dim"] - - def __call__(self, texts: List[str], task: str = "SEMANTIC_SIMILARITY") -> List[np.array]: - self.task = task - return self._encode(texts) - - def _encode_query(self, query: str) -> np.array: - self.task = "RETRIEVAL_QUERY" - return self._encode([query])[0] - - def _encode_document(self, document: str) -> np.array: - self.task = "RETRIEVAL_DOCUMENT" - return self._encode([document])[0] - - def _call_vertexai_api(self, texts: List[str]): - inputs = [TextEmbeddingInput(text, self.task) for text in texts] - kwargs = dict(output_dimensionality=self._vertexai_model_meta_info[self.model_name]["dim"]) - embeddings = self.client.get_embeddings(inputs, **kwargs) - return [np.array(embedding.values) for embedding in embeddings] - - def _encode(self, texts: List[str]): - return self._call_vertexai_api(texts) diff --git a/milvus_model/utils/__init__.py b/milvus_model/utils/__init__.py index 74bb07e..7d9cfcc 100644 --- a/milvus_model/utils/__init__.py +++ b/milvus_model/utils/__init__.py @@ -11,9 +11,11 @@ "import_protobuf", "import_unidic_lite", "import_cohere", - "import_voyageai" + "import_voyageai", "import_torch", - "import_huggingface_hub" + "import_huggingface_hub", + "import_mistralai", + "import_nomic" ] import importlib.util @@ -66,6 +68,12 @@ def import_torch(): def import_huggingface_hub(): _check_library("huggingface_hub", package="huggingface-hub") +def import_mistralai(): + _check_library("mistralai", package="mistralai") + +def import_nomic(): + _check_library("nomic", package="nomic") + def _check_library(libname: str, prompt: bool = True, package: Optional[str] = None): is_avail = False if importlib.util.find_spec(libname):