Skip to content

Commit

Permalink
VoyageAI: Adding new models, quantization and multiple output dimensi…
Browse files Browse the repository at this point in the history
…ons (#59)

VoyageAI: Adding new models, quantization and multiple output dimensions
(for the models supporting it)
  • Loading branch information
fzowl authored Jan 14, 2025
1 parent ddc1268 commit 68a1b88
Showing 1 changed file with 63 additions and 15 deletions.
78 changes: 63 additions & 15 deletions milvus_model/dense/voyageai.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import List, Optional

import numpy as np
import struct

from milvus_model.base import BaseEmbeddingFunction
from milvus_model.utils import import_voyageai
Expand All @@ -11,25 +12,58 @@


class VoyageEmbeddingFunction(BaseEmbeddingFunction):
def __init__(self, model_name: str = "voyage-3", api_key: Optional[str] = None, **kwargs):
def __init__(self,
model_name: str = "voyage-3-large",
api_key: Optional[str] = None,
embedding_type: Optional[str] = None,
truncate: Optional[bool] = None,
dimension: Optional[int] = None,
**kwargs):
self.model_name = model_name
self.truncate = truncate
self._voyageai_model_meta_info = defaultdict(dict)
self._voyageai_model_meta_info["voyage-3"]["dim"] = 1024
self._voyageai_model_meta_info["voyage-3-lite"]["dim"] = 512
self._voyageai_model_meta_info["voyage-finance-2"]["dim"] = 1024
self._voyageai_model_meta_info["voyage-multilingual-2"]["dim"] = 1024
self._voyageai_model_meta_info["voyage-law-2"]["dim"] = 1024
self._voyageai_model_meta_info["voyage-code-2"]["dim"] = 1536
self._voyageai_model_meta_info["voyage-3-large"]["dim"] = [1024, 256, 512, 2048]
self._voyageai_model_meta_info["voyage-code-3"]["dim"] = [1024, 256, 512, 2048]
self._voyageai_model_meta_info["voyage-3"]["dim"] = [1024]
self._voyageai_model_meta_info["voyage-3-lite"]["dim"] = [512]
self._voyageai_model_meta_info["voyage-finance-2"]["dim"] = [1024]
self._voyageai_model_meta_info["voyage-multilingual-2"]["dim"] = [1024]
self._voyageai_model_meta_info["voyage-law-2"]["dim"] = [1024]
#old model
self._voyageai_model_meta_info["voyage-large-2"]["dim"] = 1536
self._voyageai_model_meta_info["voyage-code-2"]["dim"] = 1536
self._voyageai_model_meta_info["voyage-2"]["dim"] = 1024
self._voyageai_model_meta_info["voyage-lite-02-instruct"]["dim"] = 1024
self._voyageai_model_meta_info["voyage-large-2"]["dim"] = [1536]
self._voyageai_model_meta_info["voyage-code-2"]["dim"] = [1536]
self._voyageai_model_meta_info["voyage-2"]["dim"] = [1024]
self._voyageai_model_meta_info["voyage-lite-02-instruct"]["dim"] = [1024]

if dimension is not None and dimension not in self._voyageai_model_meta_info[self.model_name]["dim"]:
raise ValueError(f"The provided dimension ({dimension}) is not supported by the selected model ({self.model_name}). "
"Leave this parameter empty to use the default dimension for the model. "
"Please check the supported dimensions here: https://docs.voyageai.com/docs/embeddings"
)

if embedding_type == "int8" or embedding_type == "uint8":
raise ValueError("Currently int8 or uint8 is not supported with PyMilvus model library.")

if self.model_name in ['voyage-3-large', 'voyage-code-3']:
if embedding_type is not None and embedding_type not in ['float', 'binary', 'ubinary']:
raise ValueError(f"The provided embedding_type ({embedding_type}) is not supported by the selected model "
f"({self.model_name}). Leave this parameter empty for the default embedding_type (float). "
f"Please check the supported embedding_type values here: https://docs.voyageai.com/docs/embeddings")
else:
if embedding_type is not None and embedding_type != 'float':
raise ValueError(f"The provided embedding_type ({embedding_type}) is not supported by the selected model "
f"({self.model_name}). Leave this parameter empty for the default embedding_type (float). "
f"Please check the supported embedding_type values here: https://docs.voyageai.com/docs/embeddings")

self.embedding_type = embedding_type
self.dimension = dimension
self.client = voyageai.Client(api_key, **kwargs)

@property
def dim(self):
return self._voyageai_model_meta_info[self.model_name]["dim"]
if self.dimension is None:
return self._voyageai_model_meta_info[self.model_name]["dim"][0]
return self.dimension

def encode_queries(self, queries: List[str]) -> List[np.array]:
return self._call_voyage_api(queries, input_type="query")
Expand All @@ -41,7 +75,21 @@ def __call__(self, texts: List[str]) -> List[np.array]:
return self._call_voyage_api(texts)

def _call_voyage_api(self, texts: List[str], input_type: Optional[str] = None):
results = self.client.embed(
texts=texts, model=self.model_name, input_type=input_type
embeddings = self.client.embed(
texts=texts,
model=self.model_name,
input_type=input_type,
truncation=self.truncate,
output_dtype=self.embedding_type,
output_dimension=self.dim,
).embeddings
return [np.array(data) for data in results]
if self.embedding_type is None:
results = [np.array(data, dtype=np.float32) for data in embeddings]
else:
if self.embedding_type == "binary":
results = [struct.pack('b' * len(int8_vector), *int8_vector) for int8_vector in embeddings]
elif self.embedding_type == "ubinary":
results = [struct.pack('B' * len(uint8_vector), *uint8_vector) for uint8_vector in embeddings]
elif self.embedding_type == "float":
results = [np.array(result, dtype=np.float32) for result in embeddings]
return results

0 comments on commit 68a1b88

Please sign in to comment.