Skip to content
This repository has been archived by the owner on Nov 13, 2024. It is now read-only.

Add dimension support for OpenAI embeddings #273

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,8 @@ chat_engine:
type: OpenAIRecordEncoder # Options: [OpenAIRecordEncoder, AzureOpenAIRecordEncoder]
params:
model_name: # The name of the model to use for encoding
text-embedding-ada-002
text-embedding-3-small
batch_size: 400 # The number of document chunks to encode in each call to the encoding model

create_index_params:
# -------------------------------------------------------------------------------------------
# Initialization parameters to be passed to create a canopy index. These parameters will
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ types-pyyaml = "^6.0.12.12"
jsonschema = "^4.2.0"
types-jsonschema = "^4.2.0"
prompt-toolkit = "^3.0.39"
pinecone-text = [{version = "^0.7.2"},
{version = "^0.7.2", extras = ["dense"], optional = true}]
pinecone-text = [{version = "^0.8.0"},
{version = "^0.8.0", extras = ["dense"], optional = true}]

tokenizers = "^0.15.0"
transformers = {version = "^4.35.2", optional = true}
Expand Down
33 changes: 14 additions & 19 deletions src/canopy/knowledge_base/knowledge_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,6 @@ def verify_index_connection(self) -> None:

def create_canopy_index(self,
spec: Union[Dict, ServerlessSpec, PodSpec] = None,
dimension: Optional[int] = None,
metric: Optional[str] = "cosine"
):
"""
Expand All @@ -283,9 +282,6 @@ def create_canopy_index(self,
spec: A dictionary containing configurations describing how the index should be deployed. For serverless indexes,
specify region and cloud. For pod indexes, specify replicas, shards, pods, pod_type, metadata_config,
and source_collection.
dimension: The dimension of the vectors to index.
If `dimension` isn't explicitly provided,
Canopy would try to infer the embedding's dimension based on the configured `Encoder`
metric: The distance metric to be used for similarity search: 'euclidean', 'cosine', or 'dotproduct'. The
default is 'cosine'.

Expand All @@ -297,22 +293,21 @@ def create_canopy_index(self,
region="us-west-2"
)

if dimension is None:
try:
encoder_dimension = self._encoder.dimension
if encoder_dimension is None:
raise RuntimeError(
f"The selected encoder {self._encoder.__class__.__name__} does "
f"not support inferring the vectors' dimensionality."
)
dimension = encoder_dimension
except Exception as e:
try:
encoder_dimension = self._encoder.dimension
if encoder_dimension is None:
raise RuntimeError(
f"Canopy has failed to infer vectors' dimensionality using the "
f"selected encoder: {self._encoder.__class__.__name__}. You can "
f"provide the dimension manually, try using a different encoder, or"
f" fix the underlying error:\n{e}"
) from e
f"The selected encoder {self._encoder.__class__.__name__} does "
f"not support inferring the vectors' dimensionality."
)
dimension = encoder_dimension
except Exception as e:
raise RuntimeError(
f"Canopy has failed to infer vectors' dimensionality using the "
f"selected encoder: {self._encoder.__class__.__name__}. You can "
f"provide the dimension manually, try using a different encoder, or"
f" fix the underlying error:\n{e}"
) from e

if self.index_name in list_canopy_indexes(self._pinecone_client):
raise RuntimeError(
Expand Down
5 changes: 1 addition & 4 deletions src/canopy/knowledge_base/record_encoder/dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,11 @@ def _encode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
def dimension(self) -> int:
"""
The dimension is the length of the vector generated by the `DenseRecordEncoder`
Canopy will run a single word through the encoder to get the dimension, this will also validate that the encoder
is working properly.

Returns:
dimension(int): the dimension of the encoder
""" # noqa: E501
dummy_doc = KBDocChunk(text="hello", id="dummy_doc", document_id="dummy_doc")
return len(self.encode_documents([dummy_doc])[0].values)
return self._dense_encoder.dimension

async def _aencode_documents_batch(self,
documents: List[KBDocChunk]
Expand Down
8 changes: 5 additions & 3 deletions src/canopy/knowledge_base/record_encoder/openai.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List
from typing import List, Optional

from openai import OpenAIError, RateLimitError, APIConnectionError, AuthenticationError
from pinecone_text.dense.openai_encoder import OpenAIEncoder
Expand All @@ -18,8 +18,9 @@ class OpenAIRecordEncoder(DenseRecordEncoder):
def __init__(
self,
*,
model_name: str = "text-embedding-ada-002",
model_name: str = "text-embedding-3-small",
batch_size: int = 400,
dimension: Optional[int] = None,
**kwargs
):
"""
Expand All @@ -29,10 +30,11 @@ def __init__(
model_name: The name of the OpenAI embeddings model to use for encoding. See https://platform.openai.com/docs/models/embeddings
batch_size: The number of documents or queries to encode at once.
Defaults to 400.
dimension: The dimension of the embeddings vector to generate.
**kwargs: Additional arguments to pass to the underlying `pinecone-text. OpenAIEncoder`.
""" # noqa: E501
try:
encoder = OpenAIEncoder(model_name, **kwargs)
encoder = OpenAIEncoder(model_name, dimension=dimension, **kwargs)
except OpenAIError as e:
raise RuntimeError(
"Failed to connect to OpenAI, please make sure that the OPENAI_API_KEY "
Expand Down
7 changes: 5 additions & 2 deletions tests/unit/record_encoder/test_jina_record_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,16 @@ def encoder():
def test_dimension(encoder):
with patch('pinecone_text.dense.JinaEncoder.encode_documents') \
as mock_encode_documents:
mock_encode_documents.return_value = [[0.1, 0.2, 0.3]]
mock_encode_documents.return_value = [0.1, 0.2, 0.3]
assert encoder.dimension == 3


def custom_encode(*args, **kwargs):
input_to_encode = args[0]
return [[0.1, 0.2, 0.3] for _ in input_to_encode]
if isinstance(input_to_encode, list):
return [[0.1, 0.2, 0.3] for _ in input_to_encode]
else:
return [0.1, 0.2, 0.3]


@pytest.mark.parametrize("items,function",
Expand Down
Loading