From 0019142967c845563feb9592fd63b906c4c19987 Mon Sep 17 00:00:00 2001 From: Izel Levy Date: Sun, 4 Feb 2024 12:26:22 +0200 Subject: [PATCH 1/2] Add dimension support for OpenAI embeddings --- config/config.yaml | 3 +- pyproject.toml | 4 +-- src/canopy/knowledge_base/knowledge_base.py | 33 ++++++++----------- .../knowledge_base/record_encoder/dense.py | 5 +-- .../knowledge_base/record_encoder/openai.py | 8 +++-- 5 files changed, 23 insertions(+), 30 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 83055e0e..b3ea239d 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -112,9 +112,8 @@ chat_engine: type: OpenAIRecordEncoder # Options: [OpenAIRecordEncoder, AzureOpenAIRecordEncoder] params: model_name: # The name of the model to use for encoding - text-embedding-ada-002 + text-embedding-3-small batch_size: 400 # The number of document chunks to encode in each call to the encoding model - create_index_params: # ------------------------------------------------------------------------------------------- # Initialization parameters to be passed to create a canopy index. These parameters will diff --git a/pyproject.toml b/pyproject.toml index 6b4bb60f..79c09ae2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,8 +28,8 @@ types-pyyaml = "^6.0.12.12" jsonschema = "^4.2.0" types-jsonschema = "^4.2.0" prompt-toolkit = "^3.0.39" -pinecone-text = [{version = "^0.7.2"}, - {version = "^0.7.2", extras = ["dense"], optional = true}] +pinecone-text = [{version = "^0.8.0"}, + {version = "^0.8.0", extras = ["dense"], optional = true}] tokenizers = "^0.15.0" transformers = {version = "^4.35.2", optional = true} diff --git a/src/canopy/knowledge_base/knowledge_base.py b/src/canopy/knowledge_base/knowledge_base.py index f91f6669..1ead3573 100644 --- a/src/canopy/knowledge_base/knowledge_base.py +++ b/src/canopy/knowledge_base/knowledge_base.py @@ -260,7 +260,6 @@ def verify_index_connection(self) -> None: def create_canopy_index(self, spec: Union[Dict, ServerlessSpec, PodSpec] = None, - dimension: Optional[int] = None, metric: Optional[str] = "cosine" ): """ @@ -283,9 +282,6 @@ def create_canopy_index(self, spec: A dictionary containing configurations describing how the index should be deployed. For serverless indexes, specify region and cloud. For pod indexes, specify replicas, shards, pods, pod_type, metadata_config, and source_collection. - dimension: The dimension of the vectors to index. - If `dimension` isn't explicitly provided, - Canopy would try to infer the embedding's dimension based on the configured `Encoder` metric: The distance metric to be used for similarity search: 'euclidean', 'cosine', or 'dotproduct'. The default is 'cosine'. @@ -297,22 +293,21 @@ def create_canopy_index(self, region="us-west-2" ) - if dimension is None: - try: - encoder_dimension = self._encoder.dimension - if encoder_dimension is None: - raise RuntimeError( - f"The selected encoder {self._encoder.__class__.__name__} does " - f"not support inferring the vectors' dimensionality." - ) - dimension = encoder_dimension - except Exception as e: + try: + encoder_dimension = self._encoder.dimension + if encoder_dimension is None: raise RuntimeError( - f"Canopy has failed to infer vectors' dimensionality using the " - f"selected encoder: {self._encoder.__class__.__name__}. You can " - f"provide the dimension manually, try using a different encoder, or" - f" fix the underlying error:\n{e}" - ) from e + f"The selected encoder {self._encoder.__class__.__name__} does " + f"not support inferring the vectors' dimensionality." + ) + dimension = encoder_dimension + except Exception as e: + raise RuntimeError( + f"Canopy has failed to infer vectors' dimensionality using the " + f"selected encoder: {self._encoder.__class__.__name__}. You can " + f"provide the dimension manually, try using a different encoder, or" + f" fix the underlying error:\n{e}" + ) from e if self.index_name in list_canopy_indexes(self._pinecone_client): raise RuntimeError( diff --git a/src/canopy/knowledge_base/record_encoder/dense.py b/src/canopy/knowledge_base/record_encoder/dense.py index cd2d55ed..c693368e 100644 --- a/src/canopy/knowledge_base/record_encoder/dense.py +++ b/src/canopy/knowledge_base/record_encoder/dense.py @@ -58,14 +58,11 @@ def _encode_queries_batch(self, queries: List[Query]) -> List[KBQuery]: def dimension(self) -> int: """ The dimension is the length of the vector generated by the `DenseRecordEncoder` - Canopy will run a single word through the encoder to get the dimension, this will also validate that the encoder - is working properly. Returns: dimension(int): the dimension of the encoder """ # noqa: E501 - dummy_doc = KBDocChunk(text="hello", id="dummy_doc", document_id="dummy_doc") - return len(self.encode_documents([dummy_doc])[0].values) + return self._dense_encoder.dimension async def _aencode_documents_batch(self, documents: List[KBDocChunk] diff --git a/src/canopy/knowledge_base/record_encoder/openai.py b/src/canopy/knowledge_base/record_encoder/openai.py index 03f1eb2a..0ccdd3b6 100644 --- a/src/canopy/knowledge_base/record_encoder/openai.py +++ b/src/canopy/knowledge_base/record_encoder/openai.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Optional from openai import OpenAIError, RateLimitError, APIConnectionError, AuthenticationError from pinecone_text.dense.openai_encoder import OpenAIEncoder @@ -18,8 +18,9 @@ class OpenAIRecordEncoder(DenseRecordEncoder): def __init__( self, *, - model_name: str = "text-embedding-ada-002", + model_name: str = "text-embedding-3-small", batch_size: int = 400, + dimension: Optional[int] = None, **kwargs ): """ @@ -29,10 +30,11 @@ def __init__( model_name: The name of the OpenAI embeddings model to use for encoding. See https://platform.openai.com/docs/models/embeddings batch_size: The number of documents or queries to encode at once. Defaults to 400. + dimension: The dimension of the embeddings vector to generate. **kwargs: Additional arguments to pass to the underlying `pinecone-text. OpenAIEncoder`. """ # noqa: E501 try: - encoder = OpenAIEncoder(model_name, **kwargs) + encoder = OpenAIEncoder(model_name, dimension=dimension, **kwargs) except OpenAIError as e: raise RuntimeError( "Failed to connect to OpenAI, please make sure that the OPENAI_API_KEY " From f43a23f0945990e25abed727d02beb57f803591b Mon Sep 17 00:00:00 2001 From: Izel Levy Date: Sun, 4 Feb 2024 12:45:18 +0200 Subject: [PATCH 2/2] Fix Jina tests --- tests/unit/record_encoder/test_jina_record_encoder.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/unit/record_encoder/test_jina_record_encoder.py b/tests/unit/record_encoder/test_jina_record_encoder.py index 9798fb38..203b87eb 100644 --- a/tests/unit/record_encoder/test_jina_record_encoder.py +++ b/tests/unit/record_encoder/test_jina_record_encoder.py @@ -30,13 +30,16 @@ def encoder(): def test_dimension(encoder): with patch('pinecone_text.dense.JinaEncoder.encode_documents') \ as mock_encode_documents: - mock_encode_documents.return_value = [[0.1, 0.2, 0.3]] + mock_encode_documents.return_value = [0.1, 0.2, 0.3] assert encoder.dimension == 3 def custom_encode(*args, **kwargs): input_to_encode = args[0] - return [[0.1, 0.2, 0.3] for _ in input_to_encode] + if isinstance(input_to_encode, list): + return [[0.1, 0.2, 0.3] for _ in input_to_encode] + else: + return [0.1, 0.2, 0.3] @pytest.mark.parametrize("items,function",