From 0019142967c845563feb9592fd63b906c4c19987 Mon Sep 17 00:00:00 2001
From: Izel Levy <izel@pinecone.io>
Date: Sun, 4 Feb 2024 12:26:22 +0200
Subject: [PATCH 1/2] Add dimension support for OpenAI embeddings

---
 config/config.yaml                            |  3 +-
 pyproject.toml                                |  4 +--
 src/canopy/knowledge_base/knowledge_base.py   | 33 ++++++++-----------
 .../knowledge_base/record_encoder/dense.py    |  5 +--
 .../knowledge_base/record_encoder/openai.py   |  8 +++--
 5 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 83055e0e..b3ea239d 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -112,9 +112,8 @@ chat_engine:
         type: OpenAIRecordEncoder       # Options: [OpenAIRecordEncoder, AzureOpenAIRecordEncoder]
         params:
           model_name:                   # The name of the model to use for encoding
-            text-embedding-ada-002
+            text-embedding-3-small
           batch_size: 400               # The number of document chunks to encode in each call to the encoding model
-
 create_index_params:
   # -------------------------------------------------------------------------------------------
   # Initialization parameters to be passed to create a canopy index. These parameters will
diff --git a/pyproject.toml b/pyproject.toml
index 6b4bb60f..79c09ae2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,8 +28,8 @@ types-pyyaml = "^6.0.12.12"
 jsonschema = "^4.2.0"
 types-jsonschema = "^4.2.0"
 prompt-toolkit = "^3.0.39"
-pinecone-text = [{version = "^0.7.2"},
-                 {version = "^0.7.2", extras = ["dense"], optional = true}]
+pinecone-text = [{version = "^0.8.0"},
+                 {version = "^0.8.0", extras = ["dense"], optional = true}]
 
 tokenizers = "^0.15.0"
 transformers = {version = "^4.35.2", optional = true}
diff --git a/src/canopy/knowledge_base/knowledge_base.py b/src/canopy/knowledge_base/knowledge_base.py
index f91f6669..1ead3573 100644
--- a/src/canopy/knowledge_base/knowledge_base.py
+++ b/src/canopy/knowledge_base/knowledge_base.py
@@ -260,7 +260,6 @@ def verify_index_connection(self) -> None:
 
     def create_canopy_index(self,
                             spec: Union[Dict, ServerlessSpec, PodSpec] = None,
-                            dimension: Optional[int] = None,
                             metric: Optional[str] = "cosine"
                             ):
         """
@@ -283,9 +282,6 @@ def create_canopy_index(self,
            spec: A dictionary containing configurations describing how the index should be deployed. For serverless indexes,
                  specify region and cloud. For pod indexes, specify replicas, shards, pods, pod_type, metadata_config,
                  and source_collection.
-           dimension: The dimension of the vectors to index.
-                       If `dimension` isn't explicitly provided,
-                       Canopy would try to infer the embedding's dimension based on the configured `Encoder`
            metric: The distance metric to be used for similarity search: 'euclidean', 'cosine', or 'dotproduct'. The
                    default is 'cosine'.
 
@@ -297,22 +293,21 @@ def create_canopy_index(self,
                 region="us-west-2"
             )
 
-        if dimension is None:
-            try:
-                encoder_dimension = self._encoder.dimension
-                if encoder_dimension is None:
-                    raise RuntimeError(
-                        f"The selected encoder {self._encoder.__class__.__name__} does "
-                        f"not support inferring the vectors' dimensionality."
-                    )
-                dimension = encoder_dimension
-            except Exception as e:
+        try:
+            encoder_dimension = self._encoder.dimension
+            if encoder_dimension is None:
                 raise RuntimeError(
-                    f"Canopy has failed to infer vectors' dimensionality using the "
-                    f"selected encoder: {self._encoder.__class__.__name__}. You can "
-                    f"provide the dimension manually, try using a different encoder, or"
-                    f" fix the underlying error:\n{e}"
-                ) from e
+                    f"The selected encoder {self._encoder.__class__.__name__} does "
+                    f"not support inferring the vectors' dimensionality."
+                )
+            dimension = encoder_dimension
+        except Exception as e:
+            raise RuntimeError(
+                f"Canopy has failed to infer vectors' dimensionality using the "
+                f"selected encoder: {self._encoder.__class__.__name__}. You can "
+                f"provide the dimension manually, try using a different encoder, or"
+                f" fix the underlying error:\n{e}"
+            ) from e
 
         if self.index_name in list_canopy_indexes(self._pinecone_client):
             raise RuntimeError(
diff --git a/src/canopy/knowledge_base/record_encoder/dense.py b/src/canopy/knowledge_base/record_encoder/dense.py
index cd2d55ed..c693368e 100644
--- a/src/canopy/knowledge_base/record_encoder/dense.py
+++ b/src/canopy/knowledge_base/record_encoder/dense.py
@@ -58,14 +58,11 @@ def _encode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
     def dimension(self) -> int:
         """
         The dimension is the length of the vector generated by the `DenseRecordEncoder`
-        Canopy will run a single word through the encoder to get the dimension, this will also validate that the encoder
-        is working properly.
 
         Returns:
             dimension(int): the dimension of the encoder
         """  # noqa: E501
-        dummy_doc = KBDocChunk(text="hello", id="dummy_doc", document_id="dummy_doc")
-        return len(self.encode_documents([dummy_doc])[0].values)
+        return self._dense_encoder.dimension
 
     async def _aencode_documents_batch(self,
                                        documents: List[KBDocChunk]
diff --git a/src/canopy/knowledge_base/record_encoder/openai.py b/src/canopy/knowledge_base/record_encoder/openai.py
index 03f1eb2a..0ccdd3b6 100644
--- a/src/canopy/knowledge_base/record_encoder/openai.py
+++ b/src/canopy/knowledge_base/record_encoder/openai.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Optional
 
 from openai import OpenAIError, RateLimitError, APIConnectionError, AuthenticationError
 from pinecone_text.dense.openai_encoder import OpenAIEncoder
@@ -18,8 +18,9 @@ class OpenAIRecordEncoder(DenseRecordEncoder):
     def __init__(
         self,
         *,
-        model_name: str = "text-embedding-ada-002",
+        model_name: str = "text-embedding-3-small",
         batch_size: int = 400,
+        dimension: Optional[int] = None,
         **kwargs
     ):
         """
@@ -29,10 +30,11 @@ def __init__(
             model_name: The name of the OpenAI embeddings model to use for encoding. See https://platform.openai.com/docs/models/embeddings
             batch_size: The number of documents or queries to encode at once.
                         Defaults to 400.
+            dimension: The dimension of the embeddings vector to generate.
             **kwargs: Additional arguments to pass to the underlying `pinecone-text. OpenAIEncoder`.
         """  # noqa: E501
         try:
-            encoder = OpenAIEncoder(model_name, **kwargs)
+            encoder = OpenAIEncoder(model_name, dimension=dimension, **kwargs)
         except OpenAIError as e:
             raise RuntimeError(
                 "Failed to connect to OpenAI, please make sure that the OPENAI_API_KEY "

From f43a23f0945990e25abed727d02beb57f803591b Mon Sep 17 00:00:00 2001
From: Izel Levy <izel@pinecone.io>
Date: Sun, 4 Feb 2024 12:45:18 +0200
Subject: [PATCH 2/2] Fix Jina tests

---
 tests/unit/record_encoder/test_jina_record_encoder.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/unit/record_encoder/test_jina_record_encoder.py b/tests/unit/record_encoder/test_jina_record_encoder.py
index 9798fb38..203b87eb 100644
--- a/tests/unit/record_encoder/test_jina_record_encoder.py
+++ b/tests/unit/record_encoder/test_jina_record_encoder.py
@@ -30,13 +30,16 @@ def encoder():
 def test_dimension(encoder):
     with patch('pinecone_text.dense.JinaEncoder.encode_documents') \
             as mock_encode_documents:
-        mock_encode_documents.return_value = [[0.1, 0.2, 0.3]]
+        mock_encode_documents.return_value = [0.1, 0.2, 0.3]
         assert encoder.dimension == 3
 
 
 def custom_encode(*args, **kwargs):
     input_to_encode = args[0]
-    return [[0.1, 0.2, 0.3] for _ in input_to_encode]
+    if isinstance(input_to_encode, list):
+        return [[0.1, 0.2, 0.3] for _ in input_to_encode]
+    else:
+        return [0.1, 0.2, 0.3]
 
 
 @pytest.mark.parametrize("items,function",