Merge branch 'mainline' into joshua/hybrid-search

marqo-ai · Jun 26, 2024 · 55b5e03 · 55b5e03
2 parents 39e2ce2 + e5780d4
commit 55b5e03
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 49 deletions.
diff --git a/setup.py b/setup.py
@@ -18,7 +18,7 @@
         "tox"
     ],
     name="marqo",
-    version="3.5.0",
+    version="3.5.1",
     author="marqo org",
     author_email="org@marqo.io",
     description="Tensor search for humans",

diff --git a/src/marqo/client.py b/src/marqo/client.py
@@ -167,7 +167,10 @@ def delete_index(self, index_name: str, wait_for_readiness=True) -> Dict[str, An
                 cloud_wait_for_index_status(self.http, index_name, enums.IndexStatus.DELETED)
             return res
         except errors.MarqoWebError as e:
-            return e.message
+            if "index_not_found" in str(e):
+                return e.message
+            else:
+                raise e
 
     def get_index(self, index_name: str) -> Index:
         """Get the index.

diff --git a/src/marqo/version.py b/src/marqo/version.py
@@ -1,7 +1,7 @@
-__marqo_version__ = "2.6.0"
+__marqo_version__ = "2.8.0"
 __marqo_release_page__ = f"https://github.com/marqo-ai/marqo/releases/tag/{__marqo_version__}"
 
-__minimum_supported_marqo_version__ = "2.0"
+__minimum_supported_marqo_version__ = "2.6.0"
 
 
 def supported_marqo_version() -> str:

diff --git a/tests/cloud_test_logic/cloud_test_index.py b/tests/cloud_test_logic/cloud_test_index.py
@@ -42,14 +42,14 @@ class CloudTestIndex(str, Enum):
 
 index_name_to_settings_mappings = {
     # TODO Due to the resources limit of the staging cluster, we only use 2 indexes for testing purpose now
-    # CloudTestIndex.unstructured_text: {
-    #     "type": "unstructured",
-    #     "treatUrlsAndPointersAsImages": False,
-    #     "model": "hf/e5-base-v2",
-    #
-    #     "inferenceType": "marqo.CPU.small",
-    #     "storageClass": "marqo.basic",
-    # },
+    CloudTestIndex.unstructured_text: {
+        "type": "unstructured",
+        "treatUrlsAndPointersAsImages": False,
+        "model": "hf/e5-base-v2",
+
+        "inferenceType": "marqo.CPU.small",
+        "storageClass": "marqo.basic",
+    },
     CloudTestIndex.unstructured_image: {
         "type": "unstructured",
         "treatUrlsAndPointersAsImages": True,
@@ -61,7 +61,7 @@ class CloudTestIndex(str, Enum):
     CloudTestIndex.structured_image: {
         "type": "structured",
         "model": "open_clip/ViT-B-32/laion2b_s34b_b79k",
-        "infereceType": "marqo.CPU.small",
+        "inferenceType": "marqo.CPU.small",
         "storageClass": "marqo.basic",
         "allFields": [
             {"name": "text_field_1", "type": "text", "features": ["lexical_search", "filter"]},
@@ -144,19 +144,14 @@ class CloudTestIndex(str, Enum):
     # },
     CloudTestIndex.structured_text: {
         "type": "structured",
-        "treatUrlsAndPointersAsImages": False,
-        "model": "hf/all_datasets_v4_MiniLM-L6",
+        "model": "hf/e5-base-v2",
         "allFields": [
             {"name": "text_field_1", "type": "text", "features": ["lexical_search", "filter"]},
-            {"name": "text_field_2", "type": "text", "features": ["filter"]},
+            {"name": "text_field_2", "type": "text", "features": ["lexical_search", "filter"]},
             {"name": "text_field_3", "type": "text", "features": ["lexical_search"]},
-            {"name": "array_field_1", "type": "array<text>", "features": ["filter"]},
-            {"name": "float_field_1", "type": "float", "features": ["filter", "score_modifier"]},
-            {"name": "int_field_1", "type": "int", "features": ["filter", "score_modifier"]},
-            {"name": "bool_field_1", "type": "bool", "features": ["filter"]},
-        ],
+            {"name": "int_field_1", "type": "int", "features": ["score_modifier"]},
+            {"name": "int_filter_field_1", "type": "int", "features": ["filter", "score_modifier"]}],
         "tensorFields": ["text_field_1", "text_field_2", "text_field_3"],
-
         "storageClass": "marqo.balanced",
         "numberOfShards": 2,
     },

diff --git a/tests/v2_tests/test_embed.py b/tests/v2_tests/test_embed.py
@@ -1,19 +1,16 @@
-import copy
-import marqo
-from marqo import enums
-from unittest import mock
-import requests
-import random
-import math
-import time
-from tests.marqo_test import MarqoTestCase, CloudTestIndex
-from marqo.errors import MarqoWebError
-from pytest import mark
 import numpy as np
+from pytest import mark
+
+from marqo.errors import MarqoWebError
+from tests.marqo_test import MarqoTestCase, CloudTestIndex
 
 
 @mark.fixed
 class TestEmbed(MarqoTestCase):
+
+    def setUp(self):
+        self.test_cases = [(CloudTestIndex.structured_text, self.unstructured_index_name)]
+
     def test_embed_single_string(self):
         """Embeds a string. Use add docs and get docs with tensor facets to ensure the vector is correct.
                 Checks the basic functionality and response structure. Also checks that the request level prefix override works."""
@@ -29,21 +26,22 @@ def test_embed_single_string(self):
                     "_id": "doc1",
                     "text_field_1": "Jimmy Butler is the GOAT."
                 }
-               
+
                 res_1 = self.client.index(test_index_name).add_documents([d1], tensor_fields=tensor_fields)
 
                 # Get doc with tensor facets (for reference vector)
                 retrieved_d1 = self.client.index(test_index_name).get_document(
                     document_id="doc1", expose_facets=True)
 
                 # Call embed
-                embed_res_1 = self.client.index(test_index_name).embed("Jimmy Butler is the GOAT.", content_type="document")
+                embed_res_1 = self.client.index(test_index_name).embed("Jimmy Butler is the GOAT.",
+                                                                       content_type="document")
 
                 # Assert that the 
                 self.assertIn("processingTimeMs", embed_res_1)
                 self.assertEqual(embed_res_1["content"], "Jimmy Butler is the GOAT.")
-                self.assertTrue(np.allclose(embed_res_1["embeddings"][0], retrieved_d1["_tensor_facets"][0]["_embedding"]))
-
+                self.assertTrue(np.allclose(embed_res_1["embeddings"][0], retrieved_d1["_tensor_facets"][0]["_embedding"],
+                                            atol=1e-4))
 
     def test_request_level_prefix_override_embed_add_docs(self):
         """Checks that the request level prefix override works."""
@@ -59,18 +57,21 @@ def test_request_level_prefix_override_embed_add_docs(self):
                     "_id": "doc1",
                     "text_field_1": "Jimmy Butler is the GOAT."
                 }
-                res = self.client.index(test_index_name).add_documents([d1], tensor_fields=tensor_fields, text_chunk_prefix="test query: ")
+                res = self.client.index(test_index_name).add_documents([d1], tensor_fields=tensor_fields,
+                                                                       text_chunk_prefix="test query: ")
 
                 # Get doc with tensor facets (for reference vector)
                 retrieved_d1 = self.client.index(test_index_name).get_document(
                     document_id="doc1", expose_facets=True)
 
-                embed_res = self.client.index(test_index_name).embed("test query: Jimmy Butler is the GOAT.", content_type=None)
+                embed_res = self.client.index(test_index_name).embed("test query: Jimmy Butler is the GOAT.",
+                                                                     content_type=None)
 
                 # Assert request level prefix override
                 self.assertIn("processingTimeMs", embed_res)
                 self.assertEqual(embed_res["content"], "test query: Jimmy Butler is the GOAT.")
-                self.assertTrue(np.allclose(embed_res["embeddings"][0], retrieved_d1["_tensor_facets"][0]["_embedding"]))
+                self.assertTrue(np.allclose(embed_res["embeddings"][0], retrieved_d1["_tensor_facets"][0]["_embedding"],
+                                            atol=1e-4))
 
 
     def test_embed_with_device(self):
@@ -95,10 +96,12 @@ def test_embed_with_device(self):
                     document_id="doc1", expose_facets=True)
 
                 # Call embed
-                embed_res = self.client.index(test_index_name).embed(content="Jimmy Butler is the GOAT.", device="cpu", content_type="document")
+                embed_res = self.client.index(test_index_name).embed(content="Jimmy Butler is the GOAT.", device="cpu",
+                                                                     content_type="document")
                 self.assertIn("processingTimeMs", embed_res)
                 self.assertEqual(embed_res["content"], "Jimmy Butler is the GOAT.")
-                self.assertTrue(np.allclose(embed_res["embeddings"][0], retrieved_d1["_tensor_facets"][0] ["_embedding"]))
+                self.assertTrue(np.allclose(embed_res["embeddings"][0], retrieved_d1["_tensor_facets"][0] ["_embedding"],
+                                            atol=1e-4))
 
     def test_embed_single_dict(self):
         """Embeds a dict. Use add docs and get docs with tensor facets to ensure the vector is correct.
@@ -122,11 +125,14 @@ def test_embed_single_dict(self):
                     document_id="doc1", expose_facets=True)
 
                 # Call embed
-                embed_res = self.client.index(test_index_name).embed(content={"Jimmy Butler is the GOAT.": 1}, content_type="document")
+                embed_res = self.client.index(test_index_name).embed(content={"Jimmy Butler is the GOAT.": 1},
+                                                                     content_type="document")
 
                 self.assertIn("processingTimeMs", embed_res)
                 self.assertEqual(embed_res["content"], {"Jimmy Butler is the GOAT.": 1})
-                self.assertTrue(np.allclose(embed_res["embeddings"][0], retrieved_d1["_tensor_facets"][0] ["_embedding"]))
+                self.assertTrue(np.allclose(embed_res["embeddings"][0], retrieved_d1["_tensor_facets"][0]["_embedding"],
+                                            atol=1e-4))
+
 
     def test_embed_list_content(self):
         """Embeds a list with string and dict. Use add docs and get docs with tensor facets to ensure the vector is correct.
@@ -161,9 +167,9 @@ def test_embed_list_content(self):
                 self.assertIn("processingTimeMs", embed_res)
                 self.assertEqual(embed_res["content"], [{"Jimmy Butler is the GOAT.": 1}, "Alex Caruso is the GOAT."])
                 self.assertTrue(
-                    np.allclose(embed_res["embeddings"][0], retrieved_docs["results"][0]["_tensor_facets"][0]["_embedding"], atol=1e-6))
+                    np.allclose(embed_res["embeddings"][0], retrieved_docs["results"][0]["_tensor_facets"][0]["_embedding"], atol=1e-4))
                 self.assertTrue(
-                    np.allclose(embed_res["embeddings"][1], retrieved_docs["results"][1]["_tensor_facets"][0]["_embedding"], atol=1e-6))
+                    np.allclose(embed_res["embeddings"][1], retrieved_docs["results"][1]["_tensor_facets"][0]["_embedding"], atol=1e-4))
 
 
     def test_embed_non_numeric_weight_fails(self):
@@ -176,4 +182,4 @@ def test_embed_non_numeric_weight_fails(self):
                 with self.assertRaises(MarqoWebError) as e:
                     self.client.index(test_index_name).embed(content={"text to embed": "not a number"})
 
-            self.assertIn("not a valid float", str(e.exception))
+            self.assertIn("not a valid float", str(e.exception))
diff --git a/tests/v2_tests/test_recommend.py b/tests/v2_tests/test_recommend.py
@@ -2,7 +2,7 @@
 
 from marqo.enums import InterpolationMethod
 from tests.marqo_test import MarqoTestCase
-
+from tests.cloud_test_logic.cloud_test_index import CloudTestIndex
 
 @mark.fixed
 class TestRecommend(MarqoTestCase):
@@ -52,8 +52,9 @@ def test_recommend_allFields(self):
         """
         Test recommend with all fields provided
         """
+
+        self.test_cases = [(CloudTestIndex.structured_text, self.structured_index_name), ]
         for cloud_test_index_to_use, open_source_test_index_name in self.test_cases:
-            open_source_test_index_name = self.structured_index_name
             test_index_name = self.get_test_index_name(
                 cloud_test_index_to_use=cloud_test_index_to_use,
                 open_source_test_index_name=open_source_test_index_name