From 39e2ce2ce4b1ccacf2262c131b4d1840739ca384 Mon Sep 17 00:00:00 2001
From: Joshua <joshua@marqo.ai>
Date: Tue, 21 May 2024 03:01:34 +0800
Subject: [PATCH 1/6] add hybrid_parameters to search

---
 src/marqo/index.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/marqo/index.py b/src/marqo/index.py
index a08aba2a..c2d16d34 100644
--- a/src/marqo/index.py
+++ b/src/marqo/index.py
@@ -209,7 +209,7 @@ def search(self, q: Optional[Union[str, dict]] = None, searchable_attributes: Op
                context: Optional[dict] = None, score_modifiers: Optional[dict] = None,
                model_auth: Optional[dict] = None,
                ef_search: Optional[int] = None, approximate: Optional[bool] = None,
-               text_query_prefix: Optional[str] = None,
+               text_query_prefix: Optional[str] = None, hybrid_parameters: Optional[dict] = None
                ) -> Dict[str, Any]:
         """Search the index.
 
@@ -273,6 +273,7 @@ def search(self, q: Optional[Union[str, dict]] = None, searchable_attributes: Op
             "reRanker": reranker,
             "boost": boost,
             "textQueryPrefix": text_query_prefix,
+            "hybridParameters": hybrid_parameters
         }
 
         body = {k: v for k, v in body.items() if v is not None}

From 723e201a126af088afe0f40308234d01d3207622 Mon Sep 17 00:00:00 2001
From: Joshua <joshua@marqo.ai>
Date: Tue, 9 Jul 2024 01:46:37 +0800
Subject: [PATCH 2/6] add hybrid search tests

---
 src/marqo/index.py                   |   3 +-
 src/marqo/models/search_models.py    |  29 ++++
 tests/v2_tests/test_hybrid_search.py | 199 +++++++++++++++++++++++++++
 3 files changed, 230 insertions(+), 1 deletion(-)
 create mode 100644 tests/v2_tests/test_hybrid_search.py

diff --git a/src/marqo/index.py b/src/marqo/index.py
index c2d16d34..fbd8f218 100644
--- a/src/marqo/index.py
+++ b/src/marqo/index.py
@@ -15,6 +15,7 @@
 from marqo.errors import MarqoWebError, UnsupportedOperationError, MarqoCloudIndexNotFoundError
 from marqo.marqo_logging import mq_logger
 from marqo.models import marqo_index
+from marqo.models.search_models import HybridParameters
 from marqo.models.create_index_settings import IndexSettings
 from marqo.models.marqo_cloud import CloudIndexSettings
 from marqo.version import minimum_supported_marqo_version
@@ -209,7 +210,7 @@ def search(self, q: Optional[Union[str, dict]] = None, searchable_attributes: Op
                context: Optional[dict] = None, score_modifiers: Optional[dict] = None,
                model_auth: Optional[dict] = None,
                ef_search: Optional[int] = None, approximate: Optional[bool] = None,
-               text_query_prefix: Optional[str] = None, hybrid_parameters: Optional[dict] = None
+               text_query_prefix: Optional[str] = None, hybrid_parameters: Optional[HybridParameters] = None
                ) -> Dict[str, Any]:
         """Search the index.
 
diff --git a/src/marqo/models/search_models.py b/src/marqo/models/search_models.py
index 8c390530..6ad09bf8 100644
--- a/src/marqo/models/search_models.py
+++ b/src/marqo/models/search_models.py
@@ -1,5 +1,9 @@
 from typing import Dict, List, Optional, Union
 from marqo.models.marqo_models import StrictBaseModel
+from abc import ABC
+from enum import Enum
+
+from pydantic import validator, BaseModel, root_validator
 
 
 class SearchBody(StrictBaseModel):
@@ -26,3 +30,28 @@ class BulkSearchBody(SearchBody):
 class BulkSearchQuery(StrictBaseModel):
     queries: List[BulkSearchBody]
 
+
+class RetrievalMethod(str, Enum):
+    Disjunction = 'disjunction'
+    Tensor = 'tensor'
+    Lexical = 'lexical'
+
+class RankingMethod(str, Enum):
+    RRF = 'rrf'
+    NormalizeLinear = 'normalize_linear'
+    Tensor = 'tensor'
+    Lexical = 'lexical'
+
+class HybridParameters:
+    retrieval_method: Optional[RetrievalMethod] = RetrievalMethod.Disjunction
+    ranking_method: Optional[RankingMethod] = RankingMethod.RRF
+    alpha: Optional[float] = None
+    rrf_k: Optional[int] = None
+    searchable_attributes_lexical: Optional[List[str]] = None
+    searchable_attributes_tensor: Optional[List[str]] = None
+    verbose: bool = False
+
+    # Input for API, but form will change before being passed to core Hybrid Query.
+    score_modifiers_lexical: Optional[dict] = None
+    score_modifiers_tensor: Optional[dict] = None
+
diff --git a/tests/v2_tests/test_hybrid_search.py b/tests/v2_tests/test_hybrid_search.py
new file mode 100644
index 00000000..13333153
--- /dev/null
+++ b/tests/v2_tests/test_hybrid_search.py
@@ -0,0 +1,199 @@
+import copy
+import marqo
+from marqo import enums
+from unittest import mock
+import requests
+import random
+import math
+import time
+from tests.marqo_test import MarqoTestCase, CloudTestIndex
+from marqo.errors import MarqoWebError
+from pytest import mark
+
+
+@mark.fixed
+class TestHybridSearch(MarqoTestCase):
+    @staticmethod
+    def strip_marqo_fields(doc, strip_id=True):
+        """Strips Marqo fields from a returned doc to get the original doc"""
+        copied = copy.deepcopy(doc)
+
+        strip_fields = ["_highlights", "_score"]
+        if strip_id:
+            strip_fields += ["_id"]
+
+        for to_strip in strip_fields:
+            del copied[to_strip]
+
+        return copied
+
+    def setUp(self):
+        super().setUp()
+        self.docs_list = [
+            # TODO: add score modifiers
+            # similar semantics to dogs
+            {"_id": "doc1", "text_field_1": "dogs"},
+            {"_id": "doc2", "text_field_1": "puppies"},
+            {"_id": "doc3", "text_field_1": "canines"},
+            {"_id": "doc4", "text_field_1": "huskies"},
+            {"_id": "doc5", "text_field_1": "four-legged animals"},
+
+            # shares lexical token with dogs
+            {"_id": "doc6", "text_field_1": "hot dogs"},
+            {"_id": "doc7", "text_field_1": "dogs is a word"},
+            {"_id": "doc8", "text_field_1": "something something dogs"},
+            {"_id": "doc9", "text_field_1": "dogs random words"},
+            {"_id": "doc10", "text_field_1": "dogs dogs dogs"},
+
+            {"_id": "doc11", "text_field_2": "dogs but wrong field"},
+            {"_id": "doc12", "text_field_2": "puppies puppies"},
+            {"_id": "doc13", "text_field_2": "canines canines"},
+        ]
+
+    def test_hybrid_search_searchable_attributes(self):
+        """
+        Tests that searchable attributes work as expected for all methods
+        """
+
+        index_test_cases = [
+            (CloudTestIndex.structured_text, self.structured_index_name)    # TODO: unstructured
+        ]
+        for cloud_test_index_to_use, open_source_test_index_name in index_test_cases:
+            test_index_name = self.get_test_index_name(
+                cloud_test_index_to_use=cloud_test_index_to_use,
+                open_source_test_index_name=open_source_test_index_name
+            )
+            self.client.index(test_index_name).add_documents(self.docs_list)
+
+            with self.subTest("retrieval: disjunction, ranking: rrf"):
+                hybrid_res = self.client.index(test_index_name).search(
+                    "puppies",
+                    search_method="HYBRID",
+                    hybrid_parameters={
+                        "retrieval_method": "disjunction",
+                        "ranking_method": "rrf",
+                        "alpha": 0.5,
+                        "searchable_attributes_lexical": ["text_field_2"],
+                        "searchable_attributes_tensor": ["text_field_2"]
+                    },
+                    limit=10
+                )
+                self.assertEqual(len(hybrid_res["hits"]), 3)  # Only 3 documents have text_field_2 at all
+                self.assertEqual(hybrid_res["hits"][0]["_id"], "doc12")  # puppies puppies in text field 2
+                self.assertEqual(hybrid_res["hits"][1]["_id"], "doc13")
+                self.assertEqual(hybrid_res["hits"][2]["_id"], "doc11")
+
+            with self.subTest("retrieval: lexical, ranking: tensor"):
+                hybrid_res = self.client.index(test_index_name).search(
+                    "puppies",
+                    search_method="HYBRID",
+                    hybrid_parameters={
+                        "retrieval_method": "lexical",
+                        "ranking_method": "tensor",
+                        "searchable_attributes_lexical": ["text_field_2"]
+                    },
+                    limit=10
+                )
+                self.assertEqual(len(hybrid_res["hits"]),
+                                    1)  # Only 1 document has puppies in text_field_2. Lexical retrieval will only get this one.
+                self.assertEqual(hybrid_res["hits"][0]["_id"], "doc12")
+
+            with self.subTest("retrieval: tensor, ranking: lexical"):
+                hybrid_res = self.client.index(test_index_name).search(
+                    "puppies",
+                    search_method="HYBRID",
+                    hybrid_parameters={
+                        "retrieval_method": "tensor",
+                        "ranking_method": "lexical",
+                        "searchable_attributes_tensor": ["text_field_2"]
+                    },
+                    limit=10
+                )
+                self.assertEqual(len(hybrid_res["hits"]),
+                                    3)  # Only 3 documents have text field 2. Tensor retrieval will get them all.
+                self.assertEqual(hybrid_res["hits"][0]["_id"], "doc12")
+                self.assertEqual(hybrid_res["hits"][1]["_id"], "doc11")
+                self.assertEqual(hybrid_res["hits"][2]["_id"], "doc13")
+
+    def test_hybrid_search_same_retrieval_and_ranking_matches_original_method(self):
+        """
+        Tests that hybrid search with:
+        retrieval_method = "lexical", ranking_method = "lexical" and
+        retrieval_method = "tensor", ranking_method = "tensor"
+
+        Results must be the same as lexical search and tensor search respectively.
+        """
+
+        index_test_cases = [
+            (CloudTestIndex.structured_text, self.structured_index_name)  # TODO: unstructured
+        ]
+        for cloud_test_index_to_use, open_source_test_index_name in index_test_cases:
+            test_index_name = self.get_test_index_name(
+                cloud_test_index_to_use=cloud_test_index_to_use,
+                open_source_test_index_name=open_source_test_index_name
+            )
+            self.client.index(test_index_name).add_documents(self.docs_list)
+
+            test_cases = [
+                ("lexical", "lexical"),
+                ("tensor", "tensor")
+            ]
+
+            for retrieval_method, ranking_method in test_cases:
+                with self.subTest(retrieval=retrieval_method, ranking=ranking_method):
+                    hybrid_res = self.client.index(test_index_name).search(
+                        "dogs",
+                        search_method="HYBRID",
+                        hybrid_parameters={
+                            "retrieval_method": retrieval_method,
+                            "ranking_method": ranking_method
+                        },
+                        limit=10
+                    )
+
+                    base_res = self.client.index(test_index_name).search(
+                        "dogs",
+                        search_method=retrieval_method,     # will be either lexical or tensor
+                        limit=10
+                    )
+
+                    self.assertEqual(len(hybrid_res["hits"]), len(base_res["hits"]))
+                    for i in range(len(hybrid_res["hits"])):
+                        self.assertEqual(hybrid_res["hits"][i]["_id"], base_res["hits"][i]["_id"])
+
+    def test_hybrid_search_with_filter(self):
+        """
+        Tests that filter is applied correctly in hybrid search.
+        """
+
+        index_test_cases = [
+            (CloudTestIndex.structured_text, self.structured_index_name)  # TODO: unstructured
+        ]
+        for cloud_test_index_to_use, open_source_test_index_name in index_test_cases:
+            test_index_name = self.get_test_index_name(
+                cloud_test_index_to_use=cloud_test_index_to_use,
+                open_source_test_index_name=open_source_test_index_name
+            )
+            self.client.index(test_index_name).add_documents(self.docs_list)
+
+            test_cases = [
+                ("disjunction", "rrf"),
+                ("lexical", "lexical"),
+                ("tensor", "tensor")
+            ]
+
+            for retrieval_method, ranking_method in test_cases:
+                with self.subTest(retrieval=retrieval_method, ranking=ranking_method):
+                    hybrid_res = self.client.index(test_index_name).search(
+                        "dogs",
+                        search_method="HYBRID",
+                        filter_string="text_field_1:(something something dogs)",
+                        hybrid_parameters={
+                            "retrieval_method": retrieval_method,
+                            "ranking_method": ranking_method
+                        },
+                        limit=10
+                    )
+
+                    self.assertEqual(len(hybrid_res["hits"]), 1)
+                    self.assertEqual(hybrid_res["hits"][0]["_id"], "doc8")
\ No newline at end of file

From a1d1c0f9db769e6d3d66f2d3683f1644939de872 Mon Sep 17 00:00:00 2001
From: Joshua <joshua@marqo.ai>
Date: Wed, 10 Jul 2024 17:42:22 +1000
Subject: [PATCH 3/6] Update test_hybrid_search.py

---
 tests/v2_tests/test_hybrid_search.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/v2_tests/test_hybrid_search.py b/tests/v2_tests/test_hybrid_search.py
index 13333153..c34ba0bf 100644
--- a/tests/v2_tests/test_hybrid_search.py
+++ b/tests/v2_tests/test_hybrid_search.py
@@ -56,7 +56,7 @@ def test_hybrid_search_searchable_attributes(self):
         """
 
         index_test_cases = [
-            (CloudTestIndex.structured_text, self.structured_index_name)    # TODO: unstructured
+            (CloudTestIndex.structured_text, self.structured_index_name)    # TODO: add unstructured when supported
         ]
         for cloud_test_index_to_use, open_source_test_index_name in index_test_cases:
             test_index_name = self.get_test_index_name(
@@ -125,8 +125,7 @@ def test_hybrid_search_same_retrieval_and_ranking_matches_original_method(self):
         """
 
         index_test_cases = [
-            (CloudTestIndex.structured_text, self.structured_index_name)  # TODO: unstructured
-        ]
+            (CloudTestIndex.structured_text, self.structured_index_name)  # TODO: add unstructured when supported
         for cloud_test_index_to_use, open_source_test_index_name in index_test_cases:
             test_index_name = self.get_test_index_name(
                 cloud_test_index_to_use=cloud_test_index_to_use,
@@ -167,7 +166,7 @@ def test_hybrid_search_with_filter(self):
         """
 
         index_test_cases = [
-            (CloudTestIndex.structured_text, self.structured_index_name)  # TODO: unstructured
+            (CloudTestIndex.structured_text, self.structured_index_name)  # TODO: add unstructured when supported
         ]
         for cloud_test_index_to_use, open_source_test_index_name in index_test_cases:
             test_index_name = self.get_test_index_name(
@@ -196,4 +195,4 @@ def test_hybrid_search_with_filter(self):
                     )
 
                     self.assertEqual(len(hybrid_res["hits"]), 1)
-                    self.assertEqual(hybrid_res["hits"][0]["_id"], "doc8")
\ No newline at end of file
+                    self.assertEqual(hybrid_res["hits"][0]["_id"], "doc8")

From 2d1128b250000395f424fad7f54722d4ca5d9cd8 Mon Sep 17 00:00:00 2001
From: Joshua <joshua@marqo.ai>
Date: Wed, 10 Jul 2024 17:42:50 +1000
Subject: [PATCH 4/6] Update search_models.py

---
 src/marqo/models/search_models.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/marqo/models/search_models.py b/src/marqo/models/search_models.py
index 6ad09bf8..f23638cb 100644
--- a/src/marqo/models/search_models.py
+++ b/src/marqo/models/search_models.py
@@ -36,12 +36,14 @@ class RetrievalMethod(str, Enum):
     Tensor = 'tensor'
     Lexical = 'lexical'
 
+
 class RankingMethod(str, Enum):
     RRF = 'rrf'
     NormalizeLinear = 'normalize_linear'
     Tensor = 'tensor'
     Lexical = 'lexical'
 
+
 class HybridParameters:
     retrieval_method: Optional[RetrievalMethod] = RetrievalMethod.Disjunction
     ranking_method: Optional[RankingMethod] = RankingMethod.RRF

From aaa78e90cbbc88e2a42d900abfb3c4636e4be263 Mon Sep 17 00:00:00 2001
From: Joshua <joshua@marqo.ai>
Date: Wed, 10 Jul 2024 16:48:22 +0800
Subject: [PATCH 5/6] remove HybridParameters object

---
 src/marqo/index.py                |  2 +-
 src/marqo/models/search_models.py | 16 +---------------
 2 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/src/marqo/index.py b/src/marqo/index.py
index fbd8f218..81d03745 100644
--- a/src/marqo/index.py
+++ b/src/marqo/index.py
@@ -210,7 +210,7 @@ def search(self, q: Optional[Union[str, dict]] = None, searchable_attributes: Op
                context: Optional[dict] = None, score_modifiers: Optional[dict] = None,
                model_auth: Optional[dict] = None,
                ef_search: Optional[int] = None, approximate: Optional[bool] = None,
-               text_query_prefix: Optional[str] = None, hybrid_parameters: Optional[HybridParameters] = None
+               text_query_prefix: Optional[str] = None, hybrid_parameters: Optional[dict] = None
                ) -> Dict[str, Any]:
         """Search the index.
 
diff --git a/src/marqo/models/search_models.py b/src/marqo/models/search_models.py
index 6ad09bf8..c40c69c8 100644
--- a/src/marqo/models/search_models.py
+++ b/src/marqo/models/search_models.py
@@ -40,18 +40,4 @@ class RankingMethod(str, Enum):
     RRF = 'rrf'
     NormalizeLinear = 'normalize_linear'
     Tensor = 'tensor'
-    Lexical = 'lexical'
-
-class HybridParameters:
-    retrieval_method: Optional[RetrievalMethod] = RetrievalMethod.Disjunction
-    ranking_method: Optional[RankingMethod] = RankingMethod.RRF
-    alpha: Optional[float] = None
-    rrf_k: Optional[int] = None
-    searchable_attributes_lexical: Optional[List[str]] = None
-    searchable_attributes_tensor: Optional[List[str]] = None
-    verbose: bool = False
-
-    # Input for API, but form will change before being passed to core Hybrid Query.
-    score_modifiers_lexical: Optional[dict] = None
-    score_modifiers_tensor: Optional[dict] = None
-
+    Lexical = 'lexical'
\ No newline at end of file

From a658b3d4c2784259a2aa047887aef2321fd22f8f Mon Sep 17 00:00:00 2001
From: Joshua <joshua@marqo.ai>
Date: Wed, 10 Jul 2024 17:19:50 +0800
Subject: [PATCH 6/6] fix hybrid search tests

---
 src/marqo/index.py                   |  1 -
 tests/v2_tests/test_hybrid_search.py | 43 ++++++++++++++--------------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/marqo/index.py b/src/marqo/index.py
index 81d03745..c2d16d34 100644
--- a/src/marqo/index.py
+++ b/src/marqo/index.py
@@ -15,7 +15,6 @@
 from marqo.errors import MarqoWebError, UnsupportedOperationError, MarqoCloudIndexNotFoundError
 from marqo.marqo_logging import mq_logger
 from marqo.models import marqo_index
-from marqo.models.search_models import HybridParameters
 from marqo.models.create_index_settings import IndexSettings
 from marqo.models.marqo_cloud import CloudIndexSettings
 from marqo.version import minimum_supported_marqo_version
diff --git a/tests/v2_tests/test_hybrid_search.py b/tests/v2_tests/test_hybrid_search.py
index c34ba0bf..6bba7587 100644
--- a/tests/v2_tests/test_hybrid_search.py
+++ b/tests/v2_tests/test_hybrid_search.py
@@ -70,11 +70,11 @@ def test_hybrid_search_searchable_attributes(self):
                     "puppies",
                     search_method="HYBRID",
                     hybrid_parameters={
-                        "retrieval_method": "disjunction",
-                        "ranking_method": "rrf",
+                        "retrievalMethod": "disjunction",
+                        "rankingMethod": "rrf",
                         "alpha": 0.5,
-                        "searchable_attributes_lexical": ["text_field_2"],
-                        "searchable_attributes_tensor": ["text_field_2"]
+                        "searchableAttributesLexical": ["text_field_2"],
+                        "searchableAttributesTensor": ["text_field_2"]
                     },
                     limit=10
                 )
@@ -88,9 +88,9 @@ def test_hybrid_search_searchable_attributes(self):
                     "puppies",
                     search_method="HYBRID",
                     hybrid_parameters={
-                        "retrieval_method": "lexical",
-                        "ranking_method": "tensor",
-                        "searchable_attributes_lexical": ["text_field_2"]
+                        "retrievalMethod": "lexical",
+                        "rankingMethod": "tensor",
+                        "searchableAttributesLexical": ["text_field_2"]
                     },
                     limit=10
                 )
@@ -103,9 +103,9 @@ def test_hybrid_search_searchable_attributes(self):
                     "puppies",
                     search_method="HYBRID",
                     hybrid_parameters={
-                        "retrieval_method": "tensor",
-                        "ranking_method": "lexical",
-                        "searchable_attributes_tensor": ["text_field_2"]
+                        "retrievalMethod": "tensor",
+                        "rankingMethod": "lexical",
+                        "searchableAttributesTensor": ["text_field_2"]
                     },
                     limit=10
                 )
@@ -118,14 +118,15 @@ def test_hybrid_search_searchable_attributes(self):
     def test_hybrid_search_same_retrieval_and_ranking_matches_original_method(self):
         """
         Tests that hybrid search with:
-        retrieval_method = "lexical", ranking_method = "lexical" and
-        retrieval_method = "tensor", ranking_method = "tensor"
+        retrievalMethod = "lexical", rankingMethod = "lexical" and
+        retrievalMethod = "tensor", rankingMethod = "tensor"
 
         Results must be the same as lexical search and tensor search respectively.
         """
 
         index_test_cases = [
             (CloudTestIndex.structured_text, self.structured_index_name)  # TODO: add unstructured when supported
+        ]
         for cloud_test_index_to_use, open_source_test_index_name in index_test_cases:
             test_index_name = self.get_test_index_name(
                 cloud_test_index_to_use=cloud_test_index_to_use,
@@ -138,21 +139,21 @@ def test_hybrid_search_same_retrieval_and_ranking_matches_original_method(self):
                 ("tensor", "tensor")
             ]
 
-            for retrieval_method, ranking_method in test_cases:
-                with self.subTest(retrieval=retrieval_method, ranking=ranking_method):
+            for retrievalMethod, rankingMethod in test_cases:
+                with self.subTest(retrieval=retrievalMethod, ranking=rankingMethod):
                     hybrid_res = self.client.index(test_index_name).search(
                         "dogs",
                         search_method="HYBRID",
                         hybrid_parameters={
-                            "retrieval_method": retrieval_method,
-                            "ranking_method": ranking_method
+                            "retrievalMethod": retrievalMethod,
+                            "rankingMethod": rankingMethod
                         },
                         limit=10
                     )
 
                     base_res = self.client.index(test_index_name).search(
                         "dogs",
-                        search_method=retrieval_method,     # will be either lexical or tensor
+                        search_method=retrievalMethod,     # will be either lexical or tensor
                         limit=10
                     )
 
@@ -181,15 +182,15 @@ def test_hybrid_search_with_filter(self):
                 ("tensor", "tensor")
             ]
 
-            for retrieval_method, ranking_method in test_cases:
-                with self.subTest(retrieval=retrieval_method, ranking=ranking_method):
+            for retrievalMethod, rankingMethod in test_cases:
+                with self.subTest(retrieval=retrievalMethod, ranking=rankingMethod):
                     hybrid_res = self.client.index(test_index_name).search(
                         "dogs",
                         search_method="HYBRID",
                         filter_string="text_field_1:(something something dogs)",
                         hybrid_parameters={
-                            "retrieval_method": retrieval_method,
-                            "ranking_method": ranking_method
+                            "retrievalMethod": retrievalMethod,
+                            "rankingMethod": rankingMethod
                         },
                         limit=10
                     )