From 39e2ce2ce4b1ccacf2262c131b4d1840739ca384 Mon Sep 17 00:00:00 2001 From: Joshua Date: Tue, 21 May 2024 03:01:34 +0800 Subject: [PATCH 1/6] add hybrid_parameters to search --- src/marqo/index.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/marqo/index.py b/src/marqo/index.py index a08aba2a..c2d16d34 100644 --- a/src/marqo/index.py +++ b/src/marqo/index.py @@ -209,7 +209,7 @@ def search(self, q: Optional[Union[str, dict]] = None, searchable_attributes: Op context: Optional[dict] = None, score_modifiers: Optional[dict] = None, model_auth: Optional[dict] = None, ef_search: Optional[int] = None, approximate: Optional[bool] = None, - text_query_prefix: Optional[str] = None, + text_query_prefix: Optional[str] = None, hybrid_parameters: Optional[dict] = None ) -> Dict[str, Any]: """Search the index. @@ -273,6 +273,7 @@ def search(self, q: Optional[Union[str, dict]] = None, searchable_attributes: Op "reRanker": reranker, "boost": boost, "textQueryPrefix": text_query_prefix, + "hybridParameters": hybrid_parameters } body = {k: v for k, v in body.items() if v is not None} From 723e201a126af088afe0f40308234d01d3207622 Mon Sep 17 00:00:00 2001 From: Joshua Date: Tue, 9 Jul 2024 01:46:37 +0800 Subject: [PATCH 2/6] add hybrid search tests --- src/marqo/index.py | 3 +- src/marqo/models/search_models.py | 29 ++++ tests/v2_tests/test_hybrid_search.py | 199 +++++++++++++++++++++++++++ 3 files changed, 230 insertions(+), 1 deletion(-) create mode 100644 tests/v2_tests/test_hybrid_search.py diff --git a/src/marqo/index.py b/src/marqo/index.py index c2d16d34..fbd8f218 100644 --- a/src/marqo/index.py +++ b/src/marqo/index.py @@ -15,6 +15,7 @@ from marqo.errors import MarqoWebError, UnsupportedOperationError, MarqoCloudIndexNotFoundError from marqo.marqo_logging import mq_logger from marqo.models import marqo_index +from marqo.models.search_models import HybridParameters from marqo.models.create_index_settings import IndexSettings from marqo.models.marqo_cloud import CloudIndexSettings from marqo.version import minimum_supported_marqo_version @@ -209,7 +210,7 @@ def search(self, q: Optional[Union[str, dict]] = None, searchable_attributes: Op context: Optional[dict] = None, score_modifiers: Optional[dict] = None, model_auth: Optional[dict] = None, ef_search: Optional[int] = None, approximate: Optional[bool] = None, - text_query_prefix: Optional[str] = None, hybrid_parameters: Optional[dict] = None + text_query_prefix: Optional[str] = None, hybrid_parameters: Optional[HybridParameters] = None ) -> Dict[str, Any]: """Search the index. diff --git a/src/marqo/models/search_models.py b/src/marqo/models/search_models.py index 8c390530..6ad09bf8 100644 --- a/src/marqo/models/search_models.py +++ b/src/marqo/models/search_models.py @@ -1,5 +1,9 @@ from typing import Dict, List, Optional, Union from marqo.models.marqo_models import StrictBaseModel +from abc import ABC +from enum import Enum + +from pydantic import validator, BaseModel, root_validator class SearchBody(StrictBaseModel): @@ -26,3 +30,28 @@ class BulkSearchBody(SearchBody): class BulkSearchQuery(StrictBaseModel): queries: List[BulkSearchBody] + +class RetrievalMethod(str, Enum): + Disjunction = 'disjunction' + Tensor = 'tensor' + Lexical = 'lexical' + +class RankingMethod(str, Enum): + RRF = 'rrf' + NormalizeLinear = 'normalize_linear' + Tensor = 'tensor' + Lexical = 'lexical' + +class HybridParameters: + retrieval_method: Optional[RetrievalMethod] = RetrievalMethod.Disjunction + ranking_method: Optional[RankingMethod] = RankingMethod.RRF + alpha: Optional[float] = None + rrf_k: Optional[int] = None + searchable_attributes_lexical: Optional[List[str]] = None + searchable_attributes_tensor: Optional[List[str]] = None + verbose: bool = False + + # Input for API, but form will change before being passed to core Hybrid Query. + score_modifiers_lexical: Optional[dict] = None + score_modifiers_tensor: Optional[dict] = None + diff --git a/tests/v2_tests/test_hybrid_search.py b/tests/v2_tests/test_hybrid_search.py new file mode 100644 index 00000000..13333153 --- /dev/null +++ b/tests/v2_tests/test_hybrid_search.py @@ -0,0 +1,199 @@ +import copy +import marqo +from marqo import enums +from unittest import mock +import requests +import random +import math +import time +from tests.marqo_test import MarqoTestCase, CloudTestIndex +from marqo.errors import MarqoWebError +from pytest import mark + + +@mark.fixed +class TestHybridSearch(MarqoTestCase): + @staticmethod + def strip_marqo_fields(doc, strip_id=True): + """Strips Marqo fields from a returned doc to get the original doc""" + copied = copy.deepcopy(doc) + + strip_fields = ["_highlights", "_score"] + if strip_id: + strip_fields += ["_id"] + + for to_strip in strip_fields: + del copied[to_strip] + + return copied + + def setUp(self): + super().setUp() + self.docs_list = [ + # TODO: add score modifiers + # similar semantics to dogs + {"_id": "doc1", "text_field_1": "dogs"}, + {"_id": "doc2", "text_field_1": "puppies"}, + {"_id": "doc3", "text_field_1": "canines"}, + {"_id": "doc4", "text_field_1": "huskies"}, + {"_id": "doc5", "text_field_1": "four-legged animals"}, + + # shares lexical token with dogs + {"_id": "doc6", "text_field_1": "hot dogs"}, + {"_id": "doc7", "text_field_1": "dogs is a word"}, + {"_id": "doc8", "text_field_1": "something something dogs"}, + {"_id": "doc9", "text_field_1": "dogs random words"}, + {"_id": "doc10", "text_field_1": "dogs dogs dogs"}, + + {"_id": "doc11", "text_field_2": "dogs but wrong field"}, + {"_id": "doc12", "text_field_2": "puppies puppies"}, + {"_id": "doc13", "text_field_2": "canines canines"}, + ] + + def test_hybrid_search_searchable_attributes(self): + """ + Tests that searchable attributes work as expected for all methods + """ + + index_test_cases = [ + (CloudTestIndex.structured_text, self.structured_index_name) # TODO: unstructured + ] + for cloud_test_index_to_use, open_source_test_index_name in index_test_cases: + test_index_name = self.get_test_index_name( + cloud_test_index_to_use=cloud_test_index_to_use, + open_source_test_index_name=open_source_test_index_name + ) + self.client.index(test_index_name).add_documents(self.docs_list) + + with self.subTest("retrieval: disjunction, ranking: rrf"): + hybrid_res = self.client.index(test_index_name).search( + "puppies", + search_method="HYBRID", + hybrid_parameters={ + "retrieval_method": "disjunction", + "ranking_method": "rrf", + "alpha": 0.5, + "searchable_attributes_lexical": ["text_field_2"], + "searchable_attributes_tensor": ["text_field_2"] + }, + limit=10 + ) + self.assertEqual(len(hybrid_res["hits"]), 3) # Only 3 documents have text_field_2 at all + self.assertEqual(hybrid_res["hits"][0]["_id"], "doc12") # puppies puppies in text field 2 + self.assertEqual(hybrid_res["hits"][1]["_id"], "doc13") + self.assertEqual(hybrid_res["hits"][2]["_id"], "doc11") + + with self.subTest("retrieval: lexical, ranking: tensor"): + hybrid_res = self.client.index(test_index_name).search( + "puppies", + search_method="HYBRID", + hybrid_parameters={ + "retrieval_method": "lexical", + "ranking_method": "tensor", + "searchable_attributes_lexical": ["text_field_2"] + }, + limit=10 + ) + self.assertEqual(len(hybrid_res["hits"]), + 1) # Only 1 document has puppies in text_field_2. Lexical retrieval will only get this one. + self.assertEqual(hybrid_res["hits"][0]["_id"], "doc12") + + with self.subTest("retrieval: tensor, ranking: lexical"): + hybrid_res = self.client.index(test_index_name).search( + "puppies", + search_method="HYBRID", + hybrid_parameters={ + "retrieval_method": "tensor", + "ranking_method": "lexical", + "searchable_attributes_tensor": ["text_field_2"] + }, + limit=10 + ) + self.assertEqual(len(hybrid_res["hits"]), + 3) # Only 3 documents have text field 2. Tensor retrieval will get them all. + self.assertEqual(hybrid_res["hits"][0]["_id"], "doc12") + self.assertEqual(hybrid_res["hits"][1]["_id"], "doc11") + self.assertEqual(hybrid_res["hits"][2]["_id"], "doc13") + + def test_hybrid_search_same_retrieval_and_ranking_matches_original_method(self): + """ + Tests that hybrid search with: + retrieval_method = "lexical", ranking_method = "lexical" and + retrieval_method = "tensor", ranking_method = "tensor" + + Results must be the same as lexical search and tensor search respectively. + """ + + index_test_cases = [ + (CloudTestIndex.structured_text, self.structured_index_name) # TODO: unstructured + ] + for cloud_test_index_to_use, open_source_test_index_name in index_test_cases: + test_index_name = self.get_test_index_name( + cloud_test_index_to_use=cloud_test_index_to_use, + open_source_test_index_name=open_source_test_index_name + ) + self.client.index(test_index_name).add_documents(self.docs_list) + + test_cases = [ + ("lexical", "lexical"), + ("tensor", "tensor") + ] + + for retrieval_method, ranking_method in test_cases: + with self.subTest(retrieval=retrieval_method, ranking=ranking_method): + hybrid_res = self.client.index(test_index_name).search( + "dogs", + search_method="HYBRID", + hybrid_parameters={ + "retrieval_method": retrieval_method, + "ranking_method": ranking_method + }, + limit=10 + ) + + base_res = self.client.index(test_index_name).search( + "dogs", + search_method=retrieval_method, # will be either lexical or tensor + limit=10 + ) + + self.assertEqual(len(hybrid_res["hits"]), len(base_res["hits"])) + for i in range(len(hybrid_res["hits"])): + self.assertEqual(hybrid_res["hits"][i]["_id"], base_res["hits"][i]["_id"]) + + def test_hybrid_search_with_filter(self): + """ + Tests that filter is applied correctly in hybrid search. + """ + + index_test_cases = [ + (CloudTestIndex.structured_text, self.structured_index_name) # TODO: unstructured + ] + for cloud_test_index_to_use, open_source_test_index_name in index_test_cases: + test_index_name = self.get_test_index_name( + cloud_test_index_to_use=cloud_test_index_to_use, + open_source_test_index_name=open_source_test_index_name + ) + self.client.index(test_index_name).add_documents(self.docs_list) + + test_cases = [ + ("disjunction", "rrf"), + ("lexical", "lexical"), + ("tensor", "tensor") + ] + + for retrieval_method, ranking_method in test_cases: + with self.subTest(retrieval=retrieval_method, ranking=ranking_method): + hybrid_res = self.client.index(test_index_name).search( + "dogs", + search_method="HYBRID", + filter_string="text_field_1:(something something dogs)", + hybrid_parameters={ + "retrieval_method": retrieval_method, + "ranking_method": ranking_method + }, + limit=10 + ) + + self.assertEqual(len(hybrid_res["hits"]), 1) + self.assertEqual(hybrid_res["hits"][0]["_id"], "doc8") \ No newline at end of file From a1d1c0f9db769e6d3d66f2d3683f1644939de872 Mon Sep 17 00:00:00 2001 From: Joshua Date: Wed, 10 Jul 2024 17:42:22 +1000 Subject: [PATCH 3/6] Update test_hybrid_search.py --- tests/v2_tests/test_hybrid_search.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/v2_tests/test_hybrid_search.py b/tests/v2_tests/test_hybrid_search.py index 13333153..c34ba0bf 100644 --- a/tests/v2_tests/test_hybrid_search.py +++ b/tests/v2_tests/test_hybrid_search.py @@ -56,7 +56,7 @@ def test_hybrid_search_searchable_attributes(self): """ index_test_cases = [ - (CloudTestIndex.structured_text, self.structured_index_name) # TODO: unstructured + (CloudTestIndex.structured_text, self.structured_index_name) # TODO: add unstructured when supported ] for cloud_test_index_to_use, open_source_test_index_name in index_test_cases: test_index_name = self.get_test_index_name( @@ -125,8 +125,7 @@ def test_hybrid_search_same_retrieval_and_ranking_matches_original_method(self): """ index_test_cases = [ - (CloudTestIndex.structured_text, self.structured_index_name) # TODO: unstructured - ] + (CloudTestIndex.structured_text, self.structured_index_name) # TODO: add unstructured when supported for cloud_test_index_to_use, open_source_test_index_name in index_test_cases: test_index_name = self.get_test_index_name( cloud_test_index_to_use=cloud_test_index_to_use, @@ -167,7 +166,7 @@ def test_hybrid_search_with_filter(self): """ index_test_cases = [ - (CloudTestIndex.structured_text, self.structured_index_name) # TODO: unstructured + (CloudTestIndex.structured_text, self.structured_index_name) # TODO: add unstructured when supported ] for cloud_test_index_to_use, open_source_test_index_name in index_test_cases: test_index_name = self.get_test_index_name( @@ -196,4 +195,4 @@ def test_hybrid_search_with_filter(self): ) self.assertEqual(len(hybrid_res["hits"]), 1) - self.assertEqual(hybrid_res["hits"][0]["_id"], "doc8") \ No newline at end of file + self.assertEqual(hybrid_res["hits"][0]["_id"], "doc8") From 2d1128b250000395f424fad7f54722d4ca5d9cd8 Mon Sep 17 00:00:00 2001 From: Joshua Date: Wed, 10 Jul 2024 17:42:50 +1000 Subject: [PATCH 4/6] Update search_models.py --- src/marqo/models/search_models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/marqo/models/search_models.py b/src/marqo/models/search_models.py index 6ad09bf8..f23638cb 100644 --- a/src/marqo/models/search_models.py +++ b/src/marqo/models/search_models.py @@ -36,12 +36,14 @@ class RetrievalMethod(str, Enum): Tensor = 'tensor' Lexical = 'lexical' + class RankingMethod(str, Enum): RRF = 'rrf' NormalizeLinear = 'normalize_linear' Tensor = 'tensor' Lexical = 'lexical' + class HybridParameters: retrieval_method: Optional[RetrievalMethod] = RetrievalMethod.Disjunction ranking_method: Optional[RankingMethod] = RankingMethod.RRF From aaa78e90cbbc88e2a42d900abfb3c4636e4be263 Mon Sep 17 00:00:00 2001 From: Joshua Date: Wed, 10 Jul 2024 16:48:22 +0800 Subject: [PATCH 5/6] remove HybridParameters object --- src/marqo/index.py | 2 +- src/marqo/models/search_models.py | 16 +--------------- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/src/marqo/index.py b/src/marqo/index.py index fbd8f218..81d03745 100644 --- a/src/marqo/index.py +++ b/src/marqo/index.py @@ -210,7 +210,7 @@ def search(self, q: Optional[Union[str, dict]] = None, searchable_attributes: Op context: Optional[dict] = None, score_modifiers: Optional[dict] = None, model_auth: Optional[dict] = None, ef_search: Optional[int] = None, approximate: Optional[bool] = None, - text_query_prefix: Optional[str] = None, hybrid_parameters: Optional[HybridParameters] = None + text_query_prefix: Optional[str] = None, hybrid_parameters: Optional[dict] = None ) -> Dict[str, Any]: """Search the index. diff --git a/src/marqo/models/search_models.py b/src/marqo/models/search_models.py index 6ad09bf8..c40c69c8 100644 --- a/src/marqo/models/search_models.py +++ b/src/marqo/models/search_models.py @@ -40,18 +40,4 @@ class RankingMethod(str, Enum): RRF = 'rrf' NormalizeLinear = 'normalize_linear' Tensor = 'tensor' - Lexical = 'lexical' - -class HybridParameters: - retrieval_method: Optional[RetrievalMethod] = RetrievalMethod.Disjunction - ranking_method: Optional[RankingMethod] = RankingMethod.RRF - alpha: Optional[float] = None - rrf_k: Optional[int] = None - searchable_attributes_lexical: Optional[List[str]] = None - searchable_attributes_tensor: Optional[List[str]] = None - verbose: bool = False - - # Input for API, but form will change before being passed to core Hybrid Query. - score_modifiers_lexical: Optional[dict] = None - score_modifiers_tensor: Optional[dict] = None - + Lexical = 'lexical' \ No newline at end of file From a658b3d4c2784259a2aa047887aef2321fd22f8f Mon Sep 17 00:00:00 2001 From: Joshua Date: Wed, 10 Jul 2024 17:19:50 +0800 Subject: [PATCH 6/6] fix hybrid search tests --- src/marqo/index.py | 1 - tests/v2_tests/test_hybrid_search.py | 43 ++++++++++++++-------------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/marqo/index.py b/src/marqo/index.py index 81d03745..c2d16d34 100644 --- a/src/marqo/index.py +++ b/src/marqo/index.py @@ -15,7 +15,6 @@ from marqo.errors import MarqoWebError, UnsupportedOperationError, MarqoCloudIndexNotFoundError from marqo.marqo_logging import mq_logger from marqo.models import marqo_index -from marqo.models.search_models import HybridParameters from marqo.models.create_index_settings import IndexSettings from marqo.models.marqo_cloud import CloudIndexSettings from marqo.version import minimum_supported_marqo_version diff --git a/tests/v2_tests/test_hybrid_search.py b/tests/v2_tests/test_hybrid_search.py index c34ba0bf..6bba7587 100644 --- a/tests/v2_tests/test_hybrid_search.py +++ b/tests/v2_tests/test_hybrid_search.py @@ -70,11 +70,11 @@ def test_hybrid_search_searchable_attributes(self): "puppies", search_method="HYBRID", hybrid_parameters={ - "retrieval_method": "disjunction", - "ranking_method": "rrf", + "retrievalMethod": "disjunction", + "rankingMethod": "rrf", "alpha": 0.5, - "searchable_attributes_lexical": ["text_field_2"], - "searchable_attributes_tensor": ["text_field_2"] + "searchableAttributesLexical": ["text_field_2"], + "searchableAttributesTensor": ["text_field_2"] }, limit=10 ) @@ -88,9 +88,9 @@ def test_hybrid_search_searchable_attributes(self): "puppies", search_method="HYBRID", hybrid_parameters={ - "retrieval_method": "lexical", - "ranking_method": "tensor", - "searchable_attributes_lexical": ["text_field_2"] + "retrievalMethod": "lexical", + "rankingMethod": "tensor", + "searchableAttributesLexical": ["text_field_2"] }, limit=10 ) @@ -103,9 +103,9 @@ def test_hybrid_search_searchable_attributes(self): "puppies", search_method="HYBRID", hybrid_parameters={ - "retrieval_method": "tensor", - "ranking_method": "lexical", - "searchable_attributes_tensor": ["text_field_2"] + "retrievalMethod": "tensor", + "rankingMethod": "lexical", + "searchableAttributesTensor": ["text_field_2"] }, limit=10 ) @@ -118,14 +118,15 @@ def test_hybrid_search_searchable_attributes(self): def test_hybrid_search_same_retrieval_and_ranking_matches_original_method(self): """ Tests that hybrid search with: - retrieval_method = "lexical", ranking_method = "lexical" and - retrieval_method = "tensor", ranking_method = "tensor" + retrievalMethod = "lexical", rankingMethod = "lexical" and + retrievalMethod = "tensor", rankingMethod = "tensor" Results must be the same as lexical search and tensor search respectively. """ index_test_cases = [ (CloudTestIndex.structured_text, self.structured_index_name) # TODO: add unstructured when supported + ] for cloud_test_index_to_use, open_source_test_index_name in index_test_cases: test_index_name = self.get_test_index_name( cloud_test_index_to_use=cloud_test_index_to_use, @@ -138,21 +139,21 @@ def test_hybrid_search_same_retrieval_and_ranking_matches_original_method(self): ("tensor", "tensor") ] - for retrieval_method, ranking_method in test_cases: - with self.subTest(retrieval=retrieval_method, ranking=ranking_method): + for retrievalMethod, rankingMethod in test_cases: + with self.subTest(retrieval=retrievalMethod, ranking=rankingMethod): hybrid_res = self.client.index(test_index_name).search( "dogs", search_method="HYBRID", hybrid_parameters={ - "retrieval_method": retrieval_method, - "ranking_method": ranking_method + "retrievalMethod": retrievalMethod, + "rankingMethod": rankingMethod }, limit=10 ) base_res = self.client.index(test_index_name).search( "dogs", - search_method=retrieval_method, # will be either lexical or tensor + search_method=retrievalMethod, # will be either lexical or tensor limit=10 ) @@ -181,15 +182,15 @@ def test_hybrid_search_with_filter(self): ("tensor", "tensor") ] - for retrieval_method, ranking_method in test_cases: - with self.subTest(retrieval=retrieval_method, ranking=ranking_method): + for retrievalMethod, rankingMethod in test_cases: + with self.subTest(retrieval=retrievalMethod, ranking=rankingMethod): hybrid_res = self.client.index(test_index_name).search( "dogs", search_method="HYBRID", filter_string="text_field_1:(something something dogs)", hybrid_parameters={ - "retrieval_method": retrieval_method, - "ranking_method": ranking_method + "retrievalMethod": retrievalMethod, + "rankingMethod": rankingMethod }, limit=10 )