Skip to content

Commit

Permalink
Add extra hybrid search RRF duplicate tests and replicas to indexes(#250
Browse files Browse the repository at this point in the history
)

- Adds replicas to 3 of the 5 indexes (unstructured text, structured text, structured image)
- Adds duplicate tests for hybrid search with RRF (should fail on Marqo 2.11.3 and earlier)
  • Loading branch information
vicilliar authored Sep 10, 2024
1 parent 68c853a commit 29bb70d
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 54 deletions.
96 changes: 52 additions & 44 deletions tests/cloud_test_logic/cloud_test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,19 @@ class CloudTestIndex(str, Enum):
Please try to keep names short to avoid hitting name-length limits
We create 3 unstructured indexes and 3 structured indexes to test:
1) unstructured_text: a basic text-only index with default settings.
2) unstructured_image: an image-compatible index with GPU inference pod and performance storage class.
3) unstructured_text_custom_prepro: a text-only index with custom model and text preprocessing, with 1 replica.
4) structured_image_prepro: a structured index with image-compatible models with image preprocessing
5) structured_image_custom: a structured index with custom image-compatible models using 2 inference pods
6) structured_text: a text-only index with balanced storage class and 2 shards.
1) unstructured_text: Text-only index using hf/e5-base-v2, 2 shards, 1 replica, CPU, balanced storage, for hybrid duplicates testing.
2) unstructured_image: Image-compatible index using open_clip/ViT-B-32/laion2b_s34b_b79k, 1 shard, no replicas, CPU, basic storage.
3) unstructured_no_model: 512-dimension custom vectors, 1 shard, no replicas, CPU, basic storage.
4) structured_text: Structured text index with hf/e5-base-v2, lexical search, 2 shards, 1 replica, CPU, balanced storage.
5) structured_image: Structured image-text index with open_clip/ViT-B-32, 2 shards, 1 replica, CPU, balanced storage, with image preprocessing.
For more information on the settings of each index, please refer to index_name_to_settings_mappings.
FOR CLOUD REPLICAS AND SHARDS:
- Use unstructured_text, structured_text, or structured_images for 1 replica & 2 shards
- Use all other indexes for 0 replicas & 1 shard
We design these indexes to maximize the coverage of different settings and features. For each test method,
we will have to manually specify which index to use.
For example,
1) You want to test text fields without text preprocessing
-> use 1) unstructured_text or 6) structured_text
2) You want to test image fields without image preprocessing
-> use 2) unstructured_image or 5) structured_image_custom
3) You want to test text fields with text preprocessing
-> 3) use unstructured_text_custom_prepro
4) You want to test image fields with image preprocessing
-> 4) use structured_image_prepro
"""

unstructured_text = "pymarqo_unstr_txt"
Expand All @@ -48,21 +41,60 @@ class CloudTestIndex(str, Enum):
"model": "hf/e5-base-v2",

"inferenceType": "marqo.CPU.small",
"storageClass": "marqo.basic",
"storageClass": "marqo.balanced",
"numberOfShards": 2,
"numberOfReplicas": 1, # For hybrid duplicates test
},
CloudTestIndex.unstructured_image: {
"type": "unstructured",
"treatUrlsAndPointersAsImages": True,
"model": "open_clip/ViT-B-32/laion2b_s34b_b79k",

"inferenceType": "marqo.GPU",
"storageClass": "marqo.performance",
"inferenceType": "marqo.CPU.small",
"storageClass": "marqo.basic",
"numberOfShards": 1,
"numberOfReplicas": 0,
},
CloudTestIndex.unstructured_no_model: {
"type": "unstructured",
"treatUrlsAndPointersAsImages": False,

"inferenceType": "marqo.CPU.small",
"storageClass": "marqo.basic",
"numberOfShards": 1,
"numberOfReplicas": 0,

"model": "no_model",
"modelProperties": {
"type": "no_model",
"dimensions": 512
},
},
CloudTestIndex.structured_text: {
"type": "structured",
"model": "hf/e5-base-v2",
"allFields": [
{"name": "text_field_1", "type": "text", "features": ["lexical_search", "filter"]},
{"name": "text_field_2", "type": "text", "features": ["lexical_search", "filter"]},
{"name": "text_field_3", "type": "text", "features": ["lexical_search"]},
{"name": "int_field_1", "type": "int", "features": ["score_modifier"]},
{"name": "int_filter_field_1", "type": "int", "features": ["filter", "score_modifier"]}],
"tensorFields": ["text_field_1", "text_field_2", "text_field_3"],

"inferenceType": "marqo.CPU.small",
"storageClass": "marqo.balanced",
"numberOfShards": 2,
"numberOfReplicas": 1, # For hybrid duplicates test
},
CloudTestIndex.structured_image: {
"type": "structured",
"model": "open_clip/ViT-B-32/laion2b_s34b_b79k",

"inferenceType": "marqo.CPU.small",
"storageClass": "marqo.basic",
"storageClass": "marqo.balanced",
"numberOfShards": 2,
"numberOfReplicas": 1, # For hybrid duplicates test

"allFields": [
{"name": "text_field_1", "type": "text", "features": ["lexical_search", "filter"]},
{"name": "text_field_2", "type": "text", "features": ["lexical_search", "filter"]},
Expand All @@ -78,29 +110,5 @@ class CloudTestIndex(str, Enum):
"imagePreprocessing": {
"patchMethod": "simple",
}
},
CloudTestIndex.structured_text: {
"type": "structured",
"model": "hf/e5-base-v2",
"allFields": [
{"name": "text_field_1", "type": "text", "features": ["lexical_search", "filter"]},
{"name": "text_field_2", "type": "text", "features": ["lexical_search", "filter"]},
{"name": "text_field_3", "type": "text", "features": ["lexical_search"]},
{"name": "int_field_1", "type": "int", "features": ["score_modifier"]},
{"name": "int_filter_field_1", "type": "int", "features": ["filter", "score_modifier"]}],
"tensorFields": ["text_field_1", "text_field_2", "text_field_3"],
"storageClass": "marqo.balanced",
"numberOfShards": 2,
},
CloudTestIndex.unstructured_no_model: {
"type": "unstructured",
"treatUrlsAndPointersAsImages": False,
"inferenceType": "marqo.CPU.small",
"storageClass": "marqo.basic",
"model": "no_model",
"modelProperties": {
"type": "no_model",
"dimensions": 512
},
}
}
10 changes: 5 additions & 5 deletions tests/cloud_test_logic/populate_indices_for_cloud_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,16 @@ def populate_indices():
raise Exception("Some cloud index name exceeds 32 characters limit")

for index_name, index_settings_dicts in index_name_to_settings_mappings.items():
print(f"Creating {index_name} with config: {index_settings_dicts}")
print(f"Creating {index_name} with config: {index_settings_dicts}", flush=True)
try:
print(mq.create_index(
index_name=index_name + INDEX_NAME_SEPARATOR + test_uniqueness_id,
wait_for_readiness=False,
settings_dict=index_settings_dicts
)
), flush=True
)
except MarqoWebError as e:
print(f"Attempting to create index {index_name} resulting in error {e}")
print(f"Attempting to create index {index_name} resulting in error {e}", flush=True)
raise e


Expand All @@ -57,8 +57,8 @@ def populate_indices():
mq.config.instance_mapping._refresh_urls()
time.sleep(10)
print(f"Waiting for indexes to be created. Current Mappings: "
f"{mq.config.instance_mapping._urls_mapping}")
f"{mq.config.instance_mapping._urls_mapping}", flush=True)
attempt += 1
if attempt > max_retries:
raise Exception("Timed out waiting for indexes to be created")
print(f"Populating indices took {time.time() - populate_indices_start_time} seconds")
print(f"Populating indices took {time.time() - populate_indices_start_time} seconds", flush=True)
2 changes: 2 additions & 0 deletions tests/marqo_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,8 @@ def get_test_index_name(
index_name_to_return = f"{cloud_test_index_to_use.value}_{self.index_suffix}"
self.prepare_cloud_index_for_test(index_name_to_return, delete_index_documents_before_test)
else:
if open_source_test_index_name is None:
raise ValueError("open_source_test_index_name must be specified for non-cloud tests")
index_name_to_return = open_source_test_index_name
return index_name_to_return

Expand Down
37 changes: 37 additions & 0 deletions tests/v2_tests/test_hybrid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,3 +231,40 @@ def test_hybrid_search_with_filter(self):

self.assertEqual(len(hybrid_res["hits"]), 1)
self.assertEqual(hybrid_res["hits"][0]["_id"], "doc8")

def test_hybrid_search_rrf_with_replicas_has_no_duplicates(self):
"""
Tests that show that running 100 searches on indexes with 3 replicas (structured text & unstructured text)
will not have duplicates in results.
Only relevant for cloud tests.
"""

if not self.client.config.is_marqo_cloud:
self.skipTest("Test is not relevant for non-Marqo Cloud instances")

index_test_cases = [CloudTestIndex.structured_text, CloudTestIndex.unstructured_text]
for cloud_test_index_to_use in index_test_cases:
test_index_name = self.get_test_index_name(
cloud_test_index_to_use=cloud_test_index_to_use,
open_source_test_index_name=None
)
self.client.index(test_index_name).add_documents(
self.docs_list,
tensor_fields=["text_field_1", "text_field_2", "text_field_3"] \
if "unstr" in test_index_name else None
)

for _ in range(100):
hybrid_res = self.client.index(test_index_name).search(
"dogs",
search_method="HYBRID",
limit=10
)

# check for duplicates
hit_ids = [hit["_id"] for hit in hybrid_res["hits"]]
self.assertEqual(len(hit_ids), len(set(hit_ids)),
f"Duplicates found in results. Only {len(set(hit_ids))} unique results out of "
f"{len(hit_ids)}")


8 changes: 4 additions & 4 deletions tests/v2_tests/test_image_chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,13 @@ def test_image_simple_chunking(self):
},
}

if not self.client.config.is_marqo_cloud:
self.client.create_index(self.generic_test_index_name, settings_dict=settings)

test_index_name = self.get_test_index_name(
cloud_test_index_to_use=CloudTestIndex.structured_image,
open_source_test_index_name=None
open_source_test_index_name=self.generic_test_index_name
)
if not self.client.config.is_marqo_cloud:
self.client.create_index(self.generic_test_index_name, settings_dict=settings)
test_index_name = self.generic_test_index_name
temp_file_name = 'https://avatars.githubusercontent.com/u/13092433?v=4'

img = Image.open(requests.get(temp_file_name, stream=True).raw)
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@ deps =
{[testenv]deps}
pytest-html
commands =
python tests/cloud_test_logic/run_cloud_tests.py {posargs}
python -u tests/cloud_test_logic/run_cloud_tests.py {posargs}

0 comments on commit 29bb70d

Please sign in to comment.