diff --git a/tests/cloud_test_logic/cloud_test_index.py b/tests/cloud_test_logic/cloud_test_index.py index 3c99f6d0..c1c2f41a 100644 --- a/tests/cloud_test_logic/cloud_test_index.py +++ b/tests/cloud_test_logic/cloud_test_index.py @@ -8,26 +8,19 @@ class CloudTestIndex(str, Enum): Please try to keep names short to avoid hitting name-length limits We create 3 unstructured indexes and 3 structured indexes to test: - 1) unstructured_text: a basic text-only index with default settings. - 2) unstructured_image: an image-compatible index with GPU inference pod and performance storage class. - 3) unstructured_text_custom_prepro: a text-only index with custom model and text preprocessing, with 1 replica. - 4) structured_image_prepro: a structured index with image-compatible models with image preprocessing - 5) structured_image_custom: a structured index with custom image-compatible models using 2 inference pods - 6) structured_text: a text-only index with balanced storage class and 2 shards. + 1) unstructured_text: Text-only index using hf/e5-base-v2, 2 shards, 1 replica, CPU, balanced storage, for hybrid duplicates testing. + 2) unstructured_image: Image-compatible index using open_clip/ViT-B-32/laion2b_s34b_b79k, 1 shard, no replicas, CPU, basic storage. + 3) unstructured_no_model: 512-dimension custom vectors, 1 shard, no replicas, CPU, basic storage. + 4) structured_text: Structured text index with hf/e5-base-v2, lexical search, 2 shards, 1 replica, CPU, balanced storage. + 5) structured_image: Structured image-text index with open_clip/ViT-B-32, 2 shards, 1 replica, CPU, balanced storage, with image preprocessing. For more information on the settings of each index, please refer to index_name_to_settings_mappings. + FOR CLOUD REPLICAS AND SHARDS: + - Use unstructured_text, structured_text, or structured_images for 1 replica & 2 shards + - Use all other indexes for 0 replicas & 1 shard + We design these indexes to maximize the coverage of different settings and features. For each test method, we will have to manually specify which index to use. - - For example, - 1) You want to test text fields without text preprocessing - -> use 1) unstructured_text or 6) structured_text - 2) You want to test image fields without image preprocessing - -> use 2) unstructured_image or 5) structured_image_custom - 3) You want to test text fields with text preprocessing - -> 3) use unstructured_text_custom_prepro - 4) You want to test image fields with image preprocessing - -> 4) use structured_image_prepro """ unstructured_text = "pymarqo_unstr_txt" @@ -48,21 +41,60 @@ class CloudTestIndex(str, Enum): "model": "hf/e5-base-v2", "inferenceType": "marqo.CPU.small", - "storageClass": "marqo.basic", + "storageClass": "marqo.balanced", + "numberOfShards": 2, + "numberOfReplicas": 1, # For hybrid duplicates test }, CloudTestIndex.unstructured_image: { "type": "unstructured", "treatUrlsAndPointersAsImages": True, "model": "open_clip/ViT-B-32/laion2b_s34b_b79k", - "inferenceType": "marqo.GPU", - "storageClass": "marqo.performance", + "inferenceType": "marqo.CPU.small", + "storageClass": "marqo.basic", + "numberOfShards": 1, + "numberOfReplicas": 0, + }, + CloudTestIndex.unstructured_no_model: { + "type": "unstructured", + "treatUrlsAndPointersAsImages": False, + + "inferenceType": "marqo.CPU.small", + "storageClass": "marqo.basic", + "numberOfShards": 1, + "numberOfReplicas": 0, + + "model": "no_model", + "modelProperties": { + "type": "no_model", + "dimensions": 512 + }, + }, + CloudTestIndex.structured_text: { + "type": "structured", + "model": "hf/e5-base-v2", + "allFields": [ + {"name": "text_field_1", "type": "text", "features": ["lexical_search", "filter"]}, + {"name": "text_field_2", "type": "text", "features": ["lexical_search", "filter"]}, + {"name": "text_field_3", "type": "text", "features": ["lexical_search"]}, + {"name": "int_field_1", "type": "int", "features": ["score_modifier"]}, + {"name": "int_filter_field_1", "type": "int", "features": ["filter", "score_modifier"]}], + "tensorFields": ["text_field_1", "text_field_2", "text_field_3"], + + "inferenceType": "marqo.CPU.small", + "storageClass": "marqo.balanced", + "numberOfShards": 2, + "numberOfReplicas": 1, # For hybrid duplicates test }, CloudTestIndex.structured_image: { "type": "structured", "model": "open_clip/ViT-B-32/laion2b_s34b_b79k", + "inferenceType": "marqo.CPU.small", - "storageClass": "marqo.basic", + "storageClass": "marqo.balanced", + "numberOfShards": 2, + "numberOfReplicas": 1, # For hybrid duplicates test + "allFields": [ {"name": "text_field_1", "type": "text", "features": ["lexical_search", "filter"]}, {"name": "text_field_2", "type": "text", "features": ["lexical_search", "filter"]}, @@ -78,29 +110,5 @@ class CloudTestIndex(str, Enum): "imagePreprocessing": { "patchMethod": "simple", } - }, - CloudTestIndex.structured_text: { - "type": "structured", - "model": "hf/e5-base-v2", - "allFields": [ - {"name": "text_field_1", "type": "text", "features": ["lexical_search", "filter"]}, - {"name": "text_field_2", "type": "text", "features": ["lexical_search", "filter"]}, - {"name": "text_field_3", "type": "text", "features": ["lexical_search"]}, - {"name": "int_field_1", "type": "int", "features": ["score_modifier"]}, - {"name": "int_filter_field_1", "type": "int", "features": ["filter", "score_modifier"]}], - "tensorFields": ["text_field_1", "text_field_2", "text_field_3"], - "storageClass": "marqo.balanced", - "numberOfShards": 2, - }, - CloudTestIndex.unstructured_no_model: { - "type": "unstructured", - "treatUrlsAndPointersAsImages": False, - "inferenceType": "marqo.CPU.small", - "storageClass": "marqo.basic", - "model": "no_model", - "modelProperties": { - "type": "no_model", - "dimensions": 512 - }, } } diff --git a/tests/cloud_test_logic/populate_indices_for_cloud_tests.py b/tests/cloud_test_logic/populate_indices_for_cloud_tests.py index 40f3a61a..346cd256 100644 --- a/tests/cloud_test_logic/populate_indices_for_cloud_tests.py +++ b/tests/cloud_test_logic/populate_indices_for_cloud_tests.py @@ -33,16 +33,16 @@ def populate_indices(): raise Exception("Some cloud index name exceeds 32 characters limit") for index_name, index_settings_dicts in index_name_to_settings_mappings.items(): - print(f"Creating {index_name} with config: {index_settings_dicts}") + print(f"Creating {index_name} with config: {index_settings_dicts}", flush=True) try: print(mq.create_index( index_name=index_name + INDEX_NAME_SEPARATOR + test_uniqueness_id, wait_for_readiness=False, settings_dict=index_settings_dicts - ) + ), flush=True ) except MarqoWebError as e: - print(f"Attempting to create index {index_name} resulting in error {e}") + print(f"Attempting to create index {index_name} resulting in error {e}", flush=True) raise e @@ -57,8 +57,8 @@ def populate_indices(): mq.config.instance_mapping._refresh_urls() time.sleep(10) print(f"Waiting for indexes to be created. Current Mappings: " - f"{mq.config.instance_mapping._urls_mapping}") + f"{mq.config.instance_mapping._urls_mapping}", flush=True) attempt += 1 if attempt > max_retries: raise Exception("Timed out waiting for indexes to be created") - print(f"Populating indices took {time.time() - populate_indices_start_time} seconds") + print(f"Populating indices took {time.time() - populate_indices_start_time} seconds", flush=True) diff --git a/tests/marqo_test.py b/tests/marqo_test.py index 45a76b6f..5c24edbd 100644 --- a/tests/marqo_test.py +++ b/tests/marqo_test.py @@ -348,6 +348,8 @@ def get_test_index_name( index_name_to_return = f"{cloud_test_index_to_use.value}_{self.index_suffix}" self.prepare_cloud_index_for_test(index_name_to_return, delete_index_documents_before_test) else: + if open_source_test_index_name is None: + raise ValueError("open_source_test_index_name must be specified for non-cloud tests") index_name_to_return = open_source_test_index_name return index_name_to_return diff --git a/tests/v2_tests/test_hybrid_search.py b/tests/v2_tests/test_hybrid_search.py index 4d67f949..336e0c6c 100644 --- a/tests/v2_tests/test_hybrid_search.py +++ b/tests/v2_tests/test_hybrid_search.py @@ -231,3 +231,40 @@ def test_hybrid_search_with_filter(self): self.assertEqual(len(hybrid_res["hits"]), 1) self.assertEqual(hybrid_res["hits"][0]["_id"], "doc8") + + def test_hybrid_search_rrf_with_replicas_has_no_duplicates(self): + """ + Tests that show that running 100 searches on indexes with 3 replicas (structured text & unstructured text) + will not have duplicates in results. + Only relevant for cloud tests. + """ + + if not self.client.config.is_marqo_cloud: + self.skipTest("Test is not relevant for non-Marqo Cloud instances") + + index_test_cases = [CloudTestIndex.structured_text, CloudTestIndex.unstructured_text] + for cloud_test_index_to_use in index_test_cases: + test_index_name = self.get_test_index_name( + cloud_test_index_to_use=cloud_test_index_to_use, + open_source_test_index_name=None + ) + self.client.index(test_index_name).add_documents( + self.docs_list, + tensor_fields=["text_field_1", "text_field_2", "text_field_3"] \ + if "unstr" in test_index_name else None + ) + + for _ in range(100): + hybrid_res = self.client.index(test_index_name).search( + "dogs", + search_method="HYBRID", + limit=10 + ) + + # check for duplicates + hit_ids = [hit["_id"] for hit in hybrid_res["hits"]] + self.assertEqual(len(hit_ids), len(set(hit_ids)), + f"Duplicates found in results. Only {len(set(hit_ids))} unique results out of " + f"{len(hit_ids)}") + + diff --git a/tests/v2_tests/test_image_chunking.py b/tests/v2_tests/test_image_chunking.py index 0f421c42..dd28db4c 100644 --- a/tests/v2_tests/test_image_chunking.py +++ b/tests/v2_tests/test_image_chunking.py @@ -85,13 +85,13 @@ def test_image_simple_chunking(self): }, } + if not self.client.config.is_marqo_cloud: + self.client.create_index(self.generic_test_index_name, settings_dict=settings) + test_index_name = self.get_test_index_name( cloud_test_index_to_use=CloudTestIndex.structured_image, - open_source_test_index_name=None + open_source_test_index_name=self.generic_test_index_name ) - if not self.client.config.is_marqo_cloud: - self.client.create_index(self.generic_test_index_name, settings_dict=settings) - test_index_name = self.generic_test_index_name temp_file_name = 'https://avatars.githubusercontent.com/u/13092433?v=4' img = Image.open(requests.get(temp_file_name, stream=True).raw) diff --git a/tox.ini b/tox.ini index 6c75c831..354a9d8d 100644 --- a/tox.ini +++ b/tox.ini @@ -27,4 +27,4 @@ deps = {[testenv]deps} pytest-html commands = - python tests/cloud_test_logic/run_cloud_tests.py {posargs} \ No newline at end of file + python -u tests/cloud_test_logic/run_cloud_tests.py {posargs} \ No newline at end of file