From fe2e37d83a8640fded6bdb96c61ffd1ab71dd1d9 Mon Sep 17 00:00:00 2001 From: Michael Chouinard <46358556+chouinar@users.noreply.github.com> Date: Thu, 7 Nov 2024 12:00:12 -0500 Subject: [PATCH] [Issue #2729] OpenSearch should return an accurate count of total results (#2730) ## Summary Fixes #2729 ### Time to review: __3 mins__ ## Changes proposed Set `track_total_hits` to True when calling OpenSearch ## Context for reviewers While this field says it has possible performance cost due to needing to count all the records, we also request a count for various facet counts anyways, so I imagine this won't matter at all. ## Additional information https://opensearch.org/docs/latest/api-reference/search/ I loaded ~16k records into my local search index. Querying it with no filters returns this pagination info now: ```json { "order_by": "opportunity_id", "page_offset": 1, "page_size": 25, "sort_direction": "ascending", "total_pages": 676, "total_records": 16884 } ``` --- api/src/adapters/search/opensearch_query_builder.py | 12 ++++++++++++ .../opportunities_v1/search_opportunities.py | 3 +++ .../adapters/search/test_opensearch_query_builder.py | 12 +++++++++++- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/api/src/adapters/search/opensearch_query_builder.py b/api/src/adapters/search/opensearch_query_builder.py index a7e900c4d..7e2cebe4d 100644 --- a/api/src/adapters/search/opensearch_query_builder.py +++ b/api/src/adapters/search/opensearch_query_builder.py @@ -94,6 +94,8 @@ def __init__(self) -> None: self.sort_values: list[dict[str, dict[str, str]]] = [] + self._track_total_hits: bool = True + self.must: list[dict] = [] self.filters: list[dict] = [] @@ -133,6 +135,15 @@ def sort_by(self, sort_values: list[typing.Tuple[str, SortDirection]]) -> typing return self + def track_total_hits(self, track_total_hits: bool) -> typing.Self: + """ + Whether or not to track the total number of hits in the response accurately. + + By default OpenSearch will stop counting after 10k records are counted. + """ + self._track_total_hits = track_total_hits + return self + def simple_query(self, query: str, fields: list[str]) -> typing.Self: """ Adds a simple_query_string which queries against the provided fields. @@ -238,6 +249,7 @@ def build(self) -> dict: # Always include the scores in the response objects # even if we're sorting by non-relevancy "track_scores": True, + "track_total_hits": self._track_total_hits, } # Add sorting if any was provided diff --git a/api/src/services/opportunities_v1/search_opportunities.py b/api/src/services/opportunities_v1/search_opportunities.py index aff12321d..9f232cc79 100644 --- a/api/src/services/opportunities_v1/search_opportunities.py +++ b/api/src/services/opportunities_v1/search_opportunities.py @@ -143,6 +143,9 @@ def _add_aggregations(builder: search.SearchQueryBuilder) -> None: def _get_search_request(params: SearchOpportunityParams) -> dict: builder = search.SearchQueryBuilder() + # Make sure total hit count gets counted for more than 10k records + builder.track_total_hits(True) + # Pagination builder.pagination( page_size=params.pagination.page_size, page_number=params.pagination.page_offset diff --git a/api/tests/src/adapters/search/test_opensearch_query_builder.py b/api/tests/src/adapters/search/test_opensearch_query_builder.py index 33f9b2b19..9ce82f109 100644 --- a/api/tests/src/adapters/search/test_opensearch_query_builder.py +++ b/api/tests/src/adapters/search/test_opensearch_query_builder.py @@ -161,7 +161,12 @@ def seed_data(self, search_client, search_index): def test_query_builder_empty(self, search_client, search_index): builder = SearchQueryBuilder() - assert builder.build() == {"size": 25, "from": 0, "track_scores": True} + assert builder.build() == { + "size": 25, + "from": 0, + "track_scores": True, + "track_total_hits": True, + } validate_valid_request(search_client, search_index, builder, FULL_DATA) @@ -265,6 +270,7 @@ def test_query_builder_pagination_and_sorting( "size": page_size, "from": page_size * (page_number - 1), "track_scores": True, + "track_total_hits": True, "sort": expected_sort, } @@ -369,6 +375,7 @@ def test_query_builder_filter_terms( "size": 25, "from": 0, "track_scores": True, + "track_total_hits": True, "query": {"bool": {"filter": expected_terms}}, } @@ -429,6 +436,7 @@ def test_query_builder_filter_date_range( "size": 25, "from": 0, "track_scores": True, + "track_total_hits": True, "query": {"bool": {"filter": [{"range": {"publication_date": expected_ranges}}]}}, } @@ -474,6 +482,7 @@ def test_query_builder_filter_int_range( "size": 25, "from": 0, "track_scores": True, + "track_total_hits": True, "query": {"bool": {"filter": [{"range": {"page_count": expected_ranges}}]}}, } @@ -633,6 +642,7 @@ def test_query_builder_simple_query_and_aggregations( "size": 25, "from": 0, "track_scores": True, + "track_total_hits": True, "query": { "bool": { "must": [