-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Overview query breakout #89
Changes from all commits
a509e41
2eba97e
1351ed0
cf465e5
fafa04f
3dbaef0
7fb1cad
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,6 +43,21 @@ class QueryBuilder: | |
VALID_SORT_ORDERS = ["asc", "desc"] | ||
VALID_SORT_FIELDS = ["publication_date", "indexed_date"] | ||
|
||
class Aggregators(Enum): | ||
DAILY_COUNTS = { | ||
"dailycounts": { | ||
"date_histogram": { | ||
"field": "publication_date", | ||
"calendar_interval": "day", | ||
"min_doc_count": 1, | ||
} | ||
} | ||
} | ||
TOP_LANGS = {"toplangs": {"terms": {"field": "language.keyword", "size": 100}}} | ||
TOP_DOMAINS = { | ||
"topdomains": {"terms": {"field": "canonical_domain.keyword", "size": 100}} | ||
} | ||
|
||
def __init__(self, query_text): | ||
self.query_text = query_text | ||
self._source = [ | ||
|
@@ -95,30 +110,17 @@ def basic_query(self, expanded: bool = False) -> Dict: | |
} | ||
return default | ||
|
||
def overview_query(self): | ||
def aggregator_query(self, *aggs: "QueryBuilder.Aggregators") -> Dict: | ||
query = self.basic_query() | ||
query.update( | ||
{ | ||
"aggregations": { | ||
"daily": { | ||
"date_histogram": { | ||
"field": "publication_date", | ||
"calendar_interval": "day", | ||
"min_doc_count": 1, | ||
} | ||
}, | ||
"lang": {"terms": {"field": "language.keyword", "size": 100}}, | ||
"domain": { | ||
"terms": {"field": "canonical_domain.keyword", "size": 100} | ||
}, | ||
"tld": {"terms": {"field": "tld", "size": 100}}, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we dropping There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. None of the other layers above this point in the API use TLD- we had been calculating it here, I think, so that we had a backup for when the total count was above the 10,000 document limit imposed by elasticsearch- and we can replace that function with the domain field- so I removed it here in the spirit of speeding up the query! But, happy to consider leaving it in- @rahulbot any thoughts? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I can't think of a researc huse of TLD in our current practices. I'd support removal. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm going to put this snippet of a conversation I had with @ibnesayeed many months ago here for context-
I think this property of TLDs, where it is always filled, is also true of canonical_domain- so we can drop that in instead. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Implemented this backup in the most recent commit |
||
}, | ||
"aggregations": {k: v for agg in aggs for k, v in agg.value.items()}, | ||
"track_total_hits": True, | ||
} | ||
) | ||
return query | ||
|
||
def terms_query(self, field): | ||
def terms_query(self, field) -> Dict: | ||
resct = 200 | ||
aggr_map = { | ||
"terms": { | ||
|
@@ -172,7 +174,7 @@ def paged_query( | |
query["search_after"] = [decode_key(resume)] | ||
return query | ||
|
||
def article_query(self): | ||
def article_query(self) -> Dict: | ||
default: dict = { | ||
"_source": self._expanded_source, | ||
"query": {"match": {"_id": self.query_text}}, | ||
|
@@ -251,30 +253,101 @@ def format_day_counts(self, bucket: list): | |
def format_counts(self, bucket: list): | ||
return {item["key"]: item["doc_count"] for item in bucket} | ||
|
||
def search_overview(self, collection: str, q: str): | ||
def aggregator_query( | ||
self, collection: str, q: str, *aggs: QueryBuilder.Aggregators, **options | ||
): | ||
""" | ||
Get overview statistics for a query | ||
Abstraction to DRY out permutations of the 'overview' query getting broken out into their own calls | ||
""" | ||
res = self.ES.search(index=collection, body=QueryBuilder(q).overview_query()) # type: ignore [call-arg] | ||
query_body = QueryBuilder(q).aggregator_query(*aggs) | ||
|
||
res = self.ES.search(index=collection, body=query_body) # type: ignore [call-arg] | ||
if not res["hits"]["hits"]: | ||
raise HTTPException(status_code=404, detail="No results found!") | ||
|
||
total = res["hits"]["total"]["value"] | ||
tldsum = sum( | ||
item["doc_count"] for item in res["aggregations"]["tld"]["buckets"] | ||
) | ||
return { | ||
|
||
return_dict = { | ||
"query": q, | ||
"total": max(total, tldsum), | ||
"topdomains": self.format_counts(res["aggregations"]["domain"]["buckets"]), | ||
"toptlds": self.format_counts(res["aggregations"]["tld"]["buckets"]), | ||
"toplangs": self.format_counts(res["aggregations"]["lang"]["buckets"]), | ||
"dailycounts": self.format_day_counts( | ||
res["aggregations"]["daily"]["buckets"] | ||
), | ||
"matches": [self.format_match(h, collection) for h in res["hits"]["hits"]], | ||
} | ||
|
||
# Add the results of each aggregator to the return value | ||
for agg in aggs: | ||
agg_name = list(agg.value.keys())[0] | ||
return_dict.update( | ||
{agg_name: self.format_counts(res["aggregations"][agg_name]["buckets"])} | ||
) | ||
|
||
# Only return the total and matches if explicitly requested | ||
if "overview" in options: | ||
# We use a sum of the top_domains to supplement the total, as elasticsearch has a hard limit | ||
# of 10,000 results per page in a source query, but aggregators can go around this. | ||
# | ||
if QueryBuilder.Aggregators.TOP_DOMAINS not in aggs: | ||
raise HTTPException( | ||
status_code=500, | ||
detail="Can't run overview query without top_domains aggregator", | ||
) | ||
|
||
domain_sum = sum( | ||
item["doc_count"] | ||
for item in res["aggregations"]["topdomains"]["buckets"] | ||
) | ||
|
||
return_dict.update( | ||
{ | ||
"total": max(total, domain_sum), | ||
"matches": [ # type: ignore [dict-item] | ||
self.format_match(h, collection) for h in res["hits"]["hits"] | ||
], | ||
} | ||
) | ||
|
||
return return_dict | ||
|
||
def search_overview(self, collection: str, q: str): | ||
""" | ||
Get overview statistics for a query | ||
""" | ||
return self.aggregator_query( | ||
collection, | ||
q, | ||
QueryBuilder.Aggregators.DAILY_COUNTS, | ||
QueryBuilder.Aggregators.TOP_LANGS, | ||
QueryBuilder.Aggregators.TOP_DOMAINS, | ||
overview=True, | ||
) | ||
|
||
def daily_counts(self, collection: str, q: str): | ||
""" | ||
Return just a daily count histogram for a query | ||
""" | ||
return self.aggregator_query( | ||
collection, | ||
q, | ||
QueryBuilder.Aggregators.DAILY_COUNTS, | ||
) | ||
|
||
def top_languages(self, collection: str, q: str): | ||
""" | ||
Return top languagues for a query | ||
""" | ||
return self.aggregator_query( | ||
collection, | ||
q, | ||
QueryBuilder.Aggregators.TOP_LANGS, | ||
) | ||
|
||
def top_domains(self, collection: str, q: str): | ||
""" | ||
Return top domains for a query | ||
""" | ||
return self.aggregator_query( | ||
collection, | ||
q, | ||
QueryBuilder.Aggregators.TOP_DOMAINS, | ||
) | ||
|
||
def search_result( | ||
self, | ||
collection: str, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Field changing from
canonical_domain
tocanonical_domain.keyword
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes! In trying to benchmark the latency of these operations I noticed that aggregating on the .keywords field of canonical domain made the query return significantly faster- ultimately it made the overview query almost twice as fast.