From c0d525b10891c6d7d3adb3d250ebfdaa9c383757 Mon Sep 17 00:00:00 2001 From: markharwood Date: Tue, 31 Jan 2017 09:45:25 +0000 Subject: [PATCH] [DOCS] [TEST] enhancement - added CONSOLE scripts for sampler aggs (#22869) Added missing CONSOLE scripts to documentation for sampler and diversified_sampler aggs. Includes new StackOverflow index setup in build.gradle Closes #22746 * Formatting tweaks --- docs/build.gradle | 58 +++++- .../diversified-sampler-aggregation.asciidoc | 166 +++++++++++------- .../bucket/sampler-aggregation.asciidoc | 106 +++++++++-- 3 files changed, 257 insertions(+), 73 deletions(-) diff --git a/docs/build.gradle b/docs/build.gradle index 98c79887b749f..b617218c73d37 100644 --- a/docs/build.gradle +++ b/docs/build.gradle @@ -24,7 +24,6 @@ apply plugin: 'elasticsearch.docs-test' * only remove entries from this list. When it is empty we'll remove it * entirely and have a party! There will be cake and everything.... */ buildRestTests.expectedUnconvertedCandidates = [ - 'reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc', 'reference/aggregations/bucket/geodistance-aggregation.asciidoc', 'reference/aggregations/bucket/geohashgrid-aggregation.asciidoc', 'reference/aggregations/bucket/histogram-aggregation.asciidoc', @@ -33,7 +32,6 @@ buildRestTests.expectedUnconvertedCandidates = [ 'reference/aggregations/bucket/nested-aggregation.asciidoc', 'reference/aggregations/bucket/range-aggregation.asciidoc', 'reference/aggregations/bucket/reverse-nested-aggregation.asciidoc', - 'reference/aggregations/bucket/sampler-aggregation.asciidoc', 'reference/aggregations/bucket/significantterms-aggregation.asciidoc', 'reference/aggregations/bucket/terms-aggregation.asciidoc', 'reference/aggregations/matrix/stats-aggregation.asciidoc', @@ -386,3 +384,59 @@ buildRestTests.setups['index_boost'] = ''' index: index1 name: alias1 ''' +// Used by sampler and diversified-sampler aggregation docs +buildRestTests.setups['stackoverflow'] = ''' + - do: + indices.create: + index: stackoverflow + body: + settings: + number_of_shards: 1 + number_of_replicas: 1 + mappings: + question: + properties: + author: + type: keyword + tags: + type: keyword + - do: + bulk: + index: stackoverflow + type: question + refresh: true + body: |''' + +// Make Kibana strongly connected to elasticsearch and logstash +// Make Kibana rarer (and therefore higher-ranking) than Javascript +// Make Javascript strongly connected to jquery and angular +// Make Cabana strongly connected to elasticsearch but only as a result of a single author + +for (int i = 0; i < 150; i++) { + buildRestTests.setups['stackoverflow'] += """ + {"index":{}} + {"author": "very_relevant_$i", "tags": ["elasticsearch", "kibana"]}""" +} +for (int i = 0; i < 50; i++) { + buildRestTests.setups['stackoverflow'] += """ + {"index":{}} + {"author": "very_relevant_$i", "tags": ["logstash", "kibana"]}""" +} +for (int i = 0; i < 200; i++) { + buildRestTests.setups['stackoverflow'] += """ + {"index":{}} + {"author": "partially_relevant_$i", "tags": ["javascript", "jquery"]}""" +} +for (int i = 0; i < 200; i++) { + buildRestTests.setups['stackoverflow'] += """ + {"index":{}} + {"author": "partially_relevant_$i", "tags": ["javascript", "angular"]}""" +} +for (int i = 0; i < 50; i++) { + buildRestTests.setups['stackoverflow'] += """ + {"index":{}} + {"author": "noisy author", "tags": ["elasticsearch", "cabana"]}""" +} +buildRestTests.setups['stackoverflow'] += """ +""" + diff --git a/docs/reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc b/docs/reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc index f5a45da976b03..e8d90bde08ff7 100644 --- a/docs/reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc @@ -3,35 +3,51 @@ experimental[] -A filtering aggregation used to limit any sub aggregations' processing to a sample of the top-scoring documents. Diversity settings are -used to limit the number of matches that share a common value such as an "author". +Like the `sampler` aggregation this is a filtering aggregation used to limit any sub aggregations' processing to a sample of the top-scoring documents. +The `diversified_sampler` aggregation adds the ability to limit the number of matches that share a common value such as an "author". + +NOTE: Any good market researcher will tell you that when working with samples of data it is important +that the sample represents a healthy variety of opinions rather than being skewed by any single voice. +The same is true with aggregations and sampling with these diversify settings can offer a way to remove the bias in your content (an over-populated geography, +a large spike in a timeline or an over-active forum spammer). + .Example use cases: * Tightening the focus of analytics to high-relevance matches rather than the potentially very long tail of low-quality matches * Removing bias from analytics by ensuring fair representation of content from different sources * Reducing the running cost of aggregations that can produce useful results using only samples e.g. `significant_terms` +A choice of `field` or `script` setting is used to provide values used for de-duplication and the `max_docs_per_value` setting controls the maximum +number of documents collected on any one shard which share a common value. The default setting for `max_docs_per_value` is 1. + +The aggregation will throw an error if the choice of `field` or `script` produces multiple values for a single document (de-duplication using multi-valued fields is not supported due to efficiency concerns). + Example: +We might want to see which tags are strongly associated with `#elasticsearch` on StackOverflow +forum posts but ignoring the effects of some prolific users with a tendency to misspell #Kibana as #Cabana. + [source,js] -------------------------------------------------- +POST /stackoverflow/_search?size=0 { "query": { - "match": { - "text": "iphone" + "query_string": { + "query": "tags:elasticsearch" } }, "aggs": { - "sample": { + "my_unbiased_sample": { "diversified_sampler": { "shard_size": 200, - "field" : "user.id" + "field" : "author" }, "aggs": { "keywords": { "significant_terms": { - "field": "text" + "field": "tags", + "exclude": ["elasticsearch"] } } } @@ -39,6 +55,8 @@ Example: } } -------------------------------------------------- +// CONSOLE +// TEST[setup:stackoverflow] Response: @@ -46,92 +64,118 @@ Response: -------------------------------------------------- { ... - "aggregations": { - "sample": { + "aggregations": { + "my_unbiased_sample": { "doc_count": 1000,<1> "keywords": {<2> "doc_count": 1000, "buckets": [ - ... { - "key": "bend", - "doc_count": 58, - "score": 37.982536582524276, - "bg_count": 103 - }, - .... + "key": "kibana", + "doc_count": 150, + "score": 2.213, + "bg_count": 200 + } + ] + } + } + } } -------------------------------------------------- +// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/] +// TESTRESPONSE[s/1000/151/] +// TESTRESPONSE[s/2.213/$body.aggregations.my_unbiased_sample.keywords.buckets.0.score/] <1> 1000 documents were sampled in total because we asked for a maximum of 200 from an index with 5 shards. The cost of performing the nested significant_terms aggregation was therefore limited rather than unbounded. -<2> The results of the significant_terms aggregation are not skewed by any single over-active Twitter user because we asked for a maximum of one tweet from any one user in our sample. - - -==== shard_size - -The `shard_size` parameter limits how many top-scoring documents are collected in the sample processed on each shard. -The default value is 100. - -==== Controlling diversity -=`field` or `script` and `max_docs_per_value` settings are used to control the maximum number of documents collected on any one shard which share a common value. -The choice of value (e.g. `author`) is loaded from a regular `field` or derived dynamically by a `script`. - -The aggregation will throw an error if the choice of field or script produces multiple values for a document. -It is currently not possible to offer this form of de-duplication using many values, primarily due to concerns over efficiency. - -NOTE: Any good market researcher will tell you that when working with samples of data it is important -that the sample represents a healthy variety of opinions rather than being skewed by any single voice. -The same is true with aggregations and sampling with these diversify settings can offer a way to remove the bias in your content (an over-populated geography, a large spike in a timeline or an over-active forum spammer). +<2> The results of the significant_terms aggregation are not skewed by any single author's quirks because we asked for a maximum of one post from any one author in our sample. -==== Field +==== Scripted example: -Controlling diversity using a field: +In this scenario we might want to diversify on a combination of field values. We can use a `script` to produce a hash of the +multiple values in a tags field to ensure we don't have a sample that consists of the same repeated combinations of tags. [source,js] -------------------------------------------------- +POST /stackoverflow/_search?size=0 { - "aggs" : { - "sample" : { - "diversified_sampler" : { - "field" : "author", - "max_docs_per_value" : 3 + "query": { + "query_string": { + "query": "tags:kibana" + } + }, + "aggs": { + "my_unbiased_sample": { + "diversified_sampler": { + "shard_size": 200, + "max_docs_per_value" : 3, + "script" : { + "lang": "painless", + "inline": "doc['tags'].values.hashCode()" + } + }, + "aggs": { + "keywords": { + "significant_terms": { + "field": "tags", + "exclude": ["kibana"] + } + } } } } } -------------------------------------------------- +// CONSOLE +// TEST[setup:stackoverflow] -Note that the `max_docs_per_value` setting applies on a per-shard basis only for the purposes of shard-local sampling. -It is not intended as a way of providing a global de-duplication feature on search results. - - - -==== Script - -Controlling diversity using a script: +Response: [source,js] -------------------------------------------------- { - "aggs" : { - "sample" : { - "diversified_sampler" : { - "script" : { - "lang" : "painless", - "inline" : "doc['author'].value + '/' + doc['genre'].value" - } + ... + "aggregations": { + "my_unbiased_sample": { + "doc_count": 1000,<1> + "keywords": {<2> + "doc_count": 1000, + "buckets": [ + { + "key": "logstash", + "doc_count": 3, + "score": 2.213, + "bg_count": 50 + }, + { + "key": "elasticsearch", + "doc_count": 3, + "score": 1.34, + "bg_count": 200 + }, + ] } } } } -------------------------------------------------- -Note in the above example we chose to use the default `max_docs_per_value` setting of 1 and combine author and genre fields to ensure -each shard sample has, at most, one match for an author/genre pair. +// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/] +// TESTRESPONSE[s/1000/6/] +// TESTRESPONSE[s/2.213/$body.aggregations.my_unbiased_sample.keywords.buckets.0.score/] +// TESTRESPONSE[s/1.34/$body.aggregations.my_unbiased_sample.keywords.buckets.1.score/] + +==== shard_size + +The `shard_size` parameter limits how many top-scoring documents are collected in the sample processed on each shard. +The default value is 100. + +==== max_docs_per_value +The `max_docs_per_value` is an optional parameter and limits how many documents are permitted per choice of de-duplicating value. +The default setting is "1". ==== execution_hint -When using the settings to control diversity, the optional `execution_hint` setting can influence the management of the values used for de-duplication. +The optional `execution_hint` setting can influence the management of the values used for de-duplication. Each option will hold up to `shard_size` values in memory while performing de-duplication but the type of value held can be controlled as follows: - hold field values directly (`map`) @@ -145,12 +189,12 @@ Please note that Elasticsearch will ignore the choice of execution hint if it is ==== Limitations ===== Cannot be nested under `breadth_first` aggregations -Being a quality-based filter the sampler aggregation needs access to the relevance score produced for each document. +Being a quality-based filter the diversified_sampler aggregation needs access to the relevance score produced for each document. It therefore cannot be nested under a `terms` aggregation which has the `collect_mode` switched from the default `depth_first` mode to `breadth_first` as this discards scores. In this situation an error will be thrown. ===== Limited de-dup logic. -The de-duplication logic in the diversify settings applies only at a shard level so will not apply across shards. +The de-duplication logic applies only at a shard level so will not apply across shards. ===== No specialized syntax for geo/date fields Currently the syntax for defining the diversifying values is defined by a choice of `field` or diff --git a/docs/reference/aggregations/bucket/sampler-aggregation.asciidoc b/docs/reference/aggregations/bucket/sampler-aggregation.asciidoc index e1f3e708aa6c7..693df5e814423 100644 --- a/docs/reference/aggregations/bucket/sampler-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/sampler-aggregation.asciidoc @@ -12,12 +12,18 @@ A filtering aggregation used to limit any sub aggregations' processing to a samp Example: +A query on StackOverflow data for the popular term `javascript` OR the rarer term +`kibana` will match many documents - most of them missing the word Kibana. To focus +the `significant_terms` aggregation on top-scoring documents that are more likely to match +the most interesting parts of our query we use a sample. + [source,js] -------------------------------------------------- +POST /stackoverflow/_search?size=0 { "query": { - "match": { - "text": "iphone" + "query_string": { + "query": "tags:kibana OR tags:javascript" } }, "aggs": { @@ -28,7 +34,8 @@ Example: "aggs": { "keywords": { "significant_terms": { - "field": "text" + "field": "tags", + "exclude": ["kibana", "javascript"] } } } @@ -36,6 +43,8 @@ Example: } } -------------------------------------------------- +// CONSOLE +// TEST[setup:stackoverflow] Response: @@ -43,26 +52,103 @@ Response: -------------------------------------------------- { ... - "aggregations": { + "aggregations": { "sample": { "doc_count": 1000,<1> "keywords": { "doc_count": 1000, "buckets": [ - ... { - "key": "bend", - "doc_count": 58, - "score": 37.982536582524276, - "bg_count": 103 + "key": "elasticsearch", + "doc_count": 150, + "score": 1.078125, + "bg_count": 200 }, - .... + { + "key": "logstash", + "doc_count": 50, + "score": 0.5625, + "bg_count": 50 + } + ] + } + } + } } -------------------------------------------------- +// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/] +// TESTRESPONSE[s/1000/200/] <1> 1000 documents were sampled in total because we asked for a maximum of 200 from an index with 5 shards. The cost of performing the nested significant_terms aggregation was therefore limited rather than unbounded. +Without the `sampler` aggregation the request query considers the full "long tail" of low-quality matches and therefore identifies +less significant terms such as `jquery` and `angular` rather than focusing on the more insightful Kibana-related terms. + + +[source,js] +-------------------------------------------------- +POST /stackoverflow/_search?size=0 +{ + "query": { + "query_string": { + "query": "tags:kibana OR tags:javascript" + } + }, + "aggs": { + "low_quality_keywords": { + "significant_terms": { + "field": "tags", + "size": 3, + "exclude":["kibana", "javascript"] + } + } + } +} +-------------------------------------------------- +// CONSOLE +// TEST[setup:stackoverflow] + +Response: + +[source,js] +-------------------------------------------------- +{ + ... + "aggregations": { + "low_quality_keywords": { + "doc_count": 1000, + "buckets": [ + { + "key": "angular", + "doc_count": 200, + "score": 0.02777, + "bg_count": 200 + }, + { + "key": "jquery", + "doc_count": 200, + "score": 0.02777, + "bg_count": 200 + }, + { + "key": "logstash", + "doc_count": 50, + "score": 0.0069, + "bg_count": 50 + } + ] + } + } +} +-------------------------------------------------- +// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/] +// TESTRESPONSE[s/1000/600/] +// TESTRESPONSE[s/0.02777/$body.aggregations.low_quality_keywords.buckets.0.score/] +// TESTRESPONSE[s/0.0069/$body.aggregations.low_quality_keywords.buckets.2.score/] + + + ==== shard_size The `shard_size` parameter limits how many top-scoring documents are collected in the sample processed on each shard.