diff --git a/docs/build.gradle b/docs/build.gradle index 4250e50bcc03a..3b92b014ac61c 100644 --- a/docs/build.gradle +++ b/docs/build.gradle @@ -19,21 +19,6 @@ apply plugin: 'elasticsearch.docs-test' -/* List of files that have snippets that probably should be converted to - * `// CONSOLE` and `// TESTRESPONSE` but have yet to be converted. Try and - * only remove entries from this list. When it is empty we'll remove it - * entirely and have a party! There will be cake and everything.... */ -buildRestTests.expectedUnconvertedCandidates = [ - 'reference/aggregations/bucket/nested-aggregation.asciidoc', - 'reference/aggregations/bucket/range-aggregation.asciidoc', - 'reference/aggregations/bucket/reverse-nested-aggregation.asciidoc', - 'reference/aggregations/bucket/significantterms-aggregation.asciidoc', - 'reference/aggregations/bucket/terms-aggregation.asciidoc', - 'reference/aggregations/matrix/stats-aggregation.asciidoc', - 'reference/aggregations/metrics/tophits-aggregation.asciidoc', - 'reference/cluster/allocation-explain.asciidoc', -] - integTestCluster { setting 'script.max_compilations_per_minute', '1000' /* Enable regexes in painless so our tests don't complain about example diff --git a/docs/reference/aggregations/bucket/nested-aggregation.asciidoc b/docs/reference/aggregations/bucket/nested-aggregation.asciidoc index 89142df13e786..d8dbd638f1205 100644 --- a/docs/reference/aggregations/bucket/nested-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/nested-aggregation.asciidoc @@ -8,9 +8,9 @@ price for the product. The mapping could look like: [source,js] -------------------------------------------------- +PUT /index { - ... - + "mappings": { "product" : { "properties" : { "resellers" : { <1> @@ -22,15 +22,18 @@ price for the product. The mapping could look like: } } } + } } -------------------------------------------------- - +// CONSOLE +// TESTSETUP <1> The `resellers` is an array that holds nested documents under the `product` object. The following aggregations will return the minimum price products can be purchased in: [source,js] -------------------------------------------------- +GET /_search { "query" : { "match" : { "name" : "led tv" } @@ -47,6 +50,9 @@ The following aggregations will return the minimum price products can be purchas } } -------------------------------------------------- +// CONSOLE +// TEST[s/GET \/_search/GET \/_search\?filter_path=aggregations/] +// TEST[s/^/PUT index\/product\/0\?refresh\n{"name":"led", "resellers": [{"name": "foo", "price": 350.00}, {"name": "bar", "price": 500.00}]}\n/] As you can see above, the nested aggregation requires the `path` of the nested documents within the top level documents. Then one can define any type of aggregation over these nested documents. @@ -56,12 +62,16 @@ Response: [source,js] -------------------------------------------------- { - "aggregations": { - "resellers": { - "min_price": { - "value" : 350 - } - } + ... + "aggregations": { + "resellers": { + "doc_count": 0, + "min_price": { + "value": 350 + } } + } } -------------------------------------------------- +// TESTRESPONSE[s/\.\.\.//] +// TESTRESPONSE[s/: [0-9]+/: $body.$_path/] diff --git a/docs/reference/aggregations/bucket/range-aggregation.asciidoc b/docs/reference/aggregations/bucket/range-aggregation.asciidoc index 7ce8ec699f0de..8ff26c7c92f5c 100644 --- a/docs/reference/aggregations/bucket/range-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/range-aggregation.asciidoc @@ -8,21 +8,25 @@ Example: [source,js] -------------------------------------------------- +GET /_search { "aggs" : { "price_ranges" : { "range" : { "field" : "price", "ranges" : [ - { "to" : 50 }, - { "from" : 50, "to" : 100 }, - { "from" : 100 } + { "to" : 100.0 }, + { "from" : 100.0, "to" : 200.0 }, + { "from" : 200.0 } ] } } } } -------------------------------------------------- +// CONSOLE +// TEST[setup:sales] +// TEST[s/GET \/_search/GET \/_search\?filter_path=aggregations/] Response: @@ -30,28 +34,31 @@ Response: -------------------------------------------------- { ... - "aggregations": { "price_ranges" : { "buckets": [ { - "to": 50, + "key": "*-100.0", + "to": 100.0, "doc_count": 2 }, { - "from": 50, - "to": 100, - "doc_count": 4 + "key": "100.0-200.0", + "from": 100.0, + "to": 200.0, + "doc_count": 2 }, { - "from": 100, - "doc_count": 4 + "key": "200.0-*", + "from": 200.0, + "doc_count": 3 } ] } } } -------------------------------------------------- +// TESTRESPONSE[s/\.\.\.//] ==== Keyed Response @@ -59,6 +66,7 @@ Setting the `keyed` flag to `true` will associate a unique string key with each [source,js] -------------------------------------------------- +GET /_search { "aggs" : { "price_ranges" : { @@ -66,15 +74,18 @@ Setting the `keyed` flag to `true` will associate a unique string key with each "field" : "price", "keyed" : true, "ranges" : [ - { "to" : 50 }, - { "from" : 50, "to" : 100 }, - { "from" : 100 } + { "to" : 100 }, + { "from" : 100, "to" : 200 }, + { "from" : 200 } ] } } } } -------------------------------------------------- +// CONSOLE +// TEST[setup:sales] +// TEST[s/GET \/_search/GET \/_search\?filter_path=aggregations/] Response: @@ -82,33 +93,34 @@ Response: -------------------------------------------------- { ... - "aggregations": { "price_ranges" : { "buckets": { - "*-50.0": { - "to": 50, + "*-100.0": { + "to": 100.0, "doc_count": 2 }, - "50.0-100.0": { - "from": 50, - "to": 100, - "doc_count": 4 + "100.0-200.0": { + "from": 100.0, + "to": 200.0, + "doc_count": 2 }, - "100.0-*": { - "from": 100, - "doc_count": 4 + "200.0-*": { + "from": 200.0, + "doc_count": 3 } } } } } -------------------------------------------------- +// TESTRESPONSE[s/\.\.\.//] It is also possible to customize the key for each range: [source,js] -------------------------------------------------- +GET /_search { "aggs" : { "price_ranges" : { @@ -116,20 +128,58 @@ It is also possible to customize the key for each range: "field" : "price", "keyed" : true, "ranges" : [ - { "key" : "cheap", "to" : 50 }, - { "key" : "average", "from" : 50, "to" : 100 }, - { "key" : "expensive", "from" : 100 } + { "key" : "cheap", "to" : 100 }, + { "key" : "average", "from" : 100, "to" : 200 }, + { "key" : "expensive", "from" : 200 } ] } } } } -------------------------------------------------- +// CONSOLE +// TEST[setup:sales] +// TEST[s/GET \/_search/GET \/_search\?filter_path=aggregations/] + +Response: + +[source,js] +-------------------------------------------------- +{ + ... + "aggregations": { + "price_ranges" : { + "buckets": { + "cheap": { + "to": 100.0, + "doc_count": 2 + }, + "average": { + "from": 100.0, + "to": 200.0, + "doc_count": 2 + }, + "expensive": { + "from": 200.0, + "doc_count": 3 + } + } + } + } +} +-------------------------------------------------- +// TESTRESPONSE[s/\.\.\.//] ==== Script +Range aggregation accepts a `script` parameter. This parameter allows to defined an inline `script` that +will be executed during aggregation execution. + +The following example shows how to use an `inline` script with the `painless` script language and no script parameters: + [source,js] -------------------------------------------------- +GET /_search { "aggs" : { "price_ranges" : { @@ -139,33 +189,50 @@ It is also possible to customize the key for each range: "source": "doc['price'].value" }, "ranges" : [ - { "to" : 50 }, - { "from" : 50, "to" : 100 }, - { "from" : 100 } + { "to" : 100 }, + { "from" : 100, "to" : 200 }, + { "from" : 200 } ] } } } } -------------------------------------------------- +// CONSOLE + +It is also possible to use stored scripts. Here is a simple stored script: + +[source,js] +-------------------------------------------------- +POST /_scripts/convert_currency +{ + "script": { + "lang": "painless", + "source": "doc[params.field].value * params.conversion_rate" + } +} +-------------------------------------------------- +// CONSOLE +// TEST[setup:sales] -This will interpret the `script` parameter as an `inline` script with the `painless` script language and no script parameters. To use a stored script use the following syntax: +And this new stored script can be used in the range aggregation like this: [source,js] -------------------------------------------------- +GET /_search { "aggs" : { "price_ranges" : { "range" : { "script" : { - "id": "my_script", - "params": { - "field": "price" + "id": "convert_currency", <1> + "params": { <2> + "field": "price", + "conversion_rate": 0.835526591 } }, "ranges" : [ - { "to" : 50 }, - { "from" : 50, "to" : 100 }, + { "from" : 0, "to" : 100 }, { "from" : 100 } ] } @@ -173,6 +240,39 @@ This will interpret the `script` parameter as an `inline` script with the `painl } } -------------------------------------------------- +// CONSOLE +// TEST[s/GET \/_search/GET \/_search\?filter_path=aggregations/] +// TEST[continued] +<1> Id of the stored script +<2> Parameters to use when executing the stored script + +////////////////////////// + +[source,js] +-------------------------------------------------- +{ + "aggregations": { + "price_ranges" : { + "buckets": [ + { + "key" : "0.0-100.0", + "from" : 0.0, + "to" : 100.0, + "doc_count" : 2 + }, + { + "key" : "100.0-*", + "from" : 100.0, + "doc_count" : 5 + } + ] + } + } +} +-------------------------------------------------- +// TESTRESPONSE + +////////////////////////// ==== Value Script @@ -180,13 +280,13 @@ Lets say the product prices are in USD but we would like to get the price ranges [source,js] -------------------------------------------------- +GET /sales/_search { "aggs" : { "price_ranges" : { "range" : { "field" : "price", "script" : { - "lang": "painless", "source": "_value * params.conversion_rate", "params" : { "conversion_rate" : 0.8 @@ -202,6 +302,8 @@ Lets say the product prices are in USD but we would like to get the price ranges } } -------------------------------------------------- +// CONSOLE +// TEST[setup:sales] ==== Sub Aggregations @@ -209,15 +311,16 @@ The following example, not only "bucket" the documents to the different buckets [source,js] -------------------------------------------------- +GET /_search { "aggs" : { "price_ranges" : { "range" : { "field" : "price", "ranges" : [ - { "to" : 50 }, - { "from" : 50, "to" : 100 }, - { "from" : 100 } + { "to" : 100 }, + { "from" : 100, "to" : 200 }, + { "from" : 200 } ] }, "aggs" : { @@ -229,68 +332,77 @@ The following example, not only "bucket" the documents to the different buckets } } -------------------------------------------------- +// CONSOLE +// TEST[setup:sales] +// TEST[s/GET \/_search/GET \/_search\?filter_path=aggregations/] Response: [source,js] -------------------------------------------------- { - "aggregations": { - "price_ranges" : { - "buckets": [ - { - "to": 50, - "doc_count": 2, - "price_stats": { - "count": 2, - "min": 20, - "max": 47, - "avg": 33.5, - "sum": 67 - } - }, - { - "from": 50, - "to": 100, - "doc_count": 4, - "price_stats": { - "count": 4, - "min": 60, - "max": 98, - "avg": 82.5, - "sum": 330 - } - }, - { - "from": 100, - "doc_count": 4, - "price_stats": { - "count": 4, - "min": 134, - "max": 367, - "avg": 216, - "sum": 864 - } - } - ] + ... + "aggregations": { + "price_ranges": { + "buckets": [ + { + "key": "*-100.0", + "to": 100.0, + "doc_count": 2, + "price_stats": { + "count": 2, + "min": 10.0, + "max": 50.0, + "avg": 30.0, + "sum": 60.0 + } + }, + { + "key": "100.0-200.0", + "from": 100.0, + "to": 200.0, + "doc_count": 2, + "price_stats": { + "count": 2, + "min": 150.0, + "max": 175.0, + "avg": 162.5, + "sum": 325.0 + } + }, + { + "key": "200.0-*", + "from": 200.0, + "doc_count": 3, + "price_stats": { + "count": 3, + "min": 200.0, + "max": 200.0, + "avg": 200.0, + "sum": 600.0 + } } + ] } + } } -------------------------------------------------- +// TESTRESPONSE[s/\.\.\.//] If a sub aggregation is also based on the same value source as the range aggregation (like the `stats` aggregation in the example above) it is possible to leave out the value source definition for it. The following will return the same response as above: [source,js] -------------------------------------------------- +GET /_search { "aggs" : { "price_ranges" : { "range" : { "field" : "price", "ranges" : [ - { "to" : 50 }, - { "from" : 50, "to" : 100 }, - { "from" : 100 } + { "to" : 100 }, + { "from" : 100, "to" : 200 }, + { "from" : 200 } ] }, "aggs" : { @@ -302,5 +414,5 @@ If a sub aggregation is also based on the same value source as the range aggrega } } -------------------------------------------------- - +// CONSOLE <1> We don't need to specify the `price` as we "inherit" it by default from the parent `range` aggregation diff --git a/docs/reference/aggregations/bucket/reverse-nested-aggregation.asciidoc b/docs/reference/aggregations/bucket/reverse-nested-aggregation.asciidoc index b6074298e1c03..8797e6041d5f3 100644 --- a/docs/reference/aggregations/bucket/reverse-nested-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/reverse-nested-aggregation.asciidoc @@ -17,36 +17,48 @@ the issue documents as nested documents. The mapping could look like: [source,js] -------------------------------------------------- +PUT /issues { - ... - - "issue" : { - "properties" : { - "tags" : { "type" : "text" }, - "comments" : { <1> - "type" : "nested", - "properties" : { - "username" : { "type" : "keyword" }, - "comment" : { "type" : "text" } + "mappings": { + "issue" : { + "properties" : { + "tags" : { "type" : "keyword" }, + "comments" : { <1> + "type" : "nested", + "properties" : { + "username" : { "type" : "keyword" }, + "comment" : { "type" : "text" } + } } } } } } -------------------------------------------------- - +// CONSOLE <1> The `comments` is an array that holds nested documents under the `issue` object. The following aggregations will return the top commenters' username that have commented and per top commenter the top tags of the issues the user has commented on: +////////////////////////// + +[source,js] +-------------------------------------------------- +POST /issues/issue/0?refresh +{"tags": ["tag_1"], "comments": [{"username": "username_1"}]} +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +////////////////////////// + [source,js] -------------------------------------------------- +GET /issues/_search { "query": { - "match": { - "name": "led tv" - } + "match_all": {} }, "aggs": { "comments": { @@ -76,6 +88,9 @@ tags of the issues the user has commented on: } } -------------------------------------------------- +// CONSOLE +// TEST[continued] +// TEST[s/_search/_search\?filter_path=aggregations/] As you can see above, the `reverse_nested` aggregation is put in to a `nested` aggregation as this is the only place in the dsl where the `reversed_nested` aggregation can be used. Its sole purpose is to join back to a parent doc higher @@ -92,23 +107,29 @@ Possible response snippet: { "aggregations": { "comments": { + "doc_count": 1, "top_usernames": { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, "buckets": [ { "key": "username_1", - "doc_count": 12, + "doc_count": 1, "comment_to_issue": { + "doc_count": 1, "top_tags_per_comment": { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, "buckets": [ { - "key": "tag1", - "doc_count": 9 - }, + "key": "tag_1", + "doc_count": 1 + } ... ] } } - }, + } ... ] } @@ -116,3 +137,4 @@ Possible response snippet: } } -------------------------------------------------- +// TESTRESPONSE[s/\.\.\.//] diff --git a/docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc b/docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc index 93efb779b2c2f..1db54611b31cf 100644 --- a/docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc @@ -15,6 +15,50 @@ They are the terms that have undergone a significant change in popularity measur If the term "H5N1" only exists in 5 documents in a 10 million document index and yet is found in 4 of the 100 documents that make up a user's search results that is significant and probably very relevant to their search. 5/10,000,000 vs 4/100 is a big swing in frequency. +////////////////////////// + +[source,js] +-------------------------------------------------- +PUT /reports +{ + "mappings": { + "report": { + "properties": { + "force": { + "type": "keyword" + }, + "crime_type": { + "type": "keyword" + } + } + } + } +} + +POST /reports/report/_bulk?refresh +{"index":{"_id":0}} +{"force": "British Transport Police", "crime_type": "Bicycle theft"} +{"index":{"_id":1}} +{"force": "British Transport Police", "crime_type": "Bicycle theft"} +{"index":{"_id":2}} +{"force": "British Transport Police", "crime_type": "Bicycle theft"} +{"index":{"_id":3}} +{"force": "British Transport Police", "crime_type": "Robbery"} +{"index":{"_id":4}} +{"force": "Metropolitan Police Service", "crime_type": "Robbery"} +{"index":{"_id":5}} +{"force": "Metropolitan Police Service", "crime_type": "Bicycle theft"} +{"index":{"_id":6}} +{"force": "Metropolitan Police Service", "crime_type": "Robbery"} +{"index":{"_id":7}} +{"force": "Metropolitan Police Service", "crime_type": "Robbery"} + +------------------------------------------------- +// NOTCONSOLE +// TESTSETUP + +////////////////////////// + ==== Single-set analysis In the simplest case, the _foreground_ set of interest is the search results matched by a query and the _background_ @@ -24,17 +68,20 @@ Example: [source,js] -------------------------------------------------- +GET /_search { "query" : { "terms" : {"force" : [ "British Transport Police" ]} }, "aggregations" : { - "significantCrimeTypes" : { + "significant_crime_types" : { "significant_terms" : { "field" : "crime_type" } } } } -------------------------------------------------- +// CONSOLE +// TEST[s/_search/_search\?filter_path=aggregations/] Response: @@ -42,9 +89,8 @@ Response: -------------------------------------------------- { ... - "aggregations" : { - "significantCrimeTypes" : { + "significant_crime_types" : { "doc_count": 47347, "bg_count": 5064554, "buckets" : [ @@ -60,6 +106,8 @@ Response: } } -------------------------------------------------- +// TESTRESPONSE[s/\.\.\.//] +// TESTRESPONSE[s/: (0\.)?[0-9]+/: $body.$_path/] When querying an index of all crimes from all police forces, what these results show is that the British Transport Police force stand out as a force dealing with a disproportionately large number of bicycle thefts. Ordinarily, bicycle thefts represent only 1% of crimes (66799/5064554) @@ -81,12 +129,13 @@ Example using a parent aggregation for segmentation: [source,js] -------------------------------------------------- +GET /_search { "aggregations": { "forces": { "terms": {"field": "force"}, "aggregations": { - "significantCrimeTypes": { + "significant_crime_types": { "significant_terms": {"field": "crime_type"} } } @@ -94,6 +143,8 @@ Example using a parent aggregation for segmentation: } } -------------------------------------------------- +// CONSOLE +// TEST[s/_search/_search\?filter_path=aggregations/] Response: @@ -101,14 +152,15 @@ Response: -------------------------------------------------- { ... - "aggregations": { "forces": { + "doc_count_error_upper_bound": 1375, + "sum_other_doc_count": 7879845, "buckets": [ { "key": "Metropolitan Police Service", "doc_count": 894038, - "significantCrimeTypes": { + "significant_crime_types": { "doc_count": 894038, "bg_count": 5064554, "buckets": [ @@ -117,7 +169,7 @@ Response: "doc_count": 27617, "score": 0.0599, "bg_count": 53182 - }, + } ... ] } @@ -125,7 +177,7 @@ Response: { "key": "British Transport Police", "doc_count": 47347, - "significantCrimeTypes": { + "significant_crime_types": { "doc_count": 47347, "bg_count": 5064554, "buckets": [ @@ -134,16 +186,19 @@ Response: "doc_count": 3640, "score": 0.371, "bg_count": 66799 - }, + } ... ] } } ] } + } } - -------------------------------------------------- +// TESTRESPONSE[s/\.\.\.//] +// TESTRESPONSE[s/: (0\.)?[0-9]+/: $body.$_path/] +// TESTRESPONSE[s/: "[^"]*"/: $body.$_path/] Now we have anomaly detection for each of the police forces using a single request. @@ -152,15 +207,16 @@ area to identify unusual hot-spots of a particular crime type: [source,js] -------------------------------------------------- +GET /_search { "aggs": { "hotspots": { - "geohash_grid" : { - "field":"location", - "precision":5, + "geohash_grid": { + "field": "location", + "precision": 5 }, "aggs": { - "significantCrimeTypes": { + "significant_crime_types": { "significant_terms": {"field": "crime_type"} } } @@ -168,6 +224,7 @@ area to identify unusual hot-spots of a particular crime type: } } -------------------------------------------------- +// CONSOLE This example uses the `geohash_grid` aggregation to create result buckets that represent geographic areas, and inside each bucket we can identify anomalous levels of a crime type in these tightly-focused areas e.g. @@ -283,6 +340,7 @@ Mutual information as described in "Information Retrieval", Manning et al., Chap "include_negatives": true } -------------------------------------------------- +// NOTCONSOLE Mutual information does not differentiate between terms that are descriptive for the subset or for documents outside the subset. The significant terms therefore can contain terms that appear more or less frequent in the subset than outside the subset. To filter out the terms that appear less often in the subset than in documents outside the subset, `include_negatives` can be set to `false`. @@ -293,7 +351,7 @@ Per default, the assumption is that the documents in the bucket are also contain "background_is_superset": false -------------------------------------------------- - +// NOTCONSOLE ===== Chi square Chi square as described in "Information Retrieval", Manning et al., Chapter 13.5.2 can be used as significance score by adding the parameter @@ -304,7 +362,7 @@ Chi square as described in "Information Retrieval", Manning et al., Chapter 13.5 "chi_square": { } -------------------------------------------------- - +// NOTCONSOLE Chi square behaves like mutual information and can be configured with the same parameters `include_negatives` and `background_is_superset`. @@ -317,7 +375,7 @@ Google normalized distance as described in "The Google Similarity Distance", Ci "gnd": { } -------------------------------------------------- - +// NOTCONSOLE `gnd` also accepts the `background_is_superset` parameter. @@ -336,7 +394,7 @@ Multiple observations are typically required to reinforce a view so it is recomm "percentage": { } -------------------------------------------------- - +// NOTCONSOLE ===== Which one is best? @@ -360,7 +418,7 @@ Customized scores can be implemented via a script: } } -------------------------------------------------- - +// NOTCONSOLE Scripts can be inline (as in above example), indexed or stored on disk. For details on the options, see <>. Available parameters in the script are @@ -400,6 +458,7 @@ It is possible to only return terms that match more than a configured number of [source,js] -------------------------------------------------- +GET /_search { "aggs" : { "tags" : { @@ -411,7 +470,7 @@ It is possible to only return terms that match more than a configured number of } } -------------------------------------------------- - +// CONSOLE The above aggregation would only return tags which have been found in 10 hits or more. Default value is `3`. @@ -442,9 +501,12 @@ context: [source,js] -------------------------------------------------- +GET /_search { "query" : { - "match" : "madrid" + "match" : { + "city" : "madrid" + } }, "aggs" : { "tags" : { @@ -458,6 +520,7 @@ context: } } -------------------------------------------------- +// CONSOLE The above filter would help focus in on terms that were peculiar to the city of Madrid rather than revealing terms like "Spanish" that are unusual in the full index's worldwide context but commonplace in the subset of documents containing the @@ -491,6 +554,7 @@ ordinals. [source,js] -------------------------------------------------- +GET /_search { "aggs" : { "tags" : { @@ -502,6 +566,7 @@ ordinals. } } -------------------------------------------------- +// CONSOLE <1> the possible values are `map`, `global_ordinals` diff --git a/docs/reference/aggregations/bucket/terms-aggregation.asciidoc b/docs/reference/aggregations/bucket/terms-aggregation.asciidoc index 4d96ad4c4543a..ba6b912780c9f 100644 --- a/docs/reference/aggregations/bucket/terms-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/terms-aggregation.asciidoc @@ -3,10 +3,61 @@ A multi-bucket value source based aggregation where buckets are dynamically built - one per unique value. +////////////////////////// + +[source,js] +-------------------------------------------------- +PUT /products +{ + "mappings": { + "product": { + "properties": { + "genre": { + "type": "keyword" + }, + "product": { + "type": "keyword" + } + } + } + } +} + +POST /products/product/_bulk?refresh +{"index":{"_id":0}} +{"genre": "rock", "product": "Product A"} +{"index":{"_id":1}} +{"genre": "rock"} +{"index":{"_id":2}} +{"genre": "rock"} +{"index":{"_id":3}} +{"genre": "jazz", "product": "Product Z"} +{"index":{"_id":4}} +{"genre": "jazz"} +{"index":{"_id":5}} +{"genre": "electronic"} +{"index":{"_id":6}} +{"genre": "electronic"} +{"index":{"_id":7}} +{"genre": "electronic"} +{"index":{"_id":8}} +{"genre": "electronic"} +{"index":{"_id":9}} +{"genre": "electronic"} +{"index":{"_id":10}} +{"genre": "electronic"} + +------------------------------------------------- +// NOTCONSOLE +// TESTSETUP + +////////////////////////// + Example: [source,js] -------------------------------------------------- +GET /_search { "aggs" : { "genres" : { @@ -15,6 +66,8 @@ Example: } } -------------------------------------------------- +// CONSOLE +// TEST[s/_search/_search\?filter_path=aggregations/] Response: @@ -22,30 +75,29 @@ Response: -------------------------------------------------- { ... - "aggregations" : { "genres" : { "doc_count_error_upper_bound": 0, <1> "sum_other_doc_count": 0, <2> "buckets" : [ <3> { - "key" : "jazz", - "doc_count" : 10 + "key" : "electronic", + "doc_count" : 6 }, { "key" : "rock", - "doc_count" : 10 + "doc_count" : 3 }, { - "key" : "electronic", - "doc_count" : 10 - }, + "key" : "jazz", + "doc_count" : 2 + } ] } } } -------------------------------------------------- - +// TESTRESPONSE[s/\.\.\.//] <1> an upper bound of the error on the document counts for each term, see <> <2> when there are lots of unique terms, elasticsearch only returns the top terms; this number is the sum of the document counts for all buckets that are not part of the response <3> the list of the top buckets, the meaning of `top` being defined by the <> @@ -74,6 +126,7 @@ A request is made to obtain the top 5 terms in the field product, ordered by des [source,js] -------------------------------------------------- +GET /_search { "aggs" : { "products" : { @@ -85,6 +138,8 @@ A request is made to obtain the top 5 terms in the field product, ordered by des } } -------------------------------------------------- +// CONSOLE +// TEST[s/_search/_search\?filter_path=aggregations/] The terms for each of the three shards are shown below with their respective document counts in brackets: @@ -108,7 +163,6 @@ respective document counts in brackets: The shards will return their top 5 terms so the results from the shards will be: - [width="100%",cols="^2,^2,^2,^2",options="header"] |========================================================= | | Shard A | Shard B | Shard C @@ -165,9 +219,9 @@ otherwise. ==== Calculating Document Count Error -There are two error values which can be shown on the terms aggregation. The first gives a value for the aggregation as +There are two error values which can be shown on the terms aggregation. The first gives a value for the aggregation as a whole which represents the maximum potential document count for a term which did not make it into the final list of -terms. This is calculated as the sum of the document count from the last term returned from each shard .For the example +terms. This is calculated as the sum of the document count from the last term returned from each shard. For the example given above the value would be 46 (2 + 15 + 29). This means that in the worst case scenario a term which was not returned could have the 4th highest document count. @@ -175,10 +229,10 @@ could have the 4th highest document count. -------------------------------------------------- { ... - "aggregations" : { "products" : { "doc_count_error_upper_bound" : 46, + "sum_other_doc_count" : 79, "buckets" : [ { "key" : "Product A", @@ -187,33 +241,55 @@ could have the 4th highest document count. { "key" : "Product Z", "doc_count" : 52 - }, + } ... ] } } } -------------------------------------------------- +// TESTRESPONSE[s/\.\.\.//] +// TESTRESPONSE[s/: (\-)?[0-9]+/: $body.$_path/] ==== Per bucket document count error -The second error value can be enabled by setting the `show_term_doc_count_error` parameter to true. This shows an error value -for each term returned by the aggregation which represents the 'worst case' error in the document count and can be useful when -deciding on a value for the `shard_size` parameter. This is calculated by summing the document counts for the last term returned -by all shards which did not return the term. In the example above the error in the document count for Product C would be 15 as -Shard B was the only shard not to return the term and the document count of the last term it did return was 15. The actual document -count of Product C was 54 so the document count was only actually off by 4 even though the worst case was that it would be off by -15. Product A, however has an error of 0 for its document count, since every shard returned it we can be confident that the count -returned is accurate. +The second error value can be enabled by setting the `show_term_doc_count_error` parameter to true: [source,js] -------------------------------------------------- +GET /_search { - ... + "aggs" : { + "products" : { + "terms" : { + "field" : "product", + "size" : 5, + "show_term_doc_count_error": true + } + } + } +} +-------------------------------------------------- +// CONSOLE +// TEST[s/_search/_search\?filter_path=aggregations/] + + +This shows an error value for each term returned by the aggregation which represents the 'worst case' error in the document count +and can be useful when deciding on a value for the `shard_size` parameter. This is calculated by summing the document counts for +the last term returned by all shards which did not return the term. In the example above the error in the document count for Product C +would be 15 as Shard B was the only shard not to return the term and the document count of the last term it did return was 15. +The actual document count of Product C was 54 so the document count was only actually off by 4 even though the worst case was that +it would be off by 15. Product A, however has an error of 0 for its document count, since every shard returned it we can be confident +that the count returned is accurate. +[source,js] +-------------------------------------------------- +{ + ... "aggregations" : { "products" : { "doc_count_error_upper_bound" : 46, + "sum_other_doc_count" : 79, "buckets" : [ { "key" : "Product A", @@ -224,13 +300,15 @@ returned is accurate. "key" : "Product Z", "doc_count" : 52, "doc_count_error_upper_bound" : 2 - }, + } ... ] } } } -------------------------------------------------- +// TESTRESPONSE[s/\.\.\.//] +// TESTRESPONSE[s/: (\-)?[0-9]+/: $body.$_path/] These errors can only be calculated in this way when the terms are ordered by descending document count. When the aggregation is ordered by the terms values themselves (either ascending or descending) there is no error in the document count since if a shard @@ -257,6 +335,7 @@ Ordering the buckets by their doc `_count` in an ascending manner: [source,js] -------------------------------------------------- +GET /_search { "aggs" : { "genres" : { @@ -268,11 +347,13 @@ Ordering the buckets by their doc `_count` in an ascending manner: } } -------------------------------------------------- +// CONSOLE Ordering the buckets alphabetically by their terms in an ascending manner: [source,js] -------------------------------------------------- +GET /_search { "aggs" : { "genres" : { @@ -284,6 +365,7 @@ Ordering the buckets alphabetically by their terms in an ascending manner: } } -------------------------------------------------- +// CONSOLE deprecated[6.0.0, Use `_key` instead of `_term` to order buckets by their term] @@ -291,6 +373,7 @@ Ordering the buckets by single value metrics sub-aggregation (identified by the [source,js] -------------------------------------------------- +GET /_search { "aggs" : { "genres" : { @@ -305,11 +388,13 @@ Ordering the buckets by single value metrics sub-aggregation (identified by the } } -------------------------------------------------- +// CONSOLE Ordering the buckets by multi value metrics sub-aggregation (identified by the aggregation name): [source,js] -------------------------------------------------- +GET /_search { "aggs" : { "genres" : { @@ -324,6 +409,7 @@ Ordering the buckets by multi value metrics sub-aggregation (identified by the a } } -------------------------------------------------- +// CONSOLE [NOTE] .Pipeline aggs cannot be used for sorting @@ -355,6 +441,7 @@ PATH = [ , ]* [ [ , ]* [ the possible values are `breadth_first` and `depth_first` @@ -742,6 +866,7 @@ ordinals. [source,js] -------------------------------------------------- +GET /_search { "aggs" : { "tags" : { @@ -753,6 +878,7 @@ ordinals. } } -------------------------------------------------- +// CONSOLE <1> The possible values are `map`, `global_ordinals` @@ -766,6 +892,7 @@ had a value. [source,js] -------------------------------------------------- +GET /_search { "aggs" : { "tags" : { @@ -777,6 +904,7 @@ had a value. } } -------------------------------------------------- +// CONSOLE <1> Documents without a value in the `tags` field will fall into the same bucket as documents that have the value `N/A`. diff --git a/docs/reference/aggregations/matrix/stats-aggregation.asciidoc b/docs/reference/aggregations/matrix/stats-aggregation.asciidoc index bb66115ecd571..3cc207fef7d2a 100644 --- a/docs/reference/aggregations/matrix/stats-aggregation.asciidoc +++ b/docs/reference/aggregations/matrix/stats-aggregation.asciidoc @@ -13,13 +13,34 @@ The `matrix_stats` aggregation is a numeric aggregation that computes the follow `correlation`:: The covariance matrix scaled to a range of -1 to 1, inclusive. Describes the relationship between field distributions. +////////////////////////// + +[source,js] +-------------------------------------------------- +PUT /statistics/doc/0 +{"poverty": 24.0, "income": 50000.0} + +PUT /statistics/doc/1 +{"poverty": 13.0, "income": 95687.0} + +PUT /statistics/doc/2 +{"poverty": 69.0, "income": 7890.0} + +POST /_refresh +-------------------------------------------------- +// NOTCONSOLE +// TESTSETUP + +////////////////////////// + The following example demonstrates the use of matrix stats to describe the relationship between income and poverty. [source,js] -------------------------------------------------- +GET /_search { "aggs": { - "matrixstats": { + "statistics": { "matrix_stats": { "fields": ["poverty", "income"] } @@ -27,6 +48,8 @@ The following example demonstrates the use of matrix stats to describe the relat } } -------------------------------------------------- +// CONSOLE +// TEST[s/_search/_search\?filter_path=aggregations/] The aggregation type is `matrix_stats` and the `fields` setting defines the set of fields (as an array) for computing the statistics. The above request returns the following response: @@ -36,7 +59,7 @@ the statistics. The above request returns the following response: { ... "aggregations": { - "matrixstats": { + "statistics": { "doc_count": 50, "fields": [{ "name": "income", @@ -73,6 +96,8 @@ the statistics. The above request returns the following response: } } -------------------------------------------------- +// TESTRESPONSE[s/\.\.\.//] +// TESTRESPONSE[s/: (\-)?[0-9\.E]+/: $body.$_path/] The `doc_count` field indicates the number of documents involved in the computation of the statistics. @@ -96,6 +121,7 @@ This is done by adding a set of fieldname : value mappings to specify default va [source,js] -------------------------------------------------- +GET /_search { "aggs": { "matrixstats": { @@ -107,6 +133,7 @@ This is done by adding a set of fieldname : value mappings to specify default va } } -------------------------------------------------- +// CONSOLE <1> Documents without a value in the `income` field will have the default value `50000`. diff --git a/docs/reference/aggregations/metrics/tophits-aggregation.asciidoc b/docs/reference/aggregations/metrics/tophits-aggregation.asciidoc index 05fb51e6d8530..93414147c43e0 100644 --- a/docs/reference/aggregations/metrics/tophits-aggregation.asciidoc +++ b/docs/reference/aggregations/metrics/tophits-aggregation.asciidoc @@ -177,6 +177,7 @@ relevancy order of the most relevant document in a bucket. [source,js] -------------------------------------------------- +POST /sales/_search { "query": { "match": { @@ -184,7 +185,7 @@ relevancy order of the most relevant document in a bucket. } }, "aggs": { - "top-sites": { + "top_sites": { "terms": { "field": "domain", "order": { @@ -207,6 +208,8 @@ relevancy order of the most relevant document in a bucket. } } -------------------------------------------------- +// CONSOLE +// TEST[setup:sales] At the moment the `max` (or `min`) aggregator is needed to make sure the buckets from the `terms` aggregator are ordered according to the score of the most relevant webpage per domain. Unfortunately the `top_hits` aggregator @@ -224,31 +227,129 @@ the same id. In order to determine the identity of a nested hit there is more ne nested hits also include their nested identity. The nested identity is kept under the `_nested` field in the search hit and includes the array field and the offset in the array field the nested hit belongs to. The offset is zero based. -Top hits response snippet with a nested hit, which resides in the third slot of array field `nested_field1` in document with id `1`: +Let's see how it works with a real sample. Considering the following mapping: [source,js] -------------------------------------------------- -... -"hits": { - "total": 25365, - "max_score": 1, - "hits": [ - { - "_index": "a", - "_type": "b", - "_id": "1", - "_score": 1, - "_nested" : { - "field" : "nested_field1", - "offset" : 2 - } - "_source": ... - }, - ... - ] +PUT /sales +{ + "mappings": { + "product" : { + "properties" : { + "tags" : { "type" : "keyword" }, + "comments" : { <1> + "type" : "nested", + "properties" : { + "username" : { "type" : "keyword" }, + "comment" : { "type" : "text" } + } + } + } + } + } +} +-------------------------------------------------- +// CONSOLE +<1> The `comments` is an array that holds nested documents under the `product` object. + +And some documents: + +[source,js] +-------------------------------------------------- +PUT /sales/product/1?refresh +{ + "tags": ["car", "auto"], + "comments": [ + {"username": "baddriver007", "comment": "This car could have better brakes"}, + {"username": "dr_who", "comment": "Where's the autopilot? Can't find it"}, + {"username": "ilovemotorbikes", "comment": "This car has two extra wheels"} + ] +} +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +It's now possible to execute the following `top_hits` aggregation (wrapped in a `nested` aggregation): + +[source,js] +-------------------------------------------------- +POST /sales/_search +{ + "query": { + "term": { "tags": "car" } + }, + "aggs": { + "by_sale": { + "nested" : { + "path" : "comments" + }, + "aggs": { + "by_user": { + "terms": { + "field": "comments.username", + "size": 1 + }, + "aggs": { + "by_nested": { + "top_hits":{} + } + } + } + } + } + } +} +-------------------------------------------------- +// CONSOLE +// TEST[continued] +// TEST[s/_search/_search\?filter_path=aggregations.by_sale.by_user.buckets/] + +Top hits response snippet with a nested hit, which resides in the first slot of array field `comments`: + +[source,js] +-------------------------------------------------- +{ + ... + "aggregations": { + "by_sale": { + "by_user": { + "buckets": [ + { + "key": "baddriver007", + "doc_count": 1, + "by_nested": { + "hits": { + "total": 1, + "max_score": 0.2876821, + "hits": [ + { + "_nested": { + "field": "comments", <1> + "offset": 0 <2> + }, + "_score": 0.2876821, + "_source": { + "comments": { + "comment": "This car could have better brakes", <3> + "username": "baddriver007" + } + } + } + ] + } + } + } + ... + ] + } + } + } } -... -------------------------------------------------- +// TESTRESPONSE[s/\.\.\.//] +<2> Name of the array field containing the nested hit +<3> Position if the nested hit in the containing array +<4> Source of the nested hit If `_source` is requested then just the part of the source of the nested object is returned, not the entire source of the document. Also stored fields on the *nested* inner object level are accessible via `top_hits` aggregator residing in a `nested` or `reverse_nested` aggregator. @@ -290,3 +391,4 @@ the second slow of the `nested_child_field` field: } ... -------------------------------------------------- +// NOTCONSOLE \ No newline at end of file diff --git a/docs/reference/cluster/allocation-explain.asciidoc b/docs/reference/cluster/allocation-explain.asciidoc index 8749970aeb27c..615a8a0108427 100644 --- a/docs/reference/cluster/allocation-explain.asciidoc +++ b/docs/reference/cluster/allocation-explain.asciidoc @@ -19,6 +19,7 @@ To explain the allocation of a shard, first an index should exist: -------------------------------------------------- PUT /myindex -------------------------------------------------- +// CONSOLE // TESTSETUP And then the allocation for shards of that index can be explained: @@ -72,6 +73,24 @@ GET /_cluster/allocation/explain This section includes examples of the cluster allocation explain API response output under various scenarios. +////////////////////////// + +[source,js] +-------------------------------------------------- +PUT /idx?master_timeout=1s&timeout=1s +{"settings": {"index.routing.allocation.include._name": "non_existent_node"} } + +GET /_cluster/allocation/explain +{ + "index": "idx", + "shard": 0, + "primary": true +} +-------------------------------------------------- +// CONSOLE + +////////////////////////// + The API response for an unassigned shard: [source,js] @@ -91,8 +110,9 @@ The API response for an unassigned shard: "node_allocation_decisions" : [ { "node_id" : "8qt2rY-pT6KNZB3-hGfLnw", - "node_name" : "node_t1", + "node_name" : "node-0", "transport_address" : "127.0.0.1:9401", + "node_attributes" : {}, "node_decision" : "no", <4> "weight_ranking" : 1, "deciders" : [ @@ -102,24 +122,15 @@ The API response for an unassigned shard: "explanation" : "node does not match index setting [index.routing.allocation.include] filters [_name:\"non_existent_node\"]" <6> } ] - }, - { - "node_id" : "7Wr-QxLXRLKDxhzNm50pFA", - "node_name" : "node_t0", - "transport_address" : "127.0.0.1:9400", - "node_decision" : "no", - "weight_ranking" : 2, - "deciders" : [ - { - "decider" : "filter", - "decision" : "NO", - "explanation" : "node does not match index setting [index.routing.allocation.include] filters [_name:\"non_existent_node\"]" - } - ] } ] } -------------------------------------------------- +// TESTRESPONSE[s/"at" : "[^"]*"/"at" : $body.$_path/] +// TESTRESPONSE[s/"node_id" : "[^"]*"/"node_id" : $body.$_path/] +// TESTRESPONSE[s/"transport_address" : "[^"]*"/"transport_address" : $body.$_path/] +// TESTRESPONSE[s/"node_attributes" : \{\}/"node_attributes" : $body.$_path/] + <1> The current state of the shard <2> The reason for the shard originally becoming unassigned <3> Whether to allocate the shard @@ -171,6 +182,7 @@ allocated to a node in the cluster: "allocate_explanation" : "cannot allocate because a previous copy of the primary shard existed but can no longer be found on the nodes in the cluster" } -------------------------------------------------- +// NOTCONSOLE The API response output for a replica that is unassigned due to delayed allocation: @@ -220,6 +232,7 @@ The API response output for a replica that is unassigned due to delayed allocati ] } -------------------------------------------------- +// NOTCONSOLE <1> The configured delay before allocating a replica shard that does not exist due to the node holding it leaving the cluster <2> The remaining delay before allocating the replica shard <3> Information about the shard data found on a node @@ -267,6 +280,7 @@ remain on its current node and is required to move: ] } -------------------------------------------------- +// NOTCONSOLE <1> Whether the shard is allowed to remain on its current node <2> The deciders that factored into the decision of why the shard is not allowed to remain on its current node <3> Whether the shard is allowed to be allocated to another node @@ -302,6 +316,7 @@ because moving the shard to another node does not form a better cluster balance: ] } -------------------------------------------------- +// NOTCONSOLE <1> Whether rebalancing is allowed on the cluster <2> Whether the shard can be rebalanced to another node <3> The reason the shard cannot be rebalanced to the node, in this case indicating that it offers no better balance than the current node