diff --git a/docs/build.gradle b/docs/build.gradle index 84c39992fe632..bc6afa37ebaae 100644 --- a/docs/build.gradle +++ b/docs/build.gradle @@ -53,18 +53,6 @@ buildRestTests.expectedUnconvertedCandidates = [ 'reference/aggregations/pipeline/serial-diff-aggregation.asciidoc', 'reference/aggregations/pipeline/stats-bucket-aggregation.asciidoc', 'reference/aggregations/pipeline/sum-bucket-aggregation.asciidoc', - 'reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc', - 'reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc', - 'reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc', - 'reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc', - 'reference/analysis/tokenfilters/elision-tokenfilter.asciidoc', - 'reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc', - 'reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc', - 'reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc', - 'reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc', - 'reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc', - 'reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc', - 'reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc', 'reference/cat/snapshots.asciidoc', 'reference/cat/templates.asciidoc', 'reference/cat/thread_pool.asciidoc', @@ -131,10 +119,14 @@ integTestCluster { configFile 'scripts/my_map_script.painless' configFile 'scripts/my_combine_script.painless' configFile 'scripts/my_reduce_script.painless' + configFile 'analysis/example_word_list.txt' + configFile 'analysis/hyphenation_patterns.xml' configFile 'analysis/synonym.txt' configFile 'analysis/stemmer_override.txt' configFile 'userdict_ja.txt' configFile 'KeywordTokenizer.rbbi' + extraConfigFile 'hunspell/en_US/en_US.aff', '../core/src/test/resources/indices/analyze/conf_dir/hunspell/en_US/en_US.aff' + extraConfigFile 'hunspell/en_US/en_US.dic', '../core/src/test/resources/indices/analyze/conf_dir/hunspell/en_US/en_US.dic' // Whitelist reindexing from the local node so we can test it. setting 'reindex.remote.whitelist', '127.0.0.1:*' } diff --git a/docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc index 68891c18e2365..73d35549da8b6 100644 --- a/docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc @@ -8,17 +8,21 @@ equivalents, if one exists. Example: [source,js] -------------------------------------------------- -"index" : { - "analysis" : { - "analyzer" : { - "default" : { - "tokenizer" : "standard", - "filter" : ["standard", "asciifolding"] +PUT /asciifold_example +{ + "settings" : { + "analysis" : { + "analyzer" : { + "default" : { + "tokenizer" : "standard", + "filter" : ["standard", "asciifolding"] + } } } } } -------------------------------------------------- +// CONSOLE Accepts `preserve_original` setting which defaults to false but if true will keep the original token as well as emit the folded token. For @@ -26,20 +30,24 @@ example: [source,js] -------------------------------------------------- -"index" : { - "analysis" : { - "analyzer" : { - "default" : { - "tokenizer" : "standard", - "filter" : ["standard", "my_ascii_folding"] - } - }, - "filter" : { - "my_ascii_folding" : { - "type" : "asciifolding", - "preserve_original" : true +PUT /asciifold_example +{ + "settings" : { + "analysis" : { + "analyzer" : { + "default" : { + "tokenizer" : "standard", + "filter" : ["standard", "my_ascii_folding"] + } + }, + "filter" : { + "my_ascii_folding" : { + "type" : "asciifolding", + "preserve_original" : true + } } } } } -------------------------------------------------- +// CONSOLE diff --git a/docs/reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc index c1e278b2183b3..cc26d025f04f9 100644 --- a/docs/reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc @@ -16,8 +16,9 @@ Bigrams are generated for characters in `han`, `hiragana`, `katakana` and [source,js] -------------------------------------------------- +PUT /cjk_bigram_example { - "index" : { + "settings" : { "analysis" : { "analyzer" : { "han_bigrams" : { @@ -40,3 +41,4 @@ Bigrams are generated for characters in `han`, `hiragana`, `katakana` and } } -------------------------------------------------- +// CONSOLE diff --git a/docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc index eb1469af80344..c7d8ff660d347 100644 --- a/docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc @@ -41,21 +41,33 @@ Here is an example: [source,js] -------------------------------------------------- -index : - analysis : - analyzer : - index_grams : - tokenizer : whitespace - filter : [common_grams] - search_grams : - tokenizer : whitespace - filter : [common_grams_query] - filter : - common_grams : - type : common_grams - common_words: [a, an, the] - common_grams_query : - type : common_grams - query_mode: true - common_words: [a, an, the] +PUT /common_grams_example +{ + "settings": { + "analysis": { + "my_analyzer": { + "index_grams": { + "tokenizer": "whitespace", + "filter": ["common_grams"] + }, + "search_grams": { + "tokenizer": "whitespace", + "filter": ["common_grams_query"] + } + }, + "filter": { + "common_grams": { + "type": "common_grams", + "common_words": ["a", "an", "the"] + }, + "common_grams_query": { + "type": "common_grams", + "query_mode": true, + "common_words": ["a", "an", "the"] + } + } + } + } +} -------------------------------------------------- +// CONSOLE diff --git a/docs/reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc index 1268727b2efd8..a47b66ecfebd9 100644 --- a/docs/reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc @@ -1,5 +1,5 @@ [[analysis-compound-word-tokenfilter]] -=== Compound Word Token Filter +=== Compound Word Token Filters The `hyphenation_decompounder` and `dictionary_decompounder` token filters can decompose compound words found in many German languages into word parts. @@ -84,20 +84,31 @@ Here is an example: [source,js] -------------------------------------------------- -index : - analysis : - analyzer : - myAnalyzer2 : - type : custom - tokenizer : standard - filter : [myTokenFilter1, myTokenFilter2] - filter : - myTokenFilter1 : - type : dictionary_decompounder - word_list: [one, two, three] - myTokenFilter2 : - type : hyphenation_decompounder - word_list_path: path/to/words.txt - hyphenation_patterns_path: path/to/fop.xml - max_subword_size : 22 +PUT /compound_word_example +{ + "index": { + "analysis": { + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["dictionary_decompounder", "hyphenation_decompounder"] + } + }, + "filter": { + "dictionary_decompounder": { + "type": "dictionary_decompounder", + "word_list": ["one", "two", "three"] + }, + "hyphenation_decompounder": { + "type" : "hyphenation_decompounder", + "word_list_path": "analysis/example_word_list.txt", + "hyphenation_patterns_path": "analysis/hyphenation_patterns.xml", + "max_subword_size": 22 + } + } + } + } +} -------------------------------------------------- +// CONSOLE diff --git a/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc index c44ccffd51e61..956c5ad13d034 100644 --- a/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc @@ -9,20 +9,24 @@ example: [source,js] -------------------------------------------------- -"index" : { - "analysis" : { - "analyzer" : { - "default" : { - "tokenizer" : "standard", - "filter" : ["standard", "elision"] - } - }, - "filter" : { - "elision" : { - "type" : "elision", - "articles" : ["l", "m", "t", "qu", "n", "s", "j"] +PUT /elision_example +{ + "settings" : { + "analysis" : { + "analyzer" : { + "default" : { + "tokenizer" : "standard", + "filter" : ["standard", "elision"] + } + }, + "filter" : { + "elision" : { + "type" : "elision", + "articles" : ["l", "m", "t", "qu", "n", "s", "j"] + } } } } } -------------------------------------------------- +// CONSOLE diff --git a/docs/reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc index 9b3f188d95145..cef687f761905 100644 --- a/docs/reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc @@ -10,7 +10,7 @@ one or more `*.dic` files (all of which will automatically be picked up). For example, assuming the default hunspell location is used, the following directory layout will define the `en_US` dictionary: -[source,js] +[source,txt] -------------------------------------------------- - conf |-- hunspell @@ -42,24 +42,28 @@ settings: [source,js] -------------------------------------------------- +PUT /hunspell_example { - "analysis" : { - "analyzer" : { - "en" : { - "tokenizer" : "standard", - "filter" : [ "lowercase", "en_US" ] - } - }, - "filter" : { - "en_US" : { - "type" : "hunspell", - "locale" : "en_US", - "dedup" : true + "settings": { + "analysis" : { + "analyzer" : { + "en" : { + "tokenizer" : "standard", + "filter" : [ "lowercase", "en_US" ] + } + }, + "filter" : { + "en_US" : { + "type" : "hunspell", + "locale" : "en_US", + "dedup" : true + } } } } } -------------------------------------------------- +// CONSOLE The hunspell token filter accepts four options: diff --git a/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc index bb1103dff8a2d..afaf4f8fa8c46 100644 --- a/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc @@ -1,7 +1,7 @@ [[analysis-keep-types-tokenfilter]] === Keep Types Token Filter -A token filter of type `keep_types` that only keeps tokens with a token type +A token filter of type `keep_types` that only keeps tokens with a token type contained in a predefined set. @@ -14,24 +14,61 @@ types:: a list of types to keep [float] === Settings example +You can set it up like: + [source,js] -------------------------------------------------- +PUT /keep_types_example { - "index" : { + "settings" : { "analysis" : { "analyzer" : { "my_analyzer" : { "tokenizer" : "standard", "filter" : ["standard", "lowercase", "extract_numbers"] - }, + } }, "filter" : { "extract_numbers" : { "type" : "keep_types", "types" : [ "" ] - }, + } } } } } -------------------------------------------------- +// CONSOLE + +And test it like: + +[source,js] +-------------------------------------------------- +POST /keep_types_example/_analyze +{ + "analyzer" : "my_analyzer", + "text" : "this is just 1 a test" +} +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +And it'd respond: + +[source,js] +-------------------------------------------------- +{ + "tokens": [ + { + "token": "1", + "start_offset": 13, + "end_offset": 14, + "type": "", + "position": 3 + } + ] +} +-------------------------------------------------- +// TESTRESPONSE + +Note how only the `` token is in the output. diff --git a/docs/reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc index e4abbeff15dea..50c74942a0101 100644 --- a/docs/reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc @@ -20,17 +20,18 @@ keep_words_case:: a boolean indicating whether to lower case the words (defaults [source,js] -------------------------------------------------- +PUT /keep_words_example { - "index" : { + "settings" : { "analysis" : { "analyzer" : { - "my_analyzer" : { + "example_1" : { "tokenizer" : "standard", "filter" : ["standard", "lowercase", "words_till_three"] }, - "my_analyzer1" : { + "example_2" : { "tokenizer" : "standard", - "filter" : ["standard", "lowercase", "words_on_file"] + "filter" : ["standard", "lowercase", "words_in_file"] } }, "filter" : { @@ -38,12 +39,13 @@ keep_words_case:: a boolean indicating whether to lower case the words (defaults "type" : "keep", "keep_words" : [ "one", "two", "three"] }, - "words_on_file" : { + "words_in_file" : { "type" : "keep", - "keep_words_path" : "/path/to/word/file" + "keep_words_path" : "analysis/example_word_list.txt" } } } } } -------------------------------------------------- +// CONSOLE diff --git a/docs/reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc index abf7e8d7beaef..1f1e4e655c55e 100644 --- a/docs/reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc @@ -19,19 +19,124 @@ in the text. `false`. |======================================================================= -Here is an example: +You can configure it like: [source,js] -------------------------------------------------- -index : - analysis : - analyzer : - myAnalyzer : - type : custom - tokenizer : standard - filter : [lowercase, protwords, porter_stem] - filter : - protwords : - type : keyword_marker - keywords_path : analysis/protwords.txt +PUT /keyword_marker_example +{ + "settings": { + "analysis": { + "analyzer": { + "protect_cats": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "protect_cats", "porter_stem"] + }, + "normal": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "porter_stem"] + } + }, + "filter": { + "protect_cats": { + "type": "keyword_marker", + "keywords": ["cats"] + } + } + } + } +} -------------------------------------------------- +// CONSOLE + +And test it with: + +[source,js] +-------------------------------------------------- +POST /keyword_marker_example/_analyze +{ + "analyzer" : "protect_cats", + "text" : "I like cats" +} +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +And it'd respond: + +[source,js] +-------------------------------------------------- +{ + "tokens": [ + { + "token": "i", + "start_offset": 0, + "end_offset": 1, + "type": "", + "position": 0 + }, + { + "token": "like", + "start_offset": 2, + "end_offset": 6, + "type": "", + "position": 1 + }, + { + "token": "cats", + "start_offset": 7, + "end_offset": 11, + "type": "", + "position": 2 + } + ] +} +-------------------------------------------------- +// TESTRESPONSE + +As compared to the `normal` analyzer which has `cats` stemmed to `cat`: + +[source,js] +-------------------------------------------------- +POST /keyword_marker_example/_analyze +{ + "analyzer" : "normal", + "text" : "I like cats" +} +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +Response: + +[source,js] +-------------------------------------------------- +{ + "tokens": [ + { + "token": "i", + "start_offset": 0, + "end_offset": 1, + "type": "", + "position": 0 + }, + { + "token": "like", + "start_offset": 2, + "end_offset": 6, + "type": "", + "position": 1 + }, + { + "token": "cat", + "start_offset": 7, + "end_offset": 11, + "type": "", + "position": 2 + } + ] +} +-------------------------------------------------- +// TESTRESPONSE diff --git a/docs/reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc index aa8c7a9b75285..044e8c1476951 100644 --- a/docs/reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc @@ -9,18 +9,85 @@ subsequent stemmer will be indexed twice. Therefore, consider adding a `unique` filter with `only_on_same_position` set to `true` to drop unnecessary duplicates. -Here is an example: +Here is an example of using the `keyword_repeat` token filter to +preserve both the stemmed and unstemmed version of tokens: [source,js] -------------------------------------------------- -index : - analysis : - analyzer : - myAnalyzer : - type : custom - tokenizer : standard - filter : [lowercase, keyword_repeat, porter_stem, unique_stem] - unique_stem: - type: unique - only_on_same_position : true +PUT /keyword_repeat_example +{ + "settings": { + "analysis": { + "analyzer": { + "stemmed_and_unstemmed": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "keyword_repeat", "porter_stem", "unique_stem"] + } + }, + "filter": { + "unique_stem": { + "type": "unique", + "only_on_same_position": true + } + } + } + } +} -------------------------------------------------- +// CONSOLE + +And you can test it with: + +[source,js] +-------------------------------------------------- +POST /keyword_repeat_example/_analyze +{ + "analyzer" : "stemmed_and_unstemmed", + "text" : "I like cats" +} +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +And it'd respond: + +[source,js] +-------------------------------------------------- +{ + "tokens": [ + { + "token": "i", + "start_offset": 0, + "end_offset": 1, + "type": "", + "position": 0 + }, + { + "token": "like", + "start_offset": 2, + "end_offset": 6, + "type": "", + "position": 1 + }, + { + "token": "cats", + "start_offset": 7, + "end_offset": 11, + "type": "", + "position": 2 + }, + { + "token": "cat", + "start_offset": 7, + "end_offset": 11, + "type": "", + "position": 2 + } + ] +} +-------------------------------------------------- +// TESTRESPONSE + +Which preserves both the `cat` and `cats` tokens. Compare this to the example +on the <>. diff --git a/docs/reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc index a6598be6095ec..ba2018c107626 100644 --- a/docs/reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc @@ -18,15 +18,25 @@ Here is an example: [source,js] -------------------------------------------------- -index : - analysis : - analyzer : - myAnalyzer : - type : custom - tokenizer : standard - filter : [lowercase, five_token_limit] - filter : - five_token_limit : - type : limit - max_token_count : 5 +PUT /limit_example +{ + "settings": { + "analysis": { + "analyzer": { + "limit_example": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "five_token_limit"] + } + }, + "filter": { + "five_token_limit": { + "type": "limit", + "max_token_count": 5 + } + } + } + } +} -------------------------------------------------- +// CONSOLE diff --git a/docs/reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc index 674dfe541c9cf..519fd77ba2afd 100644 --- a/docs/reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc @@ -10,28 +10,30 @@ custom analyzer [source,js] -------------------------------------------------- -index : - analysis : - analyzer : - myAnalyzer2 : - type : custom - tokenizer : myTokenizer1 - filter : [myTokenFilter1, myGreekLowerCaseFilter] - char_filter : [my_html] - tokenizer : - myTokenizer1 : - type : standard - max_token_length : 900 - filter : - myTokenFilter1 : - type : stop - stopwords : [stop1, stop2, stop3, stop4] - myGreekLowerCaseFilter : - type : lowercase - language : greek - char_filter : - my_html : - type : html_strip - escaped_tags : [xxx, yyy] - read_ahead : 1024 +PUT /lowercase_example +{ + "settings": { + "analysis": { + "analyzer": { + "standard_lowercase_example": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase"] + }, + "greek_lowercase_example": { + "type": "custom", + "tokenizer": "standard", + "filter": ["greek_lowercase"] + } + }, + "filter": { + "greek_lowercase": { + "type": "lowercase", + "language": "greek" + } + } + } + } +} -------------------------------------------------- +// CONSOLE diff --git a/docs/src/test/cluster/config/analysis/example_word_list.txt b/docs/src/test/cluster/config/analysis/example_word_list.txt new file mode 100644 index 0000000000000..f79aea42af203 --- /dev/null +++ b/docs/src/test/cluster/config/analysis/example_word_list.txt @@ -0,0 +1,4 @@ +test +list +of +words diff --git a/docs/src/test/cluster/config/analysis/hyphenation_patterns.xml b/docs/src/test/cluster/config/analysis/hyphenation_patterns.xml new file mode 100644 index 0000000000000..6241b3fc6cc80 --- /dev/null +++ b/docs/src/test/cluster/config/analysis/hyphenation_patterns.xml @@ -0,0 +1,21 @@ + + + + + + + + + + + +aA + + + + + + +.a2 + +