CONSOLEify analysis docs

Converts the analysis docs to that were marked as json into `CONSOLE` format. A few of them were in yaml but marked as json for historical reasons. I added more complete examples for a few of the less obvious sounding ones. Relates to #18160
elastic · Apr 2, 2017 · fa81e2a · fa81e2a
1 parent c2c48ed
commit fa81e2a
Show file tree

Hide file tree

Showing 15 changed files with 439 additions and 158 deletions.
diff --git a/docs/build.gradle b/docs/build.gradle
@@ -53,18 +53,6 @@ buildRestTests.expectedUnconvertedCandidates = [
   'reference/aggregations/pipeline/serial-diff-aggregation.asciidoc',
   'reference/aggregations/pipeline/stats-bucket-aggregation.asciidoc',
   'reference/aggregations/pipeline/sum-bucket-aggregation.asciidoc',
-  'reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/elision-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc',
   'reference/cat/snapshots.asciidoc',
   'reference/cat/templates.asciidoc',
   'reference/cat/thread_pool.asciidoc',
@@ -131,10 +119,14 @@ integTestCluster {
   configFile 'scripts/my_map_script.painless'
   configFile 'scripts/my_combine_script.painless'
   configFile 'scripts/my_reduce_script.painless'
+  configFile 'analysis/example_word_list.txt'
+  configFile 'analysis/hyphenation_patterns.xml'
   configFile 'analysis/synonym.txt'
   configFile 'analysis/stemmer_override.txt'
   configFile 'userdict_ja.txt'
   configFile 'KeywordTokenizer.rbbi'
+  extraConfigFile 'hunspell/en_US/en_US.aff', '../core/src/test/resources/indices/analyze/conf_dir/hunspell/en_US/en_US.aff'
+  extraConfigFile 'hunspell/en_US/en_US.dic', '../core/src/test/resources/indices/analyze/conf_dir/hunspell/en_US/en_US.dic'
   // Whitelist reindexing from the local node so we can test it.
   setting 'reindex.remote.whitelist', '127.0.0.1:*'
 }

diff --git a/docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc
@@ -8,38 +8,46 @@ equivalents, if one exists.  Example:
 
 [source,js]
 --------------------------------------------------
-"index" : {
-    "analysis" : {
-        "analyzer" : {
-            "default" : {
-                "tokenizer" : "standard",
-                "filter" : ["standard", "asciifolding"]
+PUT /asciifold_example
+{
+    "settings" : {
+        "analysis" : {
+            "analyzer" : {
+                "default" : {
+                    "tokenizer" : "standard",
+                    "filter" : ["standard", "asciifolding"]
+                }
             }
         }
     }
 }
 --------------------------------------------------
+// CONSOLE
 
 Accepts `preserve_original` setting which defaults to false but if true
 will keep the original token as well as emit the folded token.  For
 example:
 
 [source,js]
 --------------------------------------------------
-"index" : {
-    "analysis" : {
-        "analyzer" : {
-            "default" : {
-                "tokenizer" : "standard",
-                "filter" : ["standard", "my_ascii_folding"]
-            }
-        },
-        "filter" : {
-            "my_ascii_folding" : {
-                "type" : "asciifolding",
-                "preserve_original" : true
+PUT /asciifold_example
+{
+    "settings" : {
+        "analysis" : {
+            "analyzer" : {
+                "default" : {
+                    "tokenizer" : "standard",
+                    "filter" : ["standard", "my_ascii_folding"]
+                }
+            },
+            "filter" : {
+                "my_ascii_folding" : {
+                    "type" : "asciifolding",
+                    "preserve_original" : true
+                }
             }
         }
     }
 }
 --------------------------------------------------
+// CONSOLE
diff --git a/docs/reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc
@@ -16,8 +16,9 @@ Bigrams are generated for characters in `han`, `hiragana`, `katakana` and
 
 [source,js]
 --------------------------------------------------
+PUT /cjk_bigram_example
 {
-    "index" : {
+    "settings" : {
         "analysis" : {
             "analyzer" : {
                 "han_bigrams" : {
@@ -40,3 +41,4 @@ Bigrams are generated for characters in `han`, `hiragana`, `katakana` and
     }
 }
 --------------------------------------------------
+// CONSOLE
diff --git a/docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc
@@ -41,21 +41,33 @@ Here is an example:
 
 [source,js]
 --------------------------------------------------
-index :
-    analysis :
-        analyzer :
-            index_grams :
-                tokenizer : whitespace
-                filter : [common_grams]
-            search_grams :
-                tokenizer : whitespace
-                filter : [common_grams_query]
-        filter :
-            common_grams :
-                type : common_grams
-                common_words: [a, an, the]                
-            common_grams_query :
-                type : common_grams
-                query_mode: true
-                common_words: [a, an, the]                
+PUT /common_grams_example
+{
+    "settings": {
+        "analysis": {
+            "my_analyzer": {
+                "index_grams": {
+                    "tokenizer": "whitespace",
+                    "filter": ["common_grams"]
+                },
+                "search_grams": {
+                    "tokenizer": "whitespace",
+                    "filter": ["common_grams_query"]
+                }
+            },
+            "filter": {
+                "common_grams": {
+                    "type": "common_grams",
+                    "common_words": ["a", "an", "the"]
+                },
+                "common_grams_query": {
+                    "type": "common_grams",
+                    "query_mode": true,
+                    "common_words": ["a", "an", "the"]
+                }
+            }
+        }
+    }
+}
 --------------------------------------------------
+// CONSOLE
diff --git a/docs/reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc
@@ -1,5 +1,5 @@
 [[analysis-compound-word-tokenfilter]]
-=== Compound Word Token Filter
+=== Compound Word Token Filters
 
 The `hyphenation_decompounder` and `dictionary_decompounder` token filters can
 decompose compound words found in many German languages into word parts.
@@ -84,20 +84,31 @@ Here is an example:
 
 [source,js]
 --------------------------------------------------
-index :
-    analysis :
-        analyzer :
-            myAnalyzer2 :
-                type : custom
-                tokenizer : standard
-                filter : [myTokenFilter1, myTokenFilter2]
-        filter :
-            myTokenFilter1 :
-                type : dictionary_decompounder
-                word_list: [one, two, three]
-            myTokenFilter2 :
-                type : hyphenation_decompounder
-                word_list_path: path/to/words.txt
-                hyphenation_patterns_path: path/to/fop.xml
-                max_subword_size : 22
+PUT /compound_word_example
+{
+    "index": {
+        "analysis": {
+            "analyzer": {
+                "my_analyzer": {
+                    "type": "custom",
+                    "tokenizer": "standard",
+                    "filter": ["dictionary_decompounder", "hyphenation_decompounder"]
+                }
+            },
+            "filter": {
+                "dictionary_decompounder": {
+                    "type": "dictionary_decompounder",
+                    "word_list": ["one", "two", "three"]
+                },
+                "hyphenation_decompounder": {
+                    "type" : "hyphenation_decompounder",
+                    "word_list_path": "analysis/example_word_list.txt",
+                    "hyphenation_patterns_path": "analysis/hyphenation_patterns.xml",
+                    "max_subword_size": 22
+                }
+            }
+        }
+    }
+}
 --------------------------------------------------
+// CONSOLE
diff --git a/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc
@@ -9,20 +9,24 @@ example:
 
 [source,js]
 --------------------------------------------------
-"index" : {
-    "analysis" : {
-        "analyzer" : {
-            "default" : {
-                "tokenizer" : "standard",
-                "filter" : ["standard", "elision"]
-            }
-        },
-        "filter" : {
-            "elision" : {
-                "type" : "elision",
-                "articles" : ["l", "m", "t", "qu", "n", "s", "j"]
+PUT /elision_example
+{
+    "settings" : {
+        "analysis" : {
+            "analyzer" : {
+                "default" : {
+                    "tokenizer" : "standard",
+                    "filter" : ["standard", "elision"]
+                }
+            },
+            "filter" : {
+                "elision" : {
+                    "type" : "elision",
+                    "articles" : ["l", "m", "t", "qu", "n", "s", "j"]
+                }
             }
         }
     }
 }
 --------------------------------------------------
+// CONSOLE
diff --git a/docs/reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc
@@ -10,7 +10,7 @@ one or more `*.dic` files (all of which will automatically be picked up).
 For example, assuming the default hunspell location is used, the
 following directory layout will define the `en_US` dictionary:
 
-[source,js]
+[source,txt]
 --------------------------------------------------
 - conf
     |-- hunspell
@@ -42,24 +42,28 @@ settings:
 
 [source,js]
 --------------------------------------------------
+PUT /hunspell_example
 {
-    "analysis" : {
-        "analyzer" : {
-            "en" : {
-                "tokenizer" : "standard",
-                "filter" : [ "lowercase", "en_US" ]
-            }
-        },
-        "filter" : {
-            "en_US" : {
-                "type" : "hunspell",
-                "locale" : "en_US",
-                "dedup" : true
+    "settings": {
+        "analysis" : {
+            "analyzer" : {
+                "en" : {
+                    "tokenizer" : "standard",
+                    "filter" : [ "lowercase", "en_US" ]
+                }
+            },
+            "filter" : {
+                "en_US" : {
+                    "type" : "hunspell",
+                    "locale" : "en_US",
+                    "dedup" : true
+                }
             }
         }
     }
 }
 --------------------------------------------------
+// CONSOLE
 
 The hunspell token filter accepts four options:
 

diff --git a/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc
@@ -1,7 +1,7 @@
 [[analysis-keep-types-tokenfilter]]
 === Keep Types Token Filter
 
-A token filter of type `keep_types` that only keeps tokens with a token type 
+A token filter of type `keep_types` that only keeps tokens with a token type
 contained in a predefined set.
 
 
@@ -14,24 +14,61 @@ types:: a list of types to keep
 [float]
 === Settings example
 
+You can set it up like:
+
 [source,js]
 --------------------------------------------------
+PUT /keep_types_example
 {
-    "index" : {
+    "settings" : {
         "analysis" : {
             "analyzer" : {
                 "my_analyzer" : {
                     "tokenizer" : "standard",
                     "filter" : ["standard", "lowercase", "extract_numbers"]
-                },
+                }
             },
             "filter" : {
                 "extract_numbers" : {
                     "type" : "keep_types",
                     "types" : [ "<NUM>" ]
-                },
+                }
             }
         }
     }
 }
 --------------------------------------------------
+// CONSOLE
+
+And test it like:
+
+[source,js]
+--------------------------------------------------
+POST /keep_types_example/_analyze
+{
+  "analyzer" : "my_analyzer",
+  "text" : "this is just 1 a test"
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
+And it'd respond:
+
+[source,js]
+--------------------------------------------------
+{
+  "tokens": [
+    {
+      "token": "1",
+      "start_offset": 13,
+      "end_offset": 14,
+      "type": "<NUM>",
+      "position": 3
+    }
+  ]
+}
+--------------------------------------------------
+// TESTRESPONSE
+
+Note how only the `<NUM>` token is in the output.