diff --git a/.github/workflows/_integration_tests.yml b/.github/workflows/_integration_tests.yml index cffd0edb..a3761029 100644 --- a/.github/workflows/_integration_tests.yml +++ b/.github/workflows/_integration_tests.yml @@ -10,6 +10,7 @@ jobs: node-version: [18.x, 20.x, 22.x] es-version: [7.6.1] jdk-version: [oraclejdk11] + icuTokenizer: [true, false] steps: - uses: actions/checkout@v4 - name: Install node.js ${{ matrix.node-version }} @@ -23,6 +24,10 @@ jobs: run: ./scripts/setup_ci.sh - name: Run integration tests run: | + if [ "${{ matrix.icuTokenizer }}" = "true" ]; then + jq -n '{ schema: { icuTokenizer: true } }' > $(pwd)/config-icu.json + export PELIAS_CONFIG=$(pwd)/config-icu.json + fi npm install curl http://127.0.0.1:9200/ ./bin/create_index diff --git a/.github/workflows/_unit_tests.yml b/.github/workflows/_unit_tests.yml index b371970f..d0ef57f0 100644 --- a/.github/workflows/_unit_tests.yml +++ b/.github/workflows/_unit_tests.yml @@ -8,6 +8,7 @@ jobs: os: - ubuntu-22.04 node-version: [18.x, 20.x, 22.x] + icuTokenizer: [true, false] steps: - uses: actions/checkout@v4 - name: Install node.js ${{ matrix.node-version }} @@ -17,4 +18,8 @@ jobs: - name: Run unit tests run: | npm install - npm run test + if [ "${{ matrix.icuTokenizer }}" = "true" ]; then + jq -n '{ schema: { icuTokenizer: true } }' > $(pwd)/config-icu.json + export PELIAS_CONFIG=$(pwd)/config-icu.json + fi + npm run test \ No newline at end of file diff --git a/.gitignore b/.gitignore index b0e3907b..27df0198 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ node_modules npm-debug.log .DS_Store +config-icu.json diff --git a/configValidation.js b/configValidation.js index 41e824e2..02d462ed 100644 --- a/configValidation.js +++ b/configValidation.js @@ -2,10 +2,12 @@ const Joi = require('@hapi/joi'); // Schema Configuration // schema.indexName: populated by defaults if not overridden +// schema.icuTokenizer: boolean, optional, defaults to false // esclient: object, validation performed by elasticsearch module const schema = Joi.object().required().keys({ schema: Joi.object().required().keys({ indexName: Joi.string().required(), + icuTokenizer: Joi.boolean().optional() }), esclient: Joi.object().required() }).unknown(true); diff --git a/integration/analyzer_peliasIndexOneEdgeGram.js b/integration/analyzer_peliasIndexOneEdgeGram.js index 1db32362..cd4fdb4c 100644 --- a/integration/analyzer_peliasIndexOneEdgeGram.js +++ b/integration/analyzer_peliasIndexOneEdgeGram.js @@ -1,8 +1,9 @@ // validate analyzer is behaving as expected -var tape = require('tape'), +const tape = require('tape'), Suite = require('../test/elastictest/Suite'), - punctuation = require('../punctuation'); + punctuation = require('../punctuation'), + config = require('pelias-config').generate(); module.exports.tests = {}; @@ -85,6 +86,15 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'british_american_english', 'town theatre', ['0:town', '1:theatre', '1:theater'] ); assertAnalysis( 'british_american_english', 'town theater', ['0:town', '1:theater', '1:theatre'] ); + if (config.schema.icuTokenizer) { + assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', [ + '0:ซ', '0:ซอ', '0:ซอย', + '1:เพชรบุรี1', '1:เพชรบุรี', '1:เพชรบุร', '1:เพชรบุ', '1:เพชรบ', '1:เพชร', '1:เพช', '1:เพ', '1:เ', + '2:f', '2:fo', '2:foo'] ); + } else { + // no ICU tokenization, so we split only on spaces + assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', ['0:ซอยเพชรบุรี1foo']); + } suite.run( t.end ); }); diff --git a/integration/analyzer_peliasQuery.js b/integration/analyzer_peliasQuery.js index f0cf199e..47d2f76b 100644 --- a/integration/analyzer_peliasQuery.js +++ b/integration/analyzer_peliasQuery.js @@ -1,8 +1,9 @@ // validate analyzer is behaving as expected -var tape = require('tape'), +const tape = require('tape'), Suite = require('../test/elastictest/Suite'), - punctuation = require('../punctuation'); + punctuation = require('../punctuation'), + config = require('pelias-config').generate(); module.exports.tests = {}; @@ -49,6 +50,33 @@ module.exports.tests.functional = function(test, common){ assertAnalysis( 'place', 'Toys "R" Us!', [ 'toys', 'r', 'us' ]); assertAnalysis( 'address', '101 mapzen place', [ '101', 'mapzen', 'place' ]); + // complicated tokenization for some Asian languages + if (config.schema.icuTokenizer) { + assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] ); + assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] ); + assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]); + assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室', + ['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']); + // correct word by word split according to native speaker: 马来西亚 / 霹雳州 / 怡保 / 31400, 怡保花园 / 第5巷 / 45号 + assertAnalysis('chinese_address2', '马来西亚霹雳州怡保31400怡保花园第5巷45号', + ["马来", "西亚", "霹", "雳", "州", "怡", "保", "31400", "怡", "保", "花园", "第", "5", "巷", "45", "号"]); + // correct word by word split: 马来西亚 / 柔佛新山 / 81200 / , / 士古来路 / , / 百万时尚广场 + assertAnalysis('chinese_address3', '马来西亚柔佛新山81200士古来路百万时尚广场', + ["马来", "西亚", "柔", "佛", "新山", "81200", "士", "古来", "路", "百万", "时尚", "广场"]); + // correct word by word split: 马来西亚/ 槟城 / 亚依淡 / 11500 / , / 极乐寺 / , / 回返路 + assertAnalysis('chinese_address4', '马来西亚槟城亚依淡11500极乐寺回返路', + ["马来", "西亚", "槟", "城", "亚", "依", "淡", "11500", "极乐", "寺", "回", "返", "路"]); + // correct word by word split: 马来西亚 / 吉隆坡 / 50000 / , / 茨厂街 / 123号 + assertAnalysis('chinese_address5', '马来西亚吉隆坡50000茨厂街123号', + ["马来", "西亚", "吉隆坡", "50000", "茨", "厂", "街", "123", "号"]); + + assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]); + assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]); + assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]); + } else { + // no ICU tokenization, so we split only on spaces + assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอยเพชรบุรี1'] ); + } suite.run( t.end ); }); }; diff --git a/integration/analyzer_peliasStreet.js b/integration/analyzer_peliasStreet.js index 2fa0e494..7f861f9a 100644 --- a/integration/analyzer_peliasStreet.js +++ b/integration/analyzer_peliasStreet.js @@ -1,5 +1,6 @@ // validate analyzer is behaving as expected const Suite = require('../test/elastictest/Suite') +const config = require('pelias-config').generate() module.exports.tests = {}; @@ -22,6 +23,20 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'remove_ordinals', '1st 2nd 3rd 4th 5th', ['1','2','3','4','5'] ); assertAnalysis( 'remove_ordinals', 'Ast th 101st', ['ast','th','101'] ); + // complicated tokenization for some Asian languages + if (config.schema.icuTokenizer) { + assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] ); + assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] ); + assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]); + assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室', + ['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']); + assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]); + assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]); + assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]); + } else { + // no ICU tokenization, so we split only on spaces + assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอยเพชรบุรี1'] ); + } suite.run( t.end ); }); }; diff --git a/settings-icu.js b/settings-icu.js new file mode 100644 index 00000000..2ebfa2c0 --- /dev/null +++ b/settings-icu.js @@ -0,0 +1,49 @@ +const _ = require('lodash'); + +/** + * This module contains modifications to the Pelias schema to adopt the elastic ICU tokenizer. + * This tokenizer improves word-splitting of non-latin alphabets (particularly Asian languages). + * + * It can be enabled by setting `config.schema.icuTokenizer` in your `pelias.json` config. + * Note: this must be set *before* you create your elasticsearch index or it will have no effect. + * + * This feature is considered beta, we encourage testing & feedback from the community in order + * to adopt the ICU tokenizer as our default. + * + * https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu-tokenizer.html + * https://github.com/pelias/schema/pull/498 + */ + +module.exports = (settings) => { + + // replace pattern tokenizer with icu_tokenizer + _.set(settings, 'analysis.tokenizer.peliasTokenizer', { + 'type': 'icu_tokenizer' + }); + + // add ampersand_replacer filter + // replaces ampersand placeholders back to `&` (see `ampersand_mapper` char_filter) + _.set(settings, 'analysis.filter.ampersand_replacer', { + 'type': 'pattern_replace', + 'pattern': 'AMPERSANDPLACEHOLDER', + 'replacement': '&' + }); + + // add ampersand_mapper char_filter + // icu-tokenizer treats ampersands as a word boundary, so we replace them with a placeholder to avoid it, + // as we want to handle them separately, we replace them back after tokenization (see `ampersand_replacer` filter) + _.set(settings, 'analysis.char_filter.ampersand_mapper', { + 'type': 'pattern_replace', + 'pattern': '&', + 'replacement': ' AMPERSANDPLACEHOLDER ' + }); + + // prepend ampersand mapper/replacer to each analyzer + _.forEach(_.get(settings, 'analysis.analyzer'), (block) => { + if (block?.tokenizer !== 'peliasTokenizer') { return; } + block.filter.unshift('ampersand_replacer'); + block.char_filter.unshift('ampersand_mapper'); + }); + + return settings; +} \ No newline at end of file diff --git a/settings.js b/settings.js index f2dd633a..48fb2781 100644 --- a/settings.js +++ b/settings.js @@ -2,14 +2,15 @@ const _ = require('lodash'); const peliasConfig = require('pelias-config'); const punctuation = require('./punctuation'); const synonyms = require('./synonyms/loader').load(); +const settingsICU = require('./settings-icu'); require('./configValidation').validate(peliasConfig.generate()); function generate(){ - var config = peliasConfig.generate(); + const config = peliasConfig.generate(); // Default settings - var settings = { + let settings = { "index": { "similarity": { "peliasDefaultSimilarity": { @@ -299,6 +300,11 @@ function generate(){ }; }); + // Experimental ICU tokenizer + if (config.schema.icuTokenizer) { + settings = settingsICU(settings); + } + // Merge settings from pelias/config settings = _.merge({}, settings, _.get(config, 'elasticsearch.settings', {})); diff --git a/test/compile.js b/test/compile.js index 719dde7a..b1d922ac 100644 --- a/test/compile.js +++ b/test/compile.js @@ -2,7 +2,7 @@ const _ = require('lodash'); const path = require('path'); const schema = require('../'); const fixture = require('./fixtures/expected.json'); -const config = require('pelias-config').generate(); +const fixtureICUTokenizer = require('./fixtures/expected-icu-tokenizer.json'); const forEachDeep = (obj, cb) => _.forEach(obj, (val, key) => { @@ -97,6 +97,19 @@ module.exports.tests.analyzers = function (test, common) { }); }; +function overridePeliasConfig(value, cb) { + const OLD_PELIAS_CONFIG = process.env.PELIAS_CONFIG; + process.env.PELIAS_CONFIG = value; + + cb(); + + if (OLD_PELIAS_CONFIG) { + process.env.PELIAS_CONFIG = OLD_PELIAS_CONFIG; + } else { + delete process.env.PELIAS_CONFIG; + } +} + // current schema (compiled) - requires schema to be copied and settings to // be regenerated from a fixture in order to pass in CI environments. module.exports.tests.current_schema = function(test, common) { @@ -106,9 +119,9 @@ module.exports.tests.current_schema = function(test, common) { var schemaCopy = JSON.parse( JSON.stringify( schema ) ); // use the pelias config fixture instead of the local config - process.env.PELIAS_CONFIG = path.resolve( __dirname + '/fixtures/config.json' ); - schemaCopy.settings = require('../settings')(); - delete process.env.PELIAS_CONFIG; + overridePeliasConfig(path.resolve( __dirname + '/fixtures/config.json' ), () => { + schemaCopy.settings = require('../settings')(); + }); // code intentionally commented to allow quick debugging of expected.json // common.diff(schemaCopy, fixture); @@ -121,6 +134,28 @@ module.exports.tests.current_schema = function(test, common) { t.deepEqual(schemaCopy, fixture); t.end(); }); + + test('current schema vs. fixture with ICU tokenizer', function(t) { + + // copy schema + var schemaCopy = JSON.parse( JSON.stringify( schema ) ); + + // use the pelias config fixture instead of the local config + overridePeliasConfig(path.resolve( __dirname + '/fixtures/config-icu-tokenizer.json' ), () => { + schemaCopy.settings = require('../settings')(); + }); + + // code intentionally commented to allow quick debugging of expected.json + // common.diff(schemaCopy, fixtureICUTokenizer); + // console.error( JSON.stringify( schemaCopy, null, 2 ) ); + + // code to write expected output to the fixture + // const fs = require('fs'); + // fs.writeFileSync(path.resolve( __dirname + '/fixtures/expected-icu-tokenizer.json' ), JSON.stringify(schemaCopy, null, 2)); + + t.deepEqual(schemaCopy, fixtureICUTokenizer); + t.end(); + }); }; module.exports.all = function (tape, common) { diff --git a/test/fixtures/config-icu-tokenizer.json b/test/fixtures/config-icu-tokenizer.json new file mode 100644 index 00000000..81e70ce7 --- /dev/null +++ b/test/fixtures/config-icu-tokenizer.json @@ -0,0 +1,15 @@ +{ + "elasticsearch": { + "settings": { + "index": { + "number_of_replicas": "999", + "number_of_shards": "5", + "refresh_interval": "1m" + } + } + }, + "schema": { + "icuTokenizer": true + } +} + \ No newline at end of file diff --git a/test/fixtures/expected-icu-tokenizer.json b/test/fixtures/expected-icu-tokenizer.json new file mode 100644 index 00000000..fcfa2008 --- /dev/null +++ b/test/fixtures/expected-icu-tokenizer.json @@ -0,0 +1,3059 @@ +{ + "settings": { + "index": { + "similarity": { + "peliasDefaultSimilarity": { + "type": "BM25", + "k1": 1.2, + "b": 0.75 + } + }, + "number_of_replicas": "999", + "number_of_shards": "5", + "refresh_interval": "1m" + }, + "analysis": { + "tokenizer": { + "peliasTokenizer": { + "type": "icu_tokenizer" + } + }, + "analyzer": { + "peliasAdmin": { + "type": "custom", + "tokenizer": "peliasTokenizer", + "char_filter": [ + "ampersand_mapper", + "punctuation", + "nfkc_normalizer" + ], + "filter": [ + "ampersand_replacer", + "lowercase", + "trim", + "synonyms/custom_admin/multiword", + "admin_synonyms_multiplexer", + "icu_folding", + "word_delimiter", + "unique_only_same_position", + "notnull", + "flatten_graph" + ] + }, + "peliasIndexOneEdgeGram": { + "type": "custom", + "tokenizer": "peliasTokenizer", + "char_filter": [ + "ampersand_mapper", + "punctuation", + "nfkc_normalizer" + ], + "filter": [ + "ampersand_replacer", + "lowercase", + "trim", + "synonyms/custom_name/multiword", + "synonyms/custom_street/multiword", + "synonyms/custom_admin/multiword", + "name_synonyms_multiplexer", + "icu_folding", + "remove_ordinals", + "removeAllZeroNumericPrefix", + "peliasOneEdgeGramFilter", + "unique_only_same_position", + "notnull", + "flatten_graph" + ] + }, + "peliasQuery": { + "type": "custom", + "tokenizer": "peliasTokenizer", + "char_filter": [ + "ampersand_mapper", + "punctuation", + "nfkc_normalizer" + ], + "filter": [ + "ampersand_replacer", + "lowercase", + "trim", + "icu_folding", + "remove_ordinals", + "removeAllZeroNumericPrefix", + "unique_only_same_position", + "notnull" + ] + }, + "peliasPhrase": { + "type": "custom", + "tokenizer": "peliasTokenizer", + "char_filter": [ + "ampersand_mapper", + "punctuation", + "nfkc_normalizer" + ], + "filter": [ + "ampersand_replacer", + "lowercase", + "trim", + "remove_duplicate_spaces", + "synonyms/custom_name/multiword", + "synonyms/custom_street/multiword", + "synonyms/custom_admin/multiword", + "name_synonyms_multiplexer", + "icu_folding", + "remove_ordinals", + "removeAllZeroNumericPrefix", + "unique_only_same_position", + "notnull", + "flatten_graph" + ] + }, + "peliasZip": { + "type": "custom", + "tokenizer": "keyword", + "char_filter": [ + "alphanumeric", + "nfkc_normalizer" + ], + "filter": [ + "lowercase", + "trim", + "icu_folding", + "unique_only_same_position", + "notnull" + ] + }, + "peliasUnit": { + "type": "custom", + "tokenizer": "keyword", + "char_filter": [ + "alphanumeric", + "nfkc_normalizer" + ], + "filter": [ + "lowercase", + "trim", + "icu_folding", + "unique_only_same_position", + "notnull" + ] + }, + "peliasHousenumber": { + "type": "custom", + "tokenizer": "standard", + "char_filter": [ + "numeric" + ] + }, + "peliasStreet": { + "type": "custom", + "tokenizer": "peliasTokenizer", + "char_filter": [ + "ampersand_mapper", + "punctuation", + "nfkc_normalizer" + ], + "filter": [ + "ampersand_replacer", + "lowercase", + "trim", + "remove_duplicate_spaces", + "synonyms/custom_street/multiword", + "street_synonyms_multiplexer", + "icu_folding", + "remove_ordinals", + "trim", + "unique_only_same_position", + "notnull", + "flatten_graph" + ] + }, + "peliasIndexCountryAbbreviation": { + "type": "custom", + "tokenizer": "peliasTokenizer", + "char_filter": [ + "ampersand_mapper", + "punctuation", + "nfkc_normalizer" + ], + "filter": [ + "ampersand_replacer", + "lowercase", + "trim", + "icu_folding", + "country_abbreviation_synonyms_multiplexer", + "unique_only_same_position", + "notnull", + "flatten_graph" + ] + }, + "peliasIndexCountryAbbreviationOneEdgeGram": { + "type": "custom", + "tokenizer": "peliasTokenizer", + "char_filter": [ + "ampersand_mapper", + "punctuation", + "nfkc_normalizer" + ], + "filter": [ + "ampersand_replacer", + "lowercase", + "trim", + "icu_folding", + "country_abbreviation_synonyms_multiplexer", + "peliasOneEdgeGramFilter", + "unique_only_same_position", + "notnull", + "flatten_graph" + ] + } + }, + "filter": { + "ampersand_replacer": { + "type": "pattern_replace", + "pattern": "AMPERSANDPLACEHOLDER", + "replacement": "&" + }, + "street_synonyms_multiplexer": { + "type": "multiplexer", + "preserve_original": false, + "filters": [ + "synonyms/custom_street", + "synonyms/personal_titles", + "synonyms/streets", + "synonyms/directionals", + "synonyms/british_american_english" + ] + }, + "name_synonyms_multiplexer": { + "type": "multiplexer", + "preserve_original": false, + "filters": [ + "synonyms/custom_name", + "synonyms/custom_street", + "synonyms/custom_admin", + "synonyms/personal_titles", + "synonyms/place_names", + "synonyms/streets", + "synonyms/directionals", + "synonyms/punctuation", + "synonyms/british_american_english" + ] + }, + "admin_synonyms_multiplexer": { + "type": "multiplexer", + "preserve_original": false, + "filters": [ + "synonyms/custom_admin", + "synonyms/personal_titles", + "synonyms/place_names" + ] + }, + "country_abbreviation_synonyms_multiplexer": { + "type": "multiplexer", + "preserve_original": false, + "filters": [ + "synonyms/country_codes" + ] + }, + "notnull": { + "type": "length", + "min": 1 + }, + "unique_only_same_position": { + "type": "unique", + "only_on_same_position": "true" + }, + "peliasOneEdgeGramFilter": { + "type": "edge_ngram", + "min_gram": 1, + "max_gram": 24 + }, + "removeAllZeroNumericPrefix": { + "type": "pattern_replace", + "pattern": "^(0*)", + "replacement": "" + }, + "remove_ordinals": { + "type": "pattern_replace", + "pattern": "(?i)((^| )((1)st?|(2)nd?|(3)rd?|([4-9])th?)|(([0-9]*)(1[0-9])th?)|(([0-9]*[02-9])((1)st?|(2)nd?|(3)rd?|([04-9])th?))($| ))", + "replacement": "$2$4$5$6$7$9$10$12$14$15$16$17$18" + }, + "remove_duplicate_spaces": { + "type": "pattern_replace", + "pattern": " +", + "replacement": " " + }, + "synonyms/british_american_english": { + "type": "synonym", + "synonyms": [ + "accessorize,accessorise", + "eon,aeon", + "airplane,aeroplane", + "esthetic,aesthetic", + "esthetics,aesthetics", + "aging,ageing", + "almanac,almanack", + "aluminum,aluminium", + "amphitheater,amphitheatre", + "analog,analogue", + "analyze,analyse", + "analyzes,analyses", + "arbor,arbour", + "arbors,arbours", + "archeological,archaeological", + "archeology,archaeology", + "ardor,ardour", + "armor,armour", + "armored,armoured", + "armories,armouries", + "armory,armoury", + "artifact,artefact", + "artifacts,artefacts", + "authorized,authorised", + "ax,axe", + "banister,bannister", + "banisters,bannisters", + "baptize,baptise", + "baptized,baptised", + "balk,baulk", + "behavior,behaviour", + "behavioral,behavioural", + "bevy,bevvy", + "bougainvillea,bougainvillaea", + "busses,buses", + "bussing,busing", + "caliber,calibre", + "caliper,calliper", + "calisthenics,callisthenics", + "candor,candour", + "catalog,catalogue", + "centralized,centralised", + "center,centre", + "centered,centred", + "centers,centres", + "checkered,chequered", + "chili,chilli", + "chimera,chimaera", + "clamor,clamour", + "colonization,colonisation", + "color,colour", + "colored,coloured", + "colorful,colourful", + "colors,colours", + "computerized,computerised", + "connection,connexion", + "connections,connexions", + "cozy,cosy", + "councilor,councillor", + "councilors,councillors", + "counseling,counselling", + "counselor,counsellor", + "counselors,counsellors", + "customize,customise", + "customized,customised", + "defense,defence", + "defenses,defences", + "dialog,dialogue", + "digitized,digitised", + "disk,disc", + "disks,discs", + "draft,draught", + "drafts,draughts", + "dueling,duelling", + "edema,edoema", + "endeavor,endeavour", + "endeavors,endeavours", + "epicenter,epicentre", + "equalization,equalisation", + "favor,favour", + "favorite,favourite", + "favorites,favourites", + "favors,favours", + "fertilizer,fertiliser", + "fertilizers,fertilisers", + "fiber,fibre", + "fiberglass,fibreglass", + "fibers,fibres", + "filet,fillet", + "filets,fillets", + "flavor,flavour", + "flavors,flavours", + "fetal,foetal", + "fulfillment,fulfilment", + "gasses,gases", + "gage,gauge", + "gages,gauges", + "gaging,gauging", + "gypsies,gipsies", + "glamor,glamour", + "goiter,goitre", + "gram,gramme", + "grams,grammes", + "gray,grey", + "grays,greys", + "groin,groyne", + "griffin,gryphon", + "griffins,gryphons", + "gynecologist,gynaecologist", + "gynecology,gynaecology", + "hematology,haematology", + "harbor,harbour", + "harbors,harbours", + "homeopathic,homoeopathic", + "homeopathy,homoeopathy", + "honor,honour", + "honorable,honourable", + "honors,honours", + "hospitalization,hospitalisation", + "humor,humour", + "immunization,immunisation", + "install,instal", + "jeweler,jeweller", + "jewelers,jewellers", + "jewelry,jewellery", + "judgment,judgement", + "kilometer,kilometre", + "labor,labour", + "laborers,labourers", + "laboring,labouring", + "labors,labours", + "license,licence", + "licensed,licenced", + "licensing,licencing", + "liter,litre", + "liters,litres", + "louver,louvre", + "louvers,louvres", + "luster,lustre", + "maneuver,manoeuvre", + "maneuvers,manoeuvres", + "marshaling,marshalling", + "marvelous,marvellous", + "meager,meagre", + "mechanized,mechanised", + "medieval,mediaeval", + "meter,metre", + "meters,metres", + "minibusses,minibuses", + "miter,mitre", + "modelers,modellers", + "modeling,modelling", + "motorized,motorised", + "mold,mould", + "molder,moulder", + "molders,moulders", + "molding,moulding", + "moldings,mouldings", + "molds,moulds", + "molt,moult", + "mustache,moustache", + "naturalization,naturalisation", + "neighbor,neighbour", + "neighborhood,neighbourhood", + "neighborhoods,neighbourhoods", + "neighborly,neighbourly", + "neighbors,neighbours", + "odor,odour", + "omelet,omelette", + "organization,organisation", + "organizational,organisational", + "organizations,organisations", + "organize,organise", + "organized,organised", + "orthopedic,orthopaedic", + "orthopedics,orthopaedics", + "pediatric,paediatric", + "pediatrician,paediatrician", + "pediatrics,paediatrics", + "paleontology,palaeontology", + "parceling,parcelling", + "parlor,parlour", + "personalized,personalised", + "filter,philtre", + "filters,philtres", + "plow,plough", + "plowed,ploughed", + "plowing,ploughing", + "plowman,ploughman", + "plowshare,ploughshare", + "practice,practise", + "practices,practises", + "practicing,practising", + "primeval,primaeval", + "program,programme", + "programs,programmes", + "prolog,prologue", + "psychoanalyze,psychoanalyse", + "pajama,pyjama", + "pizzazz,pzazz", + "realization,realisation", + "refueling,refuelling", + "remodeling,remodelling", + "reorganized,reorganised", + "reveler,reveller", + "revitalize,revitalise", + "rigor,rigour", + "rumor,rumour", + "rumors,rumours", + "saber,sabre", + "sabers,sabres", + "saltpeter,saltpetre", + "savior,saviour", + "saviors,saviours", + "savor,savour", + "savory,savoury", + "scepter,sceptre", + "sepulcher,sepulchre", + "sheik,sheikh", + "signaling,signalling", + "smolders,smoulders", + "snorkeling,snorkelling", + "somber,sombre", + "specialized,specialised", + "specter,spectre", + "splendor,splendour", + "stabilization,stabilisation", + "succor,succour", + "sulfide,sulphide", + "sulfur,sulphur", + "siphon,syphon", + "technicolor,technicolour", + "theater,theatre", + "theaters,theatres", + "ton,tonne", + "tons,tonnes", + "tranquility,tranquilly", + "traveled,travelled", + "traveler,traveller", + "travelers,travellers", + "traveling,travelling", + "tricolor,tricolour", + "tire,tyre", + "tires,tyres", + "unorganized,unorganised", + "valor,valour", + "vaporized,vaporised", + "vapor,vapour", + "vapors,vapours", + "vigor,vigour", + "watercolor,watercolour", + "watercolors,watercolours", + "woolen,woollen", + "wooly,woolly", + "yogurt,yoghurt" + ] + }, + "synonyms/british_american_english/multiword": { + "type": "synonym", + "synonyms": [ + "" + ] + }, + "synonyms/country_codes": { + "type": "synonym", + "synonyms": [ + "abw,aw", + "afg,af", + "ago,ao", + "aia,ai", + "ala,ax", + "alb,al", + "and,ad", + "are,ae", + "arg,ar", + "arm,am", + "asm,as", + "ata,aq", + "atf,tf", + "atg,ag", + "aus,au", + "aut,at", + "aze,az", + "bdi,bi", + "bel,be", + "ben,bj", + "bes,bq", + "bfa,bf", + "bgd,bd", + "bgr,bg", + "bhr,bh", + "bhs,bs", + "bih,ba", + "blm,bl", + "blr,by", + "blz,bz", + "bmu,bm", + "bol,bo", + "bra,br", + "brb,bb", + "brn,bn", + "btn,bt", + "bvt,bv", + "bwa,bw", + "caf,cf", + "can,ca", + "cck,cc", + "che,ch", + "chl,cl", + "chn,cn", + "civ,ci", + "cmr,cm", + "cod,cd", + "cog,cg", + "cok,ck", + "col,co", + "com,km", + "cpv,cv", + "cri,cr", + "cub,cu", + "cuw,cw", + "cxr,cx", + "cym,ky", + "cyp,cy", + "cze,cz", + "deu,de", + "dji,dj", + "dma,dm", + "dnk,dk", + "dom,do", + "dza,dz", + "ecu,ec", + "egy,eg", + "eri,er", + "esh,eh", + "esp,es", + "est,ee", + "eth,et", + "fin,fi", + "fji,fj", + "flk,fk", + "fra,fr", + "fro,fo", + "fsm,fm", + "gab,ga", + "gbr,gb,uk", + "geo,ge", + "ggy,gg", + "gha,gh", + "gib,gi", + "gin,gn", + "glp,gp", + "gmb,gm", + "gnb,gw", + "gnq,gq", + "grc,gr", + "grd,gd", + "grl,gl", + "gtm,gt", + "guf,gf", + "gum,gu", + "guy,gy", + "hkg,hk", + "hmd,hm", + "hnd,hn", + "hrv,hr", + "hti,ht", + "hun,hu", + "idn,id", + "imn,im", + "ind,in", + "iot,io", + "irl,ie", + "irn,ir", + "irq,iq", + "isl,is", + "isr,il", + "ita,it", + "jam,jm", + "jey,je", + "jor,jo", + "jpn,jp", + "kaz,kz", + "ken,ke", + "kgz,kg", + "khm,kh", + "kir,ki", + "kna,kn", + "kor,kr", + "kwt,kw", + "lao,la", + "lbn,lb", + "lbr,lr", + "lby,ly", + "lca,lc", + "lie,li", + "lka,lk", + "lso,ls", + "ltu,lt", + "lux,lu", + "lva,lv", + "mac,mo", + "maf,mf", + "mar,ma", + "mco,mc", + "mda,md", + "mdg,mg", + "mdv,mv", + "mex,mx", + "mhl,mh", + "mkd,mk", + "mli,ml", + "mlt,mt", + "mmr,mm", + "mne,me", + "mng,mn", + "mnp,mp", + "moz,mz", + "mrt,mr", + "msr,ms", + "mtq,mq", + "mus,mu", + "mwi,mw", + "mys,my", + "myt,yt", + "nam,na", + "ncl,nc", + "ner,ne", + "nfk,nf", + "nga,ng", + "nic,ni", + "niu,nu", + "nld,nl", + "nor,no", + "npl,np", + "nru,nr", + "nzl,nz", + "omn,om", + "pak,pk", + "pan,pa", + "pcn,pn", + "per,pe", + "phl,ph", + "plw,pw", + "png,pg", + "pol,pl", + "pri,pr", + "prk,kp", + "prt,pt", + "pry,py", + "pse,ps", + "pyf,pf", + "qat,qa", + "reu,re", + "rou,ro", + "rus,ru", + "rwa,rw", + "sau,sa", + "sdn,sd", + "sen,sn", + "sgp,sg", + "sgs,gs", + "shn,sh", + "sjm,sj", + "slb,sb", + "sle,sl", + "slv,sv", + "smr,sm", + "som,so", + "spm,pm", + "srb,rs", + "ssd,ss", + "stp,st", + "sur,sr", + "svk,sk", + "svn,si", + "swe,se", + "swz,sz", + "sxm,sx", + "syc,sc", + "syr,sy", + "tca,tc", + "tcd,td", + "tgo,tg", + "tha,th", + "tjk,tj", + "tkl,tk", + "tkm,tm", + "tls,tl", + "ton,to", + "tto,tt", + "tun,tn", + "tur,tr", + "tuv,tv", + "twn,tw", + "tza,tz", + "uga,ug", + "ukr,ua", + "umi,um", + "ury,uy", + "usa,us", + "uzb,uz", + "vat,va", + "vct,vc", + "ven,ve", + "vgb,vg", + "vir,vi", + "vnm,vn", + "vut,vu", + "wlf,wf", + "wsm,ws", + "yem,ye", + "zaf,za", + "zmb,zm", + "zwe,zw" + ] + }, + "synonyms/country_codes/multiword": { + "type": "synonym", + "synonyms": [ + "" + ] + }, + "synonyms/custom_admin": { + "type": "synonym", + "synonyms": [ + "" + ] + }, + "synonyms/custom_admin/multiword": { + "type": "synonym", + "synonyms": [ + "" + ] + }, + "synonyms/custom_name": { + "type": "synonym", + "synonyms": [ + "" + ] + }, + "synonyms/custom_name/multiword": { + "type": "synonym", + "synonyms": [ + "" + ] + }, + "synonyms/custom_street": { + "type": "synonym", + "synonyms": [ + "" + ] + }, + "synonyms/custom_street/multiword": { + "type": "synonym", + "synonyms": [ + "" + ] + }, + "synonyms/directionals": { + "type": "synonym", + "synonyms": [ + "nord,n", + "nördlich,nördl,nordl,nordlich,noerdlich", + "nördliche,nordliche,noerdliche", + "nördlicher,nordlicher,noerdlicher", + "nördliches,nordliches,noerdliches", + "nordost,no", + "nordöstlich,nordostlich,nordoestlich", + "nordwest,nw", + "ost,o", + "östlich,östl,ostlich,ostl,oestlich", + "östliche,ostliche,oestliche", + "östlicher,ostlicher,oestlicher", + "östliches,ostliches,oestliches", + "süd,s,sud,sued", + "süden,suden,sueden", + "südlich,südl,sudl,sudlich,suedlich", + "südliche,sudliche,suedliche", + "südlicher,sudlicher,suedlicher", + "südliches,sudliches,suedliches", + "südost,so,sudost,suedost", + "südosten,sudosten,suedosten", + "südöstlich,sudostlich,suedoestlich", + "südöstliche,sudostliche,suedoestliche", + "südöstlicher,sudostlicher,suedoestlicher", + "südöstliches,sudostliches,suedoestliches", + "südwest,sw,sudwest,suedwest", + "südwesten,sudwesten,suedwesten", + "südwestlich,sudwestlich,suedwestlich", + "südwestliche,sudwestliche,suedwestliche", + "südwestlicher,sudwestlicher,suedwestlicher", + "südwestliches,sudwestliches,suedwestliches", + "west,w", + "westlich,westl", + "center,ctr", + "central,cn,ctrl,cntrl", + "centre,ctr", + "east,e", + "eastern,eastrn,estrn,estn", + "lower,lowr,lwr", + "middle,mdl,midle,mddl", + "north,n,nrt,nrth,nth,norh,nort,no", + "northeast,northe,neast,ne", + "northeastern,northeastrn,northestrn,northestn,neastern", + "northwest,northw,northwst,nwest,nw", + "northwestern,northwestrn,northwstrn,northwstn", + "south,s,so,sth", + "southeast,southe,seast,se", + "southeastern,southeastrn,southestrn,southestn,seastern", + "southwest,southw,southwst,swest,sw", + "southwestern,southwestrn,southwstrn,southwstn,swestern", + "upper,uppr,upr,up", + "west,w,wst", + "western,westrn,wstrn,wstn", + "este,e", + "norte,n", + "noreste,nordeste,ne", + "noroeste,nw", + "oeste,w", + "oriente,ote", + "poniente,pte", + "sur,s", + "sureste,se", + "suroeste,sw", + "central,ctrl", + "centre,ctre,cntre", + "est,e", + "nord,n", + "ouest,o", + "sud,s" + ] + }, + "synonyms/directionals/multiword": { + "type": "synonym", + "synonyms": [ + "" + ] + }, + "synonyms/personal_titles": { + "type": "synonym", + "synonyms": [ + "doktor,dr", + "bürgermeister,buergermeister,burgermeister,bgm", + "direktor,dir", + "frau,fr", + "fraulein,fri", + "heilige,hl", + "herr,hr", + "herrn,hrn", + "ingnieur,ing", + "oberburgermeister,ob,obgm", + "professor,prof", + "sankt,st,skt", + "colonel,col", + "commander,cmdr", + "corporal,cpl", + "captain,capt,cpt", + "czar,tsar", + "doctor,dr,doc", + "doctors,drs,docs", + "brother,br", + "brothers,bros", + "father,fr", + "sister,sr", + "general,gen", + "honorable,honourable,hon", + "king,kg", + "major,maj", + "mr,mister", + "mrs,misses", + "ms,miss", + "officer,ofcr", + "president,pres", + "professor,prof", + "professors,profs", + "reverend,rev", + "saint,st", + "saints,ss", + "sainte,ste", + "santa,sta", + "sargeant,sgt", + "secretary,sec", + "representative,rep", + "representatives,reps", + "senator,sen", + "senators,sens", + "junior,jr,jnr", + "senior,sr,snr", + "abad,abd", + "abogada,abga,abgda", + "abogado,abg,abgdo", + "administrador,admor,admr", + "administradora,admora,admra", + "agrimensor,agrim", + "alcalde,alcde", + "alférez,alf,alferez,alfz", + "almirante,alm,almte,alte", + "arquitecto,arq", + "arquitecta,arqa", + "arquitectors,arqs,arqos", + "arzobispo,arz", + "barón,baron,bar", + "brigadier,brg,bg,brig", + "caballero,cab", + "cacique,cque", + "capitán,cap,capt,ctn,cptn,capitan", + "cardenal,card,cnal", + "clérigo,clg,clerigo", + "comandante,cmdt,cmte,comte,cte", + "comisario,cmro", + "conde,cde", + "condesa,cdesa", + "consejal,cjal", + "consejero,cons,consejo", + "contralmirante,contralmte", + "coronel,cnel,col,crn", + "diacono,diac", + "diacona,diaca", + "diputado,dip", + "director,dir", + "directora,dira", + "doctor,dr", + "doctora,dra", + "dragoneante,dg", + "duque,dq", + "duquesa,dqsa,dqa", + "embajador,emb", + "enfermera,enf,enfa", + "excelencia,exca", + "excelentisima,excma", + "excelentisimo,excmo", + "fray,fr", + "gendarme,gdme", + "general,gral,genl,gnal,gn", + "gobernadora,gdora,goba", + "gobernador,gdor,gob", + "hermana,hna", + "hermanas,hnas", + "hermano,hno", + "hermanos,hnos", + "infanta,infa,infta", + "infante,inf,infte", + "ingeniera,inga", + "ingeniero,ing,ingo", + "inspector,insp", + "intendente,int", + "jefe,jf", + "juez,jz", + "licenciada,lda,lica", + "licenciado,ldo,lic,lico", + "locutor,loc", + "maestro,mstro,mtro", + "mariscal,mscal", + "marques,mq,marqs", + "marquesa,mqa,marqsa", + "mayor,my,may", + "ministerio,min", + "ministro,mtro", + "monseñor,monsenor,mons,msnr", + "notario,not", + "obispo,ob", + "presidenta,pdta,presa,presida,pta", + "presidente,pdte,pres,presid,pte", + "príncipe,principe", + "principal,pral", + "procurador,proc", + "profesor,prof,profr", + "profesora,profª,profa,profrª,profra", + "reina,ra", + "reverendo,rvdo,rdo,rvd,rev", + "reverendo,rvdo,rdo,rvd,rev", + "reverendisima,rma,rvdma", + "reverendisimo,rmo,rvdmo", + "san,s", + "santa,stª,sta", + "sant,st", + "santo,stº,st°,sto", + "sargento,sarg,sgto,sargto", + "secretario,secreto,secto", + "secretaria,secreta,secta", + "senador,sen", + "señor,sr,senor", + "señora,sª,srª,sra,senora", + "señores,srs,sres,senores", + "señorita,srta,senorita", + "serenisima,serma", + "serenisimo,sermo", + "soldado,sold", + "subteniente,subte,stn", + "teniente,tn,tte", + "vizconde,vde", + "vizcondesa,vdesa", + "junior,iunior,jr", + "amiral,am", + "capitaine,capte,capt", + "caporal,capl", + "chez,chz", + "colonel,col,cel", + "commandant,cdt", + "commander,cmdr,cdr", + "commodore,cmdre", + "comte,cte", + "comtesse,ctesse", + "docteur,dr", + "docteurs,drs", + "frère,frere", + "général,general,gén,gen,gal", + "lieutenant,lt", + "madame,mme", + "mesdames,mmes", + "mademoiselle,mlle,mle", + "mademoiselles,mlles,mles", + "maître,maitre,me", + "maîtres,maitres", + "maîtresse,maitresse", + "major,maj", + "maréchal,marechal,mal", + "marquis,mis", + "marquise,mise", + "messieurs,mm,mrs", + "monseigneur,mgr", + "monsieur,mr", + "père,pere", + "professeur,prof,pr", + "professeure,profe,pre", + "professeures,profes,pres", + "professeurs,profs,prs", + "révérend,rév,reverend,rev", + "saint,st", + "sainte,ste", + "sergente,sgt", + "veuve,vve" + ] + }, + "synonyms/personal_titles/multiword": { + "type": "synonym", + "synonyms": [ + "" + ] + }, + "synonyms/place_names": { + "type": "synonym", + "synonyms": [ + "abteilung,abt", + "altstoffsammelzentrum,asz", + "bücherei,bucherei,buecherei", + "bundesgymnasium,bg", + "bundesrealgymnasium,brg", + "büro,buro,buero", + "bustenhalter,bh", + "café,cafe", + "denkmal,dkm", + "deutsch,dt", + "ehemalige,ehem", + "fabrik,fb", + "fachhochschule,fh", + "fähranlegestelle,fahranlegestelle,faehranlegestelle", + "gebruder,gebr", + "gasthaus,gh", + "gaststätte,gaststatte,gaststaette", + "gasthof,ghf", + "gefängnis,gefangnis,gefaengnis", + "geschäft,geschaft,geschaeft", + "haltestelle,hst", + "handelsakademie,hak", + "handelsschule,hasch", + "hauptbahnhof,hbf", + "hinter,hint,ht", + "höhle,hohle", + "hörsäle,horsale,hoersaele", + "hütte,hutte,huette,htt", + "internationale,int", + "jagdhutte,jhtt", + "jungenherberge,jh", + "kapelle,kap,kpl", + "kläranlage,ka,klaranlage,klaeranlage", + "kleine,kl", + "kleiner,kl", + "kleines,kl", + "kogel,kg", + "konzentrationslager,kz,kl", + "magistratsabteilung,ma", + "markt,mkt", + "marktplatz,marktpl", + "nationalpark,np", + "naturschutzgebiet,nsg", + "niedere,nd", + "resevoir,res", + "rhein,rh", + "schloss,schl", + "schutzhütte,schutzhutte,schutzhuette", + "spitze,sp", + "sportplatz,sportpl", + "steinbruch,stb", + "supermarkt,supermkt", + "truppenubüngsplatz,tüpl,tupl,truppenubungsplatz,truppenubuengsplatz", + "universität,uni,universitat,universitaet", + "universitätsbibliothek,universitatsbibliothek,universitaetsbibliothek", + "volksschule,vs", + "vordere,vd,vord", + "warenhäuser,warenhauser,warenhaeuser", + "wasserfall,wsf,wssf", + "wiese,ws", + "wirtshaus,wh", + "abbey,abby", + "airport,aprt", + "amphitheatre,amphitheater", + "annex,anex,annx,anx", + "apartments,apts", + "barbecue,barbeque,bbq", + "barracks,barrack,baracks", + "beach,bch", + "boardwalk,bwk,bwlk", + "cafe,café", + "cape,cpe,cp", + "castle,cst", + "church,chr", + "circus,crcs,crc", + "city,cty", + "club,clb", + "community,cmnty,cmty", + "cottage,cott,cottg", + "cottages,cotts,cottgs", + "county,co,cty", + "creek,cr,crk", + "dam,dm", + "deli,delicatessen", + "department,dept", + "detention,det,detn", + "distributor,dstr,distrib,dstrb", + "district,dist", + "dormitories,dorms", + "dormitory,dorm", + "downs,downes,dwns", + "factory,fcty,fty,fy", + "farm,frm", + "flats,flts", + "fords,frds", + "forest,frst,fst", + "forge,frg", + "forges,frgs", + "fork,frk", + "forks,frks", + "fort,ft", + "glens,glns", + "great,grt,gt", + "greater,grtr,gtr", + "green,grn,gn", + "greens,grns", + "groves,grvs", + "gym,gymnasium", + "heights,hghts,hgts,hieghts,ht,hts,hgths", + "home,hm,hme,hom", + "hospital,hos,hosp,hospice,hosptl,hsp,hsptl", + "hostel,host,hostl,hstel,hstl", + "hotel,hot,hotl,htel", + "houses,hses", + "institute,inst", + "international,intl", + "jetty,jtty,jty", + "lake,lk", + "lakes,lks", + "little,ltl,lttl,littl,litl", + "lock,lck", + "locks,lcks", + "lodge,ldge,lodg,ldg", + "lower,low,lwr,lr", + "mall,mll", + "manor,mnr", + "manors,mnrs", + "market,mkt,mrkt", + "marketplace,mktpl,mktplc", + "medical,med", + "memorial,mem", + "middle,mid,midl", + "military,mil", + "mission,msn", + "monastery,monastry", + "motel,mot,motl,mtel", + "mount,mt,mnt", + "mountain,mtn", + "mountains,mtns", + "municipal,mun,mpal", + "museum,mus", + "national,natl", + "neck,nck", + "office,ofc", + "offices,ofcs", + "orchard,orch", + "paradise,pde,pdse", + "park,pk,prk", + "pharmacy,pharm", + "po,postoffice", + "port,pt,prt", + "precinct,pct", + "reservation,res,resrv,resv,rsrv,rserv,rs", + "reserve,res,resrv,resv,rsrv,rserv,rserve,rsrve", + "reservoir,res", + "retreat,rtt", + "river,riv,rvr,rivr", + "rotary,rty", + "sanctuary,sanct", + "service,svc", + "services,svcs,svc", + "shop,shp", + "slope,slpe,slp", + "springs,spgs,sprngs", + "station,sta,stn", + "store,stor", + "stream,strm,stm", + "terminal,term", + "tower,twr", + "towers,twrs", + "triangle,tri", + "university,uni,univ,univers,unvrsty", + "upper,up,upr,uppr", + "villa,vll,vla", + "village,vil,vge,vill,villag,villg,vlg,vlge,vllg,vilg,vilge", + "villages,vlgs", + "villas,vlls,vlas", + "ville,vl", + "wood,wd", + "woods,wds", + "abril,abr,abl", + "agosto,ag,agto,agt", + "altura,alt", + "alturas,alts", + "arboleda,arb", + "arrabal,arral", + "bosque,bsq", + "brigada,brig", + "cabo,cbo", + "campo,cpo,cmpo", + "campos,cpos,cmpos", + "canal,cnl", + "centro,cntro,ctro", + "cerro,crro", + "corral,crral", + "corralillo,crrlo", + "diseminado,disem", + "enero,en,eno,ene", + "diciembre,dic,dicbre,dice,dbre,10bre,xbre", + "febrero,febo,febro,febr,feb", + "gobierno,gob,gobno", + "grande,gr", + "guerra,ga", + "independencia,indep", + "infantería,infanteria,infa,ynfa,ynfanta", + "jardín,jdin,jard,jardin", + "jardínes,jdins,jards,jardines", + "junio,jun,jn", + "julio,jul,jl", + "lago,lg", + "lagos,lgs", + "laguna,lgna", + "llanura,llnra", + "llanuras,llnras", + "marzo,mzo,mar", + "mayo,my,may", + "militar,milr", + "monte,mt,mte,mnte", + "montes,mts,mtes,mntes,mnts", + "nacional,nal,nacl", + "noviembre,nbre,nvre,nove,novre,novbre,9bre", + "octubre,oct,octbre,octe,8bre", + "portillo,ptilo,ptllo", + "prado,prdo", + "primeros,pros", + "privada,priv", + "punta,pnta", + "quebrada,qbda", + "real,rl", + "republica,rep", + "revolucion,rev", + "ribera,ribr", + "río,rio", + "septiembre,setbre,sepe,sepbre,7bre,7re,sep,set", + "sierra,srra", + "valle,vlle", + "volcan,vlcn", + "voluntarios,voluntos", + "abbaye,abe", + "autoécole,autoecole", + "aéroport,aeroport", + "bastide,bstd", + "baston,bast", + "bibliothèque,bibliotheque", + "bourg,brg", + "béguinage,beguinage,begi", + "béguinages,beguinages,begis", + "café,cafe", + "camping,cpg", + "castel,cst", + "chapelle,chp", + "charmille,chi", + "château,chateau", + "cimetière,cimetiere", + "cinéma,cinema", + "colline,coli", + "collines,colis", + "cottage,cott", + "cottages,cott,cotts", + "crématorium,crematorium", + "darse,dars", + "département,dept,departement", + "enceinte,en", + "escaliers,escs", + "ferme,frm", + "fermes,frms", + "fleuve,fl", + "fontaine,fon", + "fort,ft", + "forum,form", + "grand,gd,gr", + "halle,hle", + "halles,hles,hls", + "hippodrome,hip", + "hôpital,hopital", + "hôtel,hotel", + "jardin,jard,jrd", + "jardins,jards,jrds", + "manoir,man", + "marché,marche,mar", + "marchés,marches,mars", + "mont,mt,mnt", + "mont,mt,mnt,montagne", + "moulin,mln", + "moulins,mlns", + "musée,musee,mus", + "médecin,medecin", + "mémorial,memorial", + "palais,pal", + "parc,prc", + "parking,pkg", + "pavillon,pav", + "pavillons,pavs", + "petite,pt", + "porche,pch", + "poterne,pot,potrn", + "préscolaire,prescolaire", + "péristyle,peristyle,psty", + "rivière,riviere,riv", + "résidence,rés,residence,res", + "résidences,residences", + "stade,stde", + "station,sta", + "supermarché,supermarche", + "théâtre,theatre", + "université,universite,univ,uni", + "villa,vla", + "village,vge", + "villages,vges", + "villas,vlas", + "vétérinaire,veterinaire", + "école,ecole", + "église,eglise,egl,égl", + "étang,etang" + ] + }, + "synonyms/place_names/multiword": { + "type": "synonym", + "synonyms": [ + "" + ] + }, + "synonyms/punctuation": { + "type": "synonym", + "synonyms": [ + "&,and", + "&,und" + ] + }, + "synonyms/punctuation/multiword": { + "type": "synonym", + "synonyms": [ + "" + ] + }, + "synonyms/streets": { + "type": "synonym", + "synonyms": [ + "allee,al", + "bahnhof,bhf,bf", + "boulevard,bd", + "brücke,br,brucke,bruecke", + "bühl,buhl,buehl", + "chaussee,ch", + "forsthaus,fh", + "graben,gr", + "großser,grosser", + "große,grosse,gr", + "großes,grosses", + "obere,ob", + "oberer,ob", + "platz,pl", + "quelle,qu", + "rundwanderweg,rww", + "siedlung,sdlg", + "stiege,stg", + "straße,str,strasse", + "wiese,ws", + "abbey,abby", + "access,accs,acc", + "acres,acrs", + "alley,aly,ally,alee,al", + "alleyway,alwy,allyway,allwy", + "amble,ambl", + "anchorage,ancg", + "annex,anx", + "apartments,apts", + "approach,app,apch,appr", + "arcade,arc", + "arterial,artl", + "artery,art,arty", + "avenue,av,ave,aven,avenu,avn,avnu,avnue", + "avenues,avs,aves,avens,avenus,avns,avnus,avnues", + "autoroute,aut", + "back,bk", + "bank,bnk", + "basin,basn,bsn", + "bay,by", + "bayou,byu,bayoo", + "beach,baech,bch,beech", + "belt,blt", + "bend,bnd", + "block,blk,blck", + "bluff,blf,bluf,bluffs,blfs", + "boardwalk,bwk,bwlk", + "boulevard,blvd,bd,bde,blv,bl,blvde,blvrd,boulavard,boul,boulv,bvd,boulevarde", + "bottom,bot,bottm,btm,bttm", + "bottoms,bttms,btms,bottms", + "boundary,bdy", + "bowl,bl", + "brace,br,brce", + "branch,br,brnch,brch", + "brae,br", + "break,brk", + "bridge,bdge,br,brdg,bri,brg", + "broadway,bdwy,bway,bwy,brdway", + "brook,brk", + "brooks,brks", + "brow,brw", + "burg,bg", + "burgs,brgs", + "burrow,burw", + "butte,btte,bte", + "bypass,bypa,byps,bps,byp", + "byway,bywy", + "camp,cp", + "cape,cpe,cp", + "canyon,cyn,cnyn", + "caravan,cvan,cvn", + "causeway,csway,cswy,causewy,caus,cause,cway", + "center,centre,cetr,cntr,ctr,cen", + "centers,ctrs", + "centreway,cnwy", + "chase,ch,chas", + "circle,cir,circel", + "circles,cirs", + "circlet,clt", + "circuit,crct,circ,cct,cirt,ci,circt", + "circus,crcs,crc", + "claim,clm", + "cliff,clf", + "cliffs,clfs", + "close,cl,cls,clse", + "cluster,clr,clstr", + "colonnade,clde,clnde", + "common,cmmn,comm,cmn,com,cm", + "commons,cmmns,cmns,comms", + "concord,cncd,cncrd", + "concession,conc", + "concourse,con,concs,concse,cnc", + "connection,cntn,cxn", + "connector,conr,cnctr,cntr", + "copse,cps", + "corner,cnr,crn,cor", + "corners,cnrs,crns,cors", + "corseo,cseo", + "corso,cso", + "ch,chw,cohw,ctyhw,chgwy,cohgwy,ctyhgwy,chway,cohway,ctyhway,chwy,cohwy,ctyhwy,chi,cohi,ctyhi", + "cr,cor,crd,cord,ctyr,ctyrd", + "cr,cor,crt,cort,ctyr,ctyrt,crte,corte,ctyrte", + "course,crse", + "court,ct,crt", + "courts,crts,cts", + "courtyard,cyd,ctyd", + "cove,cov,ce,cv", + "creek,cr,crk", + "crescent,cr,cres,crs,crecent", + "crest,crst,cst", + "crief,crf", + "croft,cft", + "cross,cs,crss", + "crossing,crsg,xing,csg", + "crossroad,crd,xroad,xrd", + "crossroads,xrds", + "crossway,cowy,crwy,xway,xwy", + "cruiseway,cuwy,crwy", + "culdesac,cds,cusac,csac", + "curve,cve,crv,crve,curv", + "cutting,cttg,ctg,cutt", + "dale,dle", + "deviation,devn", + "distributor,dstr", + "divide,div", + "diversion,divers", + "down,dn", + "downs,dns,dwns", + "drive,dr,drv,dv,dve", + "driveway,drwy,dvwy,dwy,dway,drvwy", + "drove,drov", + "easement,esmt", + "edge,edg", + "elbow,elb", + "entrance,ent,entr", + "esplanade,esp,espl", + "estate,est", + "estates,ests", + "expressway,exp,expwy,expway,expy,exwy", + "extension,ex,ext,extn,exten", + "extensions,exts", + "fairway,fawy,fy", + "fall,fl", + "falls,fls", + "farm,frm", + "farms,frms", + "ferry,fry,fy", + "field,fld,fd", + "fields,flds,fds", + "fireline,fline,flne", + "firetrack,ftrk", + "firetrail,fit,fitr", + "flat,fl,flt", + "flats,flts", + "follow,folw", + "footway,ftwy", + "ford,frd", + "foreshore,fshr", + "formation,form,fmtn", + "freeway,frwy,fw,fwy,fway", + "front,frnt", + "frontage,frtg,fr", + "gap,gp", + "garden,gdn,grd,grdn", + "gardens,gdns,grds,grdns", + "gate,ga,gte", + "gates,gtes", + "gateway,gwy,gway,gtwy,gtway", + "glade,gl,gld,glde", + "glen,gln", + "gbd,grbd,grdbd,gdbd", + "grange,gra", + "green,grn,gn,gren", + "greenway,grwy", + "ground,grnd", + "grounds,grnds", + "grove,gr,grv,grve,gro", + "gulch,glch", + "gully,gly", + "hanger,hngr", + "harbor,harbour,hbr,hrbr", + "harbors,hbrs", + "haven,hvn,havn", + "head,hd", + "heads,hds", + "heath,hth,heth", + "heights,hghts,hgts,ht,hts,hgths", + "highlands,hghlds,hlds,hglds", + "highroad,hrd,hird", + "highway,hgwy,hw,hway,hwy,hi,hwye,hywy", + "hill,hl", + "hills,hls,hils", + "hollow,hllw,holw", + "impasse,imp", + "inlet,inlt", + "interchange,intg,intchg", + "intersection,intn,intsctn", + "interstate,ih", + "island,is,id,isl,isld", + "islands,iss,ids,islds", + "junction,jct,jnc,jnct,jctn,jtn,junct", + "junctions,jcts", + "key,ky", + "keys,kys", + "knoll,knol,knl", + "knolls,knls", + "ladder,ladr", + "lagoon,lagn,lgn,lagon", + "landing,ldg,lndg,landng", + "lane,ln,la", + "laneway,lnwy", + "light,lgt,lt", + "limits,lmts", + "line,ln", + "link,lnk,lk", + "little,ltl,lttl,littl,litl,lit,lt", + "loaf,lf", + "lookout,lkt", + "loop,lp", + "loops,lps", + "lot,lt", + "lynne,lynn", + "mall,ml", + "manor,mnr", + "meadow,mdw", + "meadows,mdws,mead", + "mead,md", + "meander,mndr,mdr,mr", + "mew,mw", + "mews,mws", + "mile,mi", + "mill,ml", + "mills,mls", + "motorway,mway,mwy,mtwy", + "mount,mt", + "neaves,nvs", + "nook,nk", + "number,nbr,num,no,nmbr,nr", + "outlet,otlt", + "outlook,out,otlk", + "overbridge,ovrb", + "overlook,ovlk", + "overpass,opas", + "paddock,padk", + "palms,plms", + "parade,pde,prd,prde,pard", + "park,pk,prk", + "parklands,pkld,pklds,parkland", + "parkway,pkwy,parkwy,pky,pkway,prkwy,prkway,pkw,pwy,prkw", + "parkways,pkwys", + "part,prt", + "pass,ps", + "passage,psge,pass,pasg", + "path,pth", + "pathway,phwy,pway,pthway,pthwy,ptway,ptwy", + "peninsula,psla", + "piazza,piaz,pzza", + "pike,pk,pke", + "pine,pne,pn", + "pines,pns,pnes", + "place,pl,pla,plc,plac", + "plain,pln,pl", + "plains,plns,pls", + "plateau,plat,plt", + "plaza,plz,plza,pz", + "prarie,pr", + "pocket,pkt,pokt,pckt", + "point,piont,pnt,pt", + "pointe,pte,pnte", + "port,prt", + "ports,prts", + "prairie,pr", + "priors,prrs", + "private,pvt", + "promenade,prom,prm", + "pursuit,pur", + "quad,qd", + "quadrangle,qdgl", + "quadrant,qdrt,qd", + "quay,quy,qy", + "quays,quys,qys", + "radial,radl", + "ramble,ra,rmbl", + "ramp,rmp", + "ranae,ran", + "ranch,rnch", + "rapid,rpd", + "rapids,rpds", + "range,rng,rnge,rang", + "reach,rch", + "reserve,res,resrv,resv,rsrv,rserv,rserve,rsrve", + "rest,rst", + "retreat,rt,rtt", + "return,rtn", + "ridge,rdge,rdg", + "ridges,rdgs", + "ridgeway,rgwy,rdgwy", + "rowy,rightofway,rofw,row", + "rise,ri", + "riverway,rvwy", + "riviera,rvra", + "road,rd,ro,roa", + "roads,rds", + "roadside,rdsd", + "roadway,rdwy,rdw,rdy", + "rocks,rks", + "ronde,rnde", + "rosebowl,rsbl", + "rotary,rty", + "round,rnd", + "route,rt,rte", + "row,rw", + "run,rn", + "serviceway,swy,svwy,svcwy", + "shoal,shl", + "shoals,shls", + "shore,shor,shr", + "shores,shors,shrs", + "shunt,shun,shnt", + "siding,sdng,sdg", + "skyway,skwy", + "slope,slpe,slp", + "sound,snd", + "space,spc", + "spring,spg,sprng,sprn", + "springs,spgs,sprngs,spns", + "spur,spr", + "square,sq,sqr", + "squares,sqs", + "stairs,strs", + "stairway,stwy,strwy,strway", + "shighway,sthighway,sh,sth,shw,sthw,shwy,shgwy,sthgwy,shway,sthway,sthwy,shi,sthi,statehighway", + "sr,stateroad,sroad,stroad,staterd,srd,strd", + "sr,stateroute,sroute,stroute,statert,srt,srte,strt,strte", + "steps,stps", + "strand,stra,strnd,strd", + "strands,strnds,strds", + "stravenue,stra,strav", + "street,st,str,stre,stree,strt", + "streets,sts", + "strip,strp", + "subdivision,subdiv", + "subway,sbwy", + "summit,smt,sumt", + "tarn,tn", + "terrace,tce,ter,tr,terr,terace,terrac,terrasse,tsse", + "thicket,thick", + "thoroughfare,thor,throughfare,thfr", + "thoroughway,thwy", + "throughway,thru,thro,thruway,trwy,thwy", + "tollway,tlwy,twy", + "th,twph,tshph,thw,twphw,tshphw,thgwy,twphgwy,tshphgwy,thway,twphway,tshphway,thwy,twphwy,tshphwy,thi,twphi,tshphi", + "tr,trd,twpr,twprd,tshpr,tshprd", + "tr,trt,trte,twpr,twprt,twprte,tshpr,tshprt,tshprte", + "tower,twr", + "towers,twrs", + "townline,tline", + "trace,trce,trc", + "track,tr,trk,trak", + "trafficway,trfy", + "trail,tr,trl", + "trailer,trlr", + "tramway,tmwy", + "trees,trs", + "triangle,tri", + "trunkway,tkwy", + "tunnel,tun,tunl", + "turnabout,trnabt", + "turn,tn,trn", + "turnpike,tpk,tpke", + "underpass,upas,upass,ups", + "union,un", + "unions,uns", + "vale,va,vl", + "valley,vlly,vly,vy", + "valleys,vlys,vllys", + "viaduct,via,viad,vdct,viadct", + "view,vw", + "views,vws", + "villa,vla", + "village,vlge", + "villas,vlas", + "vista,vst,vsta,vis", + "walk,wlk,wk", + "walkway,wkwy,wky,wlkwy", + "waters,wtrs", + "way,wy", + "ways,wys", + "well,wl", + "wells,wls", + "wharf,whrf,whf", + "wynd,wyn", + "yard,yd,yrd", + "acceso,acces", + "alameda,alam", + "alquería,alqueria,alque", + "andador,andad", + "angosta,angta", + "apeadero,apdro", + "autopista,auto,autop,aut,ap", + "autovía,autovia,autov", + "avenida,av,avd,avda", + "bajada,bjada", + "banda,bda", + "barranco,branc", + "barranquillo,bqllo", + "barriada,barda", + "boulevard,blvd,bvd", + "brazal,brzal", + "bulevar,bulev,blev,blv,bv,bl", + "calle,cl,cll,ca,call", + "calleja,cllja", + "callejón,callejon,callej,cjón,cjon,cllon,cllón,cj", + "callejuela,cjla", + "callizo,cllzo", + "calzada,czada,calz", + "camino,cno,cmo,cmno,cm", + "caminito,cmt", + "camping,campg", + "cantera,cantr", + "cantón,canton,cant", + "carrera,cra,carra,carr,cr,kra,kr", + "carretera,ctra,cr,ct", + "carreterín,carreterin,ctrin", + "carretil,crtil", + "carril,crril", + "cerrada,cda,cer", + "cinturón,cinturon,cint", + "circular,cq", + "circuito,cto", + "circunvalar,cv,crv,cirv", + "circunvalación,circunvalacion,ccvcn", + "corredor,crrdo", + "costanilla,cstan", + "cuesta,custa", + "diagonal,diag,dg", + "diseminado,disem", + "espalda,eslda", + "estrada,estda", + "explanada,expla", + "extensión,ext,extension", + "extramuros,extrm", + "galería,galeria,gale", + "glorieta,gta", + "hacienda,hda", + "ladera,ldera", + "laderas,lderas", + "llanura,llnra", + "malecón,malecon,malec", + "mirador,mrdor", + "muelle,meull", + "pantano,pant", + "paraje,praje", + "parque,pque,parq,pq,pqe", + "particular,parti", + "partida,ptda", + "pasadizo,pzo", + "pasaje,psaje,psj", + "paseo,pº,p°,po,pso,pseo,pas,ps", + "pasillo,psllo", + "peatonal,peat", + "periferico,perif", + "plaza,pl,plza,pza,pz", + "plazoleta,pzta,plzta,plta", + "plazuela,plzla", + "poblado,pbdo", + "prolongación,prolongacion,prol", + "puebla,pbla", + "pueblo,pblo", + "puente,pnte", + "rambla,rbla", + "rampla,rampa,rpla", + "retorno,ret,rt", + "rincón,rincon,rcon,rin,rncn,rncon", + "rinconada,rcda,rcnda", + "ronda,rda", + "rotonda,rtda", + "ruta,rta", + "sector,sect", + "sendera,sedra", + "sendero,send,sedro", + "subida,sbida", + "tránsito,transito,trans", + "transversal,trval,trvsal,tv,tr", + "trasera,tras", + "travesía,travesia,trva,trvsía,trvsia", + "vereda,vreda,ver", + "viaducto,vcto,vd", + "vista,vst,vsta,vist", + "allée,allee,all", + "allées,alls,allees", + "arcade,arc", + "autoroute,aut", + "avenue,av,ave,aven,avenu,avn,avnu,avnue", + "avenues,avs,aves,avens,avenus,avns,avnus,avnues", + "barriêre,barriere,bre", + "barriêres,barrieres,bres", + "berge,ber", + "berges,bers", + "boucle,bcle", + "boulevard,bd,bde,blv,blvd,blvde,blvrd,boulavard,boul,boulv,bvd,boulevarde,bld", + "butte,but", + "côte,cote", + "côteau,coteau", + "campagne,cgne", + "carreau,cau,carru", + "carrefour,carf,carref", + "carrière,carriere,care", + "carrières,carrieres,cares", + "carré,carre,carr,car", + "cavée,cavee,cav", + "cercle,cercl", + "chalet,chl", + "chaussée,chaussee,chs,chee", + "chaussées,chaussees,chss,chees", + "chemin,ch,che", + "cheminement,chem", + "chemins,ches", + "château,chateau,cht", + "cloître,cloitre,cloi", + "contour,ctr", + "corniche,cor", + "corniches,cors", + "cours,crs", + "degré,degre,deg", + "degrés,degres,degs", + "descente,dsg", + "descentes,dsgs", + "digue,dig", + "digues,digs", + "échangeur,éch", + "écluse,ecluse,ecl,écl", + "écluses,ecluses,ecls,écls", + "enclave,env", + "enclos,enc", + "espace,espa", + "esplanade,esp", + "esplanades,esps", + "fosse,fos", + "fosses,fos,foss", + "foyer,foyr", + "galerie,gal", + "galeries,gals", + "garenne,garn", + "gbd,grbd,grdbd,gdbd", + "gch,grch,gdch,grdch", + "gden,gdens", + "grandrue,gr,grdr,gdr", + "gdsen,gdsens", + "grille,gri", + "grimpette,grim", + "hameau,ham", + "hchs,hschs", + "impasse,imp", + "impasses,imps", + "jetée,jetee,jte", + "jetées,jetees,jtes", + "levée,levee,leve,lve", + "montée,montee,mte", + "montées,montees,mtes", + "métro,metro,mét,met", + "parc,prc", + "parcs,prcs", + "parvis,prv", + "passage,pas,psg", + "passe,pass", + "passerelle,ple", + "passerelles,ples", + "patio,pat", + "périphérique,peripherique,peri,péri", + "place,pl", + "placis,plci", + "plage,plag", + "plages,plags", + "plaine,pln", + "plateau,plt,plat", + "plateaux,pltx,platx", + "pointe,pte,pnte", + "portique,porq,portq", + "portiques,porqs,portqs", + "pourtour,pour", + "presquîle,presquile,prq,prql", + "promenade,prom", + "peripherique,peri", + "quai,au", + "raccourci,rac,racc", + "raidillon,raid", + "rampe,rpe,rmpe,rmp", + "rempart,rem,remp", + "rocade,rocd", + "ronde,rnde", + "rdpt,rpt", + "roquet,roqt", + "rotonde,rtd,rtnd,rtde,rtnde", + "route,rt,rte", + "routes,rts,rtes", + "ruelle,rle", + "ruelles,rles", + "rues,rs", + "residence,res", + "residences,ress", + "sente,sen", + "sentes,sens", + "sentier,sent", + "sentiers,sents", + "square,sq", + "terrain,terr,trn", + "terrasse,tsse", + "terrasses,tsses", + "terte,trt", + "tertes,trts", + "traverse,tra,trvs,trvrs", + "vallon,val", + "valée,vallee,val", + "venelle,ven", + "venelles,vens", + "voie,voi", + "voies,voiss", + "aly,alley,allee,ally", + "anx,anex,annex,annx", + "arc,arcade", + "ave,avenue,av,aven,avenu,avn,avnue", + "byu,bayou,bayoo", + "bch,beach", + "bnd,bend", + "blf,bluff,bluf", + "blfs,bluffs", + "btm,bottom,bot,bottm", + "blvd,boulevard,boul,boulv", + "br,branch,brnch", + "brg,bridge,brdge", + "brk,brook", + "brks,brooks", + "bg,burg", + "bgs,burgs", + "byp,bypass,bypa,bypas,byps", + "cp,camp,cmp", + "cyn,canyon,canyn,cnyn", + "cpe,cape", + "cswy,causeway,causwa", + "ctr,center,cen,cent,centr,centre,cnter,cntr", + "ctrs,centers", + "cir,circle,circ,circl,crcl,crcle", + "cirs,circles", + "clf,cliff", + "clfs,cliffs", + "clb,club", + "cmn,common", + "cmns,commons", + "cor,corner", + "cors,corners", + "crse,course", + "ct,court", + "cts,courts", + "cv,cove", + "cvs,coves", + "crk,creek", + "cres,crescent,crsent,crsnt", + "crst,crest", + "xing,crossing,crssng", + "xrd,crossroad", + "xrds,crossroads", + "curv,curve", + "dl,dale", + "dm,dam", + "dv,divide,div,dvd", + "dr,drive,driv,drv", + "drs,drives", + "est,estate", + "ests,estates", + "expy,expressway,exp,expr,express,expw", + "ext,extension,extn,extnsn", + "exts,extensions", + "fls,falls", + "fry,ferry,frry", + "fld,field", + "flds,fields", + "flt,flat", + "flts,flats", + "frd,ford", + "frds,fords", + "frst,forest,forests", + "frg,forge,forg", + "frgs,forges", + "frk,fork", + "frks,forks", + "ft,fort,frt", + "fwy,freeway,freewy,frway,frwy", + "gdn,garden,gardn,grden,grdn", + "gdns,gardens,grdns", + "gtwy,gateway,gatewy,gatway,gtway", + "gln,glen", + "glns,glens", + "grn,green", + "grns,greens", + "grv,grove,grov", + "grvs,groves", + "hbr,harbor,harb,harbr,hrbor", + "hbrs,harbors", + "hvn,haven", + "hts,heights,ht", + "hwy,highway,highwy,hiway,hiwy,hway", + "hl,hill", + "hls,hills", + "holw,hollow,hllw,hollows,holws", + "inlt,inlet", + "is,island,islnd", + "iss,islands,islnds", + "isle,isles", + "jct,junction,jction,jctn,junctn,juncton", + "jcts,junctions,jctns", + "ky,key", + "kys,keys", + "knl,knoll,knol", + "knls,knolls", + "lk,lake", + "lks,lakes", + "lndg,landing,lndng", + "ln,lane", + "lgt,light", + "lgts,lights", + "lf,loaf", + "lck,lock", + "lcks,locks", + "ldg,lodge,ldge,lodg", + "loop,loops", + "mnr,manor", + "mnrs,manors", + "mdw,meadow", + "mdws,meadows,mdw,medows", + "ml,mill", + "mls,mills", + "msn,mission,missn,mssn", + "mtwy,motorway", + "mt,mount,mnt", + "mtn,mountain,mntain,mntn,mountin,mtin", + "mtns,mountains,mntns", + "nck,neck", + "orch,orchard,orchrd", + "oval,ovl", + "opas,overpass", + "park,parks", + "pkwy,parkway,parkwy,pkway,pky,parkways,pkwys", + "psge,passage", + "path,paths", + "pike,pikes", + "pne,pine", + "pnes,pines", + "pl,place", + "pln,plain", + "plns,plains", + "plz,plaza,plza", + "pt,point", + "pts,points", + "prt,port", + "prts,ports", + "pr,prairie,prr", + "radl,radial,rad,radiel", + "rnch,ranch,ranches,rnchs", + "rpd,rapid", + "rpds,rapids", + "rst,rest", + "rdg,ridge,rdge", + "rdgs,ridges", + "riv,river,rvr,rivr", + "rd,road", + "rds,roads", + "rte,route", + "shl,shoal", + "shls,shoals", + "shr,shore,shoar", + "shrs,shores,shoars", + "skwy,skyway", + "spg,spring,spng,sprng", + "spgs,springs,spngs,sprngs", + "spur,spurs", + "sq,square,sqr,sqre,squ", + "sqs,squares,sqrs", + "sta,station,statn,stn", + "stra,stravenue,strav,straven,stravn,strvn,strvnue", + "strm,stream,streme", + "st,street,strt,str", + "sts,streets", + "smt,summit,sumit,sumitt", + "ter,terrace,terr", + "trwy,throughway", + "trce,trace,traces", + "trak,track,tracks,trk,trks", + "trfy,trafficway", + "trl,trail,trails,trls", + "trlr,trailer,trlrs", + "tunl,tunnel,tunel,tunls,tunnels,tunnl", + "tpke,turnpike,trnpk,turnpk", + "upas,underpass", + "un,union", + "uns,unions", + "vly,valley,vally,vlly", + "vlys,valleys", + "via,viaduct,vdct,viadct", + "vw,view", + "vws,views", + "vlg,village,vill,villag,villg,villiage", + "vlgs,villages", + "vl,ville", + "vis,vista,vist,vst,vsta", + "walk,walks", + "way,wy", + "wl,well", + "wls,wells" + ] + }, + "synonyms/streets/multiword": { + "type": "synonym", + "synonyms": [ + "" + ] + } + }, + "char_filter": { + "ampersand_mapper": { + "type": "pattern_replace", + "pattern": "&", + "replacement": " AMPERSANDPLACEHOLDER " + }, + "punctuation": { + "type": "mapping", + "mappings": [ + ".=>", + "`=>", + "‘=>", + "’=>", + "‛=>", + "_=>", + "==>", + "?=>", + "'=>", + "|=>", + "\"=>", + "(=>", + ")=>", + "{=>", + "}=>", + "[=>", + "]=>", + "<=>", + ">=>", + "*=>", + "#=>", + "^=>", + "$=>", + "@=>", + "!=>", + "~=>", + ":=>", + ";=>", + "+=>", + "《=>", + "》=>", + "—=>", + "-=>", + ",=>", + "。=>", + "‹=>", + "›=>", + "⹂=>", + "〝=>", + "〞=>", + "、=>", + ":=>", + ";=>", + "!=>", + "·=>", + "?=>", + "„=>", + "“=>", + "”=>", + "‟=>", + ")=>", + "(=>", + "【=>", + "】=>", + "[=>", + "]=>", + "●=>", + "«=>", + "»=>" + ] + }, + "alphanumeric": { + "type": "pattern_replace", + "pattern": "[^a-zA-Z0-9]", + "replacement": "" + }, + "numeric": { + "type": "pattern_replace", + "pattern": "[^0-9]", + "replacement": " " + }, + "nfkc_normalizer": { + "type": "icu_normalizer", + "name": "nfkc", + "mode": "compose" + } + } + } + }, + "mappings": { + "properties": { + "source": { + "type": "keyword" + }, + "layer": { + "type": "keyword" + }, + "name": { + "type": "object", + "dynamic": true + }, + "phrase": { + "type": "object", + "dynamic": true + }, + "address_parts": { + "type": "object", + "dynamic": "strict", + "properties": { + "name": { + "type": "text", + "analyzer": "keyword", + "search_analyzer": "keyword", + "similarity": "peliasDefaultSimilarity" + }, + "unit": { + "type": "text", + "analyzer": "peliasUnit", + "search_analyzer": "peliasUnit", + "similarity": "peliasDefaultSimilarity" + }, + "number": { + "type": "text", + "analyzer": "peliasHousenumber", + "search_analyzer": "peliasHousenumber", + "similarity": "peliasDefaultSimilarity" + }, + "street": { + "type": "text", + "analyzer": "peliasStreet", + "search_analyzer": "peliasQuery", + "similarity": "peliasDefaultSimilarity" + }, + "cross_street": { + "type": "text", + "analyzer": "peliasStreet", + "search_analyzer": "peliasQuery", + "similarity": "peliasDefaultSimilarity" + }, + "zip": { + "type": "text", + "analyzer": "peliasZip", + "search_analyzer": "peliasZip", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "parent": { + "type": "object", + "dynamic": "strict", + "properties": { + "continent": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "continent_a": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "continent_id": { + "type": "keyword", + "doc_values": false + }, + "continent_source": { + "type": "keyword", + "doc_values": false + }, + "ocean": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "ocean_a": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "ocean_id": { + "type": "keyword", + "doc_values": false + }, + "ocean_source": { + "type": "keyword", + "doc_values": false + }, + "empire": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "empire_a": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "empire_id": { + "type": "keyword", + "doc_values": false + }, + "empire_source": { + "type": "keyword", + "doc_values": false + }, + "country": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "country_a": { + "type": "text", + "analyzer": "peliasIndexCountryAbbreviation", + "search_analyzer": "peliasQuery", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexCountryAbbreviationOneEdgeGram", + "search_analyzer": "peliasQuery", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "country_id": { + "type": "keyword", + "doc_values": false + }, + "country_source": { + "type": "keyword", + "doc_values": false + }, + "dependency": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "dependency_a": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "dependency_id": { + "type": "keyword", + "doc_values": false + }, + "dependency_source": { + "type": "keyword", + "doc_values": false + }, + "marinearea": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "marinearea_a": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "marinearea_id": { + "type": "keyword", + "doc_values": false + }, + "marinearea_source": { + "type": "keyword", + "doc_values": false + }, + "macroregion": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "macroregion_a": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "macroregion_id": { + "type": "keyword", + "doc_values": false + }, + "macroregion_source": { + "type": "keyword", + "doc_values": false + }, + "region": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "region_a": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "region_id": { + "type": "keyword", + "doc_values": false + }, + "region_source": { + "type": "keyword", + "doc_values": false + }, + "macrocounty": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "macrocounty_a": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "macrocounty_id": { + "type": "keyword", + "doc_values": false + }, + "macrocounty_source": { + "type": "keyword", + "doc_values": false + }, + "county": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "county_a": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "county_id": { + "type": "keyword", + "doc_values": false + }, + "county_source": { + "type": "keyword", + "doc_values": false + }, + "locality": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "locality_a": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "locality_id": { + "type": "keyword", + "doc_values": false + }, + "locality_source": { + "type": "keyword", + "doc_values": false + }, + "borough": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "borough_a": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "borough_id": { + "type": "keyword", + "doc_values": false + }, + "borough_source": { + "type": "keyword", + "doc_values": false + }, + "localadmin": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "localadmin_a": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "localadmin_id": { + "type": "keyword", + "doc_values": false + }, + "localadmin_source": { + "type": "keyword", + "doc_values": false + }, + "neighbourhood": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "neighbourhood_a": { + "type": "text", + "analyzer": "peliasAdmin", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasAdmin", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "neighbourhood_id": { + "type": "keyword", + "doc_values": false + }, + "neighbourhood_source": { + "type": "keyword", + "doc_values": false + }, + "postalcode": { + "type": "text", + "analyzer": "peliasZip", + "search_analyzer": "peliasZip", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasZip", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "postalcode_a": { + "type": "text", + "analyzer": "peliasZip", + "search_analyzer": "peliasZip", + "similarity": "peliasDefaultSimilarity", + "fields": { + "ngram": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasZip", + "similarity": "peliasDefaultSimilarity" + } + } + }, + "postalcode_id": { + "type": "keyword", + "doc_values": false + }, + "postalcode_source": { + "type": "keyword", + "doc_values": false + } + } + }, + "center_point": { + "type": "geo_point" + }, + "shape": { + "type": "geo_shape" + }, + "bounding_box": { + "type": "keyword", + "index": false + }, + "source_id": { + "type": "keyword", + "doc_values": false + }, + "category": { + "type": "keyword", + "doc_values": false + }, + "population": { + "type": "long", + "null_value": 0 + }, + "popularity": { + "type": "long", + "null_value": 0 + }, + "addendum": { + "type": "object", + "dynamic": true + } + }, + "dynamic_templates": [ + { + "nameGram": { + "path_match": "name.*", + "match_mapping_type": "string", + "mapping": { + "type": "text", + "analyzer": "peliasIndexOneEdgeGram", + "search_analyzer": "peliasQuery", + "similarity": "peliasDefaultSimilarity" + } + } + }, + { + "phrase": { + "path_match": "phrase.*", + "match_mapping_type": "string", + "mapping": { + "type": "text", + "analyzer": "peliasPhrase", + "search_analyzer": "peliasQuery", + "similarity": "peliasDefaultSimilarity" + } + } + }, + { + "addendum": { + "path_match": "addendum.*", + "match_mapping_type": "string", + "mapping": { + "type": "keyword", + "index": false, + "doc_values": false + } + } + } + ], + "_source": { + "excludes": [ + "shape", + "phrase" + ] + }, + "dynamic": "strict" + } +} diff --git a/test/settings.js b/test/settings.js index 78c6b2ba..3e8c0e36 100644 --- a/test/settings.js +++ b/test/settings.js @@ -1,6 +1,7 @@ -var path = require('path'), +const path = require('path'), settings = require('../settings'), - fs = require('fs'); + fs = require('fs'), + config = require('pelias-config').generate(); module.exports.tests = {}; @@ -48,6 +49,20 @@ module.exports.tests.analysis = function(test, common) { }); }; +function mayBeAmpersandMapper() { + if (config.schema.icuTokenizer) { + return ['ampersand_mapper']; + } + return []; +} + +function mayBeAmpersandReplacer() { + if (config.schema.icuTokenizer) { + return ['ampersand_replacer']; + } + return []; +} + // -- analyzers -- module.exports.tests.peliasAdminAnalyzer = function(test, common) { @@ -57,13 +72,13 @@ module.exports.tests.peliasAdminAnalyzer = function(test, common) { var analyzer = s.analysis.analyzer.peliasAdmin; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ['punctuation', 'nfkc_normalizer'], 'character filters specified'); + t.deepEqual(analyzer.char_filter, [...mayBeAmpersandMapper(), 'punctuation', 'nfkc_normalizer'], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasAdmin token filters', function (t) { var analyzer = settings().analysis.analyzer.peliasAdmin; - t.deepEqual(analyzer.filter, [ + t.deepEqual(analyzer.filter, [...mayBeAmpersandReplacer(), "lowercase", "trim", "synonyms/custom_admin/multiword", @@ -85,13 +100,14 @@ module.exports.tests.peliasIndexOneEdgeGramAnalyzer = function(test, common) { var analyzer = s.analysis.analyzer.peliasIndexOneEdgeGram; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ["punctuation","nfkc_normalizer"], 'character filters specified'); + t.deepEqual(analyzer.char_filter, [...mayBeAmpersandMapper(), "punctuation","nfkc_normalizer"], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasIndexOneEdgeGram token filters', function(t) { var analyzer = settings().analysis.analyzer.peliasIndexOneEdgeGram; t.deepEqual( analyzer.filter, [ + ...mayBeAmpersandReplacer(), "lowercase", "trim", "synonyms/custom_name/multiword", @@ -117,13 +133,14 @@ module.exports.tests.peliasQueryAnalyzer = function (test, common) { var analyzer = s.analysis.analyzer.peliasQuery; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ['punctuation', 'nfkc_normalizer'], 'character filters specified'); + t.deepEqual(analyzer.char_filter, [...mayBeAmpersandMapper(), 'punctuation', 'nfkc_normalizer'], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasQuery token filters', function (t) { var analyzer = settings().analysis.analyzer.peliasQuery; t.deepEqual(analyzer.filter, [ + ...mayBeAmpersandReplacer(), 'lowercase', 'trim', 'icu_folding', @@ -143,13 +160,14 @@ module.exports.tests.peliasPhraseAnalyzer = function(test, common) { var analyzer = s.analysis.analyzer.peliasPhrase; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ["punctuation","nfkc_normalizer"], 'character filters specified'); + t.deepEqual(analyzer.char_filter, [...mayBeAmpersandMapper(), "punctuation", "nfkc_normalizer"], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasPhrase token filters', function(t) { var analyzer = settings().analysis.analyzer.peliasPhrase; t.deepEqual( analyzer.filter, [ + ...mayBeAmpersandReplacer(), "lowercase", "trim", "remove_duplicate_spaces", @@ -236,13 +254,13 @@ module.exports.tests.peliasStreetAnalyzer = function(test, common) { var analyzer = s.analysis.analyzer.peliasStreet; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ['punctuation', 'nfkc_normalizer'], 'character filters specified'); + t.deepEqual(analyzer.char_filter, [...mayBeAmpersandMapper(), 'punctuation', 'nfkc_normalizer'], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasStreet token filters', function(t) { var analyzer = settings().analysis.analyzer.peliasStreet; - t.deepEqual( analyzer.filter, [ + t.deepEqual( analyzer.filter, [...mayBeAmpersandReplacer(), "lowercase", "trim", "remove_duplicate_spaces", @@ -266,13 +284,13 @@ module.exports.tests.peliasIndexCountryAbbreviation = function (test, common) { var analyzer = s.analysis.analyzer.peliasIndexCountryAbbreviation; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ['punctuation', 'nfkc_normalizer'], 'character filters specified'); + t.deepEqual(analyzer.char_filter, [...mayBeAmpersandMapper(), 'punctuation', 'nfkc_normalizer'], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasIndexCountryAbbreviation token filters', function (t) { var analyzer = settings().analysis.analyzer.peliasIndexCountryAbbreviation; - t.deepEqual(analyzer.filter, [ + t.deepEqual(analyzer.filter, [...mayBeAmpersandReplacer(), "lowercase", "trim", "icu_folding", @@ -292,13 +310,14 @@ module.exports.tests.peliasIndexCountryAbbreviationOneEdgeGramAnalyzer = functio var analyzer = s.analysis.analyzer.peliasIndexCountryAbbreviationOneEdgeGram; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ["punctuation", "nfkc_normalizer"], 'character filters specified'); + t.deepEqual(analyzer.char_filter, [...mayBeAmpersandMapper(), "punctuation", "nfkc_normalizer"], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasIndexCountryAbbreviationOneEdgeGram token filters', function (t) { var analyzer = settings().analysis.analyzer.peliasIndexCountryAbbreviationOneEdgeGram; t.deepEqual(analyzer.filter, [ + ...mayBeAmpersandReplacer(), "lowercase", "trim", "icu_folding",