From 80d483cb2e512ae130005048d2b19a1330aa45bb Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Tue, 4 Feb 2025 12:47:34 +0100 Subject: [PATCH] feat(schema): add beta ICU tokenizer --- .github/workflows/_unit_tests.yml | 4 +- .gitignore | 1 + .../analyzer_peliasIndexOneEdgeGram.js | 2 +- integration/analyzer_peliasQuery.js | 10 +-- integration/analyzer_peliasStreet.js | 2 +- settings-icu.js | 49 +++++++++++++ settings.js | 71 +++++-------------- test/compile.js | 1 - test/fixtures/expected-icu-tokenizer.json | 2 +- test/fixtures/expected.json | 10 --- 10 files changed, 76 insertions(+), 76 deletions(-) create mode 100644 settings-icu.js diff --git a/.github/workflows/_unit_tests.yml b/.github/workflows/_unit_tests.yml index c4d77d29..d0ef57f0 100644 --- a/.github/workflows/_unit_tests.yml +++ b/.github/workflows/_unit_tests.yml @@ -22,6 +22,4 @@ jobs: jq -n '{ schema: { icuTokenizer: true } }' > $(pwd)/config-icu.json export PELIAS_CONFIG=$(pwd)/config-icu.json fi - npm run test - - + npm run test \ No newline at end of file diff --git a/.gitignore b/.gitignore index b0e3907b..27df0198 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ node_modules npm-debug.log .DS_Store +config-icu.json diff --git a/integration/analyzer_peliasIndexOneEdgeGram.js b/integration/analyzer_peliasIndexOneEdgeGram.js index 71f03b7c..cd4fdb4c 100644 --- a/integration/analyzer_peliasIndexOneEdgeGram.js +++ b/integration/analyzer_peliasIndexOneEdgeGram.js @@ -89,7 +89,7 @@ module.exports.tests.analyze = function(test, common){ if (config.schema.icuTokenizer) { assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', [ '0:ซ', '0:ซอ', '0:ซอย', - '1:เพชรบุรี1', '1:เพชรบุรี', '1:เพชรบุร', '1:เพชรบุ', '1:เพชรบ', '1:เพชร', '1:เพช', '1:เพ', '1:เ', + '1:เพชรบุรี1', '1:เพชรบุรี', '1:เพชรบุร', '1:เพชรบุ', '1:เพชรบ', '1:เพชร', '1:เพช', '1:เพ', '1:เ', '2:f', '2:fo', '2:foo'] ); } else { // no ICU tokenization, so we split only on spaces diff --git a/integration/analyzer_peliasQuery.js b/integration/analyzer_peliasQuery.js index 86e21ac1..47d2f76b 100644 --- a/integration/analyzer_peliasQuery.js +++ b/integration/analyzer_peliasQuery.js @@ -55,19 +55,19 @@ module.exports.tests.functional = function(test, common){ assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] ); assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] ); assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]); - assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室', + assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室', ['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']); // correct word by word split according to native speaker: 马来西亚 / 霹雳州 / 怡保 / 31400, 怡保花园 / 第5巷 / 45号 - assertAnalysis('chinese_address2', '马来西亚霹雳州怡保31400怡保花园第5巷45号', + assertAnalysis('chinese_address2', '马来西亚霹雳州怡保31400怡保花园第5巷45号', ["马来", "西亚", "霹", "雳", "州", "怡", "保", "31400", "怡", "保", "花园", "第", "5", "巷", "45", "号"]); // correct word by word split: 马来西亚 / 柔佛新山 / 81200 / , / 士古来路 / , / 百万时尚广场 - assertAnalysis('chinese_address3', '马来西亚柔佛新山81200士古来路百万时尚广场', + assertAnalysis('chinese_address3', '马来西亚柔佛新山81200士古来路百万时尚广场', ["马来", "西亚", "柔", "佛", "新山", "81200", "士", "古来", "路", "百万", "时尚", "广场"]); // correct word by word split: 马来西亚/ 槟城 / 亚依淡 / 11500 / , / 极乐寺 / , / 回返路 - assertAnalysis('chinese_address4', '马来西亚槟城亚依淡11500极乐寺回返路', + assertAnalysis('chinese_address4', '马来西亚槟城亚依淡11500极乐寺回返路', ["马来", "西亚", "槟", "城", "亚", "依", "淡", "11500", "极乐", "寺", "回", "返", "路"]); // correct word by word split: 马来西亚 / 吉隆坡 / 50000 / , / 茨厂街 / 123号 - assertAnalysis('chinese_address5', '马来西亚吉隆坡50000茨厂街123号', + assertAnalysis('chinese_address5', '马来西亚吉隆坡50000茨厂街123号', ["马来", "西亚", "吉隆坡", "50000", "茨", "厂", "街", "123", "号"]); assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]); diff --git a/integration/analyzer_peliasStreet.js b/integration/analyzer_peliasStreet.js index a43d93c8..7f861f9a 100644 --- a/integration/analyzer_peliasStreet.js +++ b/integration/analyzer_peliasStreet.js @@ -28,7 +28,7 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] ); assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] ); assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]); - assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室', + assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室', ['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']); assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]); assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]); diff --git a/settings-icu.js b/settings-icu.js new file mode 100644 index 00000000..89f8205c --- /dev/null +++ b/settings-icu.js @@ -0,0 +1,49 @@ +const _ = require('lodash'); + +/** + * This module contains modifications to the Pelias schema to adopt the elastic ICU tokenizer. + * This tokenizer improves word-splitting of non-latin alphabets (particularly Asian languages). + * + * It can be enabled by setting `config.schema.icuTokenizer` in your `pelias.json` config. + * Note: this must be set *before* you run create your elasticsearch index or it will have no effect. + * + * This feature is considered beta, we encourage testing & feedback from the community in order + * to adopt the ICU tokenizer as our default. + * + * https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu-tokenizer.html + * https://github.com/pelias/schema/pull/498 + */ + +module.exports = (settings) => { + + // replace pattern tokenizer with icu_tokenizer + _.set(settings, 'analysis.tokenizer.peliasTokenizer', { + 'type': 'icu_tokenizer' + }); + + // add ampersand_replacer filter + // replaces ampersand placeholders back to `&` (see `ampersand_mapper` char_filter) + _.set(settings, 'analysis.filter.ampersand_replacer', { + 'type': 'pattern_replace', + 'pattern': 'AMPERSANDPLACEHOLDER', + 'replacement': '&' + }); + + // add ampersand_mapper char_filter + // icu-tokenizer treats ampersands as a word boundary, so we replace them with a placeholder to avoid it, + // as we want to handle them separately, we replace them back after tokenization (see `ampersand_replacer` filter) + _.set(settings, 'analysis.char_filter.ampersand_mapper', { + 'type': 'pattern_replace', + 'pattern': '&', + 'replacement': ' AMPERSANDPLACEHOLDER ' + }); + + // prepend ampersand mapper/replacer to each analyzer + _.forEach(_.get(settings, 'analysis.analyzer'), (block) => { + if (block?.tokenizer !== 'peliasTokenizer') { return; } + block.filter.unshift('ampersand_replacer'); + block.char_filter.unshift('ampersand_mapper'); + }); + + return settings; +} \ No newline at end of file diff --git a/settings.js b/settings.js index 7e071761..e6b5a951 100644 --- a/settings.js +++ b/settings.js @@ -2,6 +2,7 @@ const _ = require('lodash'); const peliasConfig = require('pelias-config'); const punctuation = require('./punctuation'); const synonyms = require('./synonyms/loader').load(); +const settingsICU = require('./settings-icu'); require('./configValidation').validate(peliasConfig.generate()); @@ -10,32 +11,6 @@ require('./configValidation').validate(peliasConfig.generate()); function generate(){ const config = peliasConfig.generate(); - function tokenizer(config) { - if (config.schema.icuTokenizer) { - return { - "type": "icu_tokenizer" - }; - } - return { - "type": "pattern", - "pattern": "[\\s,/\\\\-]+" - }; - } - - function mayBeAmpersandMapper(config) { - if (config.schema.icuTokenizer) { - return ["ampersand_mapper"]; - } - return []; - } - - function mayBeAmpersandReplacer(config) { - if (config.schema.icuTokenizer) { - return ["ampersand_replacer"]; - } - return []; - } - // Default settings let settings = { "index": { @@ -49,15 +24,17 @@ function generate(){ }, "analysis": { "tokenizer": { - "peliasTokenizer": tokenizer(config) + "peliasTokenizer": { + "type": "pattern", + "pattern": "[\\s,/\\\\-]+" + } }, "analyzer": { "peliasAdmin": { "type": "custom", "tokenizer": "peliasTokenizer", - "char_filter" : [...mayBeAmpersandMapper(config), "punctuation", "nfkc_normalizer"], + "char_filter" : ["punctuation", "nfkc_normalizer"], "filter": [ - ...mayBeAmpersandReplacer(config), "lowercase", "trim", "synonyms/custom_admin/multiword", @@ -72,9 +49,8 @@ function generate(){ "peliasIndexOneEdgeGram" : { "type": "custom", "tokenizer" : "peliasTokenizer", - "char_filter" : [...mayBeAmpersandMapper(config), "punctuation", "nfkc_normalizer"], + "char_filter" : ["punctuation", "nfkc_normalizer"], "filter": [ - ...mayBeAmpersandReplacer(config), "lowercase", "trim", "synonyms/custom_name/multiword", @@ -93,9 +69,8 @@ function generate(){ "peliasQuery": { "type": "custom", "tokenizer": "peliasTokenizer", - "char_filter": [...mayBeAmpersandMapper(config), "punctuation", "nfkc_normalizer"], + "char_filter": ["punctuation", "nfkc_normalizer"], "filter": [ - ...mayBeAmpersandReplacer(config), "lowercase", "trim", "icu_folding", @@ -108,9 +83,8 @@ function generate(){ "peliasPhrase": { "type": "custom", "tokenizer":"peliasTokenizer", - "char_filter" : [...mayBeAmpersandMapper(config), "punctuation", "nfkc_normalizer"], + "char_filter" : ["punctuation", "nfkc_normalizer"], "filter": [ - ...mayBeAmpersandReplacer(config), "lowercase", "trim", "remove_duplicate_spaces", @@ -158,9 +132,8 @@ function generate(){ "peliasStreet": { "type": "custom", "tokenizer":"peliasTokenizer", - "char_filter" : [...mayBeAmpersandMapper(config), "punctuation", "nfkc_normalizer"], + "char_filter" : ["punctuation", "nfkc_normalizer"], "filter": [ - ...mayBeAmpersandReplacer(config), "lowercase", "trim", "remove_duplicate_spaces", @@ -177,9 +150,8 @@ function generate(){ "peliasIndexCountryAbbreviation": { "type": "custom", "tokenizer": "peliasTokenizer", - "char_filter": [...mayBeAmpersandMapper(config), "punctuation", "nfkc_normalizer"], + "char_filter": ["punctuation", "nfkc_normalizer"], "filter": [ - ...mayBeAmpersandReplacer(config), "lowercase", "trim", "icu_folding", @@ -192,9 +164,8 @@ function generate(){ "peliasIndexCountryAbbreviationOneEdgeGram": { "type": "custom", "tokenizer": "peliasTokenizer", - "char_filter": [...mayBeAmpersandMapper(config), "punctuation", "nfkc_normalizer"], + "char_filter": ["punctuation", "nfkc_normalizer"], "filter": [ - ...mayBeAmpersandReplacer(config), "lowercase", "trim", "icu_folding", @@ -207,12 +178,6 @@ function generate(){ }, }, "filter" : { - // replaces ampersand placeholders back to `&` (see `ampersand_mapper` char_filter) - "ampersand_replacer": { - "type": "pattern_replace", - "pattern": "AMPERSANDPLACEHOLDER", - "replacement": "&" - }, "street_synonyms_multiplexer": { "type": "multiplexer", "preserve_original": false, @@ -286,13 +251,6 @@ function generate(){ // more generated below }, "char_filter": { - // icu-tokenizer treats ampersands as a word boundary, so we replace them with a placeholder to avoid it, - // as we want to handle them separately, we replace them back after tokenization (see `ampersand_replacer` filter) - "ampersand_mapper": { - "type": "pattern_replace", - "pattern": "&", - "replacement": " AMPERSANDPLACEHOLDER " - }, "punctuation" : { "type" : "mapping", "mappings" : punctuation.blacklist.map(function(c){ @@ -344,6 +302,11 @@ function generate(){ }; }); + // Experimental ICU tokenizer + if (config.schema.icuTokenizer) { + settings = settingsICU(settings); + } + // Merge settings from pelias/config settings = _.merge({}, settings, _.get(config, 'elasticsearch.settings', {})); diff --git a/test/compile.js b/test/compile.js index 794e5f82..1beab8db 100644 --- a/test/compile.js +++ b/test/compile.js @@ -3,7 +3,6 @@ const path = require('path'); const schema = require('../'); const fixture = require('./fixtures/expected.json'); const fixtureICUTokenizer = require('./fixtures/expected-icu-tokenizer.json'); -const config = require('pelias-config'); const forEachDeep = (obj, cb) => _.forEach(obj, (val, key) => { diff --git a/test/fixtures/expected-icu-tokenizer.json b/test/fixtures/expected-icu-tokenizer.json index e7439549..fcfa2008 100644 --- a/test/fixtures/expected-icu-tokenizer.json +++ b/test/fixtures/expected-icu-tokenizer.json @@ -3056,4 +3056,4 @@ }, "dynamic": "strict" } -} \ No newline at end of file +} diff --git a/test/fixtures/expected.json b/test/fixtures/expected.json index 66946af1..8bddef1e 100644 --- a/test/fixtures/expected.json +++ b/test/fixtures/expected.json @@ -197,11 +197,6 @@ } }, "filter": { - "ampersand_replacer": { - "type": "pattern_replace", - "pattern": "AMPERSANDPLACEHOLDER", - "replacement": "&" - }, "street_synonyms_multiplexer": { "type": "multiplexer", "preserve_original": false, @@ -2276,11 +2271,6 @@ } }, "char_filter": { - "ampersand_mapper": { - "type": "pattern_replace", - "pattern": "&", - "replacement": " AMPERSANDPLACEHOLDER " - }, "punctuation": { "type": "mapping", "mappings": [