Skip to content

Commit

Permalink
feat(schema): add beta ICU tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
missinglink committed Feb 4, 2025
1 parent 72b4e2b commit eefcaaa
Show file tree
Hide file tree
Showing 10 changed files with 79 additions and 81 deletions.
4 changes: 1 addition & 3 deletions .github/workflows/_unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,4 @@ jobs:
jq -n '{ schema: { icuTokenizer: true } }' > $(pwd)/config-icu.json
export PELIAS_CONFIG=$(pwd)/config-icu.json
fi
npm run test

npm run test
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
node_modules
npm-debug.log
.DS_Store
config-icu.json
2 changes: 1 addition & 1 deletion integration/analyzer_peliasIndexOneEdgeGram.js
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ module.exports.tests.analyze = function(test, common){
if (config.schema.icuTokenizer) {
assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', [
'0:ซ', '0:ซอ', '0:ซอย',
'1:เพชรบุรี1', '1:เพชรบุรี', '1:เพชรบุร', '1:เพชรบุ', '1:เพชรบ', '1:เพชร', '1:เพช', '1:เพ', '1:เ',
'1:เพชรบุรี1', '1:เพชรบุรี', '1:เพชรบุร', '1:เพชรบุ', '1:เพชรบ', '1:เพชร', '1:เพช', '1:เพ', '1:เ',
'2:f', '2:fo', '2:foo'] );
} else {
// no ICU tokenization, so we split only on spaces
Expand Down
10 changes: 5 additions & 5 deletions integration/analyzer_peliasQuery.js
Original file line number Diff line number Diff line change
Expand Up @@ -55,19 +55,19 @@ module.exports.tests.functional = function(test, common){
assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] );
assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] );
assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]);
assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室',
assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室',
['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']);
// correct word by word split according to native speaker: 马来西亚 / 霹雳州 / 怡保 / 31400, 怡保花园 / 第5巷 / 45号
assertAnalysis('chinese_address2', '马来西亚霹雳州怡保31400怡保花园第5巷45号',
assertAnalysis('chinese_address2', '马来西亚霹雳州怡保31400怡保花园第5巷45号',
["马来", "西亚", "霹", "雳", "州", "怡", "保", "31400", "怡", "保", "花园", "第", "5", "巷", "45", "号"]);
// correct word by word split: 马来西亚 / 柔佛新山 / 81200 / , / 士古来路 / , / 百万时尚广场
assertAnalysis('chinese_address3', '马来西亚柔佛新山81200士古来路百万时尚广场',
assertAnalysis('chinese_address3', '马来西亚柔佛新山81200士古来路百万时尚广场',
["马来", "西亚", "柔", "佛", "新山", "81200", "士", "古来", "路", "百万", "时尚", "广场"]);
// correct word by word split: 马来西亚/ 槟城 / 亚依淡 / 11500 / , / 极乐寺 / , / 回返路
assertAnalysis('chinese_address4', '马来西亚槟城亚依淡11500极乐寺回返路',
assertAnalysis('chinese_address4', '马来西亚槟城亚依淡11500极乐寺回返路',
["马来", "西亚", "槟", "城", "亚", "依", "淡", "11500", "极乐", "寺", "回", "返", "路"]);
// correct word by word split: 马来西亚 / 吉隆坡 / 50000 / , / 茨厂街 / 123号
assertAnalysis('chinese_address5', '马来西亚吉隆坡50000茨厂街123号',
assertAnalysis('chinese_address5', '马来西亚吉隆坡50000茨厂街123号',
["马来", "西亚", "吉隆坡", "50000", "茨", "厂", "街", "123", "号"]);

assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]);
Expand Down
2 changes: 1 addition & 1 deletion integration/analyzer_peliasStreet.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ module.exports.tests.analyze = function(test, common){
assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] );
assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] );
assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]);
assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室',
assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室',
['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']);
assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]);
assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]);
Expand Down
49 changes: 49 additions & 0 deletions settings-icu.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
const _ = require('lodash');

/**
* This module contains modifications to the Pelias schema to adopt the elastic ICU tokenizer.
* This tokenizer improves word-splitting of non-latin alphabets (particularly Asian languages).
*
* It can be enabled by setting `config.schema.icuTokenizer` in your `pelias.json` config.
* Note: this must be set *before* you create your elasticsearch index or it will have no effect.
*
* This feature is considered beta, we encourage testing & feedback from the community in order
* to adopt the ICU tokenizer as our default.
*
* https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu-tokenizer.html
* https://github.com/pelias/schema/pull/498
*/

module.exports = (settings) => {

// replace pattern tokenizer with icu_tokenizer
_.set(settings, 'analysis.tokenizer.peliasTokenizer', {
'type': 'icu_tokenizer'
});

// add ampersand_replacer filter
// replaces ampersand placeholders back to `&` (see `ampersand_mapper` char_filter)
_.set(settings, 'analysis.filter.ampersand_replacer', {
'type': 'pattern_replace',
'pattern': 'AMPERSANDPLACEHOLDER',
'replacement': '&'
});

// add ampersand_mapper char_filter
// icu-tokenizer treats ampersands as a word boundary, so we replace them with a placeholder to avoid it,
// as we want to handle them separately, we replace them back after tokenization (see `ampersand_replacer` filter)
_.set(settings, 'analysis.char_filter.ampersand_mapper', {
'type': 'pattern_replace',
'pattern': '&',
'replacement': ' AMPERSANDPLACEHOLDER '
});

// prepend ampersand mapper/replacer to each analyzer
_.forEach(_.get(settings, 'analysis.analyzer'), (block) => {
if (block?.tokenizer !== 'peliasTokenizer') { return; }
block.filter.unshift('ampersand_replacer');
block.char_filter.unshift('ampersand_mapper');
});

return settings;
}
73 changes: 17 additions & 56 deletions settings.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,13 @@ const _ = require('lodash');
const peliasConfig = require('pelias-config');
const punctuation = require('./punctuation');
const synonyms = require('./synonyms/loader').load();
const settingsICU = require('./settings-icu');

require('./configValidation').validate(peliasConfig.generate());



function generate(){
const config = peliasConfig.generate();

function tokenizer(config) {
if (config.schema.icuTokenizer) {
return {
"type": "icu_tokenizer"
};
}
return {
"type": "pattern",
"pattern": "[\\s,/\\\\-]+"
};
}

function mayBeAmpersandMapper(config) {
if (config.schema.icuTokenizer) {
return ["ampersand_mapper"];
}
return [];
}

function mayBeAmpersandReplacer(config) {
if (config.schema.icuTokenizer) {
return ["ampersand_replacer"];
}
return [];
}

// Default settings
let settings = {
"index": {
Expand All @@ -49,15 +22,17 @@ function generate(){
},
"analysis": {
"tokenizer": {
"peliasTokenizer": tokenizer(config)
"peliasTokenizer": {
"type": "pattern",
"pattern": "[\\s,/\\\\-]+"
}
},
"analyzer": {
"peliasAdmin": {
"type": "custom",
"tokenizer": "peliasTokenizer",
"char_filter" : [...mayBeAmpersandMapper(config), "punctuation", "nfkc_normalizer"],
"char_filter" : ["punctuation", "nfkc_normalizer"],
"filter": [
...mayBeAmpersandReplacer(config),
"lowercase",
"trim",
"synonyms/custom_admin/multiword",
Expand All @@ -72,9 +47,8 @@ function generate(){
"peliasIndexOneEdgeGram" : {
"type": "custom",
"tokenizer" : "peliasTokenizer",
"char_filter" : [...mayBeAmpersandMapper(config), "punctuation", "nfkc_normalizer"],
"char_filter" : ["punctuation", "nfkc_normalizer"],
"filter": [
...mayBeAmpersandReplacer(config),
"lowercase",
"trim",
"synonyms/custom_name/multiword",
Expand All @@ -93,9 +67,8 @@ function generate(){
"peliasQuery": {
"type": "custom",
"tokenizer": "peliasTokenizer",
"char_filter": [...mayBeAmpersandMapper(config), "punctuation", "nfkc_normalizer"],
"char_filter": ["punctuation", "nfkc_normalizer"],
"filter": [
...mayBeAmpersandReplacer(config),
"lowercase",
"trim",
"icu_folding",
Expand All @@ -108,9 +81,8 @@ function generate(){
"peliasPhrase": {
"type": "custom",
"tokenizer":"peliasTokenizer",
"char_filter" : [...mayBeAmpersandMapper(config), "punctuation", "nfkc_normalizer"],
"char_filter" : ["punctuation", "nfkc_normalizer"],
"filter": [
...mayBeAmpersandReplacer(config),
"lowercase",
"trim",
"remove_duplicate_spaces",
Expand Down Expand Up @@ -158,9 +130,8 @@ function generate(){
"peliasStreet": {
"type": "custom",
"tokenizer":"peliasTokenizer",
"char_filter" : [...mayBeAmpersandMapper(config), "punctuation", "nfkc_normalizer"],
"char_filter" : ["punctuation", "nfkc_normalizer"],
"filter": [
...mayBeAmpersandReplacer(config),
"lowercase",
"trim",
"remove_duplicate_spaces",
Expand All @@ -177,9 +148,8 @@ function generate(){
"peliasIndexCountryAbbreviation": {
"type": "custom",
"tokenizer": "peliasTokenizer",
"char_filter": [...mayBeAmpersandMapper(config), "punctuation", "nfkc_normalizer"],
"char_filter": ["punctuation", "nfkc_normalizer"],
"filter": [
...mayBeAmpersandReplacer(config),
"lowercase",
"trim",
"icu_folding",
Expand All @@ -192,9 +162,8 @@ function generate(){
"peliasIndexCountryAbbreviationOneEdgeGram": {
"type": "custom",
"tokenizer": "peliasTokenizer",
"char_filter": [...mayBeAmpersandMapper(config), "punctuation", "nfkc_normalizer"],
"char_filter": ["punctuation", "nfkc_normalizer"],
"filter": [
...mayBeAmpersandReplacer(config),
"lowercase",
"trim",
"icu_folding",
Expand All @@ -207,12 +176,6 @@ function generate(){
},
},
"filter" : {
// replaces ampersand placeholders back to `&` (see `ampersand_mapper` char_filter)
"ampersand_replacer": {
"type": "pattern_replace",
"pattern": "AMPERSANDPLACEHOLDER",
"replacement": "&"
},
"street_synonyms_multiplexer": {
"type": "multiplexer",
"preserve_original": false,
Expand Down Expand Up @@ -286,13 +249,6 @@ function generate(){
// more generated below
},
"char_filter": {
// icu-tokenizer treats ampersands as a word boundary, so we replace them with a placeholder to avoid it,
// as we want to handle them separately, we replace them back after tokenization (see `ampersand_replacer` filter)
"ampersand_mapper": {
"type": "pattern_replace",
"pattern": "&",
"replacement": " AMPERSANDPLACEHOLDER "
},
"punctuation" : {
"type" : "mapping",
"mappings" : punctuation.blacklist.map(function(c){
Expand Down Expand Up @@ -344,6 +300,11 @@ function generate(){
};
});

// Experimental ICU tokenizer
if (config.schema.icuTokenizer) {
settings = settingsICU(settings);
}

// Merge settings from pelias/config
settings = _.merge({}, settings, _.get(config, 'elasticsearch.settings', {}));

Expand Down
7 changes: 3 additions & 4 deletions test/compile.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ const path = require('path');
const schema = require('../');
const fixture = require('./fixtures/expected.json');
const fixtureICUTokenizer = require('./fixtures/expected-icu-tokenizer.json');
const config = require('pelias-config');

const forEachDeep = (obj, cb) =>
_.forEach(obj, (val, key) => {
Expand Down Expand Up @@ -99,13 +98,13 @@ module.exports.tests.analyzers = function (test, common) {
};

function overridePeliasConfig(value, cb) {
const old_PELIAS_CONFIG = process.env.PELIAS_CONFIG;
const OLD_PELIAS_CONFIG = process.env.PELIAS_CONFIG;
process.env.PELIAS_CONFIG = value;

cb();

if (old_PELIAS_CONFIG) {
process.env.PELIAS_CONFIG = old_PELIAS_CONFIG;
if (OLD_PELIAS_CONFIG) {
process.env.PELIAS_CONFIG = OLD_PELIAS_CONFIG;
} else {
delete process.env.PELIAS_CONFIG;
}
Expand Down
2 changes: 1 addition & 1 deletion test/fixtures/expected-icu-tokenizer.json
Original file line number Diff line number Diff line change
Expand Up @@ -3056,4 +3056,4 @@
},
"dynamic": "strict"
}
}
}
10 changes: 0 additions & 10 deletions test/fixtures/expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -197,11 +197,6 @@
}
},
"filter": {
"ampersand_replacer": {
"type": "pattern_replace",
"pattern": "AMPERSANDPLACEHOLDER",
"replacement": "&"
},
"street_synonyms_multiplexer": {
"type": "multiplexer",
"preserve_original": false,
Expand Down Expand Up @@ -2276,11 +2271,6 @@
}
},
"char_filter": {
"ampersand_mapper": {
"type": "pattern_replace",
"pattern": "&",
"replacement": " AMPERSANDPLACEHOLDER "
},
"punctuation": {
"type": "mapping",
"mappings": [
Expand Down

0 comments on commit eefcaaa

Please sign in to comment.