Skip to content

Commit

Permalink
feat(schema): add beta ICU tokenizer
Browse files Browse the repository at this point in the history
* Use ICU tokenizer to improve some Asian languages support

* Remove unused import

* Add more chinese test cases

* Add icuTokenizer flag

* Implement ICU tokenizer test

* Run unit tests for both ICU = true/false

* Run tests for both ICU = true/false

* add fixtures

* Fix bug in settings

* Fix tests

* Fix tests

* Fix tests

* Fix tests

* feat(schema): add beta ICU tokenizer

---------

Co-authored-by: Peter Johnson <insomnia@rcpt.at>
  • Loading branch information
SiarheiFedartsou and missinglink authored Feb 4, 2025
1 parent 41bd2d1 commit 1098354
Show file tree
Hide file tree
Showing 13 changed files with 3,272 additions and 23 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/_integration_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ jobs:
node-version: [18.x, 20.x, 22.x]
es-version: [7.6.1]
jdk-version: [oraclejdk11]
icuTokenizer: [true, false]
steps:
- uses: actions/checkout@v4
- name: Install node.js ${{ matrix.node-version }}
Expand All @@ -23,6 +24,10 @@ jobs:
run: ./scripts/setup_ci.sh
- name: Run integration tests
run: |
if [ "${{ matrix.icuTokenizer }}" = "true" ]; then
jq -n '{ schema: { icuTokenizer: true } }' > $(pwd)/config-icu.json
export PELIAS_CONFIG=$(pwd)/config-icu.json
fi
npm install
curl http://127.0.0.1:9200/
./bin/create_index
Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/_unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ jobs:
os:
- ubuntu-22.04
node-version: [18.x, 20.x, 22.x]
icuTokenizer: [true, false]
steps:
- uses: actions/checkout@v4
- name: Install node.js ${{ matrix.node-version }}
Expand All @@ -17,4 +18,8 @@ jobs:
- name: Run unit tests
run: |
npm install
npm run test
if [ "${{ matrix.icuTokenizer }}" = "true" ]; then
jq -n '{ schema: { icuTokenizer: true } }' > $(pwd)/config-icu.json
export PELIAS_CONFIG=$(pwd)/config-icu.json
fi
npm run test
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
node_modules
npm-debug.log
.DS_Store
config-icu.json
2 changes: 2 additions & 0 deletions configValidation.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ const Joi = require('@hapi/joi');

// Schema Configuration
// schema.indexName: populated by defaults if not overridden
// schema.icuTokenizer: boolean, optional, defaults to false
// esclient: object, validation performed by elasticsearch module
const schema = Joi.object().required().keys({
schema: Joi.object().required().keys({
indexName: Joi.string().required(),
icuTokenizer: Joi.boolean().optional()
}),
esclient: Joi.object().required()
}).unknown(true);
Expand Down
14 changes: 12 additions & 2 deletions integration/analyzer_peliasIndexOneEdgeGram.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
// validate analyzer is behaving as expected

var tape = require('tape'),
const tape = require('tape'),
Suite = require('../test/elastictest/Suite'),
punctuation = require('../punctuation');
punctuation = require('../punctuation'),
config = require('pelias-config').generate();

module.exports.tests = {};

Expand Down Expand Up @@ -85,6 +86,15 @@ module.exports.tests.analyze = function(test, common){

assertAnalysis( 'british_american_english', 'town theatre', ['0:town', '1:theatre', '1:theater'] );
assertAnalysis( 'british_american_english', 'town theater', ['0:town', '1:theater', '1:theatre'] );
if (config.schema.icuTokenizer) {
assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', [
'0:ซ', '0:ซอ', '0:ซอย',
'1:เพชรบุรี1', '1:เพชรบุรี', '1:เพชรบุร', '1:เพชรบุ', '1:เพชรบ', '1:เพชร', '1:เพช', '1:เพ', '1:เ',
'2:f', '2:fo', '2:foo'] );
} else {
// no ICU tokenization, so we split only on spaces
assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', ['0:ซอยเพชรบุรี1foo']);
}

suite.run( t.end );
});
Expand Down
32 changes: 30 additions & 2 deletions integration/analyzer_peliasQuery.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
// validate analyzer is behaving as expected

var tape = require('tape'),
const tape = require('tape'),
Suite = require('../test/elastictest/Suite'),
punctuation = require('../punctuation');
punctuation = require('../punctuation'),
config = require('pelias-config').generate();

module.exports.tests = {};

Expand Down Expand Up @@ -49,6 +50,33 @@ module.exports.tests.functional = function(test, common){
assertAnalysis( 'place', 'Toys "R" Us!', [ 'toys', 'r', 'us' ]);
assertAnalysis( 'address', '101 mapzen place', [ '101', 'mapzen', 'place' ]);

// complicated tokenization for some Asian languages
if (config.schema.icuTokenizer) {
assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] );
assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] );
assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]);
assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室',
['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']);
// correct word by word split according to native speaker: 马来西亚 / 霹雳州 / 怡保 / 31400, 怡保花园 / 第5巷 / 45号
assertAnalysis('chinese_address2', '马来西亚霹雳州怡保31400怡保花园第5巷45号',
["马来", "西亚", "霹", "雳", "州", "怡", "保", "31400", "怡", "保", "花园", "第", "5", "巷", "45", "号"]);
// correct word by word split: 马来西亚 / 柔佛新山 / 81200 / , / 士古来路 / , / 百万时尚广场
assertAnalysis('chinese_address3', '马来西亚柔佛新山81200士古来路百万时尚广场',
["马来", "西亚", "柔", "佛", "新山", "81200", "士", "古来", "路", "百万", "时尚", "广场"]);
// correct word by word split: 马来西亚/ 槟城 / 亚依淡 / 11500 / , / 极乐寺 / , / 回返路
assertAnalysis('chinese_address4', '马来西亚槟城亚依淡11500极乐寺回返路',
["马来", "西亚", "槟", "城", "亚", "依", "淡", "11500", "极乐", "寺", "回", "返", "路"]);
// correct word by word split: 马来西亚 / 吉隆坡 / 50000 / , / 茨厂街 / 123号
assertAnalysis('chinese_address5', '马来西亚吉隆坡50000茨厂街123号',
["马来", "西亚", "吉隆坡", "50000", "茨", "厂", "街", "123", "号"]);

assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]);
assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]);
assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]);
} else {
// no ICU tokenization, so we split only on spaces
assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอยเพชรบุรี1'] );
}
suite.run( t.end );
});
};
Expand Down
15 changes: 15 additions & 0 deletions integration/analyzer_peliasStreet.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// validate analyzer is behaving as expected
const Suite = require('../test/elastictest/Suite')
const config = require('pelias-config').generate()

module.exports.tests = {};

Expand All @@ -22,6 +23,20 @@ module.exports.tests.analyze = function(test, common){
assertAnalysis( 'remove_ordinals', '1st 2nd 3rd 4th 5th', ['1','2','3','4','5'] );
assertAnalysis( 'remove_ordinals', 'Ast th 101st', ['ast','th','101'] );

// complicated tokenization for some Asian languages
if (config.schema.icuTokenizer) {
assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] );
assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] );
assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]);
assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室',
['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']);
assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]);
assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]);
assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]);
} else {
// no ICU tokenization, so we split only on spaces
assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอยเพชรบุรี1'] );
}
suite.run( t.end );
});
};
Expand Down
49 changes: 49 additions & 0 deletions settings-icu.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
const _ = require('lodash');

/**
* This module contains modifications to the Pelias schema to adopt the elastic ICU tokenizer.
* This tokenizer improves word-splitting of non-latin alphabets (particularly Asian languages).
*
* It can be enabled by setting `config.schema.icuTokenizer` in your `pelias.json` config.
* Note: this must be set *before* you create your elasticsearch index or it will have no effect.
*
* This feature is considered beta, we encourage testing & feedback from the community in order
* to adopt the ICU tokenizer as our default.
*
* https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu-tokenizer.html
* https://github.com/pelias/schema/pull/498
*/

module.exports = (settings) => {

// replace pattern tokenizer with icu_tokenizer
_.set(settings, 'analysis.tokenizer.peliasTokenizer', {
'type': 'icu_tokenizer'
});

// add ampersand_replacer filter
// replaces ampersand placeholders back to `&` (see `ampersand_mapper` char_filter)
_.set(settings, 'analysis.filter.ampersand_replacer', {
'type': 'pattern_replace',
'pattern': 'AMPERSANDPLACEHOLDER',
'replacement': '&'
});

// add ampersand_mapper char_filter
// icu-tokenizer treats ampersands as a word boundary, so we replace them with a placeholder to avoid it,
// as we want to handle them separately, we replace them back after tokenization (see `ampersand_replacer` filter)
_.set(settings, 'analysis.char_filter.ampersand_mapper', {
'type': 'pattern_replace',
'pattern': '&',
'replacement': ' AMPERSANDPLACEHOLDER '
});

// prepend ampersand mapper/replacer to each analyzer
_.forEach(_.get(settings, 'analysis.analyzer'), (block) => {
if (block?.tokenizer !== 'peliasTokenizer') { return; }
block.filter.unshift('ampersand_replacer');
block.char_filter.unshift('ampersand_mapper');
});

return settings;
}
10 changes: 8 additions & 2 deletions settings.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@ const _ = require('lodash');
const peliasConfig = require('pelias-config');
const punctuation = require('./punctuation');
const synonyms = require('./synonyms/loader').load();
const settingsICU = require('./settings-icu');

require('./configValidation').validate(peliasConfig.generate());

function generate(){
var config = peliasConfig.generate();
const config = peliasConfig.generate();

// Default settings
var settings = {
let settings = {
"index": {
"similarity": {
"peliasDefaultSimilarity": {
Expand Down Expand Up @@ -299,6 +300,11 @@ function generate(){
};
});

// Experimental ICU tokenizer
if (config.schema.icuTokenizer) {
settings = settingsICU(settings);
}

// Merge settings from pelias/config
settings = _.merge({}, settings, _.get(config, 'elasticsearch.settings', {}));

Expand Down
43 changes: 39 additions & 4 deletions test/compile.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ const _ = require('lodash');
const path = require('path');
const schema = require('../');
const fixture = require('./fixtures/expected.json');
const config = require('pelias-config').generate();
const fixtureICUTokenizer = require('./fixtures/expected-icu-tokenizer.json');

const forEachDeep = (obj, cb) =>
_.forEach(obj, (val, key) => {
Expand Down Expand Up @@ -97,6 +97,19 @@ module.exports.tests.analyzers = function (test, common) {
});
};

function overridePeliasConfig(value, cb) {
const OLD_PELIAS_CONFIG = process.env.PELIAS_CONFIG;
process.env.PELIAS_CONFIG = value;

cb();

if (OLD_PELIAS_CONFIG) {
process.env.PELIAS_CONFIG = OLD_PELIAS_CONFIG;
} else {
delete process.env.PELIAS_CONFIG;
}
}

// current schema (compiled) - requires schema to be copied and settings to
// be regenerated from a fixture in order to pass in CI environments.
module.exports.tests.current_schema = function(test, common) {
Expand All @@ -106,9 +119,9 @@ module.exports.tests.current_schema = function(test, common) {
var schemaCopy = JSON.parse( JSON.stringify( schema ) );

// use the pelias config fixture instead of the local config
process.env.PELIAS_CONFIG = path.resolve( __dirname + '/fixtures/config.json' );
schemaCopy.settings = require('../settings')();
delete process.env.PELIAS_CONFIG;
overridePeliasConfig(path.resolve( __dirname + '/fixtures/config.json' ), () => {
schemaCopy.settings = require('../settings')();
});

// code intentionally commented to allow quick debugging of expected.json
// common.diff(schemaCopy, fixture);
Expand All @@ -121,6 +134,28 @@ module.exports.tests.current_schema = function(test, common) {
t.deepEqual(schemaCopy, fixture);
t.end();
});

test('current schema vs. fixture with ICU tokenizer', function(t) {

// copy schema
var schemaCopy = JSON.parse( JSON.stringify( schema ) );

// use the pelias config fixture instead of the local config
overridePeliasConfig(path.resolve( __dirname + '/fixtures/config-icu-tokenizer.json' ), () => {
schemaCopy.settings = require('../settings')();
});

// code intentionally commented to allow quick debugging of expected.json
// common.diff(schemaCopy, fixtureICUTokenizer);
// console.error( JSON.stringify( schemaCopy, null, 2 ) );

// code to write expected output to the fixture
// const fs = require('fs');
// fs.writeFileSync(path.resolve( __dirname + '/fixtures/expected-icu-tokenizer.json' ), JSON.stringify(schemaCopy, null, 2));

t.deepEqual(schemaCopy, fixtureICUTokenizer);
t.end();
});
};

module.exports.all = function (tape, common) {
Expand Down
15 changes: 15 additions & 0 deletions test/fixtures/config-icu-tokenizer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"elasticsearch": {
"settings": {
"index": {
"number_of_replicas": "999",
"number_of_shards": "5",
"refresh_interval": "1m"
}
}
},
"schema": {
"icuTokenizer": true
}
}

Loading

0 comments on commit 1098354

Please sign in to comment.