Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use ICU tokenizer to improve some Asian languages support #498

Merged
merged 14 commits into from
Feb 4, 2025
5 changes: 5 additions & 0 deletions .github/workflows/_integration_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ jobs:
node-version: [18.x, 20.x, 22.x]
es-version: [7.6.1]
jdk-version: [oraclejdk11]
icuTokenizer: [true, false]
steps:
- uses: actions/checkout@v4
- name: Install node.js ${{ matrix.node-version }}
Expand All @@ -23,6 +24,10 @@ jobs:
run: ./scripts/setup_ci.sh
- name: Run integration tests
run: |
if [ "${{ matrix.icuTokenizer }}" = "true" ]; then
jq -n '{ schema: { icuTokenizer: true } }' > $(pwd)/config-icu.json
export PELIAS_CONFIG=$(pwd)/config-icu.json
fi
npm install
curl http://127.0.0.1:9200/
./bin/create_index
Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/_unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ jobs:
os:
- ubuntu-22.04
node-version: [18.x, 20.x, 22.x]
icuTokenizer: [true, false]
steps:
- uses: actions/checkout@v4
- name: Install node.js ${{ matrix.node-version }}
Expand All @@ -17,4 +18,8 @@ jobs:
- name: Run unit tests
run: |
npm install
npm run test
if [ "${{ matrix.icuTokenizer }}" = "true" ]; then
jq -n '{ schema: { icuTokenizer: true } }' > $(pwd)/config-icu.json
export PELIAS_CONFIG=$(pwd)/config-icu.json
fi
npm run test
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
node_modules
npm-debug.log
.DS_Store
config-icu.json
2 changes: 2 additions & 0 deletions configValidation.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ const Joi = require('@hapi/joi');

// Schema Configuration
// schema.indexName: populated by defaults if not overridden
// schema.icuTokenizer: boolean, optional, defaults to false
// esclient: object, validation performed by elasticsearch module
const schema = Joi.object().required().keys({
schema: Joi.object().required().keys({
indexName: Joi.string().required(),
icuTokenizer: Joi.boolean().optional()
}),
esclient: Joi.object().required()
}).unknown(true);
Expand Down
14 changes: 12 additions & 2 deletions integration/analyzer_peliasIndexOneEdgeGram.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
// validate analyzer is behaving as expected

var tape = require('tape'),
const tape = require('tape'),
Suite = require('../test/elastictest/Suite'),
punctuation = require('../punctuation');
punctuation = require('../punctuation'),
config = require('pelias-config').generate();

module.exports.tests = {};

Expand Down Expand Up @@ -85,6 +86,15 @@ module.exports.tests.analyze = function(test, common){

assertAnalysis( 'british_american_english', 'town theatre', ['0:town', '1:theatre', '1:theater'] );
assertAnalysis( 'british_american_english', 'town theater', ['0:town', '1:theater', '1:theatre'] );
if (config.schema.icuTokenizer) {
assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', [
'0:ซ', '0:ซอ', '0:ซอย',
'1:เพชรบุรี1', '1:เพชรบุรี', '1:เพชรบุร', '1:เพชรบุ', '1:เพชรบ', '1:เพชร', '1:เพช', '1:เพ', '1:เ',
'2:f', '2:fo', '2:foo'] );
} else {
// no ICU tokenization, so we split only on spaces
assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', ['0:ซอยเพชรบุรี1foo']);
}

suite.run( t.end );
});
Expand Down
32 changes: 30 additions & 2 deletions integration/analyzer_peliasQuery.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
// validate analyzer is behaving as expected

var tape = require('tape'),
const tape = require('tape'),
Suite = require('../test/elastictest/Suite'),
punctuation = require('../punctuation');
punctuation = require('../punctuation'),
config = require('pelias-config').generate();

module.exports.tests = {};

Expand Down Expand Up @@ -49,6 +50,33 @@ module.exports.tests.functional = function(test, common){
assertAnalysis( 'place', 'Toys "R" Us!', [ 'toys', 'r', 'us' ]);
assertAnalysis( 'address', '101 mapzen place', [ '101', 'mapzen', 'place' ]);

// complicated tokenization for some Asian languages
if (config.schema.icuTokenizer) {
assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] );
assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] );
assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]);
assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室',
['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']);
Copy link

@kenil-cheng kenil-cheng Nov 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The correct splitting is:
北京市 - Beijing city
朝阳区 - The district
东三环中路 - East 3rd Ring Middle Road
1号 - Road number
国际大厦 - Building name
a座 - Block number
1001室 - Room number

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The full Chinese addresses are usually ...省 ...市 ...区 ...路 ...号 building_name ...座 ...楼 ...室

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, if it’s by tokens then they make sense. I feel single characters like '东', '三', '环', '中路' may be too small for search, but it’s not wrong.

// correct word by word split according to native speaker: 马来西亚 / 霹雳州 / 怡保 / 31400, 怡保花园 / 第5巷 / 45号
assertAnalysis('chinese_address2', '马来西亚霹雳州怡保31400怡保花园第5巷45号',
["马来", "西亚", "霹", "雳", "州", "怡", "保", "31400", "怡", "保", "花园", "第", "5", "巷", "45", "号"]);
// correct word by word split: 马来西亚 / 柔佛新山 / 81200 / , / 士古来路 / , / 百万时尚广场
assertAnalysis('chinese_address3', '马来西亚柔佛新山81200士古来路百万时尚广场',
["马来", "西亚", "柔", "佛", "新山", "81200", "士", "古来", "路", "百万", "时尚", "广场"]);
// correct word by word split: 马来西亚/ 槟城 / 亚依淡 / 11500 / , / 极乐寺 / , / 回返路
assertAnalysis('chinese_address4', '马来西亚槟城亚依淡11500极乐寺回返路',
["马来", "西亚", "槟", "城", "亚", "依", "淡", "11500", "极乐", "寺", "回", "返", "路"]);
// correct word by word split: 马来西亚 / 吉隆坡 / 50000 / , / 茨厂街 / 123号
assertAnalysis('chinese_address5', '马来西亚吉隆坡50000茨厂街123号',
["马来", "西亚", "吉隆坡", "50000", "茨", "厂", "街", "123", "号"]);

assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]);
assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]);
assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]);
} else {
// no ICU tokenization, so we split only on spaces
assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอยเพชรบุรี1'] );
}
suite.run( t.end );
});
};
Expand Down
15 changes: 15 additions & 0 deletions integration/analyzer_peliasStreet.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// validate analyzer is behaving as expected
const Suite = require('../test/elastictest/Suite')
const config = require('pelias-config').generate()

module.exports.tests = {};

Expand All @@ -22,6 +23,20 @@ module.exports.tests.analyze = function(test, common){
assertAnalysis( 'remove_ordinals', '1st 2nd 3rd 4th 5th', ['1','2','3','4','5'] );
assertAnalysis( 'remove_ordinals', 'Ast th 101st', ['ast','th','101'] );

// complicated tokenization for some Asian languages
if (config.schema.icuTokenizer) {
assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] );
assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] );
assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]);
assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室',
['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']);
assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]);
assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]);
assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]);
} else {
// no ICU tokenization, so we split only on spaces
assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอยเพชรบุรี1'] );
}
suite.run( t.end );
});
};
Expand Down
49 changes: 49 additions & 0 deletions settings-icu.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
const _ = require('lodash');

/**
* This module contains modifications to the Pelias schema to adopt the elastic ICU tokenizer.
* This tokenizer improves word-splitting of non-latin alphabets (particularly Asian languages).
*
* It can be enabled by setting `config.schema.icuTokenizer` in your `pelias.json` config.
* Note: this must be set *before* you create your elasticsearch index or it will have no effect.
*
* This feature is considered beta, we encourage testing & feedback from the community in order
* to adopt the ICU tokenizer as our default.
*
* https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu-tokenizer.html
* https://github.com/pelias/schema/pull/498
*/

module.exports = (settings) => {

// replace pattern tokenizer with icu_tokenizer
_.set(settings, 'analysis.tokenizer.peliasTokenizer', {
'type': 'icu_tokenizer'
});

// add ampersand_replacer filter
// replaces ampersand placeholders back to `&` (see `ampersand_mapper` char_filter)
_.set(settings, 'analysis.filter.ampersand_replacer', {
'type': 'pattern_replace',
'pattern': 'AMPERSANDPLACEHOLDER',
'replacement': '&'
});

// add ampersand_mapper char_filter
// icu-tokenizer treats ampersands as a word boundary, so we replace them with a placeholder to avoid it,
// as we want to handle them separately, we replace them back after tokenization (see `ampersand_replacer` filter)
_.set(settings, 'analysis.char_filter.ampersand_mapper', {
'type': 'pattern_replace',
'pattern': '&',
'replacement': ' AMPERSANDPLACEHOLDER '
});

// prepend ampersand mapper/replacer to each analyzer
_.forEach(_.get(settings, 'analysis.analyzer'), (block) => {
if (block?.tokenizer !== 'peliasTokenizer') { return; }
block.filter.unshift('ampersand_replacer');
block.char_filter.unshift('ampersand_mapper');
});

return settings;
}
10 changes: 8 additions & 2 deletions settings.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@ const _ = require('lodash');
const peliasConfig = require('pelias-config');
const punctuation = require('./punctuation');
const synonyms = require('./synonyms/loader').load();
const settingsICU = require('./settings-icu');

require('./configValidation').validate(peliasConfig.generate());

function generate(){
var config = peliasConfig.generate();
const config = peliasConfig.generate();

// Default settings
var settings = {
let settings = {
"index": {
"similarity": {
"peliasDefaultSimilarity": {
Expand Down Expand Up @@ -299,6 +300,11 @@ function generate(){
};
});

// Experimental ICU tokenizer
if (config.schema.icuTokenizer) {
settings = settingsICU(settings);
}

// Merge settings from pelias/config
settings = _.merge({}, settings, _.get(config, 'elasticsearch.settings', {}));

Expand Down
43 changes: 39 additions & 4 deletions test/compile.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ const _ = require('lodash');
const path = require('path');
const schema = require('../');
const fixture = require('./fixtures/expected.json');
const config = require('pelias-config').generate();
const fixtureICUTokenizer = require('./fixtures/expected-icu-tokenizer.json');

const forEachDeep = (obj, cb) =>
_.forEach(obj, (val, key) => {
Expand Down Expand Up @@ -97,6 +97,19 @@ module.exports.tests.analyzers = function (test, common) {
});
};

function overridePeliasConfig(value, cb) {
const OLD_PELIAS_CONFIG = process.env.PELIAS_CONFIG;
process.env.PELIAS_CONFIG = value;

cb();

if (OLD_PELIAS_CONFIG) {
process.env.PELIAS_CONFIG = OLD_PELIAS_CONFIG;
} else {
delete process.env.PELIAS_CONFIG;
}
}

// current schema (compiled) - requires schema to be copied and settings to
// be regenerated from a fixture in order to pass in CI environments.
module.exports.tests.current_schema = function(test, common) {
Expand All @@ -106,9 +119,9 @@ module.exports.tests.current_schema = function(test, common) {
var schemaCopy = JSON.parse( JSON.stringify( schema ) );

// use the pelias config fixture instead of the local config
process.env.PELIAS_CONFIG = path.resolve( __dirname + '/fixtures/config.json' );
schemaCopy.settings = require('../settings')();
delete process.env.PELIAS_CONFIG;
overridePeliasConfig(path.resolve( __dirname + '/fixtures/config.json' ), () => {
schemaCopy.settings = require('../settings')();
});

// code intentionally commented to allow quick debugging of expected.json
// common.diff(schemaCopy, fixture);
Expand All @@ -121,6 +134,28 @@ module.exports.tests.current_schema = function(test, common) {
t.deepEqual(schemaCopy, fixture);
t.end();
});

test('current schema vs. fixture with ICU tokenizer', function(t) {

// copy schema
var schemaCopy = JSON.parse( JSON.stringify( schema ) );

// use the pelias config fixture instead of the local config
overridePeliasConfig(path.resolve( __dirname + '/fixtures/config-icu-tokenizer.json' ), () => {
schemaCopy.settings = require('../settings')();
});

// code intentionally commented to allow quick debugging of expected.json
// common.diff(schemaCopy, fixtureICUTokenizer);
// console.error( JSON.stringify( schemaCopy, null, 2 ) );

// code to write expected output to the fixture
// const fs = require('fs');
// fs.writeFileSync(path.resolve( __dirname + '/fixtures/expected-icu-tokenizer.json' ), JSON.stringify(schemaCopy, null, 2));

t.deepEqual(schemaCopy, fixtureICUTokenizer);
t.end();
});
};

module.exports.all = function (tape, common) {
Expand Down
15 changes: 15 additions & 0 deletions test/fixtures/config-icu-tokenizer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"elasticsearch": {
"settings": {
"index": {
"number_of_replicas": "999",
"number_of_shards": "5",
"refresh_interval": "1m"
}
}
},
"schema": {
"icuTokenizer": true
}
}

Loading