feat(schema): add beta ICU tokenizer

* Use ICU tokenizer to improve some Asian languages support * Remove unused import * Add more chinese test cases * Add icuTokenizer flag * Implement ICU tokenizer test * Run unit tests for both ICU = true/false * Run tests for both ICU = true/false * add fixtures * Fix bug in settings * Fix tests * Fix tests * Fix tests * Fix tests * feat(schema): add beta ICU tokenizer --------- Co-authored-by: Peter Johnson <insomnia@rcpt.at>
pelias · Feb 4, 2025 · 1098354 · 1098354
1 parent 41bd2d1
commit 1098354
Show file tree

Hide file tree

Showing 13 changed files with 3,272 additions and 23 deletions.
diff --git a/.github/workflows/_integration_tests.yml b/.github/workflows/_integration_tests.yml
@@ -10,6 +10,7 @@ jobs:
         node-version: [18.x, 20.x, 22.x]
         es-version: [7.6.1]
         jdk-version: [oraclejdk11]
+        icuTokenizer: [true, false]
     steps:
     - uses: actions/checkout@v4
     - name: Install node.js ${{ matrix.node-version }}
@@ -23,6 +24,10 @@ jobs:
       run: ./scripts/setup_ci.sh
     - name: Run integration tests
       run: |
+        if [ "${{ matrix.icuTokenizer }}" = "true" ]; then
+          jq -n '{ schema: { icuTokenizer: true } }' > $(pwd)/config-icu.json
+          export PELIAS_CONFIG=$(pwd)/config-icu.json
+        fi
         npm install
         curl http://127.0.0.1:9200/
         ./bin/create_index

diff --git a/.github/workflows/_unit_tests.yml b/.github/workflows/_unit_tests.yml
@@ -8,6 +8,7 @@ jobs:
         os:
           - ubuntu-22.04
         node-version: [18.x, 20.x, 22.x]
+        icuTokenizer: [true, false]
     steps:
     - uses: actions/checkout@v4
     - name: Install node.js ${{ matrix.node-version }}
@@ -17,4 +18,8 @@ jobs:
     - name: Run unit tests
       run: |
         npm install
-        npm run test
+        if [ "${{ matrix.icuTokenizer }}" = "true" ]; then
+          jq -n '{ schema: { icuTokenizer: true } }' > $(pwd)/config-icu.json
+          export PELIAS_CONFIG=$(pwd)/config-icu.json
+        fi
+        npm run test
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 node_modules
 npm-debug.log
 .DS_Store
+config-icu.json
diff --git a/configValidation.js b/configValidation.js
@@ -2,10 +2,12 @@ const Joi = require('@hapi/joi');
 
 // Schema Configuration
 // schema.indexName: populated by defaults if not overridden
+// schema.icuTokenizer: boolean, optional, defaults to false
 // esclient: object, validation performed by elasticsearch module
 const schema = Joi.object().required().keys({
   schema: Joi.object().required().keys({
     indexName: Joi.string().required(),
+    icuTokenizer: Joi.boolean().optional()
   }),
   esclient: Joi.object().required()
 }).unknown(true);

diff --git a/integration/analyzer_peliasIndexOneEdgeGram.js b/integration/analyzer_peliasIndexOneEdgeGram.js
@@ -1,8 +1,9 @@
 // validate analyzer is behaving as expected
 
-var tape = require('tape'),
+const tape = require('tape'),
     Suite = require('../test/elastictest/Suite'),
-    punctuation = require('../punctuation');
+    punctuation = require('../punctuation'),
+    config = require('pelias-config').generate();
 
 module.exports.tests = {};
 
@@ -85,6 +86,15 @@ module.exports.tests.analyze = function(test, common){
 
     assertAnalysis( 'british_american_english', 'town theatre', ['0:town', '1:theatre', '1:theater'] );
     assertAnalysis( 'british_american_english', 'town theater', ['0:town', '1:theater', '1:theatre'] );
+    if (config.schema.icuTokenizer) {
+      assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', [
+        '0:ซ', '0:ซอ', '0:ซอย',
+        '1:เพชรบุรี1', '1:เพชรบุรี', '1:เพชรบุร', '1:เพชรบุ', '1:เพชรบ', '1:เพชร', '1:เพช', '1:เพ', '1:เ',
+        '2:f', '2:fo', '2:foo'] );
+    } else {
+      // no ICU tokenization, so we split only on spaces
+      assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', ['0:ซอยเพชรบุรี1foo']);
+    }
 
     suite.run( t.end );
   });

diff --git a/integration/analyzer_peliasQuery.js b/integration/analyzer_peliasQuery.js
@@ -1,8 +1,9 @@
 // validate analyzer is behaving as expected
 
-var tape = require('tape'),
+const tape = require('tape'),
     Suite = require('../test/elastictest/Suite'),
-    punctuation = require('../punctuation');
+    punctuation = require('../punctuation'),
+    config = require('pelias-config').generate();
 
 module.exports.tests = {};
 
@@ -49,6 +50,33 @@ module.exports.tests.functional = function(test, common){
     assertAnalysis( 'place', 'Toys "R" Us!', [ 'toys', 'r', 'us' ]);
     assertAnalysis( 'address', '101 mapzen place', [ '101', 'mapzen', 'place' ]);
 
+     // complicated tokenization for some Asian languages
+    if (config.schema.icuTokenizer) {
+      assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] );
+      assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] );
+      assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]);
+      assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室',
+       ['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']);
+      // correct word by word split according to native speaker: 马来西亚 / 霹雳州 / 怡保 / 31400, 怡保花园 / 第5巷 / 45号
+      assertAnalysis('chinese_address2', '马来西亚霹雳州怡保31400怡保花园第5巷45号',
+          ["马来", "西亚", "霹", "雳", "州", "怡", "保", "31400", "怡", "保", "花园", "第", "5", "巷", "45", "号"]);
+      // correct word by word split: 马来西亚 / 柔佛新山 / 81200 / , / 士古来路 / , / 百万时尚广场
+      assertAnalysis('chinese_address3', '马来西亚柔佛新山81200士古来路百万时尚广场',
+            ["马来", "西亚", "柔", "佛", "新山", "81200", "士", "古来", "路", "百万", "时尚", "广场"]);
+      // correct word by word split: 马来西亚/ 槟城 / 亚依淡 / 11500 / , / 极乐寺 / , / 回返路
+      assertAnalysis('chinese_address4', '马来西亚槟城亚依淡11500极乐寺回返路',
+            ["马来", "西亚", "槟", "城", "亚", "依", "淡", "11500", "极乐", "寺", "回", "返", "路"]);
+      // correct word by word split: 马来西亚 / 吉隆坡 / 50000 / , / 茨厂街 / 123号
+      assertAnalysis('chinese_address5', '马来西亚吉隆坡50000茨厂街123号',
+            ["马来", "西亚", "吉隆坡", "50000", "茨", "厂", "街", "123", "号"]);
+
+      assertAnalysis('japanese_address', '東京都渋谷区渋谷２丁目２１−１渋谷スクランブルスクエア４階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]);
+      assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]);
+      assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]);
+    } else {
+      // no ICU tokenization, so we split only on spaces
+      assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอยเพชรบุรี1'] );
+    }
     suite.run( t.end );
   });
 };

diff --git a/integration/analyzer_peliasStreet.js b/integration/analyzer_peliasStreet.js
@@ -1,5 +1,6 @@
 // validate analyzer is behaving as expected
 const Suite = require('../test/elastictest/Suite')
+const config = require('pelias-config').generate()
 
 module.exports.tests = {};
 
@@ -22,6 +23,20 @@ module.exports.tests.analyze = function(test, common){
     assertAnalysis( 'remove_ordinals', '1st 2nd 3rd 4th 5th', ['1','2','3','4','5'] );
     assertAnalysis( 'remove_ordinals', 'Ast th 101st', ['ast','th','101'] );
 
+    // complicated tokenization for some Asian languages
+    if (config.schema.icuTokenizer) {
+      assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] );
+      assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] );
+      assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]);
+      assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室',
+        ['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']);
+      assertAnalysis('japanese_address', '東京都渋谷区渋谷２丁目２１−１渋谷スクランブルスクエア４階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]);
+      assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]);
+      assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]);
+    } else {
+      // no ICU tokenization, so we split only on spaces
+      assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอยเพชรบุรี1'] );
+    }
     suite.run( t.end );
   });
 };

diff --git a/settings-icu.js b/settings-icu.js
@@ -0,0 +1,49 @@
+const _ = require('lodash');
+
+/**
+ * This module contains modifications to the Pelias schema to adopt the elastic ICU tokenizer.
+ * This tokenizer improves word-splitting of non-latin alphabets (particularly Asian languages).
+ * 
+ * It can be enabled by setting `config.schema.icuTokenizer` in your `pelias.json` config.
+ * Note: this must be set *before* you create your elasticsearch index or it will have no effect.
+ * 
+ * This feature is considered beta, we encourage testing & feedback from the community in order 
+ * to adopt the ICU tokenizer as our default.
+ * 
+ * https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu-tokenizer.html
+ * https://github.com/pelias/schema/pull/498
+ */
+
+module.exports = (settings) => {
+
+  // replace pattern tokenizer with icu_tokenizer
+  _.set(settings, 'analysis.tokenizer.peliasTokenizer', {
+    'type': 'icu_tokenizer'
+  });
+
+  // add ampersand_replacer filter
+  // replaces ampersand placeholders back to `&` (see `ampersand_mapper` char_filter)
+  _.set(settings, 'analysis.filter.ampersand_replacer', {
+    'type': 'pattern_replace',
+    'pattern': 'AMPERSANDPLACEHOLDER',
+    'replacement': '&'
+  });
+
+  // add ampersand_mapper char_filter
+  // icu-tokenizer treats ampersands as a word boundary, so we replace them with a placeholder to avoid it,
+  // as we want to handle them separately, we replace them back after tokenization (see `ampersand_replacer` filter)
+  _.set(settings, 'analysis.char_filter.ampersand_mapper', {
+    'type': 'pattern_replace',
+    'pattern': '&',
+    'replacement': ' AMPERSANDPLACEHOLDER '
+  });
+
+  // prepend ampersand mapper/replacer to each analyzer
+  _.forEach(_.get(settings, 'analysis.analyzer'), (block) => {
+    if (block?.tokenizer !== 'peliasTokenizer') { return; }
+    block.filter.unshift('ampersand_replacer');
+    block.char_filter.unshift('ampersand_mapper');
+  });
+
+  return settings;
+}
diff --git a/settings.js b/settings.js
@@ -2,14 +2,15 @@ const _ = require('lodash');
 const peliasConfig = require('pelias-config');
 const punctuation = require('./punctuation');
 const synonyms = require('./synonyms/loader').load();
+const settingsICU = require('./settings-icu');
 
 require('./configValidation').validate(peliasConfig.generate());
 
 function generate(){
-  var config = peliasConfig.generate();
+  const config = peliasConfig.generate();
 
   // Default settings
-  var settings = {
+  let settings = {
     "index": {
       "similarity": {
         "peliasDefaultSimilarity": {
@@ -299,6 +300,11 @@ function generate(){
     };
   });
 
+  // Experimental ICU tokenizer
+  if (config.schema.icuTokenizer) {
+    settings = settingsICU(settings);
+  }
+
   // Merge settings from pelias/config
   settings = _.merge({}, settings, _.get(config, 'elasticsearch.settings', {}));
 

diff --git a/test/compile.js b/test/compile.js
@@ -2,7 +2,7 @@ const _ = require('lodash');
 const path = require('path');
 const schema = require('../');
 const fixture = require('./fixtures/expected.json');
-const config = require('pelias-config').generate();
+const fixtureICUTokenizer = require('./fixtures/expected-icu-tokenizer.json');
 
 const forEachDeep = (obj, cb) =>
   _.forEach(obj, (val, key) => {
@@ -97,6 +97,19 @@ module.exports.tests.analyzers = function (test, common) {
   });
 };
 
+function overridePeliasConfig(value, cb) {
+  const OLD_PELIAS_CONFIG = process.env.PELIAS_CONFIG;
+  process.env.PELIAS_CONFIG = value;
+
+  cb();
+
+  if (OLD_PELIAS_CONFIG) {
+    process.env.PELIAS_CONFIG = OLD_PELIAS_CONFIG;
+  } else {
+    delete process.env.PELIAS_CONFIG;
+  }
+}
+
 // current schema (compiled) - requires schema to be copied and settings to
 // be regenerated from a fixture in order to pass in CI environments.
 module.exports.tests.current_schema = function(test, common) {
@@ -106,9 +119,9 @@ module.exports.tests.current_schema = function(test, common) {
     var schemaCopy = JSON.parse( JSON.stringify( schema ) );
 
     // use the pelias config fixture instead of the local config
-    process.env.PELIAS_CONFIG = path.resolve( __dirname + '/fixtures/config.json' );
-    schemaCopy.settings = require('../settings')();
-    delete process.env.PELIAS_CONFIG;
+    overridePeliasConfig(path.resolve( __dirname + '/fixtures/config.json' ), () => {
+      schemaCopy.settings = require('../settings')();
+    });
 
     // code intentionally commented to allow quick debugging of expected.json
     // common.diff(schemaCopy, fixture);
@@ -121,6 +134,28 @@ module.exports.tests.current_schema = function(test, common) {
     t.deepEqual(schemaCopy, fixture);
     t.end();
   });
+
+  test('current schema vs. fixture with ICU tokenizer', function(t) {
+
+    // copy schema
+    var schemaCopy = JSON.parse( JSON.stringify( schema ) );
+
+    // use the pelias config fixture instead of the local config
+    overridePeliasConfig(path.resolve( __dirname + '/fixtures/config-icu-tokenizer.json' ), () => {
+      schemaCopy.settings = require('../settings')();
+    });
+
+    // code intentionally commented to allow quick debugging of expected.json
+    // common.diff(schemaCopy, fixtureICUTokenizer);
+    // console.error( JSON.stringify( schemaCopy, null, 2 ) );
+
+    // code to write expected output to the fixture
+    // const fs = require('fs');
+    // fs.writeFileSync(path.resolve( __dirname + '/fixtures/expected-icu-tokenizer.json' ), JSON.stringify(schemaCopy, null, 2));
+
+    t.deepEqual(schemaCopy, fixtureICUTokenizer);
+    t.end();
+  });
 };
 
 module.exports.all = function (tape, common) {

diff --git a/test/fixtures/config-icu-tokenizer.json b/test/fixtures/config-icu-tokenizer.json
@@ -0,0 +1,15 @@
+{
+    "elasticsearch": {
+      "settings": {
+        "index": {
+          "number_of_replicas": "999",
+          "number_of_shards": "5",
+          "refresh_interval": "1m"
+        }
+      }
+    },
+    "schema": {
+        "icuTokenizer": true
+    }
+}
+