diff --git a/integration/analyzer_peliasAdmin.js b/integration/analyzer_peliasAdmin.js index e69afff2..d88d554b 100644 --- a/integration/analyzer_peliasAdmin.js +++ b/integration/analyzer_peliasAdmin.js @@ -15,6 +15,15 @@ module.exports.tests.analyze = function(test, common){ var assertAnalysis = analyze.bind( null, suite, t, 'peliasAdmin' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + // expand umlauts + assertAnalysis( 'umlaut', 'Häuser', ['haeuser']); + assertAnalysis( 'umlaut', 'Malmö', ['malmoe']); + assertAnalysis( 'umlaut', 'Bücher', ['buecher']); + assertAnalysis( 'umlaut', 'Äpfel', ['aepfel']); + assertAnalysis( 'umlaut', 'Österreich', ['oesterreich']); + assertAnalysis( 'umlaut', 'Übergröße', ['uebergroesse']); + assertAnalysis( 'umlaut', 'Straße', ['strasse']); + assertAnalysis( 'lowercase', 'F', ['f']); assertAnalysis( 'asciifolding', 'é', ['e']); assertAnalysis( 'asciifolding', 'ß', ['ss']); diff --git a/integration/analyzer_peliasIndexOneEdgeGram.js b/integration/analyzer_peliasIndexOneEdgeGram.js index f1334dd5..a1f37b3c 100644 --- a/integration/analyzer_peliasIndexOneEdgeGram.js +++ b/integration/analyzer_peliasIndexOneEdgeGram.js @@ -15,6 +15,15 @@ module.exports.tests.analyze = function(test, common){ var assertAnalysis = analyze.bind( null, suite, t, 'peliasIndexOneEdgeGram' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + // expand umlauts + assertAnalysis( 'umlaut', 'Häuser', ['h', 'ha', 'hae', 'haeu', 'haeus', 'haeuse', 'haeuser']); + assertAnalysis( 'umlaut', 'Malmö', ['m', 'ma', 'mal', 'malm', 'malmo', 'malmoe']); + assertAnalysis( 'umlaut', 'Bücher', ['b', 'bu', 'bue', 'buec', 'buech', 'bueche', 'buecher']); + assertAnalysis( 'umlaut', 'Äpfel', ['a', 'ae', 'aep', 'aepf', 'aepfe', 'aepfel']); + assertAnalysis( 'umlaut', 'Österreich', ['o', 'oe', 'oes', 'oest', 'oeste', 'oester', 'oesterr', 'oesterre', 'oesterrei', 'oesterreic', 'oesterreich']); + assertAnalysis( 'umlaut', 'Übergröße', ['u', 'ue', 'ueb', 'uebe', 'ueber', 'ueberg', 'uebergr', 'uebergro', 'uebergroe', 'uebergroes', 'uebergroess', 'uebergroesse']); + assertAnalysis( 'umlaut', 'Straße', ['s', 'st', 'str', 'stra', 'stras', 'strass', 'strasse']); + assertAnalysis( 'lowercase', 'F', ['f']); assertAnalysis( 'asciifolding', 'á', ['a']); assertAnalysis( 'asciifolding', 'ß', ['s','ss']); diff --git a/integration/analyzer_peliasIndexTwoEdgeGram.js b/integration/analyzer_peliasIndexTwoEdgeGram.js index daeacbb7..37c568af 100644 --- a/integration/analyzer_peliasIndexTwoEdgeGram.js +++ b/integration/analyzer_peliasIndexTwoEdgeGram.js @@ -15,6 +15,15 @@ module.exports.tests.analyze = function(test, common){ var assertAnalysis = analyze.bind( null, suite, t, 'peliasIndexTwoEdgeGram' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + // expand umlauts + assertAnalysis( 'umlaut', 'Häuser', ['ha', 'hae', 'haeu', 'haeus', 'haeuse', 'haeuser']); + assertAnalysis( 'umlaut', 'Malmö', ['ma', 'mal', 'malm', 'malmo', 'malmoe']); + assertAnalysis( 'umlaut', 'Bücher', ['bu', 'bue', 'buec', 'buech', 'bueche', 'buecher']); + assertAnalysis( 'umlaut', 'Äpfel', ['ae', 'aep', 'aepf', 'aepfe', 'aepfel']); + assertAnalysis( 'umlaut', 'Österreich', ['oe', 'oes', 'oest', 'oeste', 'oester', 'oesterr', 'oesterre', 'oesterrei', 'oesterreic', 'oesterreich']); + assertAnalysis( 'umlaut', 'Übergröße', ['ue', 'ueb', 'uebe', 'ueber', 'ueberg', 'uebergr', 'uebergro', 'uebergroe', 'uebergroes', 'uebergroess', 'uebergroesse']); + assertAnalysis( 'umlaut', 'Straße', ['st', 'str', 'stra', 'stras', 'strass', 'strasse']); + assertAnalysis( 'lowercase', 'FA', ['fa']); assertAnalysis( 'asciifolding', 'lé', ['le']); assertAnalysis( 'asciifolding', 'ß', ['ss']); diff --git a/integration/analyzer_peliasPhrase.js b/integration/analyzer_peliasPhrase.js index d19546cd..54156c0e 100644 --- a/integration/analyzer_peliasPhrase.js +++ b/integration/analyzer_peliasPhrase.js @@ -15,6 +15,15 @@ module.exports.tests.analyze = function(test, common){ var assertAnalysis = analyze.bind( null, suite, t, 'peliasPhrase' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + // expand umlauts + assertAnalysis( 'umlaut', 'Häuser', ['haeuser']); + assertAnalysis( 'umlaut', 'Malmö', ['malmoe']); + assertAnalysis( 'umlaut', 'Bücher', ['buecher']); + assertAnalysis( 'umlaut', 'Äpfel', ['aepfel']); + assertAnalysis( 'umlaut', 'Österreich', ['oesterreich']); + assertAnalysis( 'umlaut', 'Übergröße', ['uebergroesse']); + assertAnalysis( 'umlaut', 'Straße', ['strasse']); + assertAnalysis( 'lowercase', 'F', ['f']); assertAnalysis( 'asciifolding', 'é', ['e']); assertAnalysis( 'asciifolding', 'ß', ['ss']); diff --git a/integration/analyzer_peliasQueryFullToken.js b/integration/analyzer_peliasQueryFullToken.js index 3ac27e7b..14e7aecd 100644 --- a/integration/analyzer_peliasQueryFullToken.js +++ b/integration/analyzer_peliasQueryFullToken.js @@ -15,6 +15,15 @@ module.exports.tests.analyze = function(test, common){ var assertAnalysis = analyze.bind( null, suite, t, 'peliasQueryFullToken' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + // expand umlauts + assertAnalysis( 'umlaut', 'Häuser', ['haeuser']); + assertAnalysis( 'umlaut', 'Malmö', ['malmoe']); + assertAnalysis( 'umlaut', 'Bücher', ['buecher']); + assertAnalysis( 'umlaut', 'Äpfel', ['aepfel']); + assertAnalysis( 'umlaut', 'Österreich', ['oesterreich']); + assertAnalysis( 'umlaut', 'Übergröße', ['uebergroesse']); + assertAnalysis( 'umlaut', 'Straße', ['strasse']); + assertAnalysis( 'lowercase', 'F', ['f']); assertAnalysis( 'asciifolding', 'á', ['a']); assertAnalysis( 'asciifolding', 'ß', ['ss']); diff --git a/integration/analyzer_peliasQueryPartialToken.js b/integration/analyzer_peliasQueryPartialToken.js index 4c597b6c..6de155ed 100644 --- a/integration/analyzer_peliasQueryPartialToken.js +++ b/integration/analyzer_peliasQueryPartialToken.js @@ -15,6 +15,15 @@ module.exports.tests.analyze = function(test, common){ var assertAnalysis = analyze.bind( null, suite, t, 'peliasQueryPartialToken' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + // expand umlauts + assertAnalysis( 'umlaut', 'Häuser', ['haeuser']); + assertAnalysis( 'umlaut', 'Malmö', ['malmoe']); + assertAnalysis( 'umlaut', 'Bücher', ['buecher']); + assertAnalysis( 'umlaut', 'Äpfel', ['aepfel']); + assertAnalysis( 'umlaut', 'Österreich', ['oesterreich']); + assertAnalysis( 'umlaut', 'Übergröße', ['uebergroesse']); + assertAnalysis( 'umlaut', 'Straße', ['strasse']); + assertAnalysis( 'lowercase', 'F', ['f']); assertAnalysis( 'asciifolding', 'é', ['e']); assertAnalysis( 'asciifolding', 'ß', ['ss']); diff --git a/integration/analyzer_peliasStreet.js b/integration/analyzer_peliasStreet.js index d0e11f29..94829625 100644 --- a/integration/analyzer_peliasStreet.js +++ b/integration/analyzer_peliasStreet.js @@ -15,6 +15,15 @@ module.exports.tests.analyze = function(test, common){ var assertAnalysis = analyze.bind( null, suite, t, 'peliasStreet' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + // expand umlauts + assertAnalysis( 'umlaut', 'Häuser', ['haeuser']); + assertAnalysis( 'umlaut', 'Malmö', ['malmoe']); + assertAnalysis( 'umlaut', 'Bücher', ['buecher']); + assertAnalysis( 'umlaut', 'Äpfel', ['aepfel']); + assertAnalysis( 'umlaut', 'Österreich', ['oesterreich']); + assertAnalysis( 'umlaut', 'Übergröße', ['uebergroesse']); + assertAnalysis( 'umlaut', 'Straße', ['strasse']); + assertAnalysis( 'lowercase', 'F', ['f']); assertAnalysis( 'asciifolding', 'Max-Beer-Straße', ['max-beer-strasse']); assertAnalysis( 'trim', ' f ', ['f'] ); diff --git a/settings.js b/settings.js index 7387c08b..30e013ec 100644 --- a/settings.js +++ b/settings.js @@ -26,7 +26,7 @@ function generate(){ "peliasAdmin": { "type": "custom", "tokenizer": "peliasNameTokenizer", - "char_filter" : ["punctuation"], + "char_filter" : ["umlaut","punctuation"], "filter": [ "lowercase", "asciifolding", @@ -38,7 +38,7 @@ function generate(){ "peliasIndexOneEdgeGram" : { "type": "custom", "tokenizer" : "peliasNameTokenizer", - "char_filter" : ["punctuation"], + "char_filter" : ["umlaut","punctuation"], "filter": [ "lowercase", "asciifolding", @@ -61,7 +61,7 @@ function generate(){ "peliasIndexTwoEdgeGram" : { "type": "custom", "tokenizer" : "peliasNameTokenizer", - "char_filter" : ["punctuation"], + "char_filter" : ["umlaut","punctuation"], "filter": [ "lowercase", "asciifolding", @@ -81,7 +81,7 @@ function generate(){ "peliasQueryPartialToken" : { "type": "custom", "tokenizer" : "peliasNameTokenizer", - "char_filter" : ["punctuation"], + "char_filter" : ["umlaut","punctuation"], "filter": [ "lowercase", "asciifolding", @@ -97,7 +97,7 @@ function generate(){ "peliasQueryFullToken" : { "type": "custom", "tokenizer" : "peliasNameTokenizer", - "char_filter" : ["punctuation"], + "char_filter" : ["umlaut","punctuation"], "filter": [ "lowercase", "asciifolding", @@ -113,7 +113,7 @@ function generate(){ "peliasPhrase": { "type": "custom", "tokenizer":"peliasNameTokenizer", - "char_filter" : ["punctuation"], + "char_filter" : ["umlaut","punctuation"], "filter": [ "lowercase", "asciifolding", @@ -142,7 +142,7 @@ function generate(){ "peliasStreet": { "type": "custom", "tokenizer":"peliasStreetTokenizer", - "char_filter" : ["punctuation"], + "char_filter" : ["umlaut","punctuation"], "filter": [ "lowercase", "asciifolding", @@ -270,6 +270,18 @@ function generate(){ return c + '=>'; }) }, + "umlaut" : { + "type" : "mapping", + "mappings" : [ + "ä=>ae", + "ö=>oe", + "ü=>ue", + "Ä=>Ae", + "Ö=>Oe", + "Ü=>Ue", + "ß=>ss" + ] + }, "alphanumeric" : { "type" : "pattern_replace", "pattern": "[^a-zA-Z0-9]", diff --git a/test/fixtures/expected.json b/test/fixtures/expected.json index 5a1156f5..c593434b 100644 --- a/test/fixtures/expected.json +++ b/test/fixtures/expected.json @@ -16,6 +16,7 @@ "type": "custom", "tokenizer": "peliasNameTokenizer", "char_filter": [ + "umlaut", "punctuation" ], "filter": [ @@ -30,6 +31,7 @@ "type": "custom", "tokenizer": "peliasNameTokenizer", "char_filter": [ + "umlaut", "punctuation" ], "filter": [ @@ -55,6 +57,7 @@ "type": "custom", "tokenizer": "peliasNameTokenizer", "char_filter": [ + "umlaut", "punctuation" ], "filter": [ @@ -77,6 +80,7 @@ "type": "custom", "tokenizer": "peliasNameTokenizer", "char_filter": [ + "umlaut", "punctuation" ], "filter": [ @@ -95,6 +99,7 @@ "type": "custom", "tokenizer": "peliasNameTokenizer", "char_filter": [ + "umlaut", "punctuation" ], "filter": [ @@ -113,6 +118,7 @@ "type": "custom", "tokenizer": "peliasNameTokenizer", "char_filter": [ + "umlaut", "punctuation" ], "filter": [ @@ -148,6 +154,7 @@ "type": "custom", "tokenizer": "peliasStreetTokenizer", "char_filter": [ + "umlaut", "punctuation" ], "filter": [ @@ -1540,6 +1547,18 @@ "●=>" ] }, + "umlaut" : { + "type" : "mapping", + "mappings" : [ + "ä=>ae", + "ö=>oe", + "ü=>ue", + "Ä=>Ae", + "Ö=>Oe", + "Ü=>Ue", + "ß=>ss" + ] + }, "alphanumeric": { "type": "pattern_replace", "pattern": "[^a-zA-Z0-9]", diff --git a/test/settings.js b/test/settings.js index 30fa9d0c..e209d9f6 100644 --- a/test/settings.js +++ b/test/settings.js @@ -51,7 +51,7 @@ module.exports.tests.peliasIndexOneEdgeGramAnalyzer = function(test, common) { var analyzer = s.analysis.analyzer.peliasIndexOneEdgeGram; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ["punctuation"], 'punctuation filter specified'); + t.deepEqual(analyzer.char_filter, ["umlaut","punctuation"], 'punctuation filter specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); @@ -86,7 +86,7 @@ module.exports.tests.peliasIndexTwoEdgeGramAnalyzer = function(test, common) { var analyzer = s.analysis.analyzer.peliasIndexTwoEdgeGram; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ["punctuation"], 'punctuation filter specified'); + t.deepEqual(analyzer.char_filter, ["umlaut","punctuation"], 'punctuation filter specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); @@ -118,7 +118,7 @@ module.exports.tests.peliasPhraseAnalyzer = function(test, common) { var analyzer = s.analysis.analyzer.peliasPhrase; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ["punctuation"], 'punctuation filter specified'); + t.deepEqual(analyzer.char_filter, ["umlaut","punctuation"], 'punctuation filter specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); @@ -179,7 +179,7 @@ module.exports.tests.peliasStreetAnalyzer = function(test, common) { var analyzer = s.analysis.analyzer.peliasStreet; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ["punctuation"], 'punctuation filter specified'); + t.deepEqual(analyzer.char_filter, ["umlaut","punctuation"], 'punctuation filter specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); });