diff --git a/analysis/lang/es/analyzer_es.go b/analysis/lang/es/analyzer_es.go index e6fcd080c..eea75568d 100644 --- a/analysis/lang/es/analyzer_es.go +++ b/analysis/lang/es/analyzer_es.go @@ -34,6 +34,10 @@ func AnalyzerConstructor(config map[string]interface{}, if err != nil { return nil, err } + normalizeEsFilter, err := cache.TokenFilterNamed(NormalizeName) + if err != nil { + return nil, err + } stopEsFilter, err := cache.TokenFilterNamed(StopName) if err != nil { return nil, err @@ -47,6 +51,7 @@ func AnalyzerConstructor(config map[string]interface{}, TokenFilters: []analysis.TokenFilter{ toLowerFilter, stopEsFilter, + normalizeEsFilter, lightStemmerEsFilter, }, } diff --git a/analysis/lang/es/light_stemmer_es.go b/analysis/lang/es/light_stemmer_es.go index c1b4749ea..4be04a4bd 100644 --- a/analysis/lang/es/light_stemmer_es.go +++ b/analysis/lang/es/light_stemmer_es.go @@ -46,21 +46,6 @@ func stem(input []rune) []rune { return input } - for i, r := range input { - switch r { - case 'à', 'á', 'â', 'ä': - input[i] = 'a' - case 'ò', 'ó', 'ô', 'ö': - input[i] = 'o' - case 'è', 'é', 'ê', 'ë': - input[i] = 'e' - case 'ù', 'ú', 'û', 'ü': - input[i] = 'u' - case 'ì', 'í', 'î', 'ï': - input[i] = 'i' - } - } - switch input[l-1] { case 'o', 'a', 'e': return input[:l-1] diff --git a/analysis/lang/es/spanish_normalize.go b/analysis/lang/es/spanish_normalize.go new file mode 100644 index 000000000..a6f1964ad --- /dev/null +++ b/analysis/lang/es/spanish_normalize.go @@ -0,0 +1,67 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package es + +import ( + "bytes" + + "github.com/blevesearch/bleve/v2/analysis" + "github.com/blevesearch/bleve/v2/registry" +) + +const NormalizeName = "normalize_es" + +type SpanishNormalizeFilter struct { +} + +func NewSpanishNormalizeFilter() *SpanishNormalizeFilter { + return &SpanishNormalizeFilter{} +} + +func (s *SpanishNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + term := normalize(token.Term) + token.Term = term + } + return input +} + +func normalize(input []byte) []byte { + runes := bytes.Runes(input) + for i := 0; i < len(runes); i++ { + switch runes[i] { + case 'à', 'á', 'â', 'ä': + runes[i] = 'a' + case 'ò', 'ó', 'ô', 'ö': + runes[i] = 'o' + case 'è', 'é', 'ê', 'ë': + runes[i] = 'e' + case 'ù', 'ú', 'û', 'ü': + runes[i] = 'u' + case 'ì', 'í', 'î', 'ï': + runes[i] = 'i' + } + } + + return analysis.BuildTermFromRunes(runes) +} + +func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewSpanishNormalizeFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor) +} diff --git a/analysis/lang/es/spanish_normalize_test.go b/analysis/lang/es/spanish_normalize_test.go new file mode 100644 index 000000000..b2f9df571 --- /dev/null +++ b/analysis/lang/es/spanish_normalize_test.go @@ -0,0 +1,112 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package es + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/v2/analysis" +) + +func TestSpanishNormalizeFilter(t *testing.T) { + tests := []struct { + input analysis.TokenStream + output analysis.TokenStream + }{ + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("Guía"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("Guia"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("Belcebú"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("Belcebu"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("Limón"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("Limon"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("agüero"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("aguero"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("laúd"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("laud"), + }, + }, + }, + // empty + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte(""), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte(""), + }, + }, + }, + } + + spanishNormalizeFilter := NewSpanishNormalizeFilter() + for _, test := range tests { + actual := spanishNormalizeFilter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %#v, got %#v", test.output, actual) + t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term) + } + } +}