Skip to content

Commit f8af28e

Browse files
Merge branch 'origin/master' into '7.6-couchbase' (#1986)
``` * 5c7445c Abhinav Dangeti | Fix merge conflict * a0cb65a Abhinav Dangeti | Merge remote-tracking branch 'origin/master' into 7.6-couchbase |\ | * 5f1f45a Sergio Vera | Fixed spanish accents normalization (#1957) | * e26eace Mohd Shaad Khan | MB-60207 fix facets merge (#1946) | * c8e3daf Likith B | #1873: Added timeout option in the Search Handler (#1898) | * 6dee5e9 Aditi Ahuja | Added missing nil check (#1905) | * 907c83e Rahul Rampure | Added a document that demonstrates the performance benefits of docvalues (#1897) * | 8b9206a Abhi Dangeti | MB-60739: Upgrade go-faiss & zapx/v16 (#1985) ```
1 parent 8b9206a commit f8af28e

File tree

4 files changed

+184
-15
lines changed

4 files changed

+184
-15
lines changed

analysis/lang/es/analyzer_es.go

+5
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ func AnalyzerConstructor(config map[string]interface{},
3434
if err != nil {
3535
return nil, err
3636
}
37+
normalizeEsFilter, err := cache.TokenFilterNamed(NormalizeName)
38+
if err != nil {
39+
return nil, err
40+
}
3741
stopEsFilter, err := cache.TokenFilterNamed(StopName)
3842
if err != nil {
3943
return nil, err
@@ -47,6 +51,7 @@ func AnalyzerConstructor(config map[string]interface{},
4751
TokenFilters: []analysis.TokenFilter{
4852
toLowerFilter,
4953
stopEsFilter,
54+
normalizeEsFilter,
5055
lightStemmerEsFilter,
5156
},
5257
}

analysis/lang/es/light_stemmer_es.go

-15
Original file line numberDiff line numberDiff line change
@@ -46,21 +46,6 @@ func stem(input []rune) []rune {
4646
return input
4747
}
4848

49-
for i, r := range input {
50-
switch r {
51-
case 'à', 'á', 'â', 'ä':
52-
input[i] = 'a'
53-
case 'ò', 'ó', 'ô', 'ö':
54-
input[i] = 'o'
55-
case 'è', 'é', 'ê', 'ë':
56-
input[i] = 'e'
57-
case 'ù', 'ú', 'û', 'ü':
58-
input[i] = 'u'
59-
case 'ì', 'í', 'î', 'ï':
60-
input[i] = 'i'
61-
}
62-
}
63-
6449
switch input[l-1] {
6550
case 'o', 'a', 'e':
6651
return input[:l-1]

analysis/lang/es/spanish_normalize.go

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
// Copyright (c) 2017 Couchbase, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package es
16+
17+
import (
18+
"bytes"
19+
20+
"github.com/blevesearch/bleve/v2/analysis"
21+
"github.com/blevesearch/bleve/v2/registry"
22+
)
23+
24+
const NormalizeName = "normalize_es"
25+
26+
type SpanishNormalizeFilter struct {
27+
}
28+
29+
func NewSpanishNormalizeFilter() *SpanishNormalizeFilter {
30+
return &SpanishNormalizeFilter{}
31+
}
32+
33+
func (s *SpanishNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
34+
for _, token := range input {
35+
term := normalize(token.Term)
36+
token.Term = term
37+
}
38+
return input
39+
}
40+
41+
func normalize(input []byte) []byte {
42+
runes := bytes.Runes(input)
43+
for i := 0; i < len(runes); i++ {
44+
switch runes[i] {
45+
case 'à', 'á', 'â', 'ä':
46+
runes[i] = 'a'
47+
case 'ò', 'ó', 'ô', 'ö':
48+
runes[i] = 'o'
49+
case 'è', 'é', 'ê', 'ë':
50+
runes[i] = 'e'
51+
case 'ù', 'ú', 'û', 'ü':
52+
runes[i] = 'u'
53+
case 'ì', 'í', 'î', 'ï':
54+
runes[i] = 'i'
55+
}
56+
}
57+
58+
return analysis.BuildTermFromRunes(runes)
59+
}
60+
61+
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
62+
return NewSpanishNormalizeFilter(), nil
63+
}
64+
65+
func init() {
66+
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
67+
}
+112
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
// Copyright (c) 2017 Couchbase, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package es
16+
17+
import (
18+
"reflect"
19+
"testing"
20+
21+
"github.com/blevesearch/bleve/v2/analysis"
22+
)
23+
24+
func TestSpanishNormalizeFilter(t *testing.T) {
25+
tests := []struct {
26+
input analysis.TokenStream
27+
output analysis.TokenStream
28+
}{
29+
{
30+
input: analysis.TokenStream{
31+
&analysis.Token{
32+
Term: []byte("Guía"),
33+
},
34+
},
35+
output: analysis.TokenStream{
36+
&analysis.Token{
37+
Term: []byte("Guia"),
38+
},
39+
},
40+
},
41+
{
42+
input: analysis.TokenStream{
43+
&analysis.Token{
44+
Term: []byte("Belcebú"),
45+
},
46+
},
47+
output: analysis.TokenStream{
48+
&analysis.Token{
49+
Term: []byte("Belcebu"),
50+
},
51+
},
52+
},
53+
{
54+
input: analysis.TokenStream{
55+
&analysis.Token{
56+
Term: []byte("Limón"),
57+
},
58+
},
59+
output: analysis.TokenStream{
60+
&analysis.Token{
61+
Term: []byte("Limon"),
62+
},
63+
},
64+
},
65+
{
66+
input: analysis.TokenStream{
67+
&analysis.Token{
68+
Term: []byte("agüero"),
69+
},
70+
},
71+
output: analysis.TokenStream{
72+
&analysis.Token{
73+
Term: []byte("aguero"),
74+
},
75+
},
76+
},
77+
{
78+
input: analysis.TokenStream{
79+
&analysis.Token{
80+
Term: []byte("laúd"),
81+
},
82+
},
83+
output: analysis.TokenStream{
84+
&analysis.Token{
85+
Term: []byte("laud"),
86+
},
87+
},
88+
},
89+
// empty
90+
{
91+
input: analysis.TokenStream{
92+
&analysis.Token{
93+
Term: []byte(""),
94+
},
95+
},
96+
output: analysis.TokenStream{
97+
&analysis.Token{
98+
Term: []byte(""),
99+
},
100+
},
101+
},
102+
}
103+
104+
spanishNormalizeFilter := NewSpanishNormalizeFilter()
105+
for _, test := range tests {
106+
actual := spanishNormalizeFilter.Filter(test.input)
107+
if !reflect.DeepEqual(actual, test.output) {
108+
t.Errorf("expected %#v, got %#v", test.output, actual)
109+
t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term)
110+
}
111+
}
112+
}

0 commit comments

Comments
 (0)