Skip to content

Commit

Permalink
Add english plural stemmer as a new token filter
Browse files Browse the repository at this point in the history
+ This contribution was made by https://github.com/jgschis .
+ This has not been incorporated into the `en` analyzer.
+ The user will however be able to build a custom analyzer
  with the `en` components alongside this.
+ For: #1750
+ Also: https://issues.couchbase.com/browse/MB-56359
  • Loading branch information
abhinavdangeti committed Apr 6, 2023
1 parent 0c48a9e commit 1a66a57
Show file tree
Hide file tree
Showing 2 changed files with 220 additions and 0 deletions.
174 changes: 174 additions & 0 deletions analysis/lang/en/plural_stemmer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
/*
This code was ported from the Open Search Project
https://github.com/opensearch-project/OpenSearch/blob/main/modules/analysis-common/src/main/java/org/opensearch/analysis/common/EnglishPluralStemFilter.java
The algorithm itself was created by Mark Harwood
https://github.com/markharwood
*/

/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package en

import (
"strings"

"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)

const PluralStemmerName = "stemmer_en_plural"

type EnglishPluralStemmerFilter struct {
}

func NewEnglishPluralStemmerFilter() *EnglishPluralStemmerFilter {
return &EnglishPluralStemmerFilter{}
}

func (s *EnglishPluralStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
token.Term = []byte(stem(string(token.Term)))
}

return input
}

func EnglishPluralStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewEnglishPluralStemmerFilter(), nil
}

func init() {
registry.RegisterTokenFilter(PluralStemmerName, EnglishPluralStemmerFilterConstructor)
}

// ----------------------------------------------------------------------------

// Words ending in oes that retain the e when stemmed
var oesExceptions = []string{"shoes", "canoes", "oboes"}

// Words ending in ches that retain the e when stemmed
var chesExceptions = []string{
"cliches",
"avalanches",
"mustaches",
"moustaches",
"quiches",
"headaches",
"heartaches",
"porsches",
"tranches",
"caches",
}

func stem(word string) string {
runes := []rune(strings.ToLower(word))

if len(runes) < 3 || runes[len(runes)-1] != 's' {
return string(runes)
}

switch runes[len(runes)-2] {
case 'u':
fallthrough
case 's':
return string(runes)
case 'e':
// Modified ies->y logic from original s-stemmer - only work on strings > 4
// so spies -> spy still but pies->pie.
// The original code also special-cased aies and eies for no good reason as far as I can tell.
// ( no words of consequence - eg http://www.thefreedictionary.com/words-that-end-in-aies )
if len(runes) > 4 && runes[len(runes)-3] == 'i' {
runes[len(runes)-3] = 'y'
return string(runes[0 : len(runes)-2])
}

// Suffix rules to remove any dangling "e"
if len(runes) > 3 {
// xes (but >1 prefix so we can stem "boxes->box" but keep "axes->axe")
if len(runes) > 4 && runes[len(runes)-3] == 'x' {
return string(runes[0 : len(runes)-2])
}

// oes
if len(runes) > 3 && runes[len(runes)-3] == 'o' {
if isException(runes, oesExceptions) {
// Only remove the S
return string(runes[0 : len(runes)-1])
}
// Remove the es
return string(runes[0 : len(runes)-2])
}

if len(runes) > 4 {
// shes/sses
if runes[len(runes)-4] == 's' && (runes[len(runes)-3] == 'h' || runes[len(runes)-3] == 's') {
return string(runes[0 : len(runes)-2])
}

// ches
if len(runes) > 4 {
if runes[len(runes)-4] == 'c' && runes[len(runes)-3] == 'h' {
if isException(runes, chesExceptions) {
// Only remove the S
return string(runes[0 : len(runes)-1])
}
// Remove the es
return string(runes[0 : len(runes)-2])
}
}
}
}
fallthrough
default:
return string(runes[0 : len(runes)-1])
}
}

func isException(word []rune, exceptions []string) bool {
for _, exception := range exceptions {

exceptionRunes := []rune(exception)

exceptionPos := len(exceptionRunes) - 1
wordPos := len(word) - 1

matched := true
for exceptionPos >= 0 && wordPos >= 0 {
if exceptionRunes[exceptionPos] != word[wordPos] {
matched = false
break
}
exceptionPos--
wordPos--
}
if matched {
return true
}
}
return false
}
46 changes: 46 additions & 0 deletions analysis/lang/en/plural_stemmer_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package en

import "testing"

func TestEnglishPluralStemmer(t *testing.T) {
data := []struct {
In, Out string
}{
{"dresses", "dress"},
{"dress", "dress"},
{"axes", "axe"},
{"ad", "ad"},
{"ads", "ad"},
{"gas", "ga"},
{"sass", "sass"},
{"berries", "berry"},
{"dresses", "dress"},
{"spies", "spy"},
{"shoes", "shoe"},
{"headaches", "headache"},
{"computer", "computer"},
{"dressing", "dressing"},
{"clothes", "clothe"},
{"DRESSES", "dress"},
{"frog", "frog"},
{"dress", "dress"},
{"runs", "run"},
{"pies", "pie"},
{"foxes", "fox"},
{"axes", "axe"},
{"foes", "fo"},
{"dishes", "dish"},
{"snitches", "snitch"},
{"cliches", "cliche"},
{"forests", "forest"},
{"yes", "ye"},
}

for _, datum := range data {
stemmed := stem(datum.In)

if stemmed != datum.Out {
t.Errorf("expected %v but got %v", datum.Out, stemmed)
}
}
}

0 comments on commit 1a66a57

Please sign in to comment.