Add english plural stemmer as a new token filter

+ This contribution was made by https://github.com/jgschis . + This has not been incorporated into the `en` analyzer. + The user will however be able to build a custom analyzer with the `en` components alongside this. + For: #1750 + Also: https://issues.couchbase.com/browse/MB-56359
blevesearch · Apr 6, 2023 · 1a66a57 · 1a66a57
1 parent 0c48a9e
commit 1a66a57
Show file tree

Hide file tree

Showing 2 changed files with 220 additions and 0 deletions.
diff --git a/analysis/lang/en/plural_stemmer.go b/analysis/lang/en/plural_stemmer.go
@@ -0,0 +1,174 @@
+/*
+	This code was ported from the Open Search Project
+	https://github.com/opensearch-project/OpenSearch/blob/main/modules/analysis-common/src/main/java/org/opensearch/analysis/common/EnglishPluralStemFilter.java
+	The algorithm itself was created by Mark Harwood
+	https://github.com/markharwood
+*/
+
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package en
+
+import (
+	"strings"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const PluralStemmerName = "stemmer_en_plural"
+
+type EnglishPluralStemmerFilter struct {
+}
+
+func NewEnglishPluralStemmerFilter() *EnglishPluralStemmerFilter {
+	return &EnglishPluralStemmerFilter{}
+}
+
+func (s *EnglishPluralStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		token.Term = []byte(stem(string(token.Term)))
+	}
+
+	return input
+}
+
+func EnglishPluralStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewEnglishPluralStemmerFilter(), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(PluralStemmerName, EnglishPluralStemmerFilterConstructor)
+}
+
+// ----------------------------------------------------------------------------
+
+// Words ending in oes that retain the e when stemmed
+var oesExceptions = []string{"shoes", "canoes", "oboes"}
+
+// Words ending in ches that retain the e when stemmed
+var chesExceptions = []string{
+	"cliches",
+	"avalanches",
+	"mustaches",
+	"moustaches",
+	"quiches",
+	"headaches",
+	"heartaches",
+	"porsches",
+	"tranches",
+	"caches",
+}
+
+func stem(word string) string {
+	runes := []rune(strings.ToLower(word))
+
+	if len(runes) < 3 || runes[len(runes)-1] != 's' {
+		return string(runes)
+	}
+
+	switch runes[len(runes)-2] {
+	case 'u':
+		fallthrough
+	case 's':
+		return string(runes)
+	case 'e':
+		// Modified ies->y logic from original s-stemmer - only work on strings > 4
+		// so spies -> spy still but pies->pie.
+		// The original code also special-cased aies and eies for no good reason as far as I can tell.
+		// ( no words of consequence - eg http://www.thefreedictionary.com/words-that-end-in-aies )
+		if len(runes) > 4 && runes[len(runes)-3] == 'i' {
+			runes[len(runes)-3] = 'y'
+			return string(runes[0 : len(runes)-2])
+		}
+
+		// Suffix rules to remove any dangling "e"
+		if len(runes) > 3 {
+			// xes (but >1 prefix so we can stem "boxes->box" but keep "axes->axe")
+			if len(runes) > 4 && runes[len(runes)-3] == 'x' {
+				return string(runes[0 : len(runes)-2])
+			}
+
+			// oes
+			if len(runes) > 3 && runes[len(runes)-3] == 'o' {
+				if isException(runes, oesExceptions) {
+					// Only remove the S
+					return string(runes[0 : len(runes)-1])
+				}
+				// Remove the es
+				return string(runes[0 : len(runes)-2])
+			}
+
+			if len(runes) > 4 {
+				// shes/sses
+				if runes[len(runes)-4] == 's' && (runes[len(runes)-3] == 'h' || runes[len(runes)-3] == 's') {
+					return string(runes[0 : len(runes)-2])
+				}
+
+				// ches
+				if len(runes) > 4 {
+					if runes[len(runes)-4] == 'c' && runes[len(runes)-3] == 'h' {
+						if isException(runes, chesExceptions) {
+							// Only remove the S
+							return string(runes[0 : len(runes)-1])
+						}
+						// Remove the es
+						return string(runes[0 : len(runes)-2])
+					}
+				}
+			}
+		}
+		fallthrough
+	default:
+		return string(runes[0 : len(runes)-1])
+	}
+}
+
+func isException(word []rune, exceptions []string) bool {
+	for _, exception := range exceptions {
+
+		exceptionRunes := []rune(exception)
+
+		exceptionPos := len(exceptionRunes) - 1
+		wordPos := len(word) - 1
+
+		matched := true
+		for exceptionPos >= 0 && wordPos >= 0 {
+			if exceptionRunes[exceptionPos] != word[wordPos] {
+				matched = false
+				break
+			}
+			exceptionPos--
+			wordPos--
+		}
+		if matched {
+			return true
+		}
+	}
+	return false
+}
diff --git a/analysis/lang/en/plural_stemmer_test.go b/analysis/lang/en/plural_stemmer_test.go
@@ -0,0 +1,46 @@
+package en
+
+import "testing"
+
+func TestEnglishPluralStemmer(t *testing.T) {
+	data := []struct {
+		In, Out string
+	}{
+		{"dresses", "dress"},
+		{"dress", "dress"},
+		{"axes", "axe"},
+		{"ad", "ad"},
+		{"ads", "ad"},
+		{"gas", "ga"},
+		{"sass", "sass"},
+		{"berries", "berry"},
+		{"dresses", "dress"},
+		{"spies", "spy"},
+		{"shoes", "shoe"},
+		{"headaches", "headache"},
+		{"computer", "computer"},
+		{"dressing", "dressing"},
+		{"clothes", "clothe"},
+		{"DRESSES", "dress"},
+		{"frog", "frog"},
+		{"dress", "dress"},
+		{"runs", "run"},
+		{"pies", "pie"},
+		{"foxes", "fox"},
+		{"axes", "axe"},
+		{"foes", "fo"},
+		{"dishes", "dish"},
+		{"snitches", "snitch"},
+		{"cliches", "cliche"},
+		{"forests", "forest"},
+		{"yes", "ye"},
+	}
+
+	for _, datum := range data {
+		stemmed := stem(datum.In)
+
+		if stemmed != datum.Out {
+			t.Errorf("expected %v but got %v", datum.Out, stemmed)
+		}
+	}
+}