From 1a66a575085502c95345663bd6f99f1287059bd1 Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Thu, 6 Apr 2023 12:41:07 -0600 Subject: [PATCH] Add english plural stemmer as a new token filter + This contribution was made by https://github.com/jgschis . + This has not been incorporated into the `en` analyzer. + The user will however be able to build a custom analyzer with the `en` components alongside this. + For: https://github.com/blevesearch/bleve/issues/1750 + Also: https://issues.couchbase.com/browse/MB-56359 --- analysis/lang/en/plural_stemmer.go | 174 ++++++++++++++++++++++++ analysis/lang/en/plural_stemmer_test.go | 46 +++++++ 2 files changed, 220 insertions(+) create mode 100644 analysis/lang/en/plural_stemmer.go create mode 100644 analysis/lang/en/plural_stemmer_test.go diff --git a/analysis/lang/en/plural_stemmer.go b/analysis/lang/en/plural_stemmer.go new file mode 100644 index 000000000..0de7c1bbf --- /dev/null +++ b/analysis/lang/en/plural_stemmer.go @@ -0,0 +1,174 @@ +/* + This code was ported from the Open Search Project + https://github.com/opensearch-project/OpenSearch/blob/main/modules/analysis-common/src/main/java/org/opensearch/analysis/common/EnglishPluralStemFilter.java + The algorithm itself was created by Mark Harwood + https://github.com/markharwood +*/ + +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package en + +import ( + "strings" + + "github.com/blevesearch/bleve/v2/analysis" + "github.com/blevesearch/bleve/v2/registry" +) + +const PluralStemmerName = "stemmer_en_plural" + +type EnglishPluralStemmerFilter struct { +} + +func NewEnglishPluralStemmerFilter() *EnglishPluralStemmerFilter { + return &EnglishPluralStemmerFilter{} +} + +func (s *EnglishPluralStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + token.Term = []byte(stem(string(token.Term))) + } + + return input +} + +func EnglishPluralStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewEnglishPluralStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(PluralStemmerName, EnglishPluralStemmerFilterConstructor) +} + +// ---------------------------------------------------------------------------- + +// Words ending in oes that retain the e when stemmed +var oesExceptions = []string{"shoes", "canoes", "oboes"} + +// Words ending in ches that retain the e when stemmed +var chesExceptions = []string{ + "cliches", + "avalanches", + "mustaches", + "moustaches", + "quiches", + "headaches", + "heartaches", + "porsches", + "tranches", + "caches", +} + +func stem(word string) string { + runes := []rune(strings.ToLower(word)) + + if len(runes) < 3 || runes[len(runes)-1] != 's' { + return string(runes) + } + + switch runes[len(runes)-2] { + case 'u': + fallthrough + case 's': + return string(runes) + case 'e': + // Modified ies->y logic from original s-stemmer - only work on strings > 4 + // so spies -> spy still but pies->pie. + // The original code also special-cased aies and eies for no good reason as far as I can tell. + // ( no words of consequence - eg http://www.thefreedictionary.com/words-that-end-in-aies ) + if len(runes) > 4 && runes[len(runes)-3] == 'i' { + runes[len(runes)-3] = 'y' + return string(runes[0 : len(runes)-2]) + } + + // Suffix rules to remove any dangling "e" + if len(runes) > 3 { + // xes (but >1 prefix so we can stem "boxes->box" but keep "axes->axe") + if len(runes) > 4 && runes[len(runes)-3] == 'x' { + return string(runes[0 : len(runes)-2]) + } + + // oes + if len(runes) > 3 && runes[len(runes)-3] == 'o' { + if isException(runes, oesExceptions) { + // Only remove the S + return string(runes[0 : len(runes)-1]) + } + // Remove the es + return string(runes[0 : len(runes)-2]) + } + + if len(runes) > 4 { + // shes/sses + if runes[len(runes)-4] == 's' && (runes[len(runes)-3] == 'h' || runes[len(runes)-3] == 's') { + return string(runes[0 : len(runes)-2]) + } + + // ches + if len(runes) > 4 { + if runes[len(runes)-4] == 'c' && runes[len(runes)-3] == 'h' { + if isException(runes, chesExceptions) { + // Only remove the S + return string(runes[0 : len(runes)-1]) + } + // Remove the es + return string(runes[0 : len(runes)-2]) + } + } + } + } + fallthrough + default: + return string(runes[0 : len(runes)-1]) + } +} + +func isException(word []rune, exceptions []string) bool { + for _, exception := range exceptions { + + exceptionRunes := []rune(exception) + + exceptionPos := len(exceptionRunes) - 1 + wordPos := len(word) - 1 + + matched := true + for exceptionPos >= 0 && wordPos >= 0 { + if exceptionRunes[exceptionPos] != word[wordPos] { + matched = false + break + } + exceptionPos-- + wordPos-- + } + if matched { + return true + } + } + return false +} diff --git a/analysis/lang/en/plural_stemmer_test.go b/analysis/lang/en/plural_stemmer_test.go new file mode 100644 index 000000000..b6c0028e1 --- /dev/null +++ b/analysis/lang/en/plural_stemmer_test.go @@ -0,0 +1,46 @@ +package en + +import "testing" + +func TestEnglishPluralStemmer(t *testing.T) { + data := []struct { + In, Out string + }{ + {"dresses", "dress"}, + {"dress", "dress"}, + {"axes", "axe"}, + {"ad", "ad"}, + {"ads", "ad"}, + {"gas", "ga"}, + {"sass", "sass"}, + {"berries", "berry"}, + {"dresses", "dress"}, + {"spies", "spy"}, + {"shoes", "shoe"}, + {"headaches", "headache"}, + {"computer", "computer"}, + {"dressing", "dressing"}, + {"clothes", "clothe"}, + {"DRESSES", "dress"}, + {"frog", "frog"}, + {"dress", "dress"}, + {"runs", "run"}, + {"pies", "pie"}, + {"foxes", "fox"}, + {"axes", "axe"}, + {"foes", "fo"}, + {"dishes", "dish"}, + {"snitches", "snitch"}, + {"cliches", "cliche"}, + {"forests", "forest"}, + {"yes", "ye"}, + } + + for _, datum := range data { + stemmed := stem(datum.In) + + if stemmed != datum.Out { + t.Errorf("expected %v but got %v", datum.Out, stemmed) + } + } +}