Skip to content

Commit

Permalink
stem takes in account ii suffix (close #238)
Browse files Browse the repository at this point in the history
  • Loading branch information
dimus committed Mar 7, 2023
1 parent 3b06115 commit 884acce
Show file tree
Hide file tree
Showing 5 changed files with 253 additions and 244 deletions.
60 changes: 35 additions & 25 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
PROJ_NAME = gnparser

VERSION = $(shell git describe --tags)
VER = $(shell git describe --tags --abbrev=0)
DATE = $(shell date -u '+%Y-%m-%d_%H:%M:%S%Z')

FLAG_MODULE = GO111MODULE=on
FLAGS_SHARED = $(FLAG_MODULE) GOARCH=amd64
NO_C = CGO_ENABLED=0
FLAGS_SHARED = GOARCH=amd64
FLAGS_LINUX = $(FLAGS_SHARED) GOOS=linux
FLAGS_MAC = $(FLAGS_SHARED) GOOS=darwin
FLAGS_MAC_ARM = GO111MODULE=on $GOARCH=arm64 GOOS=darwin
FLAGS_MAC_ARM = $GOARCH=arm64 GOOS=darwin
FLAGS_WIN = $(FLAGS_SHARED) GOOS=windows
FLAGS_LD=-ldflags "-s -w -X github.com/gnames/gnparser.Build=${DATE} \
-X github.com/gnames/gnparser.Version=${VERSION}"
FLAGS_LD=-ldflags "-s -w -X github.com/gnames/$(PROJ_NAME).Build=$(DATE) \
-X github.com/gnames/$(PROJ_NAME).Version=$(VERSION)"
FLAGS_REL = -trimpath -ldflags "-s -w \
-X github.com/gnames/$(PROJ_NAME).Build=$(DATE)"

GOCMD = go
GOBUILD = $(GOCMD) build $(FLAGS_LD)
GOINSTALL = $(GOCMD) install $(FLAGS_LD)
GORELEASE = $(GOCMD) build $(FLAGS_REL)
GOCLEAN = $(GOCMD) clean
GOGET = $(GOCMD) get

Expand All @@ -24,7 +29,7 @@ CLIB_DIR ?= "."
all: install

test: deps install
$(FLAG_MODULE) go test -race ./...
$(FLAG_MODULE) go test -shuffle=on -race -count=1 ./...

test-build: deps build

Expand All @@ -33,7 +38,7 @@ deps:

tools: deps
@echo Installing tools from tools.go
@cat gnparser/tools.go | grep _ | awk -F'"' '{print $$2}' | xargs -tI % go install %
@cat $(PROJ_NAME)/tools.go | grep _ | awk -F'"' '{print $$2}' | xargs -tI % go install %

peg:
cd ent/parser; \
Expand All @@ -53,55 +58,60 @@ asset:
$(FLAGS_SHARED) go run -tags=dev assets_gen.go

build: peg
cd gnparser; \
cd $(PROJ_NAME); \
$(GOCLEAN); \
$(NO_C) $(GOBUILD) -o $(BUILD_DIR)

buildrel: peg
cd $(PROJ_NAME); \
$(GOCLEAN); \
$(NO_C) $(GORELEASE) -o $(BUILD_DIR)

install: peg
cd gnparser; \
cd $(PROJ_NAME); \
$(GOCLEAN); \
$(NO_C) $(GOINSTALL)

release: peg dockerhub
cd gnparser; \
cd $(PROJ_NAME); \
$(GOCLEAN); \
$(FLAGS_LINUX) $(NO_C) $(GOBUILD); \
tar zcf $(RELEASE_DIR)/gnparser-$(VER)-linux.tar.gz gnparser; \
tar zcf $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-linux.tar.gz $(PROJ_NAME); \
$(GOCLEAN); \
$(FLAGS_MAC) $(NO_C) $(GOBUILD); \
tar zcf $(RELEASE_DIR)/gnparser-$(VER)-mac.tar.gz gnparser; \
tar zcf $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-mac.tar.gz $(PROJ_NAME); \
$(GOCLEAN); \
$(FLAGS_MAC_ARM) $(NO_C) $(GOBUILD); \
tar zcf $(RELEASE_DIR)/gnparser-$(VER)-mac-arm64.tar.gz gnparser; \
tar zcf $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-mac-arm64.tar.gz $(PROJ_NAME); \
$(GOCLEAN); \
$(FLAGS_WIN) $(NO_C) $(GOBUILD); \
zip -9 $(RELEASE_DIR)/gnparser-$(VER)-win-64.zip gnparser.exe; \
zip -9 $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-win-64.zip $(PROJ_NAME).exe; \
$(GOCLEAN);

dc: asset build
docker-compose build;

docker: build
docker build -t gnames/gognparser:latest -t gnames/gognparser:$(VERSION) .; \
cd gnparser; \
docker build -t gnames/go$(PROJ_NAME):latest -t gnames/go$(PROJ_NAME):$(VERSION) .; \
cd $(PROJ_NAME); \
$(GOCLEAN);

dockerhub: docker
docker push gnames/gognparser; \
docker push gnames/gognparser:$(VERSION)
docker push gnames/go$(PROJ_NAME); \
docker push gnames/go$(PROJ_NAME):$(VERSION)

clib_darwin: peg
cd binding; \
$(GOCLEAN); \
CGO_ENABLED=1 GOOS=darwin GOARCH=arm64 $(GOBUILD) -buildmode=c-shared -o $(CLIB_DIR)/libgnparser_arm64.so; \
CGO_ENABLED=1 GOOS=darwin GOARCH=amd64 $(GOBUILD) -buildmode=c-shared -o $(CLIB_DIR)/libgnparser_amd64.so; \
rm libgnparser_amd64.h; \
mv libgnparser_arm64.h libgnparser.h; \
lipo -create -output $(CLIB_DIR)/libgnparser.so $(CLIB_DIR)/libgnparser_arm64.so $(CLIB_DIR)/libgnparser_amd64.so;
CGO_ENABLED=1 GOOS=darwin GOARCH=arm64 $(GOBUILD) -buildmode=c-shared -o $(CLIB_DIR)/lib$(PROJ_NAME)_arm64.so; \
CGO_ENABLED=1 GOOS=darwin GOARCH=amd64 $(GOBUILD) -buildmode=c-shared -o $(CLIB_DIR)/lib$(PROJ_NAME)_amd64.so; \
rm lib$(PROJ_NAME)_amd64.h; \
mv lib$(PROJ_NAME)_arm64.h lib$(PROJ_NAME).h; \
lipo -create -output $(CLIB_DIR)/lib$(PROJ_NAME).so $(CLIB_DIR)/lib$(PROJ_NAME)_arm64.so $(CLIB_DIR)/lib$(PROJ_NAME)_amd64.so;

clib: peg
cd binding; \
$(GOBUILD) -buildmode=c-shared -o $(CLIB_DIR)/libgnparser.so;
$(GOBUILD) -buildmode=c-shared -o $(CLIB_DIR)/lib$(PROJ_NAME).so;

quality:
cd tools;\
Expand All @@ -110,7 +120,7 @@ quality:

.PHONY: man
man: ronn
@ronn ./man/gnparser.1.ronn --style=dark
@ronn ./man/$(PROJ_NAME).1.ronn --style=dark

.PHONY: ronn
ronn:
Expand Down
73 changes: 37 additions & 36 deletions ent/stemmer/stemmer.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,48 +11,48 @@
//
// It has the feature that it stems each word to two forms, noun and verb. For example,
//
// NOUN VERB
// ---- ----
// aquila aquil aquila
// portat portat porta
// portis port por
// NOUN VERB
// ---- ----
// aquila aquil aquila
// portat portat porta
// portis port por
//
// Here (slightly reformatted) are the rules of the stemmer,
//
// 1. (start)
//
// 2. Convert all occurrences of the letters 'j' or 'v' to 'i' or 'u',
// 2. Convert all occurrences of the letters 'j' or 'v' to 'i' or 'u',
// respectively.
//
// 3. If the word ends in '-que' then
// if the word is on the list shown in Figure 4, then
// write the original word to both the noun-based and verb-based
// stem dictionaries and go to 8.
// else remove '-que'
// 3. If the word ends in '-que' then
// if the word is on the list shown in Figure 4, then
// write the original word to both the noun-based and verb-based
// stem dictionaries and go to 8.
// else remove '-que'
//
// [Figure 4 was
//
// atque quoque neque itaque absque apsque abusque adaeque adusque denique
// deque susque oblique peraeque plenisque quandoque quisque quaeque
// cuiusque cuique quemque quamque quaque quique quorumque quarumque
// quibusque quosque quasque quotusquisque quousque ubique undique usque
// uterque utique utroque utribique torque coque concoque contorque
// detorque decoque excoque extorque obtorque optorque retorque recoque
// attorque incoque intorque praetorque]
// atque quoque neque itaque absque apsque abusque adaeque adusque denique
// deque susque oblique peraeque plenisque quandoque quisque quaeque
// cuiusque cuique quemque quamque quaque quique quorumque quarumque
// quibusque quosque quasque quotusquisque quousque ubique undique usque
// uterque utique utroque utribique torque coque concoque contorque
// detorque decoque excoque extorque obtorque optorque retorque recoque
// attorque incoque intorque praetorque]
//
// 4. Match the end of the word against the suffix list show in Figure 6(a),
// 4. Match the end of the word against the suffix list show in Figure 6(a),
// removing the longest matching suffix, (if any).
//
// [Figure 6(a) was
//
// -ibus -ius -ae -am -as -em -es -ia
// -is -nt -os -ud -um -us -a -e
// -i -o -u]
// -ibus -ius -ae -am -as -em -es -ia
// -is -nt -os -ud -um -us -a -e
// -i -o -u]
//
// 5. If the resulting stem contains at least two characters then write this stem
// 5. If the resulting stem contains at least two characters then write this stem
// to the noun-based stem dictionary.
//
// 6. Match the end of the word against the suffix list show in Figure 6(b),
// 6. Match the end of the word against the suffix list show in Figure 6(b),
// identifying the longest matching suffix, (if any).
//
// [Figure 6(b) was
Expand All @@ -64,22 +64,24 @@
//
// If any of the following suffixes are found then convert them as shown:
//
// '-iuntur', '-erunt', '-untur', '-iunt', and '-unt', to '-i';
// '-beris', '-bor', and '-bo' to '-bi';
// '-ero' to '-eri'
// '-iuntur', '-erunt', '-untur', '-iunt', and '-unt', to '-i';
// '-beris', '-bor', and '-bo' to '-bi';
// '-ero' to '-eri'
//
// else remove the suffix in the normal way.
//
// 7. If the resulting stem contains at least two characters then write this stem
// 7. If the resulting stem contains at least two characters then write this stem
// to the verb-based stem dictionary.
//
// 8. (end)
//
// Addendum: adding -ii to Step 4.
package stemmer

import (
"github.com/gnames/gnparser/ent/str"
"strings"

"github.com/gnames/gnparser/ent/str"
)

var empty = struct{}{}
Expand All @@ -105,7 +107,7 @@ var nounSuffixes = []string{
"ibus", "ius", "ae", "am", "as",
"em", "es", "ia", "is",
"nt", "os", "ud", "um", "us",
"a", "e", "i", "o", "u",
"a", "e", "ii", "i", "o", "u",
}

// StemmedWord is the output of stemming algorithm applied to a word.
Expand All @@ -123,12 +125,11 @@ type StemmedWord struct {
// epithet.
// It assumes the following properties of a string:
//
// 1. There are no empty spaces over any side of a string.
// 2. All spaces within the string are single.
// 3. All characters in the string are ASCII with exception of the
// hybrid sign.
// 4. The string always starts with a capitalized word.
//
// 1. There are no empty spaces over any side of a string.
// 2. All spaces within the string are single.
// 3. All characters in the string are ASCII with exception of the
// hybrid sign.
// 4. The string always starts with a capitalized word.
func StemCanonical(c string) string {
graftChimeraFormulaParts := strings.Split(c, " + ")
for gci, gcv := range graftChimeraFormulaParts {
Expand Down
Loading

0 comments on commit 884acce

Please sign in to comment.