From 18622b4d80098995b5a651ec9063ec9e0f9b6f0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vytis=20Valentinavi=C4=8Dius?= Date: Fri, 20 Dec 2019 18:25:15 +0200 Subject: [PATCH 1/2] Added lithuanian language analyzer --- analysis/lang/lt/analyzer_lt.go | 57 ++++++++++ analysis/lang/lt/analyzer_lt_test.go | 114 ++++++++++++++++++++ analysis/lang/lt/stemmer_lt.go | 49 +++++++++ analysis/lang/lt/stemmer_lt_test.go | 67 ++++++++++++ analysis/lang/lt/stop_filter_lt.go | 33 ++++++ analysis/lang/lt/stop_words_lt.go | 150 +++++++++++++++++++++++++++ 6 files changed, 470 insertions(+) create mode 100644 analysis/lang/lt/analyzer_lt.go create mode 100644 analysis/lang/lt/analyzer_lt_test.go create mode 100644 analysis/lang/lt/stemmer_lt.go create mode 100644 analysis/lang/lt/stemmer_lt_test.go create mode 100644 analysis/lang/lt/stop_filter_lt.go create mode 100644 analysis/lang/lt/stop_words_lt.go diff --git a/analysis/lang/lt/analyzer_lt.go b/analysis/lang/lt/analyzer_lt.go new file mode 100644 index 000000000..dbef4c2a9 --- /dev/null +++ b/analysis/lang/lt/analyzer_lt.go @@ -0,0 +1,57 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package lt + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" +) + +const AnalyzerName = "lt" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopLtFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerLtFilter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopLtFilter, + stemmerLtFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/lt/analyzer_lt_test.go b/analysis/lang/lt/analyzer_lt_test.go new file mode 100644 index 000000000..4dd358e33 --- /dev/null +++ b/analysis/lang/lt/analyzer_lt_test.go @@ -0,0 +1,114 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package lt + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestLithuanianAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + // stemming + { + input: []byte("kavytė"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("kav"), + }, + }, + }, + { + input: []byte("kavinukas"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("kavinuk"), + }, + }, + }, + // stop word + { + input: []byte("į"), + output: analysis.TokenStream{}, + }, + // digits safe + { + input: []byte("Šeši nuliai - 1000000"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("šeš"), + }, + &analysis.Token{ + Term: []byte("nul"), + }, + &analysis.Token{ + Term: []byte("1000000"), + }, + }, + }, + { + input: []byte("Tiek savaitgalį, tiek per šventes laukia rudeniški orai: sniego tikėtis neverta"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("savaitgal"), + }, + &analysis.Token{ + Term: []byte("švent"), + }, + &analysis.Token{ + Term: []byte("lauk"), + }, + &analysis.Token{ + Term: []byte("rudeniš"), + }, + &analysis.Token{ + Term: []byte("or"), + }, + &analysis.Token{ + Term: []byte("snieg"), + }, + &analysis.Token{ + Term: []byte("tik"), + }, + &analysis.Token{ + Term: []byte("nevert"), + }, + }, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if len(actual) != len(test.output) { + t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) + } + for i, tok := range actual { + if !reflect.DeepEqual(tok.Term, test.output[i].Term) { + t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) + } + } + } +} diff --git a/analysis/lang/lt/stemmer_lt.go b/analysis/lang/lt/stemmer_lt.go new file mode 100644 index 000000000..e21007240 --- /dev/null +++ b/analysis/lang/lt/stemmer_lt.go @@ -0,0 +1,49 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package lt + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/lithuanian" +) + +const SnowballStemmerName = "stemmer_lt_snowball" + +type LithuanianStemmerFilter struct { +} + +func NewLithuanianStemmerFilter() *LithuanianStemmerFilter { + return &LithuanianStemmerFilter{} +} + +func (s *LithuanianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + lithuanian.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func LithuanianStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewLithuanianStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, LithuanianStemmerFilterConstructor) +} diff --git a/analysis/lang/lt/stemmer_lt_test.go b/analysis/lang/lt/stemmer_lt_test.go new file mode 100644 index 000000000..058226163 --- /dev/null +++ b/analysis/lang/lt/stemmer_lt_test.go @@ -0,0 +1,67 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package lt + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestSnowballLithuanianStemmer(t *testing.T) { + tests := []struct { + input analysis.TokenStream + output analysis.TokenStream + }{ + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("aktorius"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("aktor"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("kilometrų"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("kilometr"), + }, + }, + }, + } + + cache := registry.NewCache() + filter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := filter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) + } + } +} diff --git a/analysis/lang/lt/stop_filter_lt.go b/analysis/lang/lt/stop_filter_lt.go new file mode 100644 index 000000000..f5bb6c814 --- /dev/null +++ b/analysis/lang/lt/stop_filter_lt.go @@ -0,0 +1,33 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package lt + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/lt/stop_words_lt.go b/analysis/lang/lt/stop_words_lt.go new file mode 100644 index 000000000..66f16c0d3 --- /dev/null +++ b/analysis/lang/lt/stop_words_lt.go @@ -0,0 +1,150 @@ +package lt + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_lt" + +// this content was obtained from: +// https://github.com/apache/lucene-solr/blob/master/lucene/analysis/common/src/resources/org/apache/lucene/analysis/lt/stopwords.txt + +var LithuanianStopWords = []byte(`# Lithuanian stopwords list +ant +apie +ar +arba +aš +be +bei +bet +bus +būti +būtų +buvo +dėl +gali +į +iki +ir +iš +ja +ją +jai +jais +jam +jame +jas +jei +ji +jį +jie +jiedu +jiedvi +jiedviem +jiedviese +jiems +jis +jo +jodviem +jog +joje +jomis +joms +jos +jose +jų +judu +judvi +judviejų +jųdviejų +judviem +judviese +jumis +jums +jumyse +juo +juodu +juodviese +juos +juose +jus +jūs +jūsų +ką +kad +kai +kaip +kas +kiek +kol +kur +kurie +kuris +man +mane +manęs +manimi +mano +manyje +mes +metu +mudu +mudvi +mudviejų +mudviem +mudviese +mumis +mums +mumyse +mus +mūsų +nei +nes +net +nors +nuo +o +pat +per +po +prie +prieš +sau +save +savęs +savimi +savo +savyje +su +tačiau +tada +tai +taip +tas +tau +tave +tavęs +tavimi +tavyje +tiek +ten +to +todėl +tu +tuo +už +visi +yra +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(LithuanianStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} From 9d26a782cd78455e07839044559d61e15a81dad8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vytis=20Valentinavi=C4=8Dius?= Date: Mon, 6 Jan 2020 09:14:26 +0200 Subject: [PATCH 2/2] Added few more test sentences from public media --- analysis/lang/lt/analyzer_lt_test.go | 62 ++++++++++++++++++++++++++++ analysis/lang/lt/stemmer_lt_test.go | 13 ++++++ 2 files changed, 75 insertions(+) diff --git a/analysis/lang/lt/analyzer_lt_test.go b/analysis/lang/lt/analyzer_lt_test.go index 4dd358e33..d508d692f 100644 --- a/analysis/lang/lt/analyzer_lt_test.go +++ b/analysis/lang/lt/analyzer_lt_test.go @@ -93,6 +93,68 @@ func TestLithuanianAnalyzer(t *testing.T) { }, }, }, + { + input: []byte("Visą savaitę prognozuojami klastingi reiškiniai"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("vis"), + }, + &analysis.Token{ + Term: []byte("savait"), + }, + &analysis.Token{ + // verb. "prognozuo-ti" + Term: []byte("prognozuo"), + }, + &analysis.Token{ + Term: []byte("klast"), + }, + &analysis.Token{ + Term: []byte("reiškin"), + }, + }, + }, + { + input: []byte("Susirgęs Arūnas gyvenimui pasirinko šalį, kurioje įteisinta eutanazija: silpsta visos jo organizmo funkcijos"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("susirg"), + }, + &analysis.Token{ + Term: []byte("arūn"), + }, + &analysis.Token{ + Term: []byte("gyvenim"), + }, + &analysis.Token{ + Term: []byte("pasirink"), + }, + &analysis.Token{ + Term: []byte("šal"), + }, + &analysis.Token{ + Term: []byte("kur"), + }, + &analysis.Token{ + Term: []byte("įteis"), + }, + &analysis.Token{ + Term: []byte("eutanazij"), + }, + &analysis.Token{ + Term: []byte("silpst"), + }, + &analysis.Token{ + Term: []byte("vis"), + }, + &analysis.Token{ + Term: []byte("organizm"), + }, + &analysis.Token{ + Term: []byte("funkcij"), + }, + }, + }, } cache := registry.NewCache() diff --git a/analysis/lang/lt/stemmer_lt_test.go b/analysis/lang/lt/stemmer_lt_test.go index 058226163..784db729f 100644 --- a/analysis/lang/lt/stemmer_lt_test.go +++ b/analysis/lang/lt/stemmer_lt_test.go @@ -51,6 +51,19 @@ func TestSnowballLithuanianStemmer(t *testing.T) { }, }, }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("prognozuojami"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + // verb. "prognozuo-ti" + Term: []byte("prognozuo"), + }, + }, + }, } cache := registry.NewCache()