diff --git a/analysis/lang/cjk/cjk_bigram.go b/analysis/lang/cjk/cjk_bigram.go index fcb80151d..14fe6594d 100644 --- a/analysis/lang/cjk/cjk_bigram.go +++ b/analysis/lang/cjk/cjk_bigram.go @@ -78,9 +78,11 @@ func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStrea if itemsInRing < 2 { itemsInRing++ } + builtUnigram := false if itemsInRing > 1 && s.outputUnigram { unigram := s.buildUnigram(r, &itemsInRing, outputPos) if unigram != nil { + builtUnigram = true rv = append(rv, unigram) } } @@ -89,6 +91,11 @@ func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStrea rv = append(rv, bigramToken) outputPos++ } + + // prev token should be removed if unigram was built + if builtUnigram { + itemsInRing-- + } } } else { diff --git a/analysis/lang/cjk/cjk_bigram_test.go b/analysis/lang/cjk/cjk_bigram_test.go index d1b5d0008..81bb3c8c9 100644 --- a/analysis/lang/cjk/cjk_bigram_test.go +++ b/analysis/lang/cjk/cjk_bigram_test.go @@ -309,6 +309,147 @@ func TestCJKBigramFilter(t *testing.T) { }, }, }, + { + // Assuming that `、` is removed by unicode tokenizer from `こんにちは、世界` + outputUnigram: true, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("こ"), + Type: analysis.Ideographic, + Position: 1, + Start: 0, + End: 3, + }, + &analysis.Token{ + Term: []byte("ん"), + Type: analysis.Ideographic, + Position: 2, + Start: 3, + End: 6, + }, + &analysis.Token{ + Term: []byte("に"), + Type: analysis.Ideographic, + Position: 3, + Start: 6, + End: 9, + }, + &analysis.Token{ + Term: []byte("ち"), + Type: analysis.Ideographic, + Position: 4, + Start: 9, + End: 12, + }, + &analysis.Token{ + Term: []byte("は"), + Type: analysis.Ideographic, + Position: 5, + Start: 12, + End: 15, + }, + &analysis.Token{ + Term: []byte("世"), + Type: analysis.Ideographic, + Position: 7, + Start: 18, + End: 21, + }, + &analysis.Token{ + Term: []byte("界"), + Type: analysis.Ideographic, + Position: 8, + Start: 21, + End: 24, + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("こ"), + Type: analysis.Single, + Position: 1, + Start: 0, + End: 3, + }, + &analysis.Token{ + Term: []byte("こん"), + Type: analysis.Double, + Position: 1, + Start: 0, + End: 6, + }, + &analysis.Token{ + Term: []byte("ん"), + Type: analysis.Single, + Position: 2, + Start: 3, + End: 6, + }, + &analysis.Token{ + Term: []byte("んに"), + Type: analysis.Double, + Position: 2, + Start: 3, + End: 9, + }, + &analysis.Token{ + Term: []byte("に"), + Type: analysis.Single, + Position: 3, + Start: 6, + End: 9, + }, + &analysis.Token{ + Term: []byte("にち"), + Type: analysis.Double, + Position: 3, + Start: 6, + End: 12, + }, + &analysis.Token{ + Term: []byte("ち"), + Type: analysis.Single, + Position: 4, + Start: 9, + End: 12, + }, + &analysis.Token{ + Term: []byte("ちは"), + Type: analysis.Double, + Position: 4, + Start: 9, + End: 15, + }, + &analysis.Token{ + Term: []byte("は"), + Type: analysis.Single, + Position: 5, + Start: 12, + End: 15, + }, + &analysis.Token{ + Term: []byte("世"), + Type: analysis.Single, + Position: 6, + Start: 18, + End: 21, + }, + &analysis.Token{ + Term: []byte("世界"), + Type: analysis.Double, + Position: 6, + Start: 18, + End: 24, + }, + &analysis.Token{ + Term: []byte("界"), + Type: analysis.Single, + Position: 7, + Start: 21, + End: 24, + }, + }, + }, { outputUnigram: false, input: analysis.TokenStream{