Skip to content

Commit 5728b8a

Browse files
authored
When using cjk analyzer with output_unigram, an unigram before Japanese punctuation is not indexed (#1724)
fix: make the value of itemsInRing strict
1 parent 60dd873 commit 5728b8a

File tree

2 files changed

+148
-0
lines changed

2 files changed

+148
-0
lines changed

analysis/lang/cjk/cjk_bigram.go

+7
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,11 @@ func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStrea
7878
if itemsInRing < 2 {
7979
itemsInRing++
8080
}
81+
builtUnigram := false
8182
if itemsInRing > 1 && s.outputUnigram {
8283
unigram := s.buildUnigram(r, &itemsInRing, outputPos)
8384
if unigram != nil {
85+
builtUnigram = true
8486
rv = append(rv, unigram)
8587
}
8688
}
@@ -89,6 +91,11 @@ func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStrea
8991
rv = append(rv, bigramToken)
9092
outputPos++
9193
}
94+
95+
// prev token should be removed if unigram was built
96+
if builtUnigram {
97+
itemsInRing--
98+
}
9299
}
93100

94101
} else {

analysis/lang/cjk/cjk_bigram_test.go

+141
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,147 @@ func TestCJKBigramFilter(t *testing.T) {
309309
},
310310
},
311311
},
312+
{
313+
// Assuming that `、` is removed by unicode tokenizer from `こんにちは、世界`
314+
outputUnigram: true,
315+
input: analysis.TokenStream{
316+
&analysis.Token{
317+
Term: []byte("こ"),
318+
Type: analysis.Ideographic,
319+
Position: 1,
320+
Start: 0,
321+
End: 3,
322+
},
323+
&analysis.Token{
324+
Term: []byte("ん"),
325+
Type: analysis.Ideographic,
326+
Position: 2,
327+
Start: 3,
328+
End: 6,
329+
},
330+
&analysis.Token{
331+
Term: []byte("に"),
332+
Type: analysis.Ideographic,
333+
Position: 3,
334+
Start: 6,
335+
End: 9,
336+
},
337+
&analysis.Token{
338+
Term: []byte("ち"),
339+
Type: analysis.Ideographic,
340+
Position: 4,
341+
Start: 9,
342+
End: 12,
343+
},
344+
&analysis.Token{
345+
Term: []byte("は"),
346+
Type: analysis.Ideographic,
347+
Position: 5,
348+
Start: 12,
349+
End: 15,
350+
},
351+
&analysis.Token{
352+
Term: []byte("世"),
353+
Type: analysis.Ideographic,
354+
Position: 7,
355+
Start: 18,
356+
End: 21,
357+
},
358+
&analysis.Token{
359+
Term: []byte("界"),
360+
Type: analysis.Ideographic,
361+
Position: 8,
362+
Start: 21,
363+
End: 24,
364+
},
365+
},
366+
output: analysis.TokenStream{
367+
&analysis.Token{
368+
Term: []byte("こ"),
369+
Type: analysis.Single,
370+
Position: 1,
371+
Start: 0,
372+
End: 3,
373+
},
374+
&analysis.Token{
375+
Term: []byte("こん"),
376+
Type: analysis.Double,
377+
Position: 1,
378+
Start: 0,
379+
End: 6,
380+
},
381+
&analysis.Token{
382+
Term: []byte("ん"),
383+
Type: analysis.Single,
384+
Position: 2,
385+
Start: 3,
386+
End: 6,
387+
},
388+
&analysis.Token{
389+
Term: []byte("んに"),
390+
Type: analysis.Double,
391+
Position: 2,
392+
Start: 3,
393+
End: 9,
394+
},
395+
&analysis.Token{
396+
Term: []byte("に"),
397+
Type: analysis.Single,
398+
Position: 3,
399+
Start: 6,
400+
End: 9,
401+
},
402+
&analysis.Token{
403+
Term: []byte("にち"),
404+
Type: analysis.Double,
405+
Position: 3,
406+
Start: 6,
407+
End: 12,
408+
},
409+
&analysis.Token{
410+
Term: []byte("ち"),
411+
Type: analysis.Single,
412+
Position: 4,
413+
Start: 9,
414+
End: 12,
415+
},
416+
&analysis.Token{
417+
Term: []byte("ちは"),
418+
Type: analysis.Double,
419+
Position: 4,
420+
Start: 9,
421+
End: 15,
422+
},
423+
&analysis.Token{
424+
Term: []byte("は"),
425+
Type: analysis.Single,
426+
Position: 5,
427+
Start: 12,
428+
End: 15,
429+
},
430+
&analysis.Token{
431+
Term: []byte("世"),
432+
Type: analysis.Single,
433+
Position: 6,
434+
Start: 18,
435+
End: 21,
436+
},
437+
&analysis.Token{
438+
Term: []byte("世界"),
439+
Type: analysis.Double,
440+
Position: 6,
441+
Start: 18,
442+
End: 24,
443+
},
444+
&analysis.Token{
445+
Term: []byte("界"),
446+
Type: analysis.Single,
447+
Position: 7,
448+
Start: 21,
449+
End: 24,
450+
},
451+
},
452+
},
312453
{
313454
outputUnigram: false,
314455
input: analysis.TokenStream{

0 commit comments

Comments
 (0)