Commit 5728b8a 1 parent 60dd873 commit 5728b8a Copy full SHA for 5728b8a
File tree 2 files changed +148
-0
lines changed
2 files changed +148
-0
lines changed Original file line number Diff line number Diff line change @@ -78,9 +78,11 @@ func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStrea
78
78
if itemsInRing < 2 {
79
79
itemsInRing ++
80
80
}
81
+ builtUnigram := false
81
82
if itemsInRing > 1 && s .outputUnigram {
82
83
unigram := s .buildUnigram (r , & itemsInRing , outputPos )
83
84
if unigram != nil {
85
+ builtUnigram = true
84
86
rv = append (rv , unigram )
85
87
}
86
88
}
@@ -89,6 +91,11 @@ func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStrea
89
91
rv = append (rv , bigramToken )
90
92
outputPos ++
91
93
}
94
+
95
+ // prev token should be removed if unigram was built
96
+ if builtUnigram {
97
+ itemsInRing --
98
+ }
92
99
}
93
100
94
101
} else {
Original file line number Diff line number Diff line change @@ -309,6 +309,147 @@ func TestCJKBigramFilter(t *testing.T) {
309
309
},
310
310
},
311
311
},
312
+ {
313
+ // Assuming that `、` is removed by unicode tokenizer from `こんにちは、世界`
314
+ outputUnigram : true ,
315
+ input : analysis.TokenStream {
316
+ & analysis.Token {
317
+ Term : []byte ("こ" ),
318
+ Type : analysis .Ideographic ,
319
+ Position : 1 ,
320
+ Start : 0 ,
321
+ End : 3 ,
322
+ },
323
+ & analysis.Token {
324
+ Term : []byte ("ん" ),
325
+ Type : analysis .Ideographic ,
326
+ Position : 2 ,
327
+ Start : 3 ,
328
+ End : 6 ,
329
+ },
330
+ & analysis.Token {
331
+ Term : []byte ("に" ),
332
+ Type : analysis .Ideographic ,
333
+ Position : 3 ,
334
+ Start : 6 ,
335
+ End : 9 ,
336
+ },
337
+ & analysis.Token {
338
+ Term : []byte ("ち" ),
339
+ Type : analysis .Ideographic ,
340
+ Position : 4 ,
341
+ Start : 9 ,
342
+ End : 12 ,
343
+ },
344
+ & analysis.Token {
345
+ Term : []byte ("は" ),
346
+ Type : analysis .Ideographic ,
347
+ Position : 5 ,
348
+ Start : 12 ,
349
+ End : 15 ,
350
+ },
351
+ & analysis.Token {
352
+ Term : []byte ("世" ),
353
+ Type : analysis .Ideographic ,
354
+ Position : 7 ,
355
+ Start : 18 ,
356
+ End : 21 ,
357
+ },
358
+ & analysis.Token {
359
+ Term : []byte ("界" ),
360
+ Type : analysis .Ideographic ,
361
+ Position : 8 ,
362
+ Start : 21 ,
363
+ End : 24 ,
364
+ },
365
+ },
366
+ output : analysis.TokenStream {
367
+ & analysis.Token {
368
+ Term : []byte ("こ" ),
369
+ Type : analysis .Single ,
370
+ Position : 1 ,
371
+ Start : 0 ,
372
+ End : 3 ,
373
+ },
374
+ & analysis.Token {
375
+ Term : []byte ("こん" ),
376
+ Type : analysis .Double ,
377
+ Position : 1 ,
378
+ Start : 0 ,
379
+ End : 6 ,
380
+ },
381
+ & analysis.Token {
382
+ Term : []byte ("ん" ),
383
+ Type : analysis .Single ,
384
+ Position : 2 ,
385
+ Start : 3 ,
386
+ End : 6 ,
387
+ },
388
+ & analysis.Token {
389
+ Term : []byte ("んに" ),
390
+ Type : analysis .Double ,
391
+ Position : 2 ,
392
+ Start : 3 ,
393
+ End : 9 ,
394
+ },
395
+ & analysis.Token {
396
+ Term : []byte ("に" ),
397
+ Type : analysis .Single ,
398
+ Position : 3 ,
399
+ Start : 6 ,
400
+ End : 9 ,
401
+ },
402
+ & analysis.Token {
403
+ Term : []byte ("にち" ),
404
+ Type : analysis .Double ,
405
+ Position : 3 ,
406
+ Start : 6 ,
407
+ End : 12 ,
408
+ },
409
+ & analysis.Token {
410
+ Term : []byte ("ち" ),
411
+ Type : analysis .Single ,
412
+ Position : 4 ,
413
+ Start : 9 ,
414
+ End : 12 ,
415
+ },
416
+ & analysis.Token {
417
+ Term : []byte ("ちは" ),
418
+ Type : analysis .Double ,
419
+ Position : 4 ,
420
+ Start : 9 ,
421
+ End : 15 ,
422
+ },
423
+ & analysis.Token {
424
+ Term : []byte ("は" ),
425
+ Type : analysis .Single ,
426
+ Position : 5 ,
427
+ Start : 12 ,
428
+ End : 15 ,
429
+ },
430
+ & analysis.Token {
431
+ Term : []byte ("世" ),
432
+ Type : analysis .Single ,
433
+ Position : 6 ,
434
+ Start : 18 ,
435
+ End : 21 ,
436
+ },
437
+ & analysis.Token {
438
+ Term : []byte ("世界" ),
439
+ Type : analysis .Double ,
440
+ Position : 6 ,
441
+ Start : 18 ,
442
+ End : 24 ,
443
+ },
444
+ & analysis.Token {
445
+ Term : []byte ("界" ),
446
+ Type : analysis .Single ,
447
+ Position : 7 ,
448
+ Start : 21 ,
449
+ End : 24 ,
450
+ },
451
+ },
452
+ },
312
453
{
313
454
outputUnigram : false ,
314
455
input : analysis.TokenStream {
You can’t perform that action at this time.
0 commit comments