-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample_test.go
121 lines (98 loc) · 4.27 KB
/
example_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
package ngramindex_test
import (
"fmt"
"sort"
"strings"
"github.com/michurin/ngramindex"
)
func Example_basic() {
// Our documents
docs := map[string]string{
"Luke_22:35": `Then Jesus asked them, "When I sent you without purse, bag or sandals, did you lack anything?" "Nothing," they answered.`,
"Luke_22:36": `He said to them, "But now if you have a purse, take it, and also a bag; and if you don’t have a sword, sell your cloak and buy one.`,
"Luke_22:37": `It is written: 'And he was numbered with the transgressors'; and I tell you that this must be fulfilled in me. Yes, what is written about me is reaching its fulfillment."`,
"Luke_22:38": `The disciples said, "See, Lord, here are two swords." "That’s enough!" he replied.`,
}
// In our "database" the type of index is string (T=string).
// It could be integer, path string, [os.DirEntry], or anything else.
ngIdx := ngramindex.StringIndex[string]()
// Associate texts (v) with document's indexes (k)
for k, v := range docs {
ngIdx.Add(k, v)
}
// Search for "sword"
// 22:38 wins because "Lord" is also matching with "ord" (end of "sword")
results := ngIdx.Search("sword")
for _, v := range results {
fmt.Println(v)
}
// output:
// Luke_22:38
// Luke_22:36
}
func Example_textNormalization() {
// Our documents
docs := map[string]string{
"Luke_22:35": `Then Jesus asked them, "When I sent you without purse, bag or sandals, did you lack anything?" "Nothing," they answered.`,
"Luke_22:36": `He said to them, "But now if you have a purse, take it, and also a bag; and if you don’t have a sword, sell your cloak and buy one.`,
"Luke_22:37": `It is written: 'And he was numbered with the transgressors'; and I tell you that this must be fulfilled in me. Yes, what is written about me is reaching its fulfillment."`,
"Luke_22:38": `The disciples said, "See, Lord, here are two swords." "That’s enough!" he replied.`,
}
// Looking up is case insensitive and considers u and ú
// as the same letter.
// Obviously, it is useful for things like
// spaces normalization, punctuation skipping an so on.
ngIdx := ngramindex.StringIndex(
ngramindex.WithNormolizer[string](func(s string) [][]rune {
return [][]rune{[]rune(strings.ToLower(strings.ReplaceAll(s, "ú", "u")))}
}),
)
// Associate texts (v) with document's indexes (k)
for k, v := range docs {
ngIdx.Add(k, v)
}
results := ngIdx.Search("JESÚS") // we will find "Jesus" in 22:35
for _, v := range results {
fmt.Println(v)
}
// output:
// Luke_22:35
}
func Example_customOrdeing() {
// Our documents
docs := map[string]string{
"Luke_22:35": `Then Jesus asked them, "When I sent you without purse, bag or sandals, did you lack anything?" "Nothing," they answered.`,
"Luke_22:36": `He said to them, "But now if you have a purse, take it, and also a bag; and if you don’t have a sword, sell your cloak and buy one.`,
"Luke_22:37": `It is written: 'And he was numbered with the transgressors'; and I tell you that this must be fulfilled in me. Yes, what is written about me is reaching its fulfillment."`,
"Luke_22:38": `The disciples said, "See, Lord, here are two swords." "That’s enough!" he replied.`,
}
// In our "database" the type of index is string (T=string).
// It could be integer, path string, [os.DirEntry], or anything else.
ngIdx := ngramindex.StringIndex[string]()
// Associate texts (v) with document's indexes (k)
for k, v := range docs {
ngIdx.Add(k, v)
}
// Lookup for "sword"
// 22:38 wins because "Lord" is also matching with "ord" (end of "sword")
// Lookup doesn't sort documents, it returns detailed matching information.
results := ngIdx.Lookup("sword")
sort.Slice(results, func(i, j int) bool {
return results[i].MatchRate < results[j].MatchRate // sort in reverse order: best matches last
})
for _, v := range results {
fmt.Printf("%d/%3d = %.3f %q\n", v.MatchedNgrams, v.TotalNgrams, v.MatchRate, v.Document)
}
// output:
// 3/129 = 0.023 "Luke_22:36"
// 4/ 80 = 0.050 "Luke_22:38"
}
func Example_indexSettingsWithNgramLen() {
ngIdx := ngramindex.StringIndex(ngramindex.WithNgramIndex(ngramindex.Index[int](ngramindex.WithNgramLen(2))))
ngIdx.Add(1, "what")
ngIdx.Add(2, "that")
results := ngIdx.Search("with") // "th" is common in "with" and "that"
fmt.Println(results)
// output:
// [2]
}