Skip to content

Commit

Permalink
added simple tiktoken implementation.
Browse files Browse the repository at this point in the history
  • Loading branch information
guocuimi committed Dec 28, 2023
1 parent 358bfbb commit cdc11ab
Show file tree
Hide file tree
Showing 7 changed files with 535 additions and 0 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ find_package(gflags CONFIG REQUIRED)
find_package(absl CONFIG REQUIRED)
find_package(Protobuf CONFIG REQUIRED)
find_package(gRPC CONFIG REQUIRED)
find_package(re2 CONFIG REQUIRED)
find_package(folly CONFIG REQUIRED)
find_package(GTest CONFIG REQUIRED)
find_package(benchmark CONFIG REQUIRED)
Expand Down
5 changes: 5 additions & 0 deletions src/tokenizer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,33 @@ cc_library(
tokenizer
HDRS
tokenizer.h
tiktoken_tokenizer.h
sentencepiece_tokenizer.h
hf_tokenizer.h
SRCS
tiktoken_tokenizer.cpp
sentencepiece_tokenizer.cpp
hf_tokenizer.cpp
DEPS
:sentencepiece
tokenizers
glog::glog
re2::re2
)

cc_test(
NAME
tokenizer_test
SRCS
sentencepiece_tokenizer_test.cpp
tiktoken_tokenizer_test.cpp
DEPS
:tokenizer
tokenizers
GTest::gtest_main
DATA
data/tokenizer.model
data/test.tiktoken
)

# build huggingface tokenizers (rust)
Expand Down
300 changes: 300 additions & 0 deletions src/tokenizer/data/test.tiktoken
Original file line number Diff line number Diff line change
@@ -0,0 +1,300 @@
IQ== 0
Ig== 1
Iw== 2
JA== 3
JQ== 4
Jg== 5
Jw== 6
KA== 7
KQ== 8
Kg== 9
Kw== 10
LA== 11
LQ== 12
Lg== 13
Lw== 14
MA== 15
MQ== 16
Mg== 17
Mw== 18
NA== 19
NQ== 20
Ng== 21
Nw== 22
OA== 23
OQ== 24
Og== 25
Ow== 26
PA== 27
PQ== 28
Pg== 29
Pw== 30
QA== 31
QQ== 32
Qg== 33
Qw== 34
RA== 35
RQ== 36
Rg== 37
Rw== 38
SA== 39
SQ== 40
Sg== 41
Sw== 42
TA== 43
TQ== 44
Tg== 45
Tw== 46
UA== 47
UQ== 48
Ug== 49
Uw== 50
VA== 51
VQ== 52
Vg== 53
Vw== 54
WA== 55
WQ== 56
Wg== 57
Ww== 58
XA== 59
XQ== 60
Xg== 61
Xw== 62
YA== 63
YQ== 64
Yg== 65
Yw== 66
ZA== 67
ZQ== 68
Zg== 69
Zw== 70
aA== 71
aQ== 72
ag== 73
aw== 74
bA== 75
bQ== 76
bg== 77
bw== 78
cA== 79
cQ== 80
cg== 81
cw== 82
dA== 83
dQ== 84
dg== 85
dw== 86
eA== 87
eQ== 88
eg== 89
ew== 90
fA== 91
fQ== 92
fg== 93
oQ== 94
og== 95
ow== 96
pA== 97
pQ== 98
pg== 99
pw== 100
qA== 101
qQ== 102
qg== 103
qw== 104
rA== 105
rg== 106
rw== 107
sA== 108
sQ== 109
sg== 110
sw== 111
tA== 112
tQ== 113
tg== 114
tw== 115
uA== 116
uQ== 117
ug== 118
uw== 119
vA== 120
vQ== 121
vg== 122
vw== 123
wA== 124
wQ== 125
wg== 126
ww== 127
xA== 128
xQ== 129
xg== 130
xw== 131
yA== 132
yQ== 133
yg== 134
yw== 135
zA== 136
zQ== 137
zg== 138
zw== 139
0A== 140
0Q== 141
0g== 142
0w== 143
1A== 144
1Q== 145
1g== 146
1w== 147
2A== 148
2Q== 149
2g== 150
2w== 151
3A== 152
3Q== 153
3g== 154
3w== 155
4A== 156
4Q== 157
4g== 158
4w== 159
5A== 160
5Q== 161
5g== 162
5w== 163
6A== 164
6Q== 165
6g== 166
6w== 167
7A== 168
7Q== 169
7g== 170
7w== 171
8A== 172
8Q== 173
8g== 174
8w== 175
9A== 176
9Q== 177
9g== 178
9w== 179
+A== 180
+Q== 181
+g== 182
+w== 183
/A== 184
/Q== 185
/g== 186
/w== 187
AA== 188
AQ== 189
Ag== 190
Aw== 191
BA== 192
BQ== 193
Bg== 194
Bw== 195
CA== 196
CQ== 197
Cg== 198
Cw== 199
DA== 200
DQ== 201
Dg== 202
Dw== 203
EA== 204
EQ== 205
Eg== 206
Ew== 207
FA== 208
FQ== 209
Fg== 210
Fw== 211
GA== 212
GQ== 213
Gg== 214
Gw== 215
HA== 216
HQ== 217
Hg== 218
Hw== 219
IA== 220
fw== 221
gA== 222
gQ== 223
gg== 224
gw== 225
hA== 226
hQ== 227
hg== 228
hw== 229
iA== 230
iQ== 231
ig== 232
iw== 233
jA== 234
jQ== 235
jg== 236
jw== 237
kA== 238
kQ== 239
kg== 240
kw== 241
lA== 242
lQ== 243
lg== 244
lw== 245
mA== 246
mQ== 247
mg== 248
mw== 249
nA== 250
nQ== 251
ng== 252
nw== 253
oA== 254
rQ== 255
ICA= 256
ICAgIA== 257
aW4= 258
IHQ= 259
ICAgICAgICA= 260
ZXI= 261
ICAg 262
b24= 263
IGE= 264
cmU= 265
YXQ= 266
c3Q= 267
ZW4= 268
b3I= 269
IHRo 270
Cgo= 271
IGM= 272
bGU= 273
IHM= 274
aXQ= 275
YW4= 276
YXI= 277
YWw= 278
IHRoZQ== 279
Owo= 280
IHA= 281
IGY= 282
b3U= 283
ID0= 284
aXM= 285
ICAgICAgIA== 286
aW5n 287
ZXM= 288
IHc= 289
aW9u 290
ZWQ= 291
aWM= 292
IGI= 293
IGQ= 294
ZXQ= 295
IG0= 296
IG8= 297
CQk= 298
cm8= 299
Loading

0 comments on commit cdc11ab

Please sign in to comment.