-
Notifications
You must be signed in to change notification settings - Fork 1
/
ngram.coffee
79 lines (69 loc) · 2.34 KB
/
ngram.coffee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
{ Langcode } = require './langcode/character.js'
ngramTokenize = (text) ->
# Tokenize the give text, returns 5gram tokens
wordsStat = {}
sentence = ''
for ch in text
unicode = getUnicode ch
if unicode is -1 then continue
type = Langcode.getType unicode
switch type
when Langcode.CONTROL, Langcode.FORMAT, Langcode.OTHER_PUNCTUATION
if sentence.length > 0
tokenizeSentence sentence, wordsStat
sentence = ''
else
sentence += ch
if sentence.length > 0
tokenizeSentence sentence, wordsStat
wordsStat
tokenizeSentence = (seg, wordsStat) ->
# to tokenize a sentence
words = extractWords seg
words = ngram_scan words
for word in words
wordsStat[word] = (wordsStat[word] or= 0) + 1
words
ngram_scan = (words) ->
# scan the words array and tokenize them into n-gram forms
results = []
if words.length == 0
return results
for n_count in [2..5]
if words.length < n_count then continue
for pos in [0..words.length - n_count]
sel_words = words.slice pos, n_count + pos
if sel_words.length > 1
for i in [1...sel_words.length]
if sel_words[i].length > 1 and sel_words[i] isnt '_END'
sel_words[i] = ' ' + sel_words[i]
results.push sel_words.join('')
results
extractWords = (seg) ->
# to extract each word from the sentence
words = []
wordBuf = ''
for ch in seg
unicode = getUnicode ch
if unicode is -1 then continue
type = Langcode.getType unicode
switch type
when Langcode.UPPERCASE_LETTER, Langcode.LOWERCASE_LETTER, Langcode.TITLECASE_LETTER, Langcode.MODIFIER_LETTER, Langcode.DECIMAL_DIGIT_NUMBER
wordBuf += ch
when Langcode.OTHER_LETTER
if wordBuf.length > 0
words.push wordBuf
wordBuf = ''
words.push ch
else
if wordBuf.length > 0
words.push wordBuf
wordBuf = ''
# return extracted words
words
getUnicode = (ch) ->
if ch is ''
-1
else
ch.toLowerCase().charCodeAt 0
exports.ngramTokenize = ngramTokenize