-
Notifications
You must be signed in to change notification settings - Fork 1
/
token_main.coffee
92 lines (73 loc) · 2.6 KB
/
token_main.coffee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
fs = require 'fs'
ngram = require './ngram.js'
main = ->
corpusDir = "data/corpus/"
token0Dir = "data/token_0/"
# Read the file
_histo = processCorpus corpusDir, token0Dir
words = sortAndSave _histo, 'data/raw_token.data'
# create the universal words dictionary
_wordsIndex = {}
_wordCount = 0
output = []
start = 5 #parseInt 0.0002 * _word_index.length
words = words[start..words.length]
for word in words
if _histo[word] > 1
output.push "#{word}:#{_histo[word]}"
_wordsIndex[word] = output.length - 1
_wordCount += 1
fs.writeFileSync 'data/token.data', output.join('\n')
console.log "Found total #{_wordCount} words."
createDocumentArray _wordCount, _wordsIndex, token0Dir
console.log "Create the documents array data file."
processCorpus = (corpusDir, token0Dir) ->
# Process the corpus files
console.log "Start process the files"
histo = {}
files = fs.readdirSync corpusDir
p_count = 0
for fileId in files
p_count += 1
if p_count >= 50 and p_count % 50 == 0
console.log "* process iteration #{p_count}"
filename = "#{corpusDir}#{fileId}"
content = fs.readFileSync filename, 'UTF-8'
wordsStat = ngram.ngramTokenize content
for word of wordsStat
histo[word] = (histo[word] or= 0) + wordsStat[word]
token0file = "#{token0Dir}#{fileId}"
sortAndSave wordsStat, token0file
histo
createDocumentArray = (wordCount, wordsIndex, token0Dir) ->
# create the document array
docArrayFile = 'data/documents.data'
fd = fs.openSync docArrayFile, 'w'
fs.writeSync fd, "#{wordCount}\n"
docFiles = fs.readdirSync token0Dir
p_count = 0
for fileId in docFiles
p_count += 1
if p_count >= 50 and p_count % 50 == 0
console.log "* process iteration #{p_count}"
docArray = []
docFile = "#{token0Dir}#{fileId}"
content = fs.readFileSync docFile, 'UTF-8'
words = content.split '\n'
for wordCombine in words
[word, freq] = wordCombine.split ':'
if word of wordsIndex
docArray.push wordsIndex[word] for i in [1..freq]
docArray = docArray.join(' ')
fs.writeSync fd, "#{docArray}\n"
fs.closeSync fd
sortAndSave = (wordMap, filename) ->
a_words = (word for word of wordMap)
a_words.sort (a, b) =>
wordMap[b] - wordMap[a]
output = []
for word in a_words
output.push "#{word}:#{wordMap[word]}"
fs.writeFileSync filename, output.join('\n')
a_words
main()