-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcross-validation.py
executable file
·149 lines (127 loc) · 4.64 KB
/
cross-validation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import codecs
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from shutil import copyfile
import subprocess
sentences = []
sentence = []
sentences2 = []
sentence2 = []
def write(path, sts):
f = open(path, 'w', encoding='utf-8')
for s in sts:
for w in s:
f.write(w[0] + ' ' + w[1])
f.write('\n')
f.write('\n')
f.close()
for line in codecs.open('data/celikkaya2013/input.txt', 'r', 'utf8'):
line = line.rstrip()
if not line:
if len(sentence) > 0:
sentences.append(sentence)
sentence = []
else:
word = line.split()
assert len(word) >= 2
sentence.append(word)
if len(sentence) > 0:
sentences.append(sentence)
num_sentences = len(sentences)
print("Found %i sentences in DS-1" % num_sentences)
for line in codecs.open('data/news/input.txt', 'r', 'utf8'):
line = line.rstrip()
if not line:
if len(sentence2) > 0:
sentences2.append(sentence2)
sentence2 = []
else:
word = line.split()
assert len(word) >= 2
sentence2.append(word)
if len(sentence) > 0:
sentences2.append(sentence2)
num_sentences2 = len(sentences2)
# Randomly shuffle sentences
np.random.shuffle(sentences)
np.random.shuffle(sentences2)
# Need numpy array so that we can 'extract' using indices
sentences = np.array(sentences)
sentences2 = np.array(sentences2)
rs = KFold(n_splits=10)
count = rs.get_n_splits()
# News data
#train2, test2 = train_test_split(sentences2, test_size=0.1)
write('data/train2.tmp', sentences2)
# Generate n-fold CV files & build, train, eval
for train_index, test_index in rs.split(sentences):
# Find dev index as well...
numb_dev = len(train_index) // 10
dev_index = train_index[-1*numb_dev:]
train_index = train_index[:-1*numb_dev]
# Extract sentences from indices
train_sentences = sentences[train_index]
dev_sentences = sentences[dev_index]
test_sentences = sentences[test_index]
print("Splitted dataset into 3 parts.")
# Write to respective files
filename_train = 'data/tr.train{}.tmp'.format(count)
filename_dev = 'data/tr.testa{}.tmp'.format(count)
filename_test = 'data/tr.testb{}.tmp'.format(count)
write(filename_train, train_sentences)
write(filename_dev, dev_sentences)
write(filename_test, test_sentences)
print("Created train, dev and test sets of iteration: %i" % count)
copyfile(filename_train, 'data/train.tmp')
copyfile(filename_dev, 'data/dev.tmp')
copyfile(filename_test, 'data/test.tmp')
# Build
with open('results/output.log', 'a+') as out:
out.write("Beginning building for CV iteration:{}".format(str(count)))
p = subprocess.Popen('python3 build_data.py', shell=True, stdout=subprocess.PIPE, universal_newlines = True,
stderr=subprocess.STDOUT)
while True:
output = p.stdout.readline()
if output == '' and p.poll() is not None:
break
if output:
out.write(output.strip() + '\n')
out.flush()
retval = p.poll()
out.write("Finished building. exit code:{}\n".format(str(retval)))
out.flush()
print("Built model.")
# Train
with open('results/output.log', 'a+') as out:
out.write("Beginning training for CV iteration:{}".format(str(count)))
p = subprocess.Popen('python3 train.py', shell=True, stdout=subprocess.PIPE, universal_newlines = True,
stderr=subprocess.STDOUT)
while True:
output = p.stdout.readline()
if output == '' and p.poll() is not None:
break
if output:
out.write(output.strip() + '\n')
out.flush()
retval = p.poll()
out.write("Finished training. exit code:{}\n".format(str(retval)))
out.flush()
print("Trained model.")
# Evaluate
with open('results/output.log', 'a+') as out:
out.write("Beginning eval for CV iteration:{}".format(str(count)))
p = subprocess.Popen('python3 evaluate.py', shell=True, stdout=subprocess.PIPE, universal_newlines = True,
stderr=subprocess.STDOUT)
while True:
output = p.stdout.readline()
if output == '' and p.poll() is not None:
break
if output:
out.write(output.strip() + '\n')
out.flush()
retval = p.poll()
out.write("Finished eval. exit code:{}\n".format(str(retval)))
out.flush()
print("Evaluated model.")
count -= 1