This repository has been archived by the owner on Oct 10, 2018. It is now read-only.
forked from guillaumegenthial/sequence_tagging
-
Notifications
You must be signed in to change notification settings - Fork 7
/
cross-validation.py
executable file
·125 lines (106 loc) · 4 KB
/
cross-validation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import codecs
import numpy as np
from sklearn.model_selection import KFold
from shutil import copyfile
import subprocess
sentences = []
sentence = []
def write(path, sts):
f = open(path, 'w', encoding='utf-8')
for s in sts:
for w in s:
f.write(w[0] + ' ' + w[1])
f.write('\n')
f.write('\n')
f.close()
for line in codecs.open('data/celikkaya2013/input.txt', 'r', 'utf8'):
line = line.rstrip()
if not line:
if len(sentence) > 0:
sentences.append(sentence)
sentence = []
else:
word = line.split()
assert len(word) >= 2
sentence.append(word)
if len(sentence) > 0:
sentences.append(sentence)
num_sentences = len(sentences)
print("Found %i sentences" % num_sentences)
# Randomly shuffle sentences
np.random.shuffle(sentences)
# Need numpy array so that we can 'extract' using indices
sentences = np.array(sentences)
rs = KFold(n_splits=10)
count = rs.get_n_splits()
# Generate n-fold CV files & build, train, eval
for train_index, test_index in rs.split(sentences):
# Find dev index as well...
numb_dev = len(train_index) // 10
dev_index = train_index[-1*numb_dev:]
train_index = train_index[:-1*numb_dev]
# Extract sentences from indices
train_sentences = sentences[train_index]
dev_sentences = sentences[dev_index]
test_sentences = sentences[test_index]
print("Splitted dataset into 3 parts.")
# Write to respective files
filename_train = 'data/tr.train{}.tmp'.format(count)
filename_dev = 'data/tr.testa{}.tmp'.format(count)
filename_test = 'data/tr.testb{}.tmp'.format(count)
write(filename_train, train_sentences)
write(filename_dev, dev_sentences)
write(filename_test, test_sentences)
print("Created train, dev and test sets of iteration: %i" % count)
copyfile(filename_train, 'data/train.tmp')
copyfile(filename_dev, 'data/dev.tmp')
copyfile(filename_test, 'data/test.tmp')
# Build
with open('results/output.log', 'a+') as out:
out.write("Beginning building for CV iteration:{}".format(str(count)))
p = subprocess.Popen('python3 build_data.py', shell=True, stdout=subprocess.PIPE, universal_newlines = True,
stderr=subprocess.STDOUT)
while True:
output = p.stdout.readline()
if output == '' and p.poll() is not None:
break
if output:
out.write(output.strip() + '\n')
out.flush()
retval = p.poll()
out.write("Finished building. exit code:{}\n".format(str(retval)))
out.flush()
print("Built model.")
# Train
with open('results/output.log', 'a+') as out:
out.write("Beginning training for CV iteration:{}".format(str(count)))
p = subprocess.Popen('python3 train.py', shell=True, stdout=subprocess.PIPE, universal_newlines = True,
stderr=subprocess.STDOUT)
while True:
output = p.stdout.readline()
if output == '' and p.poll() is not None:
break
if output:
out.write(output.strip() + '\n')
out.flush()
retval = p.poll()
out.write("Finished training. exit code:{}\n".format(str(retval)))
out.flush()
print("Trained model.")
# Evaluate
with open('results/output.log', 'a+') as out:
out.write("Beginning eval for CV iteration:{}".format(str(count)))
p = subprocess.Popen('python3 evaluate.py', shell=True, stdout=subprocess.PIPE, universal_newlines = True,
stderr=subprocess.STDOUT)
while True:
output = p.stdout.readline()
if output == '' and p.poll() is not None:
break
if output:
out.write(output.strip() + '\n')
out.flush()
retval = p.poll()
out.write("Finished eval. exit code:{}\n".format(str(retval)))
out.flush()
print("Evaluated model.")
count -= 1