-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutil.py
429 lines (343 loc) · 13 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
import logging
import math
import numpy as np
import sys
from zipfile import ZipFile, ZIP_DEFLATED
from io import BytesIO
import cPickle as pickle
import theano
from time import time
from datetime import date
log = logging.getLogger(__name__)
def argv(flag, default=None, formatter=None):
"""
Looks for a program argument flag in sys.argv and
returns it's value. If flag is not present, the
default value is returned.
:param flag: A string flat, for example '-a'.
:param default: Value that is returned if the flag
is not present in sys.argv.
:param formatter: The formatter to be used on the
value if present. If no formatter is given,
value is returned in it's original form.
"""
if flag not in sys.argv:
return default
value = sys.argv[sys.argv.index(flag) + 1]
if formatter is not None:
value = formatter(value)
return value
def dataset_split(x, validation=0.05, test=0.05, rng=None):
"""
Splits dataset into train, validation and testing subsets.
The dataset is split on the zeroth axis.
:param x: The dataset of shape (N, ...)
:param validation: float in range (0, 1) that indicates
desired validation set size to be N * validation
:param test: float in range (0, 1) that indicates
desired test set size to be N * test
:param rng: Numpy random number generator, or an integer
seed for rng, or None (rng initialized always with the same seed).
"""
if rng is None:
rng = np.random.RandomState()
elif isinstance(rng, int):
rng = np.random.RandomState(rng)
# if sizes are given as proportions, convert to actual sizes
if isinstance(validation, float):
validation = int(x.shape[0] * validation)
if isinstance(test, float):
test = int(x.shape[0] * test)
train = x.shape[0] - validation - test
# shuffle data
rng.shuffle(x)
log.info("Performing dataset split, sizes are: "
"train: %d, valid: %d, test: %d", train, validation, test)
return x[:train], x[train:train + validation], x[train + validation:]
def create_minibatches(x, y, size, shuffle=True):
"""
Default implementation for batching the
data, override for finer control.
Returns batched data in the form of a
list of (x, y) batches if y is not None.
Otherwise, if y is None, it returns a list
of x batches.
:type x: list
:param x: A list of sentences. Each sentence
is a list of indices of vocabulary terms.
:type y: list
:param y: A list of sentence labels, when using
the RAE in a supervised fashion, or None when purely
unsupervised.
:type size: int, float
:param size: Desired size of the minibatches. If
int, then taken as the desired size. If float (0, 1), then
taken as the desired perecentage x.
:type shuffle: boolean
:param shuffle: If or not the trainset should be
shuffled prior to splitting into batches. If the trainset
is ordered with regards to classes, shuffling will ensure
that classes are approximately uniformly represented in
each minibatch.
"""
# convert float size to int size
if isinstance(size, float):
size = int(math.ceil(len(x) * size))
# if size out of range, ensure appropriate
size = min(size, len(x))
size = max(1, size)
log.debug('Creating minibatches, size: %d', size)
# shuffle trainset
if shuffle:
if y is not None:
assert len(x) == len(y)
p = np.random.permutation(len(x))
x = x[p]
y = y[p]
else:
np.random.shuffle(x)
# split x and y into a batch of tuples
batches_x = []
batches_y = []
while True:
low_ind = len(batches_x) * size
high_ind = min(low_ind + size, len(x))
batches_x.append(x[low_ind:high_ind])
if y is not None:
batches_y.append(y[low_ind:high_ind])
if high_ind >= len(x):
break
log.debug('Created %d minibatches', len(batches_x))
if y is not None:
return batches_x, batches_y
else:
return batches_x
def try_pickle_load(file_name, zip=None):
"""
Tries to load pickled data from a file with
the given name. If unsuccesful, returns None.
Can compress using Zip.
:param file_name: File path/name.
:param zip: If or not the file should be zipped.
If None, determined from file name.
"""
if zip is None:
zip = file_name.lower().endswith("zip")
try:
if zip:
file = ZipFile(file_name, 'r')
entry = file.namelist()[0]
data = pickle.load(BytesIO(file.read(entry)))
else:
file = open(file_name, "rb")
data = pickle.load(file)
log.info('Succesfully loaded pickle %s', file_name)
return data
except IOError:
log.info('Failed to load pickle %s', file_name)
return None
finally:
if 'file' in locals():
file.close()
def try_pickle_dump(data, file_name, zip=None, entry_name="Data.pkl"):
"""
Pickles given data tp the given file name.
Returns True if succesful, False otherwise.
:param data: The object to pickle.
:param file_name: Name of file to pickle to.
:param zip: If or not the file should be zipped.
If None, determined from file name.
:param entry_name: If zipping, the name to be used
for the ZIP entry.
"""
if zip is None:
zip = file_name.lower().endswith("zip")
try:
log.info('Attempting to pickle data to %s', file_name)
if zip:
file = ZipFile(file_name, 'w', ZIP_DEFLATED)
file.writestr(entry_name, pickle.dumps(data, -1))
else:
pickle.dump(data, open(file_name, "wb"), -1)
return True
except IOError:
log.info('Failed to pickle data to %s', file_name)
return False
finally:
if 'file' in locals():
file.close()
def unique_with_counts(array):
"""
Wrapper numpy.unique(return_counts=True) that does the same
thing (but less efficient) on numpy versions prior to 1.9.x
when that flat was introduced. If running numpy of version
higher then 1.9.x then the numpy version is used.
"""
if "_NP_RETURN_COUNTS" not in globals():
np_ver = map(int, np.version.version.split('.'))
globals()["_NP_RETURN_COUNTS"] = np_ver[0] > 1 or np_ver[1] > 8
if globals()["_NP_RETURN_COUNTS"]:
return np.unique(array, return_counts=True)
else:
values = np.unique(array)
counts = np.zeros(values.size, dtype='uint32')
for ind, value in enumerate(values):
counts[ind] = (array == value).sum()
return (values, counts)
def labels_to_indices(labels):
"""
Converts an iterable of labels into
a numpy vector of label indices (zero-based).
Returns a tuple (indices, vocabulary) so that
vocabulary[index]=label
"""
vocab = sorted(set(labels))
indices = np.array([vocab.index(lab) for lab in labels], dtype=np.int)
return indices, vocab
def one_hot(indices, count=None):
"""
Takes a vector of 0 based indices (numpy array) and
converts it into a matrix of one-hot-encoded
indices (each index becomes one row).
For example, if 'indices' is [2, 3], the
results is:
[
[0, 0, 1, 0],
[0, 0, 0, 1]
]
:param indices: The indices to convert.
:param count: The number elements each one-hot-encoded
vector should have. If 'None', it is assumed to be
(indices.max() + 1)
"""
# ensure indices is a vector
indices = indices.reshape(indices.size)
# get the max size
if count is None:
count = indices.max() + 1
else:
assert indices.max() < count
encoded = np.zeros((indices.size, count), dtype=np.uint8)
encoded[range(indices.size), indices] = 1
return encoded
def cost_minimization(inputs, cost, params, epochs, eps, x_mnb, y_mnb):
"""
Generic cost minimization function (gradient descent) for a
situaition where given input there are desired outputs.
:type inputs: iterable of Theano symbolic vars, 2 elements.
:param inputs: Symblic variables that are inputs to the cost function.
The iterable needs to consist of two elements, the first is a sym
variable for minibatch input (x), and the second is a sym for
minibatch outputs (y).
:type cost: Theano symbolic variable.
:param cost: The cost function which needs to be minimized.
:type params: iterable of theano symbolic vars
:param params: All the parameters which need to be optimized with
gradient descent.
:type epochs: int
:param epochs: Number of epochs (int) of training.
:type eps: float
:param eps: Learning rate.
:param x_mnb: Trainset split into minibatches. Thus,
x_mnb is an iterable containing numpy arrays of
(mnb_N, n_vis) shape, where mnb_N is the number of
samples in the minibatch.
:param y_mnb: Trainset label indices split into minibatches. Thus,
y_mnb is an iterable containing numpy arrays of
(mnb_N, ) shape, where mnb_N is the number of
samples in the minibatch.
"""
# gradients and param updates
grads = [(p, theano.tensor.grad(cost=cost, wrt=p)) for p in params]
updates = [(p, p - eps * grad_p) for (p, grad_p) in grads]
# compiled training function
train_model = theano.function(
inputs=inputs,
outputs=cost,
updates=updates
)
# things we'll track through training, for reporting
epoch_costs = []
epoch_times = []
# iterate through the epochs
for epoch in range(epochs):
log.info('Starting epoch %d', epoch)
epoch_t0 = time()
# iterate through the minibatches
batch_costs = []
for batch_ind, (x_batch, y_batch) in enumerate(zip(x_mnb, y_mnb)):
batch_costs.append(train_model(x_batch, y_batch))
epoch_costs.append(np.array(batch_costs).mean())
epoch_times.append(time() - epoch_t0)
log.info(
'Epoch cost %.5f, duration %.2f sec',
epoch_costs[-1],
epoch_times[-1]
)
log.info('Training duration %.2f min',
(sum(epoch_times)) / 60.0)
return epoch_costs, epoch_times
def write_ndarray(ndarray, file, formatter=None, separators=None):
"""
Writes a numpy array into a file.
:param ndarray: The array to write to file.
:param file: File object in which to write.
:param formatter: Formatting string to be used on each
numpy array element if None (default), the '{}' is used.
:param separators: A list of separator tokens to be used
in between of array elements.
"""
shape = ndarray.shape
# get cumulative sizes of each dimension
dim_sizes = [
np.prod(shape[(i + 1):], dtype=int) for i in range(0, len(shape))]
# prepare the separators
if separators is None:
separators = ['\n'] * len(shape)
separators[-1] = ' '
# default formatter
if formatter is None:
formatter = "{}"
# write all the array elements
for i, n in enumerate(ndarray.reshape(ndarray.size, )):
if i != 0:
sep_ind = [i % ds for ds in dim_sizes].index(0)
file.write(separators[sep_ind])
file.write(formatter.format(n))
def store_mlp_ascii(mlp, file_path):
"""
Stores a MLP into an ASCII file.
:param mlp: A MLP instance to store.
:param file_path: File path to store it to.
"""
log.info("Storing MLP to file: %s", file_path)
# first info in the ascii file is the layer sizes
layer_sizes = [32 * 24]
for hid_lay in mlp.hidden_layers:
layer_sizes.append(hid_lay.b.get_value().size)
layer_sizes.append(mlp.regression_layer.b.get_value().size)
with open(file_path, "w") as file:
def ln(string):
file.write(string + '\n')
ln("# Multilayer-perceptron, exported from Theano+Python DBN-MLP")
ln("# Author: Florijan Stamenkovic (florijan.stameknovic@gmail.com")
ln("# Date: {}".format(date.today()))
ln("#")
ln("# Non-comment lines are organized as follows:")
ln("# - first come layer sizes (visible -> hidden -> softmax")
ln("# - then for each layer (except visible):")
ln("# - first the weights to previous layer in N lines where N "
"is number of neurons of previous layer")
ln("# - then biases for that layer (in a single line)")
ln("# Enjoy!!!")
file.write(" ".join([str(ls) for ls in layer_sizes]))
for hl in mlp.hidden_layers:
file.write('\n')
write_ndarray(hl.W.get_value(), file, "{:.06f}")
file.write('\n')
write_ndarray(hl.b.get_value(), file, "{:.06f}")
file.write('\n')
write_ndarray(mlp.regression_layer.W.get_value(), file, "{:.06f}")
file.write('\n')
write_ndarray(mlp.regression_layer.b.get_value(), file, "{:.06f}")
file.write('\n')