-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy pathutils.py
161 lines (133 loc) · 4.9 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import time
import datetime
import numpy as np
import scipy
import tensorflow as tf
from sklearn import preprocessing as prep
class timer(object):
def __init__(self, name='default'):
"""
timer object to record running time of functions, not for micro-benchmarking
usage is:
$ timer = utils.timer('name').tic()
$ timer.toc('process A').tic()
:param name: label for the timer
"""
self._start_time = None
self._name = name
self.tic()
def tic(self):
self._start_time = time.time()
return self
def toc(self, message):
elapsed = time.time() - self._start_time
message = '' if message is None else message
print('[{0:s}] {1:s} elapsed [{2:s}]'.format(self._name, message, timer._format(elapsed)))
return self
def reset(self):
self._start_time = None
return self
@staticmethod
def _format(s):
delta = datetime.timedelta(seconds=s)
d = datetime.datetime(1, 1, 1) + delta
s = ''
if (d.day - 1) > 0:
s = s + '{:d} days'.format(d.day - 1)
if d.hour > 0:
s = s + '{:d} hr'.format(d.hour)
if d.minute > 0:
s = s + '{:d} min'.format(d.minute)
s = s + '{:d} s'.format(d.second)
return s
def batch(iterable, _n=1, drop=True):
"""
returns batched version of some iterable
:param iterable: iterable object as input
:param _n: batch size
:param drop: if true, drop extra if batch size does not divide evenly,
otherwise keep them (last batch might be shorter)
:return: batched version of iterable
"""
it_len = len(iterable)
for ndx in range(0, it_len, _n):
if ndx + _n < it_len:
yield iterable[ndx:ndx + _n]
elif drop is False:
yield iterable[ndx:it_len]
def tfidf(x):
"""
compute tfidf of numpy array x
:param x: input array, document by terms
:return:
"""
x_idf = np.log(x.shape[0] - 1) - np.log(1 + np.asarray(np.sum(x > 0, axis=0)).ravel())
x_idf = np.asarray(x_idf)
x_idf_diag = scipy.sparse.lil_matrix((len(x_idf), len(x_idf)))
x_idf_diag.setdiag(x_idf)
x_tf = x.tocsr()
x_tf.data = np.log(x_tf.data + 1)
x_tfidf = x_tf * x_idf_diag
return x_tfidf
def prep_standardize(x):
"""
takes sparse input and compute standardized version
Note:
cap at 5 std
:param x: 2D scipy sparse data array to standardize (column-wise), must support row indexing
:return: the object to perform scale (stores mean/std) for inference, as well as the scaled x
"""
x_nzrow = x.any(axis=1)
scaler = prep.StandardScaler().fit(x[x_nzrow, :])
x_scaled = np.copy(x)
x_scaled[x_nzrow, :] = scaler.transform(x_scaled[x_nzrow, :])
x_scaled[x_scaled > 5] = 5
x_scaled[x_scaled < -5] = -5
x_scaled[np.absolute(x_scaled) < 1e-5] = 0
return scaler, x_scaled
def prep_standardize_dense(x):
"""
takes dense input and compute standardized version
Note:
cap at 5 std
:param x: 2D numpy data array to standardize (column-wise)
:return: the object to perform scale (stores mean/std) for inference, as well as the scaled x
"""
scaler = prep.StandardScaler().fit(x)
x_scaled = scaler.transform(x)
x_scaled[x_scaled > 5] = 5
x_scaled[x_scaled < -5] = -5
x_scaled[np.absolute(x_scaled) < 1e-5] = 0
return scaler, x_scaled
def batch_eval_recall(_sess, tf_eval, eval_feed_dict, recall_k, eval_data):
"""
given EvalData and DropoutNet compute graph in TensorFlow, runs batch evaluation
:param _sess: tf session
:param tf_eval: the evaluate output symbol in tf
:param eval_feed_dict: method to parse tf, pick from EvalData method
:param recall_k: list of thresholds to compute recall at (information retrieval recall)
:param eval_data: EvalData instance
:return: recall array at thresholds matching recall_k
"""
tf_eval_preds_batch = []
for (batch, (eval_start, eval_stop)) in enumerate(eval_data.eval_batch):
tf_eval_preds = _sess.run(tf_eval,
feed_dict=eval_feed_dict(
batch, eval_start, eval_stop, eval_data))
tf_eval_preds_batch.append(tf_eval_preds)
tf_eval_preds = np.concatenate(tf_eval_preds_batch)
tf.local_variables_initializer().run()
# filter non-zero targets
y_nz = [len(x) > 0 for x in eval_data.R_test_inf.rows]
y_nz = np.arange(len(eval_data.R_test_inf.rows))[y_nz]
preds_all = tf_eval_preds[y_nz, :]
recall = []
for at_k in recall_k:
preds_k = preds_all[:, :at_k]
y = eval_data.R_test_inf[y_nz, :]
x = scipy.sparse.lil_matrix(y.shape)
x.rows = preds_k
x.data = np.ones_like(preds_k)
z = y.multiply(x)
recall.append(np.mean(np.divide((np.sum(z, 1)), np.sum(y, 1))))
return recall