-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathoei.py
199 lines (171 loc) · 8.13 KB
/
oei.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
from .bo import BO
import numpy as np
import scipy.linalg as la
import logging
from .sdp import sdp, solution_derivative
import tensorflow as tf
from gpflow.param import AutoFlow
from gpflow._settings import settings
float_type = settings.dtypes.float_type
class OEI(BO):
'''
This class implements the Optimistic Expected Improvement acquisition function.
'''
def __init__(self, options):
super(OEI, self).__init__(options)
def acquisition(self, x):
'''
The acquisition function and its gradient
Input: x: flattened ndarray [batch_size x self.dim] containing the evaluation points
Output[0]: Value of the acquisition function
Output[1]: Gradient of the acquisition function
'''
# Calculate the minimum value so far
fmin = np.min(self.predict_f(self.X.value)[0])
X = x.reshape((-1, self.dim))
X_, V = self.project(X) # See comments in self.project
if len(X_) > 0:
# Solve the SDP
value, M_, _, _ = sdp(self.omega(X_), fmin, warm_start=(len(X)==len(X_)))
_, gradient_ = self.acquisition_tf(X_, M_)
gradient = V.T.dot(gradient_)
else:
value = 0; gradient = np.random.rand(len(x))
return np.array([value]), gradient.flatten()
def acquisition_hessian(self, x):
'''
The acquisition function and its gradient
Input: x: flattened ndarray [batch_size x self.dim] containing the evaluation points
Output[0]: Value of the acquisition function
Output[1]: Gradient of the acquisition function
'''
# Calculate the minimum value so far
fmin = np.min(self.predict_f(self.X.value)[0])
X = x.reshape((-1, self.dim))
# See comments on self.project
X_, _ = self.project(X)
if len(X) != len(X_):
return np.zeros((x.shape[0], x.shape[0]))
# Solve SDP
_, M, Y, C = sdp(self.omega(X), fmin)
# Calculate domega/dx and dM/dx
domega = self.domega(X.flatten())
dM = solution_derivative(M, Y, C, domega)
# Perform the chain rules in Tensorflow
return self.acquisition_hessian_tf(X.flatten(), M, dM, domega)
@AutoFlow((float_type, [None, None]), (float_type, [None, None]))
def acquisition_tf(self, X, M):
'''
Calculates the acquisition function, given M the optimizer of the SDP.
The calculation is simply a matrix inner product.
Input: X: ndarray [batch_size x self.dim] containing the evaluation points
Output[0]: Value of the acquisition function
Output[1]: Gradient of the acquisition function
'''
f = tf.tensordot(self.omega_tf(X), M, axes=2)
df = tf.gradients(f, X)[0]
return f, df
def omega_tf(self, X):
'''
Calculates the second order moment matrix in tensorflow.
Input: X: ndarray [batch_size x self.dim] containing the evaluation points
Output: [Sigma(X) + mu(X)*mu(X).T mu(X); mu(X).T 1]
where mu(X) and Sigma(X) are the mean and variance of the GP posterior at X.
'''
mean, var = self.build_predict(X, full_cov=True)
var = var[:, :, 0] + tf.eye(tf.shape(var)[0], dtype=float_type)*self.likelihood.variance
# Create omega
omega = var + tf.matmul(mean, mean, transpose_b=True)
omega = tf.concat([omega, mean], axis=1)
omega = tf.concat([omega,
tf.concat([tf.transpose(mean), [[1]]], axis=1)],
axis=0)
return omega
@AutoFlow((float_type, [None, None]))
def omega(self, X):
'''
Just an autoflow wrapper for omega_tf
'''
return self.omega_tf(X)
@AutoFlow((float_type, [None]), (float_type, [None, None]),
(float_type, [None, None, None]), (float_type, [None, None, None]))
def acquisition_hessian_tf(self, x, M, dM, domega):
'''
Input: x: flattened ndarray [batch_size x self.dim] containing the evaluation points
M: [(batch_size + 1) x (batch_size + 1)] ndarray
containing the solution of the SDP
dM: [(batch_size + 1) x (batch_size + 1) x (batch_size * self.dim)] ndarray
containing the ''jacobian'' of M w.r.t x.
domega: [(batch_size + 1) x (batch_size + 1) x (batch_size * self.dim)] ndarray
containing the ''jacobian'' of the second order moment Omega with respect to x.
Output: hessian of the acquisition function: [(batch_size * self.dim) x (batch_size * self.dim)] ndarray
'''
X = tf.reshape(x, [self.options['batch_size'], -1])
f = tf.tensordot(self.omega_tf(X), M, axes=2)
d2f_a = tf.hessians(f, x)[0]
d2f_b = tf.tensordot(tf.transpose(dM), domega, axes=2)
return d2f_a + d2f_b
@AutoFlow((float_type, [None]))
def domega(self, x):
'''
Input: x: flattened ndarray [batch_size x self.dim] containing the evaluation points
Output: domega: [(batch_size + 1) x (batch_size + 1) x (batch_size * self.dim)] ndarray
containing the ''jacobian'' of the second order moment Omega with respect to x.
'''
X = tf.reshape(x, [self.options['batch_size'], -1])
omega = self.omega_tf(X)
domega = self.jacobian(tf.reshape(omega, [-1]), x)
return tf.reshape(domega,[omega.shape[0], omega.shape[1], -1])
def jacobian(self, y_flat, x):
'''
Calculates the jacobian of y_flat with respect to x
Code taken from jeisses' comment on
https://github.com/tensorflow/tensorflow/issues/675
'''
n = y_flat.shape[0]
loop_vars = [
tf.constant(0, tf.int32),
tf.TensorArray(float_type, size=n),
]
_, jacobian = tf.while_loop(
lambda j, _: j < n,
lambda j, result: (j+1, result.write(j, tf.gradients(y_flat[j], x))),
loop_vars)
return jacobian.stack()
def project(self, X):
'''
According to Proposition 8, OEI and QEI are not differentiable when the kernel is noiseless and
there are duplicates in X. In these cases calculating M, SDP's optimizer, becomes increasing hard, as
the SDP problem becomes ill conditioned.
In this cases, we get around this problem by solving a smaller (projected) SDP problem,
and providing a descent direction that causes the duplicates to separate and, if requested, a hessian equal to zero.
Input: X: ndarray [batch_size x self.dim] containing the evaluation points
Output:
If the self.likelihood.variance > 1e-4:
Returns X (same as input) and V, an identity matrix
If the kernel is noiseless:
X_u: ndarray [q x self.dim] containing the unique evaluation points (it removes also duplicates of dataset)
V: ndarray [q x batch_size] projection matrix that given the solution M of the smaller sdp problem
can be used to calculate an appropriate descent direction
'''
if self.likelihood.variance.value > 1e-4:
return X, np.eye(X.shape[0])
l = self.kern.lengthscales.value
# Finding duplicates of dataset
# See https://stackoverflow.com/questions/11903083/find-the-set-difference-between-two-large-arrays-matrices-in-python
idx = np.where(np.all(np.abs((X/l)[:, None, :] - self.X.value/l) < 1e-2, axis=2))[0]
idx_ = np.setdiff1d(np.arange(X.shape[0]), idx)
# Remove duplicates of the dataset
X_u = X[idx_]
V = np.eye(X.shape[0])[idx_]
random_direction = np.random.rand(V.shape[0], len(idx))
V[:, idx] = random_direction/np.linalg.norm(random_direction, axis=0, keepdims=True)
if X_u.shape[0] > 0:
_, idx = np.unique(np.round(X_u / l, decimals=2), return_index=True, axis=0)
X_u = X_u[idx]
V = V[idx]
# If no points were removed, then just return the original X, to preserve the order of its rows
if len(X_u) == len(X):
X_u = X
V = np.eye(X.shape[0])
return X_u, V