forked from google/active-learning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimulate_batch.py
261 lines (230 loc) · 10 KB
/
simulate_batch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Select a new batch based on results of simulated trajectories."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
import math
import numpy as np
from sampling_methods.wrapper_sampler_def import AL_MAPPING
from sampling_methods.wrapper_sampler_def import WrapperSamplingMethod
class SimulateBatchSampler(WrapperSamplingMethod):
"""Creates batch based on trajectories simulated using smaller batch sizes.
Current support use case: simulate smaller batches than the batch size
actually indicated to emulate which points would be selected in a
smaller batch setting. This method can do better than just selecting
a batch straight out if smaller batches perform better and the simulations
are informative enough and are not hurt too much by labeling noise.
"""
def __init__(self,
X,
y,
seed,
samplers=[{'methods': ('margin', 'uniform'),'weight': (1, 0)}],
n_sims=10,
train_per_sim=10,
return_type='best_sim'):
""" Initialize sampler with options.
Args:
X: training data
y: labels may be used by base sampling methods
seed: seed for np.random
samplers: list of dicts with two fields
'samplers': list of named samplers
'weights': percentage of batch to allocate to each sampler
n_sims: number of total trajectories to simulate
train_per_sim: number of minibatches to split the batch into
return_type: two return types supported right now
best_sim: return points selected by the best trajectory
frequency: returns points selected the most over all trajectories
"""
self.name = 'simulate_batch'
self.X = X
self.y = y
self.seed = seed
self.n_sims = n_sims
self.train_per_sim = train_per_sim
self.return_type = return_type
self.samplers_list = samplers
self.initialize_samplers(self.samplers_list)
self.trace = []
self.selected = []
np.random.seed(seed)
def simulate_batch(self, sampler, N, already_selected, y, model, X_test,
y_test, **kwargs):
"""Simulates smaller batches by using hallucinated y to select next batch.
Assumes that select_batch is only dependent on already_selected and not on
any other states internal to the sampler. i.e. this would not work with
BanditDiscreteSampler but will work with margin, hierarchical, and uniform.
Args:
sampler: dict with two fields
'samplers': list of named samplers
'weights': percentage of batch to allocate to each sampler
N: batch size
already_selected: indices already labeled
y: y to use for training
model: model to use for margin calc
X_test: validaiton data
y_test: validation labels
Returns:
- mean accuracy
- indices selected by best hallucinated trajectory
- best accuracy achieved by one of the trajectories
"""
minibatch = max(int(math.ceil(N / self.train_per_sim)), 1)
results = []
best_acc = 0
best_inds = []
self.selected = []
n_minibatch = int(N/minibatch) + (N % minibatch > 0)
for _ in range(self.n_sims):
inds = []
hallucinated_y = []
# Copy these objects to make sure they are not modified while simulating
# trajectories as they are used later by the main run_experiment script.
kwargs['already_selected'] = copy.copy(already_selected)
kwargs['y'] = copy.copy(y)
# Assumes that model has already by fit using all labeled data so
# the probabilities can be used immediately to hallucinate labels
kwargs['model'] = copy.deepcopy(model)
for _ in range(n_minibatch):
batch_size = min(minibatch, N-len(inds))
if batch_size > 0:
kwargs['N'] = batch_size
new_inds = sampler.select_batch(**kwargs)
inds.extend(new_inds)
# All models need to have predict_proba method
probs = kwargs['model'].predict_proba(self.X[new_inds])
# Hallucinate labels for selected datapoints to be label
# using class probabilities from model
try:
classes = kwargs['model'].best_estimator_.classes_
except:
classes = kwargs['model'].classes_
new_y = ([
np.random.choice(classes, p=probs[i, :])
for i in range(batch_size)
])
hallucinated_y.extend(new_y)
# Not saving already_selected here, if saving then should sort
# only for the input to fit but preserve ordering of indices in
# already_selected
kwargs['already_selected'] = sorted(kwargs['already_selected']
+ new_inds)
kwargs['y'][new_inds] = new_y
kwargs['model'].fit(self.X[kwargs['already_selected']],
kwargs['y'][kwargs['already_selected']])
acc_hallucinated = kwargs['model'].score(X_test, y_test)
if acc_hallucinated > best_acc:
best_acc = acc_hallucinated
best_inds = inds
kwargs['model'].fit(self.X[kwargs['already_selected']],
y[kwargs['already_selected']])
# Useful to know how accuracy compares for model trained on hallucinated
# labels vs trained on true labels. But can remove this train to speed
# up simulations. Won't speed up significantly since many more models
# are being trained inside the loop above.
acc_true = kwargs['model'].score(X_test, y_test)
results.append([acc_hallucinated, acc_true])
print('Hallucinated acc: %.3f, Actual Acc: %.3f' % (acc_hallucinated,
acc_true))
# Save trajectory for reference
t = {}
t['arm'] = sampler
t['data_size'] = len(kwargs['already_selected'])
t['inds'] = inds
t['y_hal'] = hallucinated_y
t['acc_hal'] = acc_hallucinated
t['acc_true'] = acc_true
self.trace.append(t)
self.selected.extend(inds)
# Delete created copies
del kwargs['model']
del kwargs['already_selected']
results = np.array(results)
return np.mean(results, axis=0), best_inds, best_acc
def sampler_select_batch(self, sampler, N, already_selected, y, model, X_test, y_test, **kwargs):
"""Calculate the performance of the model if the batch had been selected using the base method without simulation.
Args:
sampler: dict with two fields
'samplers': list of named samplers
'weights': percentage of batch to allocate to each sampler
N: batch size
already_selected: indices already selected
y: labels to use for training
model: model to use for training
X_test, y_test: validation set
Returns:
- indices selected by base method
- validation accuracy of model trained on new batch
"""
m = copy.deepcopy(model)
kwargs['y'] = y
kwargs['model'] = m
kwargs['already_selected'] = copy.copy(already_selected)
inds = []
kwargs['N'] = N
inds.extend(sampler.select_batch(**kwargs))
kwargs['already_selected'] = sorted(kwargs['already_selected'] + inds)
m.fit(self.X[kwargs['already_selected']], y[kwargs['already_selected']])
acc = m.score(X_test, y_test)
del m
del kwargs['already_selected']
return inds, acc
def select_batch_(self, N, already_selected, y, model,
X_test, y_test, **kwargs):
""" Returns a batch of size N selected by using the best sampler in simulation
Args:
samplers: list of sampling methods represented by dict with two fields
'samplers': list of named samplers
'weights': percentage of batch to allocate to each sampler
N: batch size
already_selected: indices of datapoints already labeled
y: actual labels, used to compare simulation with actual
model: training model to use to evaluate different samplers. Model must
have a predict_proba method with same signature as that in sklearn
n_sims: the number of simulations to perform for each sampler
minibatch: batch size to use for simulation
"""
results = []
# THE INPUTS CANNOT BE MODIFIED SO WE MAKE COPIES FOR THE CHECK LATER
# Should check model but kernel_svm does not have coef_ so need better
# handling here
copy_selected = copy.copy(already_selected)
copy_y = copy.copy(y)
for s in self.samplers:
sim_results, sim_inds, sim_acc = self.simulate_batch(
s, N, already_selected, y, model, X_test, y_test, **kwargs)
real_inds, acc = self.sampler_select_batch(
s, N, already_selected, y, model, X_test, y_test, **kwargs)
print('Best simulated acc: %.3f, Actual acc: %.3f' % (sim_acc, acc))
results.append([sim_results, sim_inds, real_inds, acc])
best_s = np.argmax([r[0][0] for r in results])
# Make sure that model object fed in did not change during simulations
assert all(y == copy_y)
assert all([copy_selected[i] == already_selected[i]
for i in range(len(already_selected))])
# Return indices based on return type specified
if self.return_type == 'best_sim':
return results[best_s][1]
elif self.return_type == 'frequency':
unique, counts = np.unique(self.selected, return_counts=True)
argcount = np.argsort(-counts)
return list(unique[argcount[0:N]])
return results[best_s][2]
def to_dict(self):
output = {}
output['simulated_trajectories'] = self.trace
return output