-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrainer.py
457 lines (357 loc) · 14.4 KB
/
trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
import collections
import copy
import json
import os
import sys
import traceback
import types
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from experiment_logger import get_logger
from help_get_net_components import check_params, nested_getattr
def get_constant_schedule_with_warmup(optimizer: optim.Optimizer, num_warmup_steps: int, last_epoch: int = -1):
"""
Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate
increases linearly between 0 and the initial lr set in the optimizer.
Args:
optimizer (:class:`~torch.optim.Optimizer`):
The optimizer for which to schedule the learning rate.
num_warmup_steps (:obj:`int`):
The number of steps for the warmup phase.
last_epoch (:obj:`int`, `optional`, defaults to -1):
The index of the last epoch when resuming training.
Return:
:obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
"""
def lr_lambda(current_step: int):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1.0, num_warmup_steps))
return 1.0
return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
def default_getattr(o, k, default=None):
if not hasattr(o, k):
return default
return getattr(o, k)
def nested_getattr(o, k):
k_lst = k.split('.')
for i in range(len(k_lst)):
o = getattr(o, k_lst[i])
return o
def nested_hasattr(o, k):
try:
_ = nested_getattr(o, k)
return True
except:
return False
def nested_setattr(o, k, v):
k_lst = k.split('.')
if len(k_lst) > 1:
new_k = '.'.join(k_lst[:-1])
o = nested_getattr(o, new_k)
setattr(o, k_lst[-1], v)
class Net(nn.Module):
def __init__(self, embed=None, diora=None, loss_funcs=[], size=None):
super(Net, self).__init__()
self.embed = embed
self.diora = diora
self.size = size
self.loss_func_names = []
self.add_loss_funcs(loss_funcs)
@property
def device(self):
return next(self.parameters()).device
def add_loss_funcs(self, loss_funcs):
for m in loss_funcs:
# Assign.
setattr(self, m.name, m)
self.loss_func_names.append(m.name)
def compute_loss(self, batch, diora, info, embed, name_filter=None):
device = self.device
info['net'] = self
ret, loss = {}, []
func_name_lst = self.loss_func_names
if name_filter is not None:
func_name_lst = [x for x in func_name_lst if x in name_filter]
# Loss
for func_name in func_name_lst:
func = getattr(self, func_name)
if default_getattr(func, 'skip', False):
continue
subloss, desc = func(batch, diora, info, embed)
loss.append(subloss.view(1, 1))
for k, v in desc.items():
ret[k] = v
if len(loss) > 0:
loss = torch.cat(loss, 1)
else:
loss = torch.full((1, 1), 1, dtype=torch.float, device=device)
return ret, loss
def forward(self, batch, compute_loss=True, info={}):
device = self.device
# Embed
embed = self.embed(batch, info)
# Run DIORA
diora = self.diora
chart = diora(embed, info=info)
# Compute Loss
if isinstance(compute_loss, (tuple, list, str)):
if isinstance(compute_loss, str):
compute_loss = [compute_loss]
ret, loss = self.compute_loss(batch, diora=diora, info=info, embed=embed, name_filter=compute_loss)
elif compute_loss:
ret, loss = self.compute_loss(batch, diora=diora, info=info, embed=embed)
else:
ret, loss = {}, torch.full((1, 1), 1, dtype=torch.float, device=device)
# Results
ret['chart'] = chart
ret['total_loss'] = loss
return ret
class Trainer(object):
def __init__(self, net=None, cuda=None, word2idx=None, mlp_reg=0, clip_threshold=5.0):
super(Trainer, self).__init__()
self.net = net
self.optimizer = None
self.optimizer_step = 0
self.optimizer_epoch = 0
self.optimizer_epoch_step = 0
self.optimizer_cls = None
self.optimizer_kwargs = None
self.cuda = cuda
self.word2idx = word2idx
self._embedding_keys = None
self.mlp_reg = mlp_reg
self.clip_threshold = clip_threshold
self.logger = get_logger()
def save_init_parameters(self, net):
d = {}
for n, p in net.named_parameters():
if not p.requires_grad:
continue
phat = p.clone().detach()
phat.requires_grad = False
d[n] = phat
self.init_parameters = d
@staticmethod
def get_single_net(net):
if isinstance(net, torch.nn.parallel.DistributedDataParallel):
return net.module
return net
def embedding_keys(self, net):
if self._embedding_keys is not None:
# Cache these keys because they will be deleted.
return self._embedding_keys
suffix = '.weight'
keys = [k[:len(k)-len(suffix)] for k in net.state_dict().keys() if 'embeddings.weight' in k]
self._embedding_keys = keys
return keys
def init_optimizer(self, options):
if options.opt == 'adam':
optimizer_cls = optim.Adam
optimizer_kwargs = dict(lr=options.lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=options.weight_decay)
elif options.opt == 'sgd':
optimizer_cls = optim.SGD
optimizer_kwargs = dict(lr=options.lr, weight_decay=options.weight_decay)
if optimizer_cls is None:
optimizer_cls = self.optimizer_cls
if optimizer_kwargs is None:
optimizer_kwargs = self.optimizer_kwargs
trainable_params = [p for (name, p) in self.net.named_parameters() if p.requires_grad]
optimizer = optimizer_cls(trainable_params, **optimizer_kwargs)
self.optimizer = optimizer
self.scheduler = None
if options.num_warmup_steps > 0:
self.scheduler = get_constant_schedule_with_warmup(optimizer, options.num_warmup_steps)
def save_model(self, model_file):
state_dict = self.net.state_dict()
todelete = []
for k in state_dict.keys():
if 'embeddings' in k:
todelete.append(k)
for k in todelete:
del state_dict[k]
torch.save({
'state_dict': state_dict,
'word2idx': self.word2idx,
}, model_file)
@staticmethod
def load_model(net, model_file):
save_dict = torch.load(model_file, map_location=lambda storage, loc: storage)
state_dict_toload = save_dict['state_dict']
state_dict_net = Trainer.get_single_net(net).state_dict()
print('[load] state dict keys: {}'.format(state_dict_toload.keys()))
print('[net] state dict keys: {}'.format(state_dict_net.keys()))
todelete = []
# Rename some layers.
keys = list(state_dict_toload.keys())
for k in keys:
if k.startswith('module'):
newk = k[len('module.'):]
state_dict_toload[newk] = state_dict_toload[k]
del state_dict_toload[k]
# Deprecated MLP.
keys = list(state_dict_toload.keys())
for k in keys:
newk = None
if k == 'diora.inside_compose_func.W_0':
newk = 'diora.inside_compose_func.W'
if k == 'diora.outside_compose_func.W_0':
newk = 'diora.outside_compose_func.W'
if k == 'diora.atten_func.mat':
newk = 'diora.atten_func.0.mat'
if newk is not None:
state_dict_toload[newk] = state_dict_toload[k]
del state_dict_toload[k]
state_dict_toload_copy = state_dict_toload.copy()
# Remove extra keys.
keys = list(state_dict_toload.keys())
for k in keys:
# print('state_dict[load]', k, k in state_dict_net)
if k not in state_dict_net:
print('deleting (missing from state_dict) {}'.format(k))
del state_dict_toload[k]
for k in todelete:
print('deleting (during load) {}'.format(k))
del state_dict_toload[k]
# Hack to support embeddings.
for k in state_dict_net.keys():
if 'embeddings' in k:
print('restoring {}'.format(k))
state_dict_toload[k] = state_dict_net[k]
# Add any missing modules.
for k in state_dict_net.keys():
if k not in state_dict_toload:
print('add missing {}'.format(k))
state_dict_toload[k] = state_dict_net[k]
Trainer.get_single_net(net).load_state_dict(state_dict_toload)
def run_net(self, batch_map, compute_loss=True, info={}):
batch = batch_map['sentences']
for k, v in self.prepare_info(batch_map).items():
info[k] = v
if self.cuda:
batch = batch.cuda()
net = self.net
out = net(batch, compute_loss=compute_loss, info=info)
return out
def backward_and_maybe_update(self, loss, update=True, zero_grad=True):
if zero_grad:
self.optimizer.zero_grad()
loss.backward()
if update:
trainable_params = [p for p in self.net.parameters() if p.requires_grad]
clip_threshold = np.inf if self.clip_threshold == 0 else self.clip_threshold
grad_norm = torch.nn.utils.clip_grad_norm_(trainable_params, clip_threshold)
self.optimizer.step()
if self.scheduler is not None:
self.scheduler.step()
# Number of gradient updates.
self.optimizer_step += 1
self.optimizer_epoch_step += 1
def prepare_result(self, batch_map, model_output):
batch_size, length = batch_map['sentences'].shape
result = {}
result['batch_size'] = batch_size
result['length'] = length
result['inside_root'] = model_output.get('inside_root', None)
result['scores'] = model_output.get('scores', None)
result['samples'] = model_output.get('samples', None)
metrics = collections.OrderedDict()
for k, v in model_output.items():
if 'loss' in k:
metrics[k] = v.mean(dim=0).sum().item()
elif 'metric' in k:
metrics[k] = v
elif 'acc' in k:
metrics[k] = v.mean(dim=0).item()
result['metrics'] = metrics
return result
def prepare_info(self, batch_map):
info = {}
if 'raw_parse' in batch_map:
info['raw_parse'] = batch_map['raw_parse']
if 'constituency_tags' in batch_map:
info['constituency_tags'] = batch_map['constituency_tags']
if 'entity_labels' in batch_map:
info['entity_labels'] = batch_map['entity_labels']
if 'pos_tags' in batch_map:
info['pos_tags'] = batch_map['pos_tags']
if 'binary_tree' in batch_map:
info['binary_tree'] = batch_map['binary_tree']
if 'spans' in batch_map:
info['spans'] = batch_map['spans']
if 'span_siblings' in batch_map:
info['span_siblings'] = batch_map['span_siblings']
if 'vector_target' in batch_map:
info['vector_target'] = batch_map['vector_target']
if 'sentence_idx' in batch_map:
info['sentence_idx'] = batch_map['sentence_idx']
if 'sibling_idx' in batch_map:
info['sibling_idx'] = batch_map['sibling_idx']
if 'sibling' in batch_map:
info['sibling'] = batch_map['sibling']
if 'labels' in batch_map:
info['labels'] = batch_map['labels']
if 'example_ids' in batch_map:
info['example_ids'] = batch_map['example_ids']
if 'constraints' in batch_map:
info['constraints'] = batch_map['constraints']
if 'ner_labels' in batch_map:
info['ner_labels'] = batch_map['ner_labels']
return info
def cleanup(self, batch_map=None):
if batch_map is not None:
for k, v in list(batch_map.items()):
del batch_map[k]
self.net.diora.reset()
def single_step(self, batch_map, train=True, compute_loss=True, info={}):
if train:
self.net.train()
else:
self.net.eval()
info['trainer'] = self
with torch.set_grad_enabled(train):
model_output = self.run_net(batch_map, compute_loss=compute_loss, info=info)
result = self.prepare_result(batch_map, model_output)
trainer_output = {}
trainer_output['result'] = result
trainer_output['model_output'] = model_output
return trainer_output
def step(self, batch_map, train=True, compute_loss=True, info={}):
return self.single_step(batch_map=batch_map, train=train, compute_loss=compute_loss, info=info)
def end_of_epoch(self, best_dict):
self.optimizer_epoch += 1
self.optimizer_epoch_step = 0
def build_net(options, context=dict(), net_components=dict()):
logger = get_logger()
logger.info('build net')
# Components.
projection_layer = net_components['projection_layer']
diora = net_components['diora']
loss_funcs = net_components['loss_funcs']
# Context.
cuda = context['cuda']
lr = options.lr
word2idx = context['word2idx']
# Net
net = Net(projection_layer, diora, loss_funcs=loss_funcs, size=options.hidden_dim)
# Load model.
if options.load_model_path is not None:
logger.info('Loading model: {}'.format(options.load_model_path))
Trainer.load_model(net, options.load_model_path)
# CUDA-support
if cuda:
net.cuda()
diora.cuda()
# Trainer
trainer_kwargs = {}
trainer_kwargs['net'] = net
trainer_kwargs['cuda'] = cuda
trainer_kwargs['mlp_reg'] = options.mlp_reg
trainer_kwargs['clip_threshold'] = options.clip_threshold
trainer_kwargs['word2idx'] = word2idx
trainer = Trainer(**trainer_kwargs)
trainer.experiment_name = options.experiment_name
trainer.init_optimizer(options)
logger.info('Architecture =\n {}'.format(str(trainer.net)))
return trainer