diff --git a/README.md b/README.md index f4453ca..b63aa67 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,9 @@ If you use this code or our results in your research, please cite as appropriate year={2018} } ``` +## Update (06/13) + +The codebase is now PyTorch 0.4 compatible for most use cases (a big shoutout to @shawntan for a fairly comprehensive PR https://github.com/salesforce/awd-lstm-lm/pull/43). Mild readjustments to hyperparameters may be necessary to obtain quoted performance. If you desire exact reproducibility (or wish to run on PyTorch 0.3 or lower), we suggest using an older commit of this repository. We are still working on `finetune` and `generate` functionality. ## Software Requirements diff --git a/embed_regularize.py b/embed_regularize.py index 386e1ec..b0a40c5 100644 --- a/embed_regularize.py +++ b/embed_regularize.py @@ -1,12 +1,10 @@ import numpy as np import torch -from torch.autograd import Variable def embedded_dropout(embed, words, dropout=0.1, scale=None): if dropout: mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(embed.weight) / (1 - dropout) - mask = Variable(mask) masked_embed_weight = mask * embed.weight else: masked_embed_weight = embed.weight @@ -16,7 +14,8 @@ def embedded_dropout(embed, words, dropout=0.1, scale=None): padding_idx = embed.padding_idx if padding_idx is None: padding_idx = -1 - X = embed._backend.Embedding.apply(words, masked_embed_weight, + + X = torch.nn.functional.embedding(words, masked_embed_weight, padding_idx, embed.max_norm, embed.norm_type, embed.scale_grad_by_freq, embed.sparse ) @@ -32,7 +31,6 @@ def embedded_dropout(embed, words, dropout=0.1, scale=None): words = np.random.random_integers(low=0, high=V-1, size=(batch_size, bptt)) words = torch.LongTensor(words) - words = Variable(words) origX = embed(words) X = embedded_dropout(embed, words) diff --git a/finetune.py b/finetune.py index c320cd5..5afdbc3 100644 --- a/finetune.py +++ b/finetune.py @@ -5,7 +5,6 @@ np.random.seed(331) import torch import torch.nn as nn -from torch.autograd import Variable import data import model diff --git a/main.py b/main.py index bda3e3d..afd6e6e 100644 --- a/main.py +++ b/main.py @@ -1,10 +1,10 @@ +import setGPU import argparse import time import math import numpy as np import torch import torch.nn as nn -from torch.autograd import Variable import data import model @@ -166,7 +166,7 @@ def evaluate(data_source, batch_size=10): output, hidden = model(data, hidden) total_loss += len(data) * criterion(model.decoder.weight, model.decoder.bias, output, targets).data hidden = repackage_hidden(hidden) - return total_loss[0] / len(data_source) + return total_loss.item() / len(data_source) def train(): @@ -205,13 +205,13 @@ def train(): loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. - if args.clip: torch.nn.utils.clip_grad_norm(params, args.clip) + if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip) optimizer.step() total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: - cur_loss = total_loss[0] / args.log_interval + cur_loss = total_loss.item() / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format( @@ -249,7 +249,7 @@ def train(): print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( - epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), val_loss / math.log(2))) + epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2), val_loss2 / math.log(2))) print('-' * 89) if val_loss2 < stored_loss: diff --git a/model.py b/model.py index 3ef853c..a704b3f 100644 --- a/model.py +++ b/model.py @@ -1,6 +1,5 @@ import torch import torch.nn as nn -from torch.autograd import Variable from embed_regularize import embedded_dropout from locked_dropout import LockedDropout @@ -99,9 +98,9 @@ def forward(self, input, hidden, return_h=False): def init_hidden(self, bsz): weight = next(self.parameters()).data if self.rnn_type == 'LSTM': - return [(Variable(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_()), - Variable(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_())) + return [(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_(), + weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_()) for l in range(self.nlayers)] elif self.rnn_type == 'QRNN' or self.rnn_type == 'GRU': - return [Variable(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_()) + return [weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_() for l in range(self.nlayers)] diff --git a/splitcross.py b/splitcross.py index c590b18..cda17dd 100644 --- a/splitcross.py +++ b/splitcross.py @@ -37,7 +37,7 @@ def logprob(self, weight, bias, hiddens, splits=None, softmaxed_head_res=None, v # Perform the softmax calculation for the word vectors in the head for all splits # We need to guard against empty splits as torch.cat does not like random lists head_res = torch.nn.functional.linear(hiddens, head_weight, bias=head_bias) - softmaxed_head_res = torch.nn.functional.log_softmax(head_res) + softmaxed_head_res = torch.nn.functional.log_softmax(head_res, dim=-1) if splits is None: splits = list(range(self.nsplits)) @@ -62,7 +62,7 @@ def logprob(self, weight, bias, hiddens, splits=None, softmaxed_head_res=None, v # Then we calculate p(tombstone) * p(word in tombstone) # Adding is equivalent to multiplication in log space head_entropy = (softmaxed_head_res[:, -idx]).contiguous() - tail_entropy = torch.nn.functional.log_softmax(tail_res) + tail_entropy = torch.nn.functional.log_softmax(tail_res, dim=-1) results.append(head_entropy.view(-1, 1) + tail_entropy) if len(results) > 1: @@ -129,7 +129,7 @@ def forward(self, weight, bias, hiddens, targets, verbose=False): combo = torch.cat([split_hiddens[i] for i in range(self.nsplits) if len(split_hiddens[i])]) ### all_head_res = torch.nn.functional.linear(combo, head_weight, bias=head_bias) - softmaxed_all_head_res = torch.nn.functional.log_softmax(all_head_res) + softmaxed_all_head_res = torch.nn.functional.log_softmax(all_head_res, dim=-1) if self.verbose or verbose: self.stats[0].append(combo.size()[0] * head_weight.size()[0]) @@ -160,7 +160,7 @@ def forward(self, weight, bias, hiddens, targets, verbose=False): # All indices are shifted - if the first split handles [0,...,499] then the 500th in the second split will be 0 indexed indices = (split_targets[idx] - self.splits[idx]).view(-1, 1) # Warning: if you don't squeeze, you get an N x 1 return, which acts oddly with broadcasting - tail_entropy = torch.gather(torch.nn.functional.log_softmax(tail_res), dim=1, index=indices).squeeze() + tail_entropy = torch.gather(torch.nn.functional.log_softmax(tail_res, dim=-1), dim=1, index=indices).squeeze() entropy = -(head_entropy + tail_entropy) ### running_offset += len(split_hiddens[idx]) diff --git a/utils.py b/utils.py index 82bb157..20f70ec 100644 --- a/utils.py +++ b/utils.py @@ -1,12 +1,15 @@ -from torch.autograd import Variable +import torch + def repackage_hidden(h): - """Wraps hidden states in new Variables, to detach them from their history.""" - if type(h) == Variable: - return Variable(h.data) + """Wraps hidden states in new Tensors, + to detach them from their history.""" + if isinstance(h, torch.Tensor): + return h.detach() else: return tuple(repackage_hidden(v) for v in h) + def batchify(data, bsz, args): # Work out how cleanly we can divide the dataset into bsz parts. nbatch = data.size(0) // bsz @@ -18,8 +21,9 @@ def batchify(data, bsz, args): data = data.cuda() return data + def get_batch(source, i, args, seq_len=None, evaluation=False): seq_len = min(seq_len if seq_len else args.bptt, len(source) - 1 - i) - data = Variable(source[i:i+seq_len], volatile=evaluation) - target = Variable(source[i+1:i+1+seq_len].view(-1)) + data = source[i:i+seq_len] + target = source[i+1:i+1+seq_len].view(-1) return data, target