PyTorch 0.4 compatible

salesforce · Jun 13, 2018 · 457a422 · 457a422
1 parent 9205e9b
commit 457a422
Show file tree

Hide file tree

Showing 7 changed files with 27 additions and 24 deletions.
diff --git a/README.md b/README.md
@@ -37,6 +37,9 @@ If you use this code or our results in your research, please cite as appropriate
   year={2018}
 }
 ```
+## Update (06/13)
+
+The codebase is now PyTorch 0.4 compatible for most use cases (a big shoutout to @shawntan for a fairly comprehensive PR https://github.com/salesforce/awd-lstm-lm/pull/43). Mild readjustments to hyperparameters may be necessary to obtain quoted performance. If you desire exact reproducibility (or wish to run on PyTorch 0.3 or lower), we suggest using an older commit of this repository. We are still working on `finetune` and `generate` functionality.
 
 ## Software Requirements
 

diff --git a/embed_regularize.py b/embed_regularize.py
@@ -1,12 +1,10 @@
 import numpy as np
 
 import torch
-from torch.autograd import Variable
 
 def embedded_dropout(embed, words, dropout=0.1, scale=None):
   if dropout:
     mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(embed.weight) / (1 - dropout)
-    mask = Variable(mask)
     masked_embed_weight = mask * embed.weight
   else:
     masked_embed_weight = embed.weight
@@ -16,7 +14,8 @@ def embedded_dropout(embed, words, dropout=0.1, scale=None):
   padding_idx = embed.padding_idx
   if padding_idx is None:
       padding_idx = -1
-  X = embed._backend.Embedding.apply(words, masked_embed_weight,
+
+  X = torch.nn.functional.embedding(words, masked_embed_weight,
     padding_idx, embed.max_norm, embed.norm_type,
     embed.scale_grad_by_freq, embed.sparse
   )
@@ -32,7 +31,6 @@ def embedded_dropout(embed, words, dropout=0.1, scale=None):
 
   words = np.random.random_integers(low=0, high=V-1, size=(batch_size, bptt))
   words = torch.LongTensor(words)
-  words = Variable(words)
 
   origX = embed(words)
   X = embedded_dropout(embed, words)

diff --git a/finetune.py b/finetune.py
@@ -5,7 +5,6 @@
 np.random.seed(331)
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 
 import data
 import model

diff --git a/main.py b/main.py
@@ -1,10 +1,10 @@
+import setGPU
 import argparse
 import time
 import math
 import numpy as np
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 
 import data
 import model
@@ -166,7 +166,7 @@ def evaluate(data_source, batch_size=10):
         output, hidden = model(data, hidden)
         total_loss += len(data) * criterion(model.decoder.weight, model.decoder.bias, output, targets).data
         hidden = repackage_hidden(hidden)
-    return total_loss[0] / len(data_source)
+    return total_loss.item() / len(data_source)
 
 
 def train():
@@ -205,13 +205,13 @@ def train():
         loss.backward()
 
         # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
-        if args.clip: torch.nn.utils.clip_grad_norm(params, args.clip)
+        if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip)
         optimizer.step()
 
         total_loss += raw_loss.data
         optimizer.param_groups[0]['lr'] = lr2
         if batch % args.log_interval == 0 and batch > 0:
-            cur_loss = total_loss[0] / args.log_interval
+            cur_loss = total_loss.item() / args.log_interval
             elapsed = time.time() - start_time
             print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | '
                     'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
@@ -249,7 +249,7 @@ def train():
             print('-' * 89)
             print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format(
-              epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), val_loss / math.log(2)))
+                    epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2), val_loss2 / math.log(2)))
             print('-' * 89)
 
             if val_loss2 < stored_loss:

diff --git a/model.py b/model.py
@@ -1,6 +1,5 @@
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 
 from embed_regularize import embedded_dropout
 from locked_dropout import LockedDropout
@@ -99,9 +98,9 @@ def forward(self, input, hidden, return_h=False):
     def init_hidden(self, bsz):
         weight = next(self.parameters()).data
         if self.rnn_type == 'LSTM':
-            return [(Variable(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_()),
-                    Variable(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_()))
+            return [(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_(),
+                    weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_())
                     for l in range(self.nlayers)]
         elif self.rnn_type == 'QRNN' or self.rnn_type == 'GRU':
-            return [Variable(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_())
+            return [weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_()
                     for l in range(self.nlayers)]
diff --git a/splitcross.py b/splitcross.py
@@ -37,7 +37,7 @@ def logprob(self, weight, bias, hiddens, splits=None, softmaxed_head_res=None, v
             # Perform the softmax calculation for the word vectors in the head for all splits
             # We need to guard against empty splits as torch.cat does not like random lists
             head_res = torch.nn.functional.linear(hiddens, head_weight, bias=head_bias)
-            softmaxed_head_res = torch.nn.functional.log_softmax(head_res)
+            softmaxed_head_res = torch.nn.functional.log_softmax(head_res, dim=-1)
 
         if splits is None:
             splits = list(range(self.nsplits))
@@ -62,7 +62,7 @@ def logprob(self, weight, bias, hiddens, splits=None, softmaxed_head_res=None, v
                 # Then we calculate p(tombstone) * p(word in tombstone)
                 # Adding is equivalent to multiplication in log space
                 head_entropy = (softmaxed_head_res[:, -idx]).contiguous()
-                tail_entropy = torch.nn.functional.log_softmax(tail_res)
+                tail_entropy = torch.nn.functional.log_softmax(tail_res, dim=-1)
                 results.append(head_entropy.view(-1, 1) + tail_entropy)
 
         if len(results) > 1:
@@ -129,7 +129,7 @@ def forward(self, weight, bias, hiddens, targets, verbose=False):
         combo = torch.cat([split_hiddens[i] for i in range(self.nsplits) if len(split_hiddens[i])])
         ###
         all_head_res = torch.nn.functional.linear(combo, head_weight, bias=head_bias)
-        softmaxed_all_head_res = torch.nn.functional.log_softmax(all_head_res)
+        softmaxed_all_head_res = torch.nn.functional.log_softmax(all_head_res, dim=-1)
         if self.verbose or verbose:
             self.stats[0].append(combo.size()[0] * head_weight.size()[0])
 
@@ -160,7 +160,7 @@ def forward(self, weight, bias, hiddens, targets, verbose=False):
                 # All indices are shifted - if the first split handles [0,...,499] then the 500th in the second split will be 0 indexed
                 indices = (split_targets[idx] - self.splits[idx]).view(-1, 1)
                 # Warning: if you don't squeeze, you get an N x 1 return, which acts oddly with broadcasting
-                tail_entropy = torch.gather(torch.nn.functional.log_softmax(tail_res), dim=1, index=indices).squeeze()
+                tail_entropy = torch.gather(torch.nn.functional.log_softmax(tail_res, dim=-1), dim=1, index=indices).squeeze()
                 entropy = -(head_entropy + tail_entropy)
             ###
             running_offset += len(split_hiddens[idx])

diff --git a/utils.py b/utils.py
@@ -1,12 +1,15 @@
-from torch.autograd import Variable
+import torch
+
 
 def repackage_hidden(h):
-    """Wraps hidden states in new Variables, to detach them from their history."""
-    if type(h) == Variable:
-        return Variable(h.data)
+    """Wraps hidden states in new Tensors,
+    to detach them from their history."""
+    if isinstance(h, torch.Tensor):
+        return h.detach()
     else:
         return tuple(repackage_hidden(v) for v in h)
 
+
 def batchify(data, bsz, args):
     # Work out how cleanly we can divide the dataset into bsz parts.
     nbatch = data.size(0) // bsz
@@ -18,8 +21,9 @@ def batchify(data, bsz, args):
         data = data.cuda()
     return data
 
+
 def get_batch(source, i, args, seq_len=None, evaluation=False):
     seq_len = min(seq_len if seq_len else args.bptt, len(source) - 1 - i)
-    data = Variable(source[i:i+seq_len], volatile=evaluation)
-    target = Variable(source[i+1:i+1+seq_len].view(-1))
+    data = source[i:i+seq_len]
+    target = source[i+1:i+1+seq_len].view(-1)
     return data, target