Skip to content
This repository has been archived by the owner on Feb 12, 2022. It is now read-only.

Commit

Permalink
PyTorch 0.4 compatible
Browse files Browse the repository at this point in the history
  • Loading branch information
Nitish Keskar committed Jun 13, 2018
1 parent 9205e9b commit 457a422
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 24 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ If you use this code or our results in your research, please cite as appropriate
year={2018}
}
```
## Update (06/13)

The codebase is now PyTorch 0.4 compatible for most use cases (a big shoutout to @shawntan for a fairly comprehensive PR https://github.com/salesforce/awd-lstm-lm/pull/43). Mild readjustments to hyperparameters may be necessary to obtain quoted performance. If you desire exact reproducibility (or wish to run on PyTorch 0.3 or lower), we suggest using an older commit of this repository. We are still working on `finetune` and `generate` functionality.

## Software Requirements

Expand Down
6 changes: 2 additions & 4 deletions embed_regularize.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import numpy as np

import torch
from torch.autograd import Variable

def embedded_dropout(embed, words, dropout=0.1, scale=None):
if dropout:
mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(embed.weight) / (1 - dropout)
mask = Variable(mask)
masked_embed_weight = mask * embed.weight
else:
masked_embed_weight = embed.weight
Expand All @@ -16,7 +14,8 @@ def embedded_dropout(embed, words, dropout=0.1, scale=None):
padding_idx = embed.padding_idx
if padding_idx is None:
padding_idx = -1
X = embed._backend.Embedding.apply(words, masked_embed_weight,

X = torch.nn.functional.embedding(words, masked_embed_weight,
padding_idx, embed.max_norm, embed.norm_type,
embed.scale_grad_by_freq, embed.sparse
)
Expand All @@ -32,7 +31,6 @@ def embedded_dropout(embed, words, dropout=0.1, scale=None):

words = np.random.random_integers(low=0, high=V-1, size=(batch_size, bptt))
words = torch.LongTensor(words)
words = Variable(words)

origX = embed(words)
X = embedded_dropout(embed, words)
Expand Down
1 change: 0 additions & 1 deletion finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
np.random.seed(331)
import torch
import torch.nn as nn
from torch.autograd import Variable

import data
import model
Expand Down
10 changes: 5 additions & 5 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import setGPU
import argparse
import time
import math
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable

import data
import model
Expand Down Expand Up @@ -166,7 +166,7 @@ def evaluate(data_source, batch_size=10):
output, hidden = model(data, hidden)
total_loss += len(data) * criterion(model.decoder.weight, model.decoder.bias, output, targets).data
hidden = repackage_hidden(hidden)
return total_loss[0] / len(data_source)
return total_loss.item() / len(data_source)


def train():
Expand Down Expand Up @@ -205,13 +205,13 @@ def train():
loss.backward()

# `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
if args.clip: torch.nn.utils.clip_grad_norm(params, args.clip)
if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip)
optimizer.step()

total_loss += raw_loss.data
optimizer.param_groups[0]['lr'] = lr2
if batch % args.log_interval == 0 and batch > 0:
cur_loss = total_loss[0] / args.log_interval
cur_loss = total_loss.item() / args.log_interval
elapsed = time.time() - start_time
print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | '
'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
Expand Down Expand Up @@ -249,7 +249,7 @@ def train():
print('-' * 89)
print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
'valid ppl {:8.2f} | valid bpc {:8.3f}'.format(
epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), val_loss / math.log(2)))
epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2), val_loss2 / math.log(2)))
print('-' * 89)

if val_loss2 < stored_loss:
Expand Down
7 changes: 3 additions & 4 deletions model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import torch
import torch.nn as nn
from torch.autograd import Variable

from embed_regularize import embedded_dropout
from locked_dropout import LockedDropout
Expand Down Expand Up @@ -99,9 +98,9 @@ def forward(self, input, hidden, return_h=False):
def init_hidden(self, bsz):
weight = next(self.parameters()).data
if self.rnn_type == 'LSTM':
return [(Variable(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_()),
Variable(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_()))
return [(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_(),
weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_())
for l in range(self.nlayers)]
elif self.rnn_type == 'QRNN' or self.rnn_type == 'GRU':
return [Variable(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_())
return [weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_()
for l in range(self.nlayers)]
8 changes: 4 additions & 4 deletions splitcross.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def logprob(self, weight, bias, hiddens, splits=None, softmaxed_head_res=None, v
# Perform the softmax calculation for the word vectors in the head for all splits
# We need to guard against empty splits as torch.cat does not like random lists
head_res = torch.nn.functional.linear(hiddens, head_weight, bias=head_bias)
softmaxed_head_res = torch.nn.functional.log_softmax(head_res)
softmaxed_head_res = torch.nn.functional.log_softmax(head_res, dim=-1)

if splits is None:
splits = list(range(self.nsplits))
Expand All @@ -62,7 +62,7 @@ def logprob(self, weight, bias, hiddens, splits=None, softmaxed_head_res=None, v
# Then we calculate p(tombstone) * p(word in tombstone)
# Adding is equivalent to multiplication in log space
head_entropy = (softmaxed_head_res[:, -idx]).contiguous()
tail_entropy = torch.nn.functional.log_softmax(tail_res)
tail_entropy = torch.nn.functional.log_softmax(tail_res, dim=-1)
results.append(head_entropy.view(-1, 1) + tail_entropy)

if len(results) > 1:
Expand Down Expand Up @@ -129,7 +129,7 @@ def forward(self, weight, bias, hiddens, targets, verbose=False):
combo = torch.cat([split_hiddens[i] for i in range(self.nsplits) if len(split_hiddens[i])])
###
all_head_res = torch.nn.functional.linear(combo, head_weight, bias=head_bias)
softmaxed_all_head_res = torch.nn.functional.log_softmax(all_head_res)
softmaxed_all_head_res = torch.nn.functional.log_softmax(all_head_res, dim=-1)
if self.verbose or verbose:
self.stats[0].append(combo.size()[0] * head_weight.size()[0])

Expand Down Expand Up @@ -160,7 +160,7 @@ def forward(self, weight, bias, hiddens, targets, verbose=False):
# All indices are shifted - if the first split handles [0,...,499] then the 500th in the second split will be 0 indexed
indices = (split_targets[idx] - self.splits[idx]).view(-1, 1)
# Warning: if you don't squeeze, you get an N x 1 return, which acts oddly with broadcasting
tail_entropy = torch.gather(torch.nn.functional.log_softmax(tail_res), dim=1, index=indices).squeeze()
tail_entropy = torch.gather(torch.nn.functional.log_softmax(tail_res, dim=-1), dim=1, index=indices).squeeze()
entropy = -(head_entropy + tail_entropy)
###
running_offset += len(split_hiddens[idx])
Expand Down
16 changes: 10 additions & 6 deletions utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
from torch.autograd import Variable
import torch


def repackage_hidden(h):
"""Wraps hidden states in new Variables, to detach them from their history."""
if type(h) == Variable:
return Variable(h.data)
"""Wraps hidden states in new Tensors,
to detach them from their history."""
if isinstance(h, torch.Tensor):
return h.detach()
else:
return tuple(repackage_hidden(v) for v in h)


def batchify(data, bsz, args):
# Work out how cleanly we can divide the dataset into bsz parts.
nbatch = data.size(0) // bsz
Expand All @@ -18,8 +21,9 @@ def batchify(data, bsz, args):
data = data.cuda()
return data


def get_batch(source, i, args, seq_len=None, evaluation=False):
seq_len = min(seq_len if seq_len else args.bptt, len(source) - 1 - i)
data = Variable(source[i:i+seq_len], volatile=evaluation)
target = Variable(source[i+1:i+1+seq_len].view(-1))
data = source[i:i+seq_len]
target = source[i+1:i+1+seq_len].view(-1)
return data, target

0 comments on commit 457a422

Please sign in to comment.