diff --git a/README.md b/README.md
index f4453ca..b63aa67 100644
--- a/README.md
+++ b/README.md
@@ -37,6 +37,9 @@ If you use this code or our results in your research, please cite as appropriate
   year={2018}
 }
 ```
+## Update (06/13)
+
+The codebase is now PyTorch 0.4 compatible for most use cases (a big shoutout to @shawntan for a fairly comprehensive PR https://github.com/salesforce/awd-lstm-lm/pull/43). Mild readjustments to hyperparameters may be necessary to obtain quoted performance. If you desire exact reproducibility (or wish to run on PyTorch 0.3 or lower), we suggest using an older commit of this repository. We are still working on `finetune` and `generate` functionality.
 
 ## Software Requirements
 
diff --git a/embed_regularize.py b/embed_regularize.py
index 386e1ec..b0a40c5 100644
--- a/embed_regularize.py
+++ b/embed_regularize.py
@@ -1,12 +1,10 @@
 import numpy as np
 
 import torch
-from torch.autograd import Variable
 
 def embedded_dropout(embed, words, dropout=0.1, scale=None):
   if dropout:
     mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(embed.weight) / (1 - dropout)
-    mask = Variable(mask)
     masked_embed_weight = mask * embed.weight
   else:
     masked_embed_weight = embed.weight
@@ -16,7 +14,8 @@ def embedded_dropout(embed, words, dropout=0.1, scale=None):
   padding_idx = embed.padding_idx
   if padding_idx is None:
       padding_idx = -1
-  X = embed._backend.Embedding.apply(words, masked_embed_weight,
+
+  X = torch.nn.functional.embedding(words, masked_embed_weight,
     padding_idx, embed.max_norm, embed.norm_type,
     embed.scale_grad_by_freq, embed.sparse
   )
@@ -32,7 +31,6 @@ def embedded_dropout(embed, words, dropout=0.1, scale=None):
 
   words = np.random.random_integers(low=0, high=V-1, size=(batch_size, bptt))
   words = torch.LongTensor(words)
-  words = Variable(words)
 
   origX = embed(words)
   X = embedded_dropout(embed, words)
diff --git a/finetune.py b/finetune.py
index c320cd5..5afdbc3 100644
--- a/finetune.py
+++ b/finetune.py
@@ -5,7 +5,6 @@
 np.random.seed(331)
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 
 import data
 import model
diff --git a/main.py b/main.py
index bda3e3d..afd6e6e 100644
--- a/main.py
+++ b/main.py
@@ -1,10 +1,10 @@
+import setGPU
 import argparse
 import time
 import math
 import numpy as np
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 
 import data
 import model
@@ -166,7 +166,7 @@ def evaluate(data_source, batch_size=10):
         output, hidden = model(data, hidden)
         total_loss += len(data) * criterion(model.decoder.weight, model.decoder.bias, output, targets).data
         hidden = repackage_hidden(hidden)
-    return total_loss[0] / len(data_source)
+    return total_loss.item() / len(data_source)
 
 
 def train():
@@ -205,13 +205,13 @@ def train():
         loss.backward()
 
         # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
-        if args.clip: torch.nn.utils.clip_grad_norm(params, args.clip)
+        if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip)
         optimizer.step()
 
         total_loss += raw_loss.data
         optimizer.param_groups[0]['lr'] = lr2
         if batch % args.log_interval == 0 and batch > 0:
-            cur_loss = total_loss[0] / args.log_interval
+            cur_loss = total_loss.item() / args.log_interval
             elapsed = time.time() - start_time
             print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | '
                     'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
@@ -249,7 +249,7 @@ def train():
             print('-' * 89)
             print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format(
-              epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), val_loss / math.log(2)))
+                    epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2), val_loss2 / math.log(2)))
             print('-' * 89)
 
             if val_loss2 < stored_loss:
diff --git a/model.py b/model.py
index 3ef853c..a704b3f 100644
--- a/model.py
+++ b/model.py
@@ -1,6 +1,5 @@
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 
 from embed_regularize import embedded_dropout
 from locked_dropout import LockedDropout
@@ -99,9 +98,9 @@ def forward(self, input, hidden, return_h=False):
     def init_hidden(self, bsz):
         weight = next(self.parameters()).data
         if self.rnn_type == 'LSTM':
-            return [(Variable(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_()),
-                    Variable(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_()))
+            return [(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_(),
+                    weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_())
                     for l in range(self.nlayers)]
         elif self.rnn_type == 'QRNN' or self.rnn_type == 'GRU':
-            return [Variable(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_())
+            return [weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_()
                     for l in range(self.nlayers)]
diff --git a/splitcross.py b/splitcross.py
index c590b18..cda17dd 100644
--- a/splitcross.py
+++ b/splitcross.py
@@ -37,7 +37,7 @@ def logprob(self, weight, bias, hiddens, splits=None, softmaxed_head_res=None, v
             # Perform the softmax calculation for the word vectors in the head for all splits
             # We need to guard against empty splits as torch.cat does not like random lists
             head_res = torch.nn.functional.linear(hiddens, head_weight, bias=head_bias)
-            softmaxed_head_res = torch.nn.functional.log_softmax(head_res)
+            softmaxed_head_res = torch.nn.functional.log_softmax(head_res, dim=-1)
 
         if splits is None:
             splits = list(range(self.nsplits))
@@ -62,7 +62,7 @@ def logprob(self, weight, bias, hiddens, splits=None, softmaxed_head_res=None, v
                 # Then we calculate p(tombstone) * p(word in tombstone)
                 # Adding is equivalent to multiplication in log space
                 head_entropy = (softmaxed_head_res[:, -idx]).contiguous()
-                tail_entropy = torch.nn.functional.log_softmax(tail_res)
+                tail_entropy = torch.nn.functional.log_softmax(tail_res, dim=-1)
                 results.append(head_entropy.view(-1, 1) + tail_entropy)
 
         if len(results) > 1:
@@ -129,7 +129,7 @@ def forward(self, weight, bias, hiddens, targets, verbose=False):
         combo = torch.cat([split_hiddens[i] for i in range(self.nsplits) if len(split_hiddens[i])])
         ###
         all_head_res = torch.nn.functional.linear(combo, head_weight, bias=head_bias)
-        softmaxed_all_head_res = torch.nn.functional.log_softmax(all_head_res)
+        softmaxed_all_head_res = torch.nn.functional.log_softmax(all_head_res, dim=-1)
         if self.verbose or verbose:
             self.stats[0].append(combo.size()[0] * head_weight.size()[0])
 
@@ -160,7 +160,7 @@ def forward(self, weight, bias, hiddens, targets, verbose=False):
                 # All indices are shifted - if the first split handles [0,...,499] then the 500th in the second split will be 0 indexed
                 indices = (split_targets[idx] - self.splits[idx]).view(-1, 1)
                 # Warning: if you don't squeeze, you get an N x 1 return, which acts oddly with broadcasting
-                tail_entropy = torch.gather(torch.nn.functional.log_softmax(tail_res), dim=1, index=indices).squeeze()
+                tail_entropy = torch.gather(torch.nn.functional.log_softmax(tail_res, dim=-1), dim=1, index=indices).squeeze()
                 entropy = -(head_entropy + tail_entropy)
             ###
             running_offset += len(split_hiddens[idx])
diff --git a/utils.py b/utils.py
index 82bb157..20f70ec 100644
--- a/utils.py
+++ b/utils.py
@@ -1,12 +1,15 @@
-from torch.autograd import Variable
+import torch
+
 
 def repackage_hidden(h):
-    """Wraps hidden states in new Variables, to detach them from their history."""
-    if type(h) == Variable:
-        return Variable(h.data)
+    """Wraps hidden states in new Tensors,
+    to detach them from their history."""
+    if isinstance(h, torch.Tensor):
+        return h.detach()
     else:
         return tuple(repackage_hidden(v) for v in h)
 
+
 def batchify(data, bsz, args):
     # Work out how cleanly we can divide the dataset into bsz parts.
     nbatch = data.size(0) // bsz
@@ -18,8 +21,9 @@ def batchify(data, bsz, args):
         data = data.cuda()
     return data
 
+
 def get_batch(source, i, args, seq_len=None, evaluation=False):
     seq_len = min(seq_len if seq_len else args.bptt, len(source) - 1 - i)
-    data = Variable(source[i:i+seq_len], volatile=evaluation)
-    target = Variable(source[i+1:i+1+seq_len].view(-1))
+    data = source[i:i+seq_len]
+    target = source[i+1:i+1+seq_len].view(-1)
     return data, target