-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmakemore_part_2.py
113 lines (81 loc) · 2.49 KB
/
makemore_part_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
import random
# read in all the words
words = open('names.txt', 'r').read().splitlines()
print(words[:8])
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)
# build the dataset
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?
def build_dataset(words):
X, Y = [], []
for w in words:
context = [0] * block_size
for ch in w + '.':
ix = stoi[ch]
X.append(context)
Y.append(ix)
#print(''.join(itos[i] for i in context), '--->', itos[ix])
context = context[1:] + [ix] # crop and append
X = torch.tensor(X)
Y = torch.tensor(Y)
print(X.shape, Y.shape)
return X, Y
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))
Xtrain, Ytrain = build_dataset(words[:n1])
Xvalidation, Yvalidation = build_dataset(words[n1:n2])
Xtest, Ytest = build_dataset(words[n2:])
### Neural network
embedding_dimensions = 10
embedding_in_size = block_size * embedding_dimensions
hidden_layer_size = 200
vocab_size = 27
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((vocab_size, embedding_dimensions), generator=g)
W1 = torch.randn((embedding_in_size, hidden_layer_size), generator=g)
b1 = torch.randn(hidden_layer_size, generator=g)
W2 = torch.randn((hidden_layer_size, vocab_size), generator=g)
b2 = torch.randn(vocab_size, generator=g)
parameters = [C, W1, b1, W2, b2]
### Train
batch_size = 32
for p in parameters:
p.requires_grad = True
learning_rate = 0.1
for i in range(10000):
## Get batch
# Sample random integers batch_size times, to get a training batch
ix = torch.randint(0, Xtrain.shape[0], (batch_size,))
## Forward pass
# Embedding layer
emb = C[Xtrain[ix]]
# Hidden layer - Concatenate
hidden_layer = torch.tanh(emb.view(-1, embedding_in_size) @ W1 + b1)
# Logits layer
logits = hidden_layer @ W2 + b2
# Loss
loss = F.cross_entropy(logits, Ytrain[ix])
## Backward pass
# Get gradients
for p in parameters:
p.grad = None
loss.backward()
# Update parameters
for p in parameters:
p.data += -learning_rate * p.grad
# Final loss on the whole set
emb = C[Xtrain]
h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
logits = h @ W2 + b2
fina_loss = F.cross_entropy(logits, Ytrain)
print(fina_loss)