-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathq_learning_model.py
410 lines (329 loc) · 17.2 KB
/
q_learning_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
import math
import random
import numpy as np
from collections import namedtuple, deque
from itertools import count
import game
import os
from functools import partial
import torch
import torch.nn as nn
import torch.optim as optim
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
# Uncomment if you wanna see the AI playing
# is_ipython = 'inline' in matplotlib.get_backend()
# if is_ipython:
# from IPython import display
#
# plt.ion()
# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# release all unoccupied GPU cache
if torch.cuda.is_available():
torch.cuda.empty_cache()
# Number of GPUs to use (only applied if GPU count > 1)
gpus_per_trial = 2
# Intantiate Board
connect = game.connect4()
## Define Replay Memory
# Define a named tuple that represents a single transition on the board
Transition = namedtuple("Transition",
("state", "action", "next_state", "reward"))
# Define a cyclic buffer of bounded size that holds the transitions observed recently. Will later also be used
# for training
class ReplayMemory(object):
def __init__(self, capacity):
self.memory = deque([], maxlen=capacity) # list-like sequence
# Method for saving a transition
def push(self, *args):
"""Save a transition"""
self.memory.append(Transition(*args))
def sample(self, batch_size):
"""Function that returns a batch of randomly sampled previously saved transition of size batch_size"""
return random.sample(self.memory, batch_size)
# Len method
def __len__(self):
return len(self.memory)
# Specify if the model should be saved
save = True
### Description
# The aim will be to approximate the optimal Q-value function denoted with Q'(s,a), where s denotes the
# state s (i.e. the current board) and a the taken action at time step t (i.e. the columns selected to drop
# a chip). If we know Q'(s,a) we could easily infer the optimal policy P'(s) = argmax_a Q'(s,a). To approximate
# Q'(s,a) we will make use of the Bellman equation: Q^P(s,a) = r + g Q^P(s', P(s')), where g is a discount factor.
# We do the following: Create a CNN that takes as input the current board and outputs Q(s, c), with c = {1,...,7}
# being the columns selectable. The network thus outputs the expected return of taking each action given the current
# board. We can then select the action based on the argmax of the output.
# To approximate the optimal Q'(s,a) we make use of the temporal difference error (temporal in the sense that we start
# in s and go to s'): error = Q(s,a) - (r + g max_a Q(s',P(s'))), where we obtain Q(s,a) from our network (now called
# policy network) and max_a Q(s', a) from one instance of our network (from now on called target network) which is based
# on the policy network but has its weights froozen during training and gets updated only so often.
# We use the Huber loss to obtain error and optimize our policy network based on this.
# To train our DQN (Deep Q Network) we use an epsilon greedy policy. The probability of choosing a random action will
# decay exponentially fast.
# Since connect4 is a 2 player game we will mainly train player 1 and then update the weights of the network of
# player 2 after each episode (i.e. a game). This saves memory and speeds up training.
## Create the Q-Network
# (h,w): image height and width, outputs: numbers of actions
class DQN(nn.Module):
def __init__(self, h, w, outputs, l1=200, l2=100):
super(DQN, self).__init__()
self.conv_layer = nn.Sequential(
nn.Conv2d(3, 16, kernel_size=5, stride=2), # First Convolutional Layer
nn.BatchNorm2d(16), # Normalise layer output,
nn.ReLU(), # RElu activation function
nn.Conv2d(16, 32, kernel_size=5, stride=2), # Second Convolutional Layer
nn.BatchNorm2d(32), # Normalise layer output
nn.ReLU(), # RElu activation function
nn.Conv2d(32, 32, kernel_size=5, stride=2), # Third Convolutional Layer
nn.BatchNorm2d(32), # Normalise layer output
nn.ReLU() # RElu activation function
)
# Calculate conv_layer output size to be able to pass it through the final linear layer
def conv2d_size_out(size, kernel_size=5, stride=2):
return (size - (kernel_size - 1) - 1) // stride + 1
# Calculate width and height of conv_layer output based on input w and h
convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
self.linear_input_size = convw * convh * 32
# Final linear layers
self.linear_layers = nn.Sequential(
nn.Linear(self.linear_input_size, l1),
nn.ReLU(),
nn.Linear(l1, l2),
nn.ReLU(),
nn.Linear(l2, outputs)
)
def forward(self, x):
x = x.to(device) # Convert to GPU or CPU tensor
x = self.conv_layer(x) # Pass through conv net
# To be able to pass it through the final layer the conv output gets flattened
return self.linear_layers(x.view(x.size(0), -1))
### Training
## Hyperparameters and utilities
# Define the epsilon decay and when the target net will be updated
BATCH_SIZE = 80
GAMMA = 0.99 # Discount factor
EPS_START = 0.9 # starting value epsilon
EPS_END = 0.05 # final value epsilon
EPS_DECAY = 300 # decay factor of epsilon
TARGET_UPDATE = 10
steps_done = 0 # Helper for epsilon decay
# Reward dictionary
rewards = {"win": 50, "draw": 5, "lose": -50, "surviving": 0}
# Get image size (will be constant for all iterations)
_, _, screen_height, screen_width = connect.export_board().shape
# get numbers of actions
n_actions = torch.tensor(connect.actions_performable().shape).item()
# Instatiate networks
policy_net = DQN(screen_height, screen_width, n_actions).to(device)
policy_net_player_2 = DQN(screen_height, screen_width, n_actions).to(device)
target_net = DQN(screen_height, screen_width, n_actions).to(device)
# Multi GPU
if torch.cuda.device_count() > 1:
policy_net = nn.DataParallel(policy_net)
policy_net_player_2 = nn.DataParallel(policy_net_player_2)
target_net = nn.DataParallel(target_net)
# Give target_net the same parameters as policy_net and set to eval mode (no weight updates), do the same with
# second player
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
policy_net_player_2.load_state_dict(policy_net.state_dict())
policy_net_player_2.eval()
# Create optimizer here we use SGD
optimizer = optim.RMSprop(policy_net.parameters(), lr=0.01, momentum=0.90)
# Create an scheduler which reduced the learning rate when the loss is not decreasing over a long period
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.05, patience=TARGET_UPDATE)
# Instatiate Replay memory (only for player 1)
memory = ReplayMemory(10000)
# Define empty lists for player 1 wins, draws, reward and loss
loss_player_2 = 0
wins_player_2 = 0
draws_player_2 = 0
reward_player_2 = 0
# Define the action selection function based on an epsilon greedy approach
def select_action(state, conn, model):
global steps_done
sample = random.random()
eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
steps_done += 1
if sample > eps_threshold:
with torch.no_grad():
# t.max(1) will return largest column value of each row.
# second column on max result is index of where max element was
# found, so we pick action with the larger expected reward.
# However, it might happen, that the algorithm proposed a column, that is already full.
# Thus we have to check if the column selected is already full and if so, select the one with the next highest
# expected reward
avail = False
k = 0
actions_proposed = model(state).topk(k=7)[1][0] # Get actions proposed by network order by largest value
available_columns = torch.tensor(np.flatnonzero(np.any(conn.board == " ", 0))) # Get available columns
# Cycle through every actions proposed by the network and check if column is available
while avail == False:
action_proposed = actions_proposed[k].item()
avail = any(action_proposed == available_columns)
k += 1
action_selected = model(state).topk(k=7)[1][0][k - 1].view(1, 1)
return action_selected
else:
# Same for random selection
return torch.tensor([[random.choice(np.flatnonzero(np.any(connect.board == " ", 0)))]],
device=device, dtype=torch.long)
## Training Loop
## Optimization function
# We now define a function that optimizes the model given a batch of transitions, if there are enough transitions
# stored in the memory. It will only use states which are not final states, i.e. states after which the game ended, for
# final states we set the value function equal to 0.
def optimize_model():
global loss_player_2
if len(memory) < BATCH_SIZE:
return
# Get a Batch of transitions
transitions = memory.sample(BATCH_SIZE)
# Transpose the batch. This converts batch-array of Transitions to Transition of batch-arrays.
batch = Transition(*zip(*transitions))
# Compute a mask of non-final states and concatenate the batch elements
non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)),
device=device, dtype=torch.bool)
non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
state_batch = torch.cat(batch.state)
action_batch = torch.cat(batch.action)
reward_batch = torch.cat(batch.reward)
# Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken. These are the actions
# which would've been taken for each batch state according to policy_net
state_action_values = policy_net(state_batch).gather(1, action_batch)
# Compute V(s_{t+1}) for all next states. Expected values of actions for non_final_next_states are computed based
# on the "older" target_net; selecting their best reward with max(1)[0]. This is merged based on the mask, such that
# we'll have either the expected state value or 0 in case the state was final.
next_state_values = torch.zeros(BATCH_SIZE, device=device)
next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
# Compute the expected Q values
expected_state_action_values = (next_state_values * GAMMA) + reward_batch
# Compute Huber loss
criterion = nn.SmoothL1Loss()
loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
# Save loss
loss_player_2 += loss.item()
# Optimize model
optimizer.zero_grad()
loss.backward()
for param in policy_net.parameters():
param.grad.data.clamp_(-1, 1)
optimizer.step()
## Reward calculation
# We will now write a function that plays one round, i.e. places a marker based on the action and then retrieves the
# reward. We will award +20 for winning and +10 for a draw and 0 for a loss or a simple placement.
def reward_getter(action, connect, marker_type):
connect, win = game.connect_4_train(action, connect, marker_type)
draw = game.draw_check(connect.board)
done = any([draw, win])
if win:
reward = rewards["win"]
return reward, connect, done
if draw:
reward = rewards["draw"]
return reward, connect, done
else:
reward = rewards["surviving"]
return reward, connect, done
# Main loop
num_episodes = 1000
perf_update = 50 # When to receive performance updates during training
episode_durations = [] # count number of moves per game
if __name__ == "__main__":
for i_episode in range(num_episodes):
# Initialize environment and state
connect.reset()
last_screen = connect.export_board()
current_screen = connect.export_board()
state = current_screen - last_screen
for t in count():
# Player 1
# action_player_1 = select_action(state, connect, policy_net_player_2) # Player 1 AI performs action
# reward_1, connect, done = reward_getter(action_player_1, connect, "B") # Player 1 AI gets reward
# reward_1 = torch.tensor([reward_1], device=device) # convert reward to tensor
# Monkey Player 1
action_player_1 = random.choice(np.flatnonzero(np.any(connect.board == " ", axis=0)))
reward_1, connect, done = reward_getter(action_player_1, connect, "B") # Player 1 AI gets reward
reward_1 = torch.tensor([reward_1], device=device) # convert reward to tensor
# Player_2 observes new state
last_screen = current_screen
current_screen = connect.export_board()
if not done:
next_state = current_screen - last_screen
else:
next_state = None
# Move to next state
state = next_state
# If player 1 has already won, then we don't have to have player 2 select an action. Since in this case
# player 2 wouldn't get any strict positive reward we don't lose any experience not performing an action
# selection for player 2
if done:
episode_durations.append(t + 1)
if reward_1.item() == rewards["draw"]: # Just for safety shouldn't occur for the first player
draws_player_2 += 1 # Increase draws player 2 by one
elif reward_1.item() == rewards["win"]: # If player 1 wins, we punish player 2
reward_player_2 += rewards["lose"]
break
# If player_1 already ended the game this will not get executed
# Player 2
action_player_2 = select_action(state, connect, policy_net) # Player 2 AI performs action
reward_2, connect, done = reward_getter(action_player_2, connect, "R") # Player 2 AI gets reward
reward_2 = torch.tensor([reward_2], device=device) # convert reward to tensor
# Player_1 observes new state
last_screen = current_screen
current_screen = connect.export_board()
if not done:
next_state = current_screen - last_screen
else:
next_state = None
# Store the transition of player 2 in memory
memory.push(state, action_player_2, next_state, reward_2)
# memory_player_2.push(state, action_player_2, next_state, reward_2)
# Move to next state
state = next_state
# Perform one step of the optimization (player 2) and retrieve loss
optimize_model()
if done:
# if this gets executed the result is either, that player 1 has lost or that the game ended in a
# draw
episode_durations.append(t + 1)
reward_player_2 += reward_2.item() # Append player 2 reward
if reward_2.item() == rewards["win"]:
wins_player_2 += 1 # Increase wins player 2 by one
elif reward_2.item() == rewards["draw"]:
draws_player_2 += 1 # Increase draws player 2 by one
break
# Note: Since the state space is of order 2^42 a draw would only occur after player 2 has performed
# an action since at round 1 player 1 performs an action and in round 2 player 2, hence the 42nd round
# will definitely be performed by player 2. But this would also result in done = True in the
# reward_getter function.
# Update learning rate if necessary
scheduler.step(loss_player_2 / (sum(episode_durations) - (BATCH_SIZE - 1)))
# Update the target network, copying all weights and biases in DQN after 10 episodes
if (i_episode + 1) % (TARGET_UPDATE) == 0:
target_net.load_state_dict(policy_net.state_dict())
# Update player 2 ai
# policy_net_player_2.load_state_dict(policy_net.state_dict())
# Calculate average loss, win-, draw-ratio and average reward per episode for the last 20 episodes
if (i_episode + 1) % (perf_update) == 0:
if (sum(episode_durations) - (BATCH_SIZE - 1)) == 0:
div_loss = 1
else:
div_loss = sum(episode_durations) - (BATCH_SIZE - 1)
print("Rounds played: {}; Mean loss of last {} moves: {};\n"
"Mean reward: {}; Mean wins of player 2 : {} \n"
"Mean draws of player 2: {}; Learning rate: {}"
.format(i_episode + 1, sum(episode_durations),
round(loss_player_2 / div_loss, 4),
round(reward_player_2 / (i_episode + 1), 4),
round(wins_player_2 / (i_episode + 1), 4),
round(draws_player_2 / (i_episode + 1), 4),
optimizer.state_dict()["param_groups"][0]["lr"]))
print("Complete")
# Save Model if specified
if save:
torch.save(policy_net.state_dict(), "C:/Users/TonyG/Documents/GitHub/connect4/q_learning_mod")