forked from tjmccue00/Thesis-Code
-
Notifications
You must be signed in to change notification settings - Fork 0
/
CartPoleTrainExample.py
executable file
·93 lines (72 loc) · 3.15 KB
/
CartPoleTrainExample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gym
import numpy as np
import matplotlib.pyplot as plt
import time
env = gym.make('CartPole-v1')
def Qtable(state_space,action_space,bin_size = 30):
bins = []
state_bounds = [[-4.8, 4.8], [-4,4], [-0.418, 0.418], [-4,4]]
for i in range(state_space):
bins.append(np.linspace(state_bounds[i][0], state_bounds[i][1], bin_size))
q_table = np.random.uniform(low=-1,high=1,size=([bin_size] * state_space + [action_space]))
return q_table, bins
def Discrete(state, bins):
disc_states = []
for i in range(len(state)):
disc_states.append(np.digitize(state[i], bins[i]) - 1)
return tuple(disc_states)
def Q_learning(q_table, bins, episodes = 5000, gamma = 0.95, lr = 0.1, timestep = 100, epsilon = 0.2):
rewards = 0
solved = False
steps = 0
runs = [0]
data = {'max' : [0], 'avg' : [0]}
start = time.time()
ep = [i for i in range(0,episodes + 1,timestep)]
for episode in range(1,episodes+1):
current_state = Discrete(env.reset()[0],bins) # initial observation
score = 0
done = False
temp_start = time.time()
while not done:
steps += 1
ep_start = time.time()
if episode%timestep == 0:
env.render()
if np.random.uniform(0,1) < epsilon:
action = env.action_space.sample()
else:
action = np.argmax(q_table[current_state])
observation, reward, done, truncated, info = env.step(action)
next_state = Discrete(observation,bins)
score += reward
if not done:
max_future_q = np.max(q_table[next_state])
current_q = q_table[current_state+(action,)]
new_q = (1-lr)*current_q + lr*(reward + gamma*max_future_q)
q_table[current_state+(action,)] = new_q
current_state = next_state
# End of the loop update
else:
rewards += score
runs.append(score)
if score > 195 and steps >= 100 and solved == False: # considered as a solved:
solved = True
print('Solved in episode : {} in time {}'.format(episode, (time.time()-ep_start)))
# Timestep value update
if episode%timestep == 0:
print('Episode : {} | Reward -> {} | Max reward : {} | Time : {}'.format(episode,rewards/timestep, max(runs), time.time() - ep_start))
data['max'].append(max(runs))
data['avg'].append(rewards/timestep)
if rewards/timestep >= 195:
print('Solved in episode : {}'.format(episode))
rewards, runs= 0, [0]
if len(ep) == len(data['max']):
plt.plot(ep, data['max'], label = 'Max')
plt.plot(ep, data['avg'], label = 'Avg')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.legend(loc = "upper left")
env.close()
q_table, bins = Qtable(len(env.observation_space.low), env.action_space.n)
Q_learning(q_table, bins, lr = 0.15, gamma = 0.995, episodes = 5**3, timestep = 100)