This repository has been archived by the owner on Mar 17, 2019. It is now read-only.
forked from openai/gym
-
Notifications
You must be signed in to change notification settings - Fork 282
/
Copy pathcircuit2_turtlebot_lidar_qlearn.py
executable file
·98 lines (67 loc) · 2.91 KB
/
circuit2_turtlebot_lidar_qlearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python
import gym
import gym_gazebo
import time
import numpy
import random
import time
import qlearn
import liveplot
def render():
render_skip = 0 #Skip first X episodes.
render_interval = 50 #Show render Every Y episodes.
render_episodes = 10 #Show Z episodes every rendering.
if (x%render_interval == 0) and (x != 0) and (x > render_skip):
env.render()
elif ((x-render_episodes)%render_interval == 0) and (x != 0) and (x > render_skip) and (render_episodes < x):
env.render(close=True)
if __name__ == '__main__':
env = gym.make('GazeboCircuit2TurtlebotLidar-v0')
outdir = '/tmp/gazebo_gym_experiments'
env = gym.wrappers.Monitor(env, outdir, force=True)
plotter = liveplot.LivePlot(outdir)
last_time_steps = numpy.ndarray(0)
qlearn = qlearn.QLearn(actions=range(env.action_space.n),
alpha=0.2, gamma=0.8, epsilon=0.9)
initial_epsilon = qlearn.epsilon
epsilon_discount = 0.9986
start_time = time.time()
total_episodes = 10000
highest_reward = 0
for x in range(total_episodes):
done = False
cumulated_reward = 0 #Should going forward give more reward then L/R ?
observation = env.reset()
if qlearn.epsilon > 0.05:
qlearn.epsilon *= epsilon_discount
#render() #defined above, not env.render()
state = ''.join(map(str, observation))
for i in range(1500):
# Pick an action based on the current state
action = qlearn.chooseAction(state)
# Execute the action and get feedback
observation, reward, done, info = env.step(action)
cumulated_reward += reward
if highest_reward < cumulated_reward:
highest_reward = cumulated_reward
nextState = ''.join(map(str, observation))
qlearn.learn(state, action, reward, nextState)
env._flush(force=True)
if not(done):
state = nextState
else:
last_time_steps = numpy.append(last_time_steps, [int(i + 1)])
break
if x%100==0:
plotter.plot(env)
m, s = divmod(int(time.time() - start_time), 60)
h, m = divmod(m, 60)
print ("EP: "+str(x+1)+" - [alpha: "+str(round(qlearn.alpha,2))+" - gamma: "+str(round(qlearn.gamma,2))+" - epsilon: "+str(round(qlearn.epsilon,2))+"] - Reward: "+str(cumulated_reward)+" Time: %d:%02d:%02d" % (h, m, s))
#Github table content
print ("\n|"+str(total_episodes)+"|"+str(qlearn.alpha)+"|"+str(qlearn.gamma)+"|"+str(initial_epsilon)+"*"+str(epsilon_discount)+"|"+str(highest_reward)+"| PICTURE |")
l = last_time_steps.tolist()
l.sort()
#print("Parameters: a="+str)
print("Overall score: {:0.2f}".format(last_time_steps.mean()))
print("Best 100 score: {:0.2f}".format(reduce(lambda x, y: x + y, l[-100:]) / len(l[-100:])))
env.close()