-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDDPG_vanilla.py
386 lines (310 loc) · 14.9 KB
/
DDPG_vanilla.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
"""
Implementation of the Deep Deterministic Policy Gradient (DDPG) algorithm [1] with TensorFlow for continuous-state and continuous-action spaces.
[1] Lillicrap, Timothy P., et al. "Continuous control with deep reinforcement learning." arXiv preprint arXiv:1509.02971 (2015).
Author: Arthur Bouton [arthur.bouton@gadz.org]
Dependency:
tensorflow 1.13.1
"""
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import numpy as np
from collections import deque
import random
from tqdm import trange
def actor_network_def( states, a_dim ) :
""" A feedforward neural network for the synthesis of the policy """
s_dim = states.get_shape().as_list()[1]
with tf.variable_scope( 'layer1' ) :
n_units_1 = 400
wmax = 1/np.sqrt( s_dim )
bmax = 1/np.sqrt( s_dim )
w1 = tf.get_variable( 'kernel', [s_dim, n_units_1], tf.float32, tf.initializers.random_uniform( -wmax, wmax ) )
b1 = tf.get_variable( 'bias', [n_units_1], tf.float32, tf.initializers.random_uniform( -bmax, bmax ) )
o1 = tf.add( tf.matmul( states, w1 ), b1 )
a1 = tf.nn.relu( o1 )
with tf.variable_scope( 'layer2' ) :
n_units_2 = 300
wmax = 1/np.sqrt( n_units_1 )
bmax = 1/np.sqrt( n_units_1 )
w2 = tf.get_variable( 'kernel', [n_units_1, n_units_2], tf.float32, tf.initializers.random_uniform( -wmax, wmax ) )
b2 = tf.get_variable( 'bias', [n_units_2], tf.float32, tf.initializers.random_uniform( -bmax, bmax ) )
o2 = tf.add( tf.matmul( a1, w2 ), b2 )
a2 = tf.nn.relu( o2 )
with tf.variable_scope( 'layer3' ) :
wmax = 0.003
bmax = 0.003
w3 = tf.get_variable( 'kernel', [n_units_2, a_dim], tf.float32, tf.initializers.random_uniform( -wmax, wmax ) )
b3 = tf.get_variable( 'bias', [a_dim], tf.float32, tf.initializers.random_uniform( -bmax, bmax ) )
o3 = tf.add( tf.matmul( a2, w3 ), b3 )
action = tf.nn.tanh( o3 )
return action
def critic_network_def( states, actions ) :
""" A feedforward neural network for the approximation of the Q-value """
s_dim = states.get_shape().as_list()[1]
a_dim = actions.get_shape().as_list()[1]
with tf.variable_scope( 'layer1' ) :
n_units_1 = 400
wmax = 1/np.sqrt( s_dim )
bmax = 1/np.sqrt( s_dim )
w1 = tf.get_variable( 'kernel', [s_dim, n_units_1], tf.float32, tf.initializers.random_uniform( -wmax, wmax ) )
b1 = tf.get_variable( 'bias', [n_units_1], tf.float32, tf.initializers.random_uniform( -bmax, bmax ) )
o1 = tf.add( tf.matmul( states, w1 ), b1 )
a1 = tf.nn.relu( o1 )
with tf.variable_scope( 'layer2' ) :
n_units_2 = 300
wmax = 1/np.sqrt( n_units_1 + a_dim )
bmax = 1/np.sqrt( n_units_1 + a_dim )
w2 = tf.get_variable( 'kernel', [n_units_1 + a_dim, n_units_2], tf.float32, tf.initializers.random_uniform( -wmax, wmax ) )
b2 = tf.get_variable( 'bias', [n_units_2], tf.float32, tf.initializers.random_uniform( -bmax, bmax ) )
o2 = tf.add( tf.matmul( tf.concat( [ a1, actions ], 1 ), w2 ), b2 )
a2 = tf.nn.relu( o2 )
with tf.variable_scope( 'layer3' ) :
wmax = 0.003
bmax = 0.003
w3 = tf.get_variable( 'kernel', [n_units_2, 1], tf.float32, tf.initializers.random_uniform( -wmax, wmax ) )
b3 = tf.get_variable( 'bias', [1], tf.float32, tf.initializers.random_uniform( -bmax, bmax ) )
Q_value = tf.add( tf.matmul( a2, w3 ), b3 )
return Q_value
class DDPG() :
"""
Deep Deterministic Policy Gradient algorithm.
Parameters
----------
s_dim : int
Dimension of the state space.
a_dim : int
Dimension of the action space.
state_scale : float or list of floats, optional, default: None
A scalar or a vector to normalize the state.
action_scale : float or list of floats, optional, default: None
A scalar or a vector to scale the actions.
gamma : float, optional, default: 0.99
Discount factor applied to the reward.
tau : float, optional, default: 1e-3
Soft target update factor.
buffer_size : int, optional, default: 1e6
Maximal size of the replay buffer.
minibatch_size : int, optional, default: 64
Size of each minibatch.
actor_lr : float, optional, default: 1e-4
Learning rate of the actor network.
critic_lr : float, optional, default: 1e-3
Learning rate of the critic network.
beta_L2 : float, optional, default: 0
Ridge regularization coefficient.
actor_def : function, optional, default: actor_network_def
Function defining the actor network.
It has to take the state tensor and the dimension of the
action space as inputs and return the action tensor.
critic_def : function, optional, default: critic_network_def
Function defining the critic network.
It has to take the state and action tensors as inputs and
return the Q value tensor.
summary_dir : str, optional, default: None
Directory in which to save the summaries.
If None, no summaries are created.
seed : int, optional, default: None
Random seed for the initialization of random generators.
sess : tf.Session, optional, default: None
A TensorFlow session already initialized.
If None, a new session is created.
single_thread : bool, optional, default: False
Whether to force the execution on a single core in order to
have a deterministic behavior (sess=None only).
Examples
--------
# Fill the replay buffer with transitions:
ddpg.replay_buffer.append(( state, action, reward, is_terminal, next_state ))
# Train the networks:
loss = ddpg.train( nb_iterations )
# Infer the actions from the actor network:
action = ddpg.get_action( state )
"""
def __init__( self, s_dim, a_dim, state_scale=None, action_scale=None,
gamma=0.99, tau=1e-3, buffer_size=1e6, minibatch_size=64, actor_lr=1e-4, critic_lr=1e-3, beta_L2=0,
actor_def=actor_network_def, critic_def=critic_network_def,
summary_dir=None, seed=None, sess=None, single_thread=False ) :
self.gamma = gamma
self.minibatch_size = minibatch_size
self.summaries = summary_dir is not None
self.n_iter = 0
# Instantiation of the replay buffer:
self.replay_buffer = deque( maxlen=int( buffer_size ) )
random.seed( seed )
######################
# Building the graph #
######################
# Set the graph-level random seed:
tf.set_random_seed( seed )
self.states = tf.placeholder( tf.float32, [None, s_dim], 'States' )
self.actions = tf.placeholder( tf.float32, [None, a_dim], 'Actions' )
# Scaling of the inputs:
if state_scale is not None :
state_scale = tf.constant( state_scale, tf.float32, name='state_scale' )
scaled_states = tf.divide( self.states, state_scale, 'scale_states' )
else :
scaled_states = self.states
# Declaration of the actor network:
with tf.variable_scope( 'Actor' ) :
self.mu_actions = actor_def( scaled_states, a_dim )
if action_scale is not None :
action_scale = tf.constant( action_scale, tf.float32, name='action_scale' )
self.mu_actions = tf.multiply( self.mu_actions, action_scale, 'scale_actions' )
actor_params = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=tf.get_variable_scope().name )
tf.identity( self.mu_actions, name='Actor_Output' )
# Declaration of the critic network:
with tf.variable_scope( 'Critic' ) :
if action_scale is not None :
scaled_actions = tf.divide( self.actions, action_scale, 'scale_actions' )
else :
scaled_actions = self.actions
self.Q_value = critic_def( scaled_states, scaled_actions )
critic_params = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=tf.get_variable_scope().name )
# Declaration of the target networks:
with tf.variable_scope( 'Target_Networks' ) :
with tf.variable_scope( 'Target_Actor' ) :
target_mu_actions = actor_def( scaled_states, a_dim )
target_actor_params = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=tf.get_variable_scope().name )
with tf.variable_scope( 'Target_Critic' ) :
self.target_Q_value = critic_def( scaled_states, target_mu_actions )
target_critic_params = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=tf.get_variable_scope().name )
# Update of the target network parameters:
with tf.name_scope( 'Update_Target_Networks' ) :
sync_target_networks = [ tP.assign( P ) for P, tP in zip( actor_params + critic_params, target_actor_params + target_critic_params ) ]
self.update_target_actor = [ tP.assign( P*tau + tP*( 1 - tau ) ) for P, tP in zip( actor_params, target_actor_params ) ]
self.update_target_critic = [ tP.assign( P*tau + tP*( 1 - tau ) ) for P, tP in zip( critic_params, target_critic_params ) ]
# Backpropagation in the critic network of the target errors:
self.y = tf.placeholder( tf.float32, [None, 1], 'Targets' )
with tf.name_scope( 'Critic_Backprop' ) :
self.L = tf.losses.mean_squared_error( self.y, self.Q_value )
if beta_L2 > 0 :
with tf.name_scope( 'L2_regularization' ) :
L2 = beta_L2*tf.reduce_mean( [ tf.nn.l2_loss( v ) for v in critic_params if 'kernel' in v.name ] )
self.L += L2
critic_optimizer = tf.train.AdamOptimizer( critic_lr )
#self.train_critic = critic_optimizer.minimize( self.L, name='critic_backprop' )
critic_grads_and_vars = critic_optimizer.compute_gradients( self.L, critic_params )
self.train_critic = critic_optimizer.apply_gradients( critic_grads_and_vars, name='apply_backprop' )
# Application of the deterministic policy gradient to the actor network:
with tf.name_scope( 'Policy_Gradient' ) :
gradQ_a = tf.gradients( self.Q_value, self.actions, name='gradQ_a' )
gradQ_a_N = tf.divide( gradQ_a[0], tf.constant( minibatch_size, tf.float32, name='minibatch_size' ), 'normalize_over_batch' )
policy_gradients = tf.gradients( self.mu_actions, actor_params, -gradQ_a_N, name='policy_gradients' )
self.train_actor = tf.train.AdamOptimizer( actor_lr ).apply_gradients( zip( policy_gradients, actor_params ), name='apply_policy_gradients' )
#######################
# Setting the session #
#######################
if sess is not None :
self.sess = sess
else :
if single_thread :
sess_config = tf.ConfigProto( intra_op_parallelism_threads=1, inter_op_parallelism_threads=1 )
self.sess = tf.Session( config=sess_config )
else :
self.sess = tf.Session()
# Initialize variables and saver:
self.sess.run( tf.global_variables_initializer() )
self.sess.run( sync_target_networks )
self.saver = tf.train.Saver()
# Create the summaries:
if self.summaries :
def param_histogram( params ) :
for var in params :
name = var.name.split( ':' )[0]
tf.summary.histogram( name, var )
param_histogram( actor_params )
param_histogram( critic_params )
self.wb_summary_op = tf.summary.merge_all()
self.reward_eval = tf.placeholder( tf.float32, name='reward_eval' )
reward_summary = tf.summary.scalar( 'Reward', self.reward_eval )
self.reward_summary_op = tf.summary.merge( [ reward_summary ] )
L_summary = tf.summary.scalar( 'L', self.L )
cg_summaries = []
with tf.name_scope( 'critic_gradient_norms' ) :
for grad, var in critic_grads_and_vars :
name = 'Critic_Gradient/' + var.name.split( '/', 1 )[1].split( ':' )[0]
cg_summaries.append( tf.summary.scalar( name, tf.norm( grad ) ) )
self.critic_summary_op = tf.summary.merge( [ L_summary ] + cg_summaries )
ag_summaries = []
with tf.name_scope( 'policy_gradient_norms' ) :
for grad, var in zip( policy_gradients, actor_params ) :
name = 'Policy_Gradient/' + var.name.split( '/', 1 )[1].split( ':' )[0]
ag_summaries.append( tf.summary.scalar( name, tf.norm( grad ) ) )
self.actor_summary_op = tf.summary.merge( ag_summaries )
self.writer = tf.summary.FileWriter( summary_dir, self.sess.graph )
def reward_summary( self, reward ) :
if self.summaries :
self.writer.add_summary( self.sess.run( self.reward_summary_op, {self.reward_eval: reward} ), self.n_iter )
def sample_batch( self, batch_size ) :
batch = random.sample( self.replay_buffer, min( len( self.replay_buffer ), batch_size ) )
s_batch = np.array( [ _[0] for _ in batch ] )
a_batch = np.array( [ _[1] if np.shape( _[1] ) else [ _[1] ] for _ in batch ] )
r_batch = np.array( [ [ _[2] ] for _ in batch ] )
t_batch = np.array( [ [ _[3] ] for _ in batch ] )
s2_batch = np.array( [ _[4] for _ in batch ] )
return s_batch, a_batch, r_batch, t_batch, s2_batch
def train( self, iterations=1 ) :
if len( self.replay_buffer ) < self.minibatch_size :
return 0
Lt = 0
for _ in trange( iterations, desc='Training the networks', leave=False ) :
self.n_iter += 1
# Randomly pick samples in the replay buffer:
s, a, r, terminal, s2 = self.sample_batch( self.minibatch_size )
# Predict the future discounted rewards with the target critic network:
target_q = self.sess.run( self.target_Q_value, {self.states: s2} )
# Compute the targets for the Q-value:
y = r + self.gamma*target_q*( 1 - terminal )
# Optimize the critic network according to the targets:
if self.summaries :
L, _, critic_summary = self.sess.run( [ self.L, self.train_critic, self.critic_summary_op ], {self.states: s, self.actions: a, self.y: y} )
self.writer.add_summary( critic_summary, self.n_iter )
else :
L, _ = self.sess.run( [ self.L, self.train_critic ], {self.states: s, self.actions: a, self.y: y} )
Lt += L
# Apply the sample gradient to the actor network:
mu_a = self.sess.run( self.mu_actions, {self.states: s} )
if self.summaries :
_, actor_summary = self.sess.run( [ self.train_actor, self.actor_summary_op ], {self.states: s, self.actions: mu_a} )
self.writer.add_summary( actor_summary, self.n_iter )
else :
self.sess.run( self.train_actor, {self.states: s, self.actions: mu_a} )
# Update the target networks:
self.sess.run( self.update_target_actor )
self.sess.run( self.update_target_critic )
if self.summaries and self.wb_summary_op is not None :
self.writer.add_summary( self.sess.run( self.wb_summary_op ), self.n_iter )
#self.writer.flush()
return Lt/iterations
def get_action( self, s ) :
mu_a = self.sess.run( self.mu_actions, {self.states: s[np.newaxis, :] if s.ndim < 2 else s} )
return mu_a.squeeze()
def get_Q_value( self, s, a ) :
Q_value = self.sess.run( self.Q_value, {self.states: s[np.newaxis, :] if s.ndim < 2 else s,
self.actions: a[:, np.newaxis] if isinstance( a, np.ndarray ) and a.ndim < 2 else a} )
return Q_value.squeeze()
def get_V_value( self, s ) :
mu_a = self.sess.run( self.mu_actions, {self.states: s[np.newaxis, :] if s.ndim < 2 else s} )
V_value = self.sess.run( self.Q_value, {self.states: s[np.newaxis, :] if s.ndim < 2 else s, self.actions: mu_a} )
return V_value.squeeze()
def save_model( self, filename ) :
self.saver.save( self.sess, filename )
def load_model( self, filename ) :
self.saver.restore( self.sess, filename )
def save_replay_buffer( self, filename ) :
with open( filename, 'wb' ) as f :
import pickle
pickle.dump( self.replay_buffer, f )
def load_replay_buffer( self, filename ) :
try :
with open( filename, 'rb' ) as f :
import pickle
temp_buf = pickle.load( f )
self.replay_buffer = temp_buf
return True
except IOError :
return False
def __enter__( self ) :
return self
def __exit__( self, type, value, traceback ) :
self.sess.close()