-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun-wann-gym.py
225 lines (190 loc) · 10.9 KB
/
run-wann-gym.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#!/usr/bin/env python3
"""
Evolve a weight-agnostic network on a gym environment.
"""
import numpy as np
import joblib
import time
import logging
import copy
import os
import pickle
from networks import WeightAgnosticNetwork
from evolution import reproduce_tournament, rank_by_dominance
from datasets import make_env
import utils
# Set up parameters and output dir.
train_weight_values = [-2, -1, -0.5, 0.5, 1, 2]
params = utils.load_params(mode='wann') # based on terminal input
params['script'] = 'run-wann-gym.py'
writer, out_dir = utils.init_output(params)
# Define evaluate function used during evolution.
def evaluate(net, weight_values, num_trials, env_seed=None, return_std=False,
verbose=False, render=False):
"""
Evaluate the fitness of the network by running it on a gym environment
for multiple trials.
Args:
net (WeightAgnosticNetwork): The network to evaluate.
weight_values (iterable of float): The weight values to evaluate on.
num_trials (int): How often to run on the environment per weight value.
env_seed (int, optional): The base seed for the gym environment. If given, each trial will
still use a different seed, but synced across weight values. I.e. the first trials for
all weight values will use the same seed, the second trials will use another same seed,
etc. If None, the environments aren't seeded (default: None).
return_std (bool, optional): If True, return the standard deviations across trials
in addition to the averages across trials (default: False).
verbose (bool, optional): Verbose mode (default: False).
render (bool, optional): Render the gym environments (default: False).
Returns:
Cumulative reward in mean (mean of all weight values) and max (best weight value) setting,
averaged over trials. If return_std is True, the standard deviations across trials (again
in mean and max setting) are returned as well.
"""
env = make_env(params['env_name'])
# For each weight value, run multiple trials and record all (cumulative) rewards.
rewards = np.zeros((len(weight_values), num_trials))
for i, weight_value in enumerate(weight_values):
for trial in range(num_trials):
if env_seed is not None:
env.seed(env_seed + trial) # different for trials but synced across weight values
observation = env.reset()
done = False
while not done:
if render:
env.render()
output = net.forward(observation.reshape(1, -1),
weight_value) # add (empty) batch dimension
output = output.reshape(-1) # get rid of batch dimension again
if params['take_argmax_action']:
action = np.argmax(output)
else:
action = output
observation, reward, done, info = env.step(action)
rewards[i, trial] += reward
if done:
env.close()
#print(rewards)
# Average rewards over trials.
reward_per_weight_value = rewards.mean(axis=1)
reward_per_weight_value_std = rewards.std(axis=1)
# Mean setting: Average over all weight values.
mean_reward = reward_per_weight_value.mean()
mean_reward_std = reward_per_weight_value_std.mean()
# Max setting: Use best performing weight value.
max_index = reward_per_weight_value.argmax()
max_reward = reward_per_weight_value[max_index]
max_reward_std = reward_per_weight_value_std[max_index]
if verbose:
logging.info(
f'Evaluating net ({net}): mean reward: {mean_reward:.2f} +- {mean_reward_std:.2f}, '
f'max reward: {max_reward:.2f} +- {max_reward_std:.2f}')
if return_std:
return mean_reward, max_reward, mean_reward_std, max_reward_std
else:
return mean_reward, max_reward
# Create initial population.
population = [
WeightAgnosticNetwork(params['num_inputs'], params['num_outputs'],
params['p_initial_connection_enabled'], params['p_change_activation'],
params['p_add_connection'], params['p_add_node'])
for _ in range(params['population_size'])]
# Keep track of best network of all generations (= champion).
champion = {'mean_reward': -np.inf}
# Evolution loop.
with joblib.Parallel(n_jobs=params['num_workers']) as parallel:
for generation in range(params['num_generations']):
start_time_generation = time.time()
# Evaluate fitness of all networks. Use the same environment seed for each network. This
# makes it easier to compare different mutations and significantly improves performance.
start_time_evaluation = time.time()
objectives = parallel(
joblib.delayed(evaluate)(net, train_weight_values, params['num_trials'],
env_seed=np.random.randint(1000))
for net in population)
objectives = np.array(objectives) # shape: population_size, 2
mean_rewards = objectives[:, 0]
max_rewards = objectives[:, 1]
complexities = np.array([net.get_num_connections() for net in population])
complexities = np.maximum(complexities, 1) # prevent 0 division
time_evaluation = time.time() - start_time_evaluation
# Pick best net from this generation (based on mean reward) and check
# if it's better than the previously observed best net (= champion).
start_time_champion_evaluation = time.time()
best_index = mean_rewards.argmax()
if mean_rewards[best_index] > champion['mean_reward']:
# Check again by running for 96 trials (16 per weight value) to make sure it wasn't a
# lucky shot. This is a hacky implementation to parallelize across weight values.
# TODO: Use seed here or not?
# TODO: Think about making this nicer.
exact_rewards = parallel(
joblib.delayed(evaluate)(population[best_index], [weight_value], 16,
return_std=True) for weight_value in train_weight_values)
exact_rewards = np.array(exact_rewards)
exact_mean_reward = exact_rewards[:, 0].mean()
# if exact_mean_reward > champion_mean_reward:
if exact_mean_reward > champion['mean_reward']:
max_index = exact_rewards[:, 0].argmax()
champion = {'net': copy.deepcopy(population[best_index]),
'mean_reward': exact_mean_reward,
'mean_reward_std': exact_rewards[:, 2].mean(),
'max_reward': exact_rewards[max_index, 0],
'max_reward_std': exact_rewards[max_index, 2]}
# Save new champion net to file.
with open(os.path.join(out_dir, 'champion_network.pkl'), 'wb') as f:
pickle.dump(champion['net'], f)
time_champion_evaluation = time.time() - start_time_champion_evaluation
# Write metrics to log and tensorboard.
logging.info(
f'{generation} - Best net: mean reward: {mean_rewards[best_index]:.2f}, '
f'max reward: {max_rewards[best_index]:.2f} - evaluation: {time_evaluation:.1f} s, '
f'champion evaluation: {time_champion_evaluation:.1f} s')
writer.add_scalar('best/mean_reward', mean_rewards[best_index], generation)
writer.add_scalar('best/max_reward', max_rewards[best_index], generation)
if generation % params['test_every'] == 0:
# Run champion net in "test" setting, i.e. on more weight values and trials.
if 'test_mean_reward' not in champion:
# Test evaluation: Each weight used in training, 100 trials per weight value.
champion['test_mean_reward'], champion['test_max_reward'], \
champion['test_mean_reward_std'], champion['test_max_reward_std'] = evaluate(
champion['net'], train_weight_values, 100, return_std=True)
logging.info(f'All-time champion net in train setting: '
f'mean reward: {champion["mean_reward"]:.2f} '
f'+- {champion["mean_reward_std"]:.2f}, '
f'max reward: {champion["max_reward"]:.2f} '
f'+- {champion["max_reward_std"]:.2f}')
logging.info(f' ...in test setting (100 trials): '
f'mean reward: {champion["test_mean_reward"]:.2f} '
f'+- {champion["test_mean_reward_std"]:.2f}, '
f'max reward: {champion["test_max_reward"]:.2f} '
f'+- {champion["test_max_reward_std"]:.2f}')
writer.add_scalar('champion/mean_reward', champion['mean_reward'], generation)
writer.add_scalar('champion/max_reward', champion['max_reward'], generation)
writer.add_scalar('champion/test_mean_reward', champion['test_mean_reward'], generation)
writer.add_scalar('champion/test_max_reward', champion['test_max_reward'], generation)
writer.add_scalar('champion/connections', champion['net'].get_num_connections(),
generation)
writer.add_scalar('champion/neurons', champion['net'].num_neurons, generation)
writer.add_scalar('champion/layers', len(champion['net'].neurons_per_layer), generation)
utils.log_network_stats(population, writer, generation)
logging.info('')
# Rank networks based on the evaluation metrics.
start_time_ranking = time.time()
ranks = rank_by_dominance(mean_rewards, max_rewards, complexities,
p_complexity_objective=params['p_complexity_objective'])
time_ranking = time.time() - start_time_ranking
# Make new population by picking networks via tournament selection and mutating them.
start_time_reproduction = time.time()
new_population = reproduce_tournament(population, ranks, params['tournament_size'],
cull_ratio=params['cull_ratio'],
elite_ratio=params['elite_ratio'],
num_mutations=params['num_mutations_per_generation'])
population = new_population
time_reproduction = time.time() - start_time_reproduction
time_generation = time.time() - start_time_generation
writer.add_scalar('times/complete_generation', time_generation, generation)
writer.add_scalar('times/evaluation', time_evaluation, generation)
writer.add_scalar('times/champion_evaluation', time_champion_evaluation, generation)
writer.add_scalar('times/ranking', time_ranking, generation)
writer.add_scalar('times/reproduction', time_reproduction, generation)
writer.close()