forked from smearle/control-pcgrl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
111 lines (102 loc) · 3.55 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#pip install tensorflow==1.15
#Install stable-baselines as described in the documentation
import model
from model import FullyConvPolicy, FullyConvPolicySmallMap, CustomPolicyBigMap, CustomPolicySmallMap
from utils import get_exp_name, max_exp_idx, load_model, make_vec_envs
#from stable_baselines import PPO2
from policy import PPO2
from stable_baselines.results_plotter import load_results, ts2xy
import tensorflow as tf
import numpy as np
import os
n_steps = 0
log_dir = './'
best_mean_reward, n_steps = -np.inf, 0
def callback(_locals, _globals):
"""
Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)
:param _locals: (dict)
:param _globals: (dict)
"""
global n_steps, best_mean_reward
# Print stats every 1000 calls
if (n_steps + 1) % 10 == 0:
x, y = ts2xy(load_results(log_dir), 'timesteps')
if len(x) > 100:
#pdb.set_trace()
mean_reward = np.mean(y[-100:])
print(x[-1], 'timesteps')
print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward))
# New best model, we save the agent here
if mean_reward > best_mean_reward:
best_mean_reward = mean_reward
# Example for saving best model
print("Saving new best model")
_locals['self'].save(os.path.join(log_dir, 'best_model.pkl'))
else:
print("Saving latest model")
_locals['self'].save(os.path.join(log_dir, 'latest_model.pkl'))
else:
print('{} monitor entries'.format(len(x)))
pass
n_steps += 1
# Returning False will stop training early
return True
def main(game, representation, experiment, steps, n_cpu, render, logging, **kwargs):
kwargs['n_cpu'] = n_cpu
env_name = '{}-{}-v0'.format(game, representation)
exp_name = get_exp_name(game, representation, experiment, **kwargs)
resume = kwargs.get('resume', False)
if representation == 'wide':
policy = FullyConvPolicy
if game == "sokoban":
policy = FullyConvPolicySmallMap
else:
policy = CustomPolicyBigMap
if game == "sokoban":
policy = CustomPolicySmallMap
if game == "binary":
kwargs['cropped_size'] = 28
elif game == "zelda":
kwargs['cropped_size'] = 22
elif game == "sokoban":
kwargs['cropped_size'] = 10
n = max_exp_idx(exp_name)
global log_dir
if not resume:
n = n + 1
log_dir = 'runs/{}_{}_{}'.format(exp_name, n, 'log')
if not resume:
os.mkdir(log_dir)
else:
model = load_model(log_dir)
kwargs = {
**kwargs,
'render_rank': 0,
'render': render,
}
used_dir = log_dir
if not logging:
used_dir = None
env = make_vec_envs(env_name, representation, log_dir, **kwargs)
if not resume or model is None:
model = PPO2(policy, env, verbose=1, tensorboard_log="./runs")
else:
model.set_env(env)
if not logging:
model.learn(total_timesteps=int(steps), tb_log_name=exp_name)
else:
model.learn(total_timesteps=int(steps), tb_log_name=exp_name, callback=callback)
################################## MAIN ########################################
game = 'sokoban'
representation = 'wide'
experiment = 'LongConv'
steps = 1e8
render = False
logging = True
n_cpu = 50
kwargs = {
'resume': False
}
if __name__ == '__main__':
main(game, representation, experiment, steps, n_cpu, render, logging, **kwargs)