-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_ppo_selfplay.py
173 lines (138 loc) · 5.1 KB
/
train_ppo_selfplay.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import shootout
import os
import gym
import time
import random
import numpy as np
from stable_baselines.ppo1 import PPO1
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import logger
from stable_baselines.common.callbacks import EvalCallback
from shutil import copyfile # keep track of generations
LOGDIR = "models/ppo1_selfplay"
NUM_TIMESTEPS = int(1e9)
EVAL_FREQ = int(1e4)
EVAL_EPISODES = int(1e2)
BEST_THRESHOLD = 1.6
RENDER_MODE = True
class ShootoutSelfPlayEnv(shootout.ShootoutEnv):
# wrapper over the normal single player env, but loads the best self play model
def __init__(self):
super(ShootoutSelfPlayEnv, self).__init__()
self.policy = self
self.best_model = None
self.best_model_filename = None
#sus = 0
random_mode = 0
def predict(self, obs): # the policy
if self.best_model is None or self.random_mode <= 0.05:
out = self.action_space.sample() # return a random action
for i in range(len(out)):
if i != 4:
out[i] = round(out[i])
#if self.sus % 2000 == 0:
#print(f'blahblah {self.sus}, {self.timer}')
#self.sus += 1
return out
elif self.random_mode <= 0.075:
return np.array([0,0,0,0,0,0,0,0], dtype=np.float32)
elif self.random_mode <= 0.1:
return np.array([0,0,0,0,random.random(),0,0,1], dtype=np.float32)
else:
action, _ = self.best_model.predict(obs)
return action
def reset(self):
# load model if it's there
self.random_mode = random.random()
modellist = [f for f in os.listdir(LOGDIR) if f.startswith("history")]
modellist.sort()
#andomlist = ["random/"+f for f in os.listdir(LOGDIR+"/random/") if f.startswith("history")]
#randomlist.sort()
if len(modellist) > 0:
filename = os.path.join(LOGDIR, modellist[-1]) # the latest best model
#filename = os.path.join(LOGDIR, randomlist[random.randint(0,len(randomlist)-1)] if self.random_mode <= 0.4 else modellist[-1]) # the latest best model
if filename != self.best_model_filename:
print("loading model as best: ", filename)
self.best_model_filename = filename
if self.best_model is not None:
del self.best_model
self.best_model = PPO1.load(filename, env=self)
return super(ShootoutSelfPlayEnv, self).reset()
class SelfPlayCallback(EvalCallback):
# hacked it to only save new version of best model if beats prev self by BEST_THRESHOLD score
# after saving model, resets the best score to be BEST_THRESHOLD
def __init__(self, *args, **kwargs):
super(SelfPlayCallback, self).__init__(*args, **kwargs)
self.best_mean_reward = BEST_THRESHOLD
self.generation = 0
def _on_step(self) -> bool:
result = super(SelfPlayCallback, self)._on_step()
if result and self.best_mean_reward > BEST_THRESHOLD:
self.generation += 1
env = ShootoutSelfPlayEnv()
rollout(env, None)
env.close()
print("SELFPLAY: mean_reward achieved:", self.best_mean_reward)
print("SELFPLAY: new best model, bumping up generation to", self.generation)
source_file = os.path.join(LOGDIR, "best_model.zip")
backup_file = os.path.join(LOGDIR, "history_"+str(self.generation).zfill(8)+".zip")
copyfile(source_file, backup_file)
self.best_mean_reward = BEST_THRESHOLD
return result
def rollout(env, policy):
""" play one agent vs the other in modified gym-style loop. """
obs = env.reset()
#env.random_mode = 0.35
if policy == None:
# load model if it's there
modellist = [f for f in os.listdir(LOGDIR) if f.startswith("history")]
modellist.sort()
if len(modellist) > 0:
filename = os.path.join(LOGDIR, modellist[-1]) # the latest best model
if filename != None:
print("loading model for rollout: ", filename)
best_model_filename = filename
policy = PPO1.load(filename, env=env)
done = False
total_reward = 0
while not done:
action, _states = policy.predict(obs)
obs, reward, done, _ = env.step(action)
total_reward += reward
if RENDER_MODE:
env.render()
time.sleep(0.05)
return total_reward
return 0
def train():
# train selfplay agent
logger.configure(folder=LOGDIR)
if False:
# Load model from file
# load model if it's there
modellist = [f for f in os.listdir(LOGDIR) if f.startswith("history")]
modellist.sort()
if len(modellist) > 0:
filename = os.path.join(LOGDIR, modellist[-1]) # the latest best model
if filename != None:
print("loading model: ", filename)
best_model_filename = filename
policy = PPO1.load(filename, env=env)
model = policy
else:
# take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.)
model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)
eval_callback = SelfPlayCallback(env,
best_model_save_path=LOGDIR,
log_path=LOGDIR,
eval_freq=EVAL_FREQ,
n_eval_episodes=EVAL_EPISODES,
deterministic=False)
model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)
model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.
env.close()
if __name__=="__main__":
env = ShootoutSelfPlayEnv()
#rollout(env, None)
train()