forked from tensorflow/models
-
Notifications
You must be signed in to change notification settings - Fork 0
/
visualizer.py
107 lines (91 loc) · 4.69 KB
/
visualizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from __future__ import print_function
from builtins import range
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import numpy as np
import tensorflow as tf
# import moviepy.editor as mpy
import time, os, traceback, multiprocessing, portalocker, sys
import envwrap
import util
import valuerl, worldmodel
from config import config
MODEL_NAME = config["name"]
LOG_PATH = util.create_directory("output/" + config["env"] + "/" + MODEL_NAME + "/" + config["log_path"]) + "/" + MODEL_NAME
LOAD_PATH = util.create_directory("output/" + config["env"] + "/" + MODEL_NAME + "/" + config["save_model_path"])
OBS_DIM = np.prod(config["obs_dims"])
HIDDEN_DIM = config["hidden_dim"]
ACTION_DIM = config["action_dim"]
MAX_FRAMES = config["max_frames"]
REWARD_SCALE = config["reward_scale"]
DISCOUNT = config["discount"]
ALGO = config["policy_config"]["algo"]
AGENT_BATCH_SIZE = config["agent_config"]["batch_size"]
EVALUATOR_BATCH_SIZE = config["evaluator_config"]["batch_size"]
RELOAD_EVERY_N = config["agent_config"]["reload_every_n"]
FRAMES_BEFORE_LEARNING = config["policy_config"]["frames_before_learning"]
FRAMES_PER_UPDATE = config["policy_config"]["frames_per_update"]
LEARNER_EPOCH_N = config["policy_config"]["epoch_n"]
SYNC_UPDATES = config["policy_config"]["frames_per_update"] >= 0
POLICY_BAYESIAN_CONFIG = config["policy_config"]["bayesian"]
AUX_CONFIG = config["aux_config"]
DDPG_EXPLORE_CHANCE = config["policy_config"]["explore_chance"] if ALGO == "ddpg" else 0.
MODEL_AUGMENTED = config["model_config"] is not False
if MODEL_AUGMENTED: MODEL_BAYESIAN_CONFIG = config["model_config"]["bayesian"]
FILENAME = sys.argv[3]
if __name__ == '__main__':
oprl = valuerl.ValueRL(MODEL_NAME, ALGO, OBS_DIM, ACTION_DIM, HIDDEN_DIM, REWARD_SCALE, DISCOUNT, POLICY_BAYESIAN_CONFIG, AUX_CONFIG, DDPG_EXPLORE_CHANCE)
obs_loader = tf.placeholder(tf.float32, [1, OBS_DIM])
policy_actions, _ = oprl.build_evalution_graph(obs_loader, mode="exploit")
if MODEL_AUGMENTED:
next_obs_loader = tf.placeholder(tf.float32, [1, OBS_DIM])
reward_loader = tf.placeholder(tf.float32, [1])
done_loader = tf.placeholder(tf.float32, [1])
worldmodel = worldmodel.DeterministicWorldModel(MODEL_NAME, OBS_DIM, ACTION_DIM, HIDDEN_DIM, REWARD_SCALE, DISCOUNT, MODEL_BAYESIAN_CONFIG)
_, _, _, _, _, confidence, _ = oprl.build_Q_expansion_graph(next_obs_loader, reward_loader, done_loader, worldmodel, rollout_len=3, model_ensembling=True)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
oprl.load(sess, FILENAME)
if MODEL_AUGMENTED: worldmodel.load(sess, FILENAME)
env = envwrap.get_env(config["env"])
hist = np.zeros([4, 10])
for _ in range(10):
ts = 0
rgb_frames = []
obs, reward, done, reset = env.reset(), 0, False, False
while not reset:
# env.internal_env.render()
# rgb_frames.append(env.internal_env.render(mode='rgb_array'))
# action = env.action_space.sample()
all_actions = sess.run(policy_actions, feed_dict={obs_loader: np.array([obs])})
all_actions = np.clip(all_actions, -1., 1.)
action = all_actions[0]
obs, _reward, done, reset = env.step(action)
if MODEL_AUGMENTED:
_confidences = sess.run(confidence, feed_dict={next_obs_loader: np.expand_dims(obs,0),
reward_loader: np.expand_dims(_reward,0),
done_loader: np.expand_dims(done,0)})
# print "%.02f %.02f %.02f %.02f" % tuple(_confidences[0,0])
for h in range(4):
bucket = int((_confidences[0,0,h]-1e-5)*10)
hist[h,bucket] += 1
reward += _reward
ts += 1
# print ts, _reward, reward
print(ts, reward)
hist /= np.sum(hist, axis=1, keepdims=True)
for row in reversed(hist.T): print(' '.join(["%.02f"] * 4) % tuple(row))
#clip = mpy.ImageSequenceClip(rgb_frames, fps=100)
#clip.write_videofile(FILENAME + "/movie.mp4")