-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathrollout.py
125 lines (105 loc) · 3.87 KB
/
rollout.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""
Storing and manipulating trajectories from an agent.
"""
import random
import numpy as np
class Rollout:
"""
A batch of trajectory segments. These segments record
what an agent saw, what actions it took, what rewards
it received, etc.
Rollouts provide a useful representation which can be
passed to or produced from anywhere that deals with
trajectories. For example, a Rollout can be produced
from a series of demonstrations, or it can be produced
by running an agent on a batch of environments for a
certain number of timesteps. You can pass a Rollout to
an RL algorithm like PPO, or you can feed it to a
discriminator like the one in GAIL.
The dones, obses, states, and model_outs are one
timestep longer than the other sequences, for
bootstrapping off the value function.
The model_outs will vary by the type of model that
generated the rollouts. Typically it will have these
keys:
values: outputs from the value function.
actions: actions from the policy.
Members have shape [num_steps x batch_size x ...] or
[(num_steps + 1) x batch_size x ...].
"""
def __init__(self, states, obses, rews, dones, infos, model_outs):
self.states = np.array(states, dtype=np.float32)
self.obses = np.array(obses)
self.rews = np.array(rews, dtype=np.float32)
self.dones = np.array(dones, dtype=np.float32)
self.infos = infos
self.model_outs = model_outs
def copy(self):
return Rollout(self.states, self.obses, self.rews, self.dones, self.infos, self.model_outs)
@property
def num_steps(self):
return len(self.rews)
@property
def batch_size(self):
return len(self.rews[0])
def value_predictions(self):
"""
Get the value predictions from the model at each
timestep.
"""
return np.array([m['critic'] for m in self.model_outs], dtype=np.float32)
def actions(self):
"""
Get the integer actions from the model at each
timestep.
"""
return np.array([m['actions'] for m in self.model_outs], dtype=np.int32)
def log_probs(self):
"""
Get the initial log probabilities from the model
at each timestep.
"""
return np.array([m['log_probs'] for m in self.model_outs], dtype=np.float32)
def logits(self):
"""
Get the actor logits from the model at each
timestep.
"""
return np.array([m['actor'] for m in self.model_outs], dtype=np.float32)
def advantages(self, gamma, lam):
"""
Generate a [num_steps x batch_size] array of
generalized advantages using GAE.
"""
values = self.value_predictions()
result = np.zeros([self.num_steps, self.batch_size], dtype=np.float32)
current = np.zeros([self.batch_size], dtype=np.float32)
for t in range(self.num_steps - 1, -1, -1):
delta = self.rews[t] - values[t]
delta += (1 - self.dones[t + 1]) * gamma * values[t + 1]
current *= gamma * lam
current += delta
result[t] = current
current *= (1 - self.dones[t])
return result
def batches(self, batch_size, count):
"""
Yield `count` batches, where each batch is a list
of (timestep, batch_idx) tuples.
"""
entries = self.entries()
for _ in range(count):
yield [next(entries) for _ in range(batch_size)]
def entries(self):
"""
Yield an infinite and shuffled list of
(timestep, batch_idx) tuples.
"""
entries = []
for t in range(self.num_steps):
for b in range(self.batch_size):
entries.append((t, b))
while True:
random.shuffle(entries)
for entry in entries:
yield entry