Skip to content

Commit

Permalink
[gym/examples] Improve documentation of RLLib tools.
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexis Duburcq committed May 16, 2021
1 parent 9719720 commit 2dfd37a
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 23 deletions.
115 changes: 96 additions & 19 deletions python/gym_jiminy/examples/rllib/tools/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def build_callbacks(*callback_mixins: Type) -> DefaultCallbacks:
return type("UnifiedCallbacks", (*callback_mixins, DefaultCallbacks), {})


def flatten_dict(dt, delimiter="/", prevent_delimiter=False):
def _flatten_dict(dt, delimiter="/", prevent_delimiter=False):
"""Must be patched to use copy instead of deepcopy to prevent memory
allocation, significantly impeding computational efficiency of `TBXLogger`,
and slowing down the optimization by about 25%.
Expand Down Expand Up @@ -130,7 +130,7 @@ def flatten_dict(dt, delimiter="/", prevent_delimiter=False):
return dt


ray.tune.logger.flatten_dict = flatten_dict
ray.tune.logger.flatten_dict = _flatten_dict


def initialize(num_cpus: int = 0,
Expand All @@ -144,10 +144,24 @@ def initialize(num_cpus: int = 0,
It will be used later for almost everything from dashboard, remote/client
management, to multithreaded environment.
:param num_cpus: Maximum number of CPU threads that can be executed in
parallel. Note that it does not actually reserve part of
the CPU, so that several processes can reserve the number
of threads available on the system at the same time. 0 for
unlimited.
Optional: Unlimited by default.
:param num_gpu: Maximum number of GPU unit that can be used, which can be
fractional to only allocate part of the resource. Note that
contrary to CPU resource, the memory is likely to actually
be reserve and allocated by the process, in particular
using Tensorflow backend. 0 for unlimited.
Optional: Unlimited by default.
:param log_root_path: Fullpath of root log directory.
Optional: location of this file / log by default.
:param log_name: Name of the subdirectory where to save data.
Optional: full date _ hostname by default.
:param debug: Whether or not to display debugging trace.
Optional: Disable by default.
:param verbose: Whether or not to print information about what is going on.
Optional: True by default.
Expand Down Expand Up @@ -208,6 +222,7 @@ def train(train_agent: Trainer,
max_timesteps: int = 0,
max_iters: int = 0,
evaluation_period: int = 0,
record_video: bool = True,
verbose: bool = True) -> str:
"""Train a model on a specific environment using a given agent.
Expand All @@ -224,11 +239,16 @@ def train(train_agent: Trainer,
Optional: Disable by default.
:param max_iters: Maximum number of training iterations. 0 to disable.
Optional: Disable by default.
:param evaluation_period: Run one simulation without exploration every
given number of training steps, and save a video
of the esult in log folder. 0 to disable.
:param evaluation_period: Run one simulation (without exploration) every
given number of training steps, and save the log
file and a video of the result in log folder if
requested. 0 to disable.
Optional: Disable by default.
:param verbose: Whether or not to print information about what is going on.
:param record_video: Whether or not to enable video recording during
evaluation.
Optional: True by default.
:param verbose: Whether or not to print high-level information after each
training iteration.
Optional: True by default.
:returns: Fullpath of agent's final state dump. Note that it also contains
Expand Down Expand Up @@ -286,7 +306,10 @@ def train(train_agent: Trainer,
# Record video and log data of the result if requested
iter = result["training_iteration"]
if evaluation_period > 0 and iter % evaluation_period == 0:
record_video_path = f"{train_agent.logdir}/iter_{iter}.mp4"
if record_video:
record_video_path = f"{train_agent.logdir}/iter_{iter}.mp4"
else:
record_video_path = None
env, _ = test(train_agent, explore=False, viewer_kwargs={
"record_video_path": record_video_path,
"scene_name": f"iter_{iter}"})
Expand All @@ -310,24 +333,33 @@ def train(train_agent: Trainer,


def compute_action(policy: Policy,
dist_class: ActionDistribution,
input_dict: Dict[str, np.ndarray],
explore: bool) -> np.ndarray:
"""TODO Write documentation.
explore: bool) -> Any:
"""Compute predicted action by the policy.
.. note::
It supports both Pytorch and Tensorflow backends (both eager and
compiled graph modes).
:param policy: `rllib.poli.Policy` to use to predict the action, which is
a thin wrapper around the actual policy model.
:param input_dict: Input dictionary for forward as input of the policy.
:param explore: Whether or not to enable exploration during sampling of the
action.
"""
if policy.framework == 'torch':
with torch.no_grad():
input_dict = policy._lazy_tensor_dict(input_dict)
action_logits, _ = policy.model(input_dict)
action_dist = dist_class(action_logits, policy.model)
action_dist = policy.dist_class(action_logits, policy.model)
if explore:
action_torch = action_dist.sample()
else:
action_torch = action_dist.deterministic_sample()
action = action_torch.cpu().numpy()
elif tf.compat.v1.executing_eagerly():
action_logits, _ = policy.model(input_dict)
action_dist = dist_class(action_logits, policy.model)
action_dist = policy.dist_class(action_logits, policy.model)
if explore:
action_tf = action_dist.sample()
else:
Expand All @@ -350,7 +382,6 @@ def compute_action(policy: Policy,

def evaluate(env: gym.Env,
policy: Policy,
dist_class: ActionDistribution,
obs_filter_fn: Optional[
Callable[[np.ndarray], np.ndarray]] = None,
n_frames_stack: int = 1,
Expand All @@ -360,7 +391,37 @@ def evaluate(env: gym.Env,
enable_stats: bool = True,
enable_replay: bool = True,
viewer_kwargs: Optional[Dict[str, Any]] = None) -> gym.Env:
"""TODO Write documentation.
"""Evaluate a policy on a given environment over a complete episode.
:param env: Environment on which to evaluate the policy. Note that the
environment must be already instantiated and ready-to-use.
:param policy: Policy to evaluate.
:param obs_filter_fn: Observation filter to apply on (flattened)
observation from the environment, usually used
from moving average normalization. `None` to
disable.
Optional: Disable by default.
:param n_frames_stack: Number of frames to stack in the input to provide
to the policy. Note that previous observation,
action, and reward will be stacked.
Optional: 1 by default.
:param horizon: Horizon of the simulation, namely maximum number of steps
before termination. `None` to disable.
Optional: Disable by default.
:param clip_action: Whether or not to clip action to make sure the
prediction by the policy is not out-of-bounds.
Optional: Disable by default.
:param explore: Whether or not to enable exploration during sampling of the
actions predicted by the policy.
Optional: Disable by default.
:param enable_stats: Whether or not to print high-level statistics after
simulation.
Optional: Enable by default.
:param enable_replay: Whether or not to enable replay of the simulation,
and eventually recording through `viewer_kwargs`.
Optional: Enable by default.
:param viewer_kwargs: Extra keyword arguments to forward to the viewer if
replay has been requested.
"""
# Handling of default arguments
if viewer_kwargs is None:
Expand Down Expand Up @@ -392,7 +453,7 @@ def evaluate(env: gym.Env,
if obs_filter_fn is not None:
obs = obs_filter_fn(obs)
input_dict["obs"][0] = obs
action = compute_action(policy, dist_class, input_dict, explore)
action = compute_action(policy, input_dict, explore)
if clip_action:
action = clip(action_space, action)
input_dict["prev_n_obs"][0, -1] = input_dict["obs"][0]
Expand Down Expand Up @@ -432,11 +493,29 @@ def test(test_agent: Trainer,
test_env: Optional[gym.Env] = None,
viewer_kwargs: Optional[Dict[str, Any]] = None,
**kwargs: Any) -> gym.Env:
"""Test a model on a specific environment using a given agent. It will
render the result in the default viewer.
"""Test a model on a specific environment using a given agent.
.. note::
This function can be terminated early using CTRL+C.
:param test_agent: Agent to evaluate on a single simulation.
:param explore: Whether or not to enable exploration during sampling of the
actions predicted by the policy.
Optional: Disable by default.
:param n_frames_stack: Number of frames to stack in the input to provide
to the policy. Note that previous observation,
action, and reward will be stacked.
Optional: 1 by default.
:param enable_stats: Whether or not to print high-level statistics after
simulation.
Optional: Enable by default.
:param enable_replay: Whether or not to enable replay of the simulation,
and eventually recording through `viewer_kwargs`.
Optional: Enable by default.
:param test_env: Environment on which to evaluate the policy. It must be
already instantiated and ready-to-use.
:param viewer_kwargs: Extra keyword arguments to forward to the viewer if
replay has been requested.
"""
# Instantiate the environment if not provided
if test_env is None:
Expand All @@ -445,7 +524,6 @@ def test(test_agent: Trainer,

# Get policy model
policy = test_agent.get_policy()
dist_class = policy.dist_class
obs_filter = test_agent.workers.local_worker().filters["default_policy"]
if isinstance(obs_filter, NoFilter):
obs_filter_fn = None
Expand All @@ -459,7 +537,6 @@ def test(test_agent: Trainer,

return evaluate(test_env,
policy,
dist_class,
obs_filter_fn,
n_frames_stack=n_frames_stack,
horizon=test_agent.config["horizon"],
Expand Down
5 changes: 1 addition & 4 deletions python/jiminy_py/src/jiminy_py/state.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
from copy import deepcopy
from collections import defaultdict
from typing import Optional, Union, Sequence, Dict
from typing import Optional, Sequence

import numpy as np

from pinocchio import Force, StdVec_Force


class State:
"""Store the kinematics and dynamics data of the robot at a given time.
Expand Down
1 change: 1 addition & 0 deletions python/jiminy_py/src/jiminy_py/viewer/viewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1853,6 +1853,7 @@ def display(self,
:param q: Configuration of the robot.
:param v: Velocity of the robot. Used only to update velocity
dependent markers such as DCM. `None` if undefined.
Optional: `None` by default.
:param xyz_offset: Freeflyer position offset. Note that it does not
check for the robot actually have a freeflyer.
:param update_hook: Callable that will be called right after updating
Expand Down

0 comments on commit 2dfd37a

Please sign in to comment.