Skip to content

Commit

Permalink
[all] Fix reproducibility and performance comparation issues. (#399)
Browse files Browse the repository at this point in the history
* [core] Fix log buffer not refreshed after calling 'stop', making last datapoint not accessible. (#396)
* [python/robot] Fix non-reproducible simulation results.
* [jiminy_py/simulator] Fix running simulation not stopped before updating seed.
* [python/simulator] Fix unexpected error if 'replay' is called without data.
* [gym/common] Do not set the seed by default in '_setup' method.
* [gym/common] Add train and eval modes to allow specific evaluation behaviors.
* [gym/common] Rename '_is_training' in 'is_training'. Fix 'is_training' False by default. (#397)
* [gym/common] 'seed' method now return the actual sequence of seeds instead of only the first one.
* [gym/common] Remove useless conversion of iterator to list.
* [gym/common] Do not cast automatically the action space to float32.  (#400)
* [gym/common] Ensure internal '_action' buffer dtype float64. (#400)
* [gym/toolbox] Add 'quat_to_yaw_cos_sin' math helper.
* [gym/rllib] Add l2 regulization loss to PPO. 
* [gym/rllib] Use noisy samples instead of the true ones to compute global smoothness.
* [gym/rllib] Make sure the validation is always the same to make it easier to benchmark performance.
* [gym/rllib] Enable to evaluate policy on multiple trials and report statistics. Export data from best trial only.
* [gym/rllib] Relax ray dependency version requirement.

Co-authored-by: Alexis Duburcq <alexis.duburcq@wandercraft.eu>
  • Loading branch information
duburcqa and Alexis Duburcq authored Aug 18, 2021
1 parent 28449b9 commit 6243e43
Show file tree
Hide file tree
Showing 14 changed files with 212 additions and 88 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
cmake_minimum_required(VERSION 3.10)

# Set the build version
set(BUILD_VERSION 1.6.28)
set(BUILD_VERSION 1.6.29)

# Set compatibility
if(CMAKE_VERSION VERSION_GREATER "3.11.0")
Expand Down
3 changes: 3 additions & 0 deletions core/src/engine/EngineMultiRobot.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2162,6 +2162,9 @@ namespace jiminy
// Log current buffer content as final point of the log data
updateTelemetry();

// Clear log data buffer one last time, now that the final point has been added
logData_ = nullptr;

/* Reset the telemetry.
Note that calling ``stop` or `reset` does NOT clear
the internal data buffer of telemetryRecorder_.
Expand Down
64 changes: 37 additions & 27 deletions python/gym_jiminy/common/gym_jiminy/common/envs/env_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
import tempfile
from copy import deepcopy
from collections import OrderedDict
from typing import Optional, Tuple, Sequence, Dict, Any, Callable, List, Union
from typing import Optional, Tuple, Dict, Any, Callable, List, Union

import numpy as np
import gym
from gym import logger, spaces
from gym.utils.seeding import create_seed, _int_list_from_bigint, hash_seed
from gym.utils.seeding import _int_list_from_bigint, hash_seed

import jiminy_py.core as jiminy
from jiminy_py.core import (EncoderSensor as encoder,
Expand Down Expand Up @@ -120,10 +120,13 @@ def __init__(self,

# Internal buffers for physics computations
self.rg = np.random.Generator(np.random.Philox())
self._seed: Optional[np.uint32] = None
self._seed: List[np.uint32] = []
self.log_path: Optional[str] = None
self.logfile_action_headers: Optional[FieldDictNested] = None

# Whether or not evaluation mode is active
self.is_training = True

# Whether or not play interactive mode is active
self._is_interactive = False

Expand All @@ -139,7 +142,7 @@ def __init__(self,
self._num_steps_beyond_done: Optional[int] = None

# Initialize the seed of the environment.
# Note that reseting the seed also reset robot internal state.
# Note that resetting the seed also reset robot internal state.
self.seed()

# Set robot in neutral configuration
Expand Down Expand Up @@ -429,15 +432,10 @@ def _refresh_action_space(self) -> None:
command_limit[motor.joint_velocity_idx] = \
MOTOR_EFFORT_MAX

# Set the action space.
# Note that float32 is used instead of float64, because otherwise it
# would requires the neural network to perform float64 computations
# or cast the output for no really advantage since the action is
# directly forwarded to the motors, without intermediary computations.
# Set the action space
action_scale = command_limit[self.robot.motors_velocity_idx]
self.action_space = spaces.Box(low=-action_scale.astype(np.float32),
high=action_scale.astype(np.float32),
dtype=np.float32)
self.action_space = spaces.Box(
low=-action_scale, high=action_scale, dtype=np.float64)

def reset(self,
controller_hook: Optional[Callable[[], Optional[Tuple[
Expand Down Expand Up @@ -566,14 +564,9 @@ def reset(self,
# Fallback: Get generic fieldnames otherwise
self.logfile_action_headers = get_fieldnames(
self.action_space, "action")
is_success = register_variables(self.simulator.controller,
self.logfile_action_headers,
self._action)
if not is_success:
self.logfile_action_headers = None
logger.warn(
"Action must have dtype np.float64 to be registered to the "
"telemetry.")
register_variables(self.simulator.controller,
self.logfile_action_headers,
self._action)

# Sample the initial state and reset the low-level engine
qpos, qvel = self._sample_state()
Expand Down Expand Up @@ -626,7 +619,7 @@ def reset(self,

return obs

def seed(self, seed: Optional[int] = None) -> Sequence[np.uint32]:
def seed(self, seed: Optional[int] = None) -> List[np.uint32]:
"""Specify the seed of the environment.
.. warning::
Expand All @@ -643,16 +636,16 @@ def seed(self, seed: Optional[int] = None) -> Sequence[np.uint32]:
# into sequence of 4 bytes uint32 seeds. Backup only the first one.
# Note that hashing is used to get rid off possible correlation in the
# presence of concurrency.
seed_ints = _int_list_from_bigint(hash_seed(create_seed(seed)))
self._seed = np.uint32(seed_ints[0])
seed = hash_seed(seed)
self._seed = list(map(np.uint32, _int_list_from_bigint(seed)))

# Instantiate a new random number generator based on the provided seed
self.rg = np.random.Generator(np.random.Philox(seed_ints))
self.rg = np.random.Generator(np.random.Philox(self._seed))

# Reset the seed of Jiminy Engine
self.simulator.seed(self._seed)
self.simulator.seed(self._seed[0])

return [self._seed]
return self._seed

def close(self) -> None:
"""Terminate the Python Jiminy engine.
Expand Down Expand Up @@ -958,6 +951,24 @@ def _interact(key: Optional[str] = None) -> bool:
# Disable play interactive mode flag
self._is_interactive = False

def train(self) -> None:
"""Sets the environment in training mode.
.. note::
This mode is enabled by default.
"""
self.is_training = True

def eval(self) -> None:
"""Sets the environment in evaluation mode.
This has any effect only on certain environment. See documentations of
particular environment for details of their behaviors in training and
evaluation modes, if they are affected. It can be used to activate
clipping or some filtering of the action specifical at evaluation time.
"""
self.is_training = False

# methods to override:
# ----------------------------

Expand All @@ -982,7 +993,6 @@ def _setup(self) -> None:
engine_options["stepper"]["iterMax"] = 0
engine_options["stepper"]["timeout"] = 0.0
engine_options["stepper"]["logInternalStepperSteps"] = False
engine_options["stepper"]["randomSeed"] = self._seed
self.simulator.engine.set_options(engine_options)

# Set robot in neutral configuration
Expand Down
13 changes: 6 additions & 7 deletions python/gym_jiminy/common/gym_jiminy/common/utils/helpers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
""" TODO: Write documentation.
"""
from typing import Union, Dict, List
from typing import Union, Dict, List, ValuesView

import numpy as np
import numba as nb
Expand Down Expand Up @@ -64,7 +64,8 @@ def get_fieldnames(space: spaces.Space,


def register_variables(controller: jiminy.AbstractController,
fields: FieldDictNested,
fields: Union[
ValuesView[FieldDictNested], FieldDictNested],
data: SpaceDictNested,
namespace: str = "") -> bool:
"""Register data from `Gym.Space` to the telemetry of a controller.
Expand Down Expand Up @@ -101,8 +102,8 @@ def register_variables(controller: jiminy.AbstractController,
# Default case: data is already a numpy array. Can be registered directly.
if isinstance(data, np.ndarray):
if np.issubsctype(data, np.float64):
assert isinstance(fields, list)
for i, field in enumerate(fields):
assert not isinstance(fields, dict)
if isinstance(fields[i], list):
fields[i] = [".".join(filter(None, (namespace, subfield)))
for subfield in field]
Expand All @@ -115,10 +116,8 @@ def register_variables(controller: jiminy.AbstractController,
# Fallback to looping over fields and data iterators
is_success = True
if isinstance(fields, dict):
fields = list(fields.values())
if isinstance(data, dict):
data = list(data.values())
for subfields, value in zip(fields, data):
fields = fields.values()
for subfields, value in zip(fields, data.values()):
assert isinstance(subfields, (dict, list))
is_success = register_variables(
controller, subfields, value, namespace)
Expand Down
5 changes: 3 additions & 2 deletions python/gym_jiminy/common/gym_jiminy/common/utils/spaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,10 @@ def zeros(space: gym.Space,
if isinstance(space, spaces.Tuple):
return tuple(zeros(subspace, dtype=dtype) for subspace in space.spaces)
if isinstance(space, (spaces.Discrete, spaces.MultiDiscrete)):
return np.array(0) # Using np.array of 0 dim to be mutable
# Note that np.array of 0 dim is returned in order to be mutable
return np.array(0, dtype=dtype or np.int64)
if isinstance(space, spaces.MultiBinary):
return np.zeros(space.n, dtype=np.int8)
return np.zeros(space.n, dtype=dtype or np.int8)
raise NotImplementedError(
f"Space of type {type(space)} is not supported.")

Expand Down
2 changes: 1 addition & 1 deletion python/gym_jiminy/envs/gym_jiminy/envs/acrobot.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def compute_command(self,

# Compute the actual torque to apply
if not self.continuous:
action = self.AVAIL_CTRL[action]
action = self.AVAIL_CTRL[round(action[()])]
if ACTION_NOISE > 0.0:
action += sample(scale=ACTION_NOISE, rg=self.rg)

Expand Down
2 changes: 1 addition & 1 deletion python/gym_jiminy/envs/gym_jiminy/envs/cartpole.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def compute_command(self,

# Compute the actual torque to apply
if not self.continuous:
action = self.AVAIL_CTRL[action]
action = self.AVAIL_CTRL[round(action[()])]

return action

Expand Down
67 changes: 44 additions & 23 deletions python/gym_jiminy/rllib/gym_jiminy/rllib/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.view_requirement import ViewRequirement
from ray.rllib.utils.typing import TensorType, TrainerConfigDict
from ray.rllib.utils.torch_ops import l2_loss
from ray.rllib.agents.ppo import DEFAULT_CONFIG, PPOTrainer
from ray.rllib.agents.ppo.ppo_torch_policy import (
ppo_surrogate_loss, kl_and_loss_stats, setup_mixins, PPOTorchPolicy)
Expand All @@ -23,7 +24,8 @@
"symmetric_policy_reg": 0.0,
"caps_temporal_reg": 0.0,
"caps_spatial_reg": 0.0,
"caps_global_reg": 0.0
"caps_global_reg": 0.0,
"l2_reg": 0.0
},
_allow_unknown_configs=True)

Expand Down Expand Up @@ -53,6 +55,7 @@ def ppo_init(policy: Policy,
policy._mean_temporal_caps_loss = 0.0
policy._mean_spatial_caps_loss = 0.0
policy._mean_global_caps_loss = 0.0
policy._l2_reg_loss = 0.0

# Convert to torch.Tensor observation bounds
policy._observation_space_low = \
Expand Down Expand Up @@ -110,7 +113,8 @@ def ppo_loss(policy: Policy,
# Append the training batches to the set
train_batches["prev"] = train_batch_copy

if policy.config["caps_spatial_reg"] > 0.0:
if policy.config["caps_spatial_reg"] > 0.0 or \
policy.config["caps_global_reg"] > 0.0:
# Shallow copy the original training batch
train_batch_copy = train_batch.copy(shallow=True)

Expand Down Expand Up @@ -211,15 +215,38 @@ def value_function(self, *args: Any, **kwargs: Any) -> torch.Tensor:
action_dist = dist_class(action_logits, model)
action_mean_true = action_dist.deterministic_sample()

# Compute the mean action corresponding to the previous observation
if policy.config["caps_temporal_reg"] > 0.0:
# Compute the mean action corresponding to the previous observation
action_logits_prev = logits["prev"]
if issubclass(dist_class, TorchDiagGaussian):
action_mean_prev, _ = torch.chunk(action_logits_prev, 2, dim=1)
else:
action_dist_prev = dist_class(action_logits_prev, model)
action_mean_prev = action_dist_prev.deterministic_sample()

# Compute the mean action corresponding to the noisy observation
if policy.config["caps_spatial_reg"] > 0.0 or \
policy.config["caps_global_reg"] > 0.0:
action_logits_noisy = logits["noisy"]
if issubclass(dist_class, TorchDiagGaussian):
action_mean_noisy, _ = torch.chunk(action_logits_noisy, 2, dim=1)
else:
action_dist_noisy = dist_class(action_logits_noisy, model)
action_mean_noisy = action_dist_noisy.deterministic_sample()

# Compute the mirrored mean action corresponding to the mirrored action
if policy.config["symmetric_policy_reg"] > 0.0:
action_logits_mirror = logits["mirrored"]
if issubclass(dist_class, TorchDiagGaussian):
action_mean_mirror, _ = torch.chunk(action_logits_mirror, 2, dim=1)
else:
action_dist_mirror = dist_class(action_logits_mirror, model)
action_mean_mirror = action_dist_mirror.deterministic_sample()
action_mirror_mat = policy.action_space.mirror_mat.to(device)
policy.action_space.mirror_mat = action_mirror_mat
action_mean_mirror = action_mean_mirror @ action_mirror_mat

if policy.config["caps_temporal_reg"] > 0.0:
# Minimize the difference between the successive action mean
policy._mean_temporal_caps_loss = torch.mean(
(action_mean_prev - action_mean_true) ** 2)
Expand All @@ -229,14 +256,6 @@ def value_function(self, *args: Any, **kwargs: Any) -> torch.Tensor:
policy._mean_temporal_caps_loss

if policy.config["caps_spatial_reg"] > 0.0:
# Compute the mean action corresponding to the noisy observation
action_logits_noisy = logits["noisy"]
if issubclass(dist_class, TorchDiagGaussian):
action_mean_noisy, _ = torch.chunk(action_logits_noisy, 2, dim=1)
else:
action_dist_noisy = dist_class(action_logits_noisy, model)
action_mean_noisy = action_dist_noisy.deterministic_sample()

# Minimize the difference between the original action mean and the
# one corresponding to the noisy observation.
policy._mean_spatial_caps_loss = torch.mean(
Expand All @@ -248,24 +267,13 @@ def value_function(self, *args: Any, **kwargs: Any) -> torch.Tensor:

if policy.config["caps_global_reg"] > 0.0:
# Minimize the magnitude of action mean
policy._mean_global_caps_loss = torch.mean(action_mean_true ** 2)
policy._mean_global_caps_loss = torch.mean(action_mean_noisy ** 2)

# Add global smoothness loss to total loss
total_loss += policy.config["caps_global_reg"] * \
policy._mean_global_caps_loss

if policy.config["symmetric_policy_reg"] > 0.0:
# Compute the mirrored mean action corresponding to the mirrored action
action_logits_mirror = logits["mirrored"]
if issubclass(dist_class, TorchDiagGaussian):
action_mean_mirror, _ = torch.chunk(action_logits_mirror, 2, dim=1)
else:
action_dist_mirror = dist_class(action_logits_mirror, model)
action_mean_mirror = action_dist_mirror.deterministic_sample()
action_mirror_mat = policy.action_space.mirror_mat.to(device)
policy.action_space.mirror_mat = action_mirror_mat
action_mean_mirror = action_mean_mirror @ action_mirror_mat

# Minimize the assymetry of policy output
policy._mean_symmetric_policy_loss = torch.mean(
(action_mean_mirror - action_mean_true) ** 2)
Expand All @@ -274,6 +282,17 @@ def value_function(self, *args: Any, **kwargs: Any) -> torch.Tensor:
total_loss += policy.config["symmetric_policy_reg"] * \
policy._mean_symmetric_policy_loss

if policy.config["l2_reg"] > 0.0:
# Add actor l2-regularization loss
l2_reg_loss = 0.0
for name, params in model.state_dict().items():
if "bias" not in name:
l2_reg_loss += l2_loss(params)
policy._l2_reg_loss = l2_reg_loss

# Add l2-regularization loss to total loss
total_loss += policy.config["l2_reg"] * policy._l2_reg_loss

return total_loss


Expand All @@ -293,6 +312,8 @@ def ppo_stats(policy: Policy,
stats_dict["spatial_smoothness"] = policy._mean_spatial_caps_loss
if policy.config["caps_global_reg"] > 0.0:
stats_dict["global_smoothness"] = policy._mean_global_caps_loss
if policy.config["l2_reg"] > 0.0:
stats_dict["l2_reg"] = policy._l2_reg_loss

return stats_dict

Expand Down
Loading

0 comments on commit 6243e43

Please sign in to comment.