From 6243e43237e4c9e5eceacc413cbc44f0c2008193 Mon Sep 17 00:00:00 2001
From: Alexis DUBURCQ <alexis.duburcq@gmail.com>
Date: Wed, 18 Aug 2021 22:14:18 +0200
Subject: [PATCH] [all] Fix reproducibility and performance comparation issues.
 (#399)

* [core] Fix log buffer not refreshed after calling 'stop', making last datapoint not accessible. (#396)
* [python/robot] Fix non-reproducible simulation results.
* [jiminy_py/simulator] Fix running simulation not stopped before updating seed.
* [python/simulator] Fix unexpected error if 'replay' is called without data.
* [gym/common] Do not set the seed by default in '_setup' method.
* [gym/common] Add train and eval modes to allow specific evaluation behaviors.
* [gym/common] Rename '_is_training' in 'is_training'. Fix 'is_training' False by default. (#397)
* [gym/common] 'seed' method now return the actual sequence of seeds instead of only the first one.
* [gym/common] Remove useless conversion of iterator to list.
* [gym/common] Do not cast automatically the action space to float32.  (#400)
* [gym/common] Ensure internal '_action' buffer dtype float64. (#400)
* [gym/toolbox] Add 'quat_to_yaw_cos_sin' math helper.
* [gym/rllib] Add l2 regulization loss to PPO.
* [gym/rllib] Use noisy samples instead of the true ones to compute global smoothness.
* [gym/rllib] Make sure the validation is always the same to make it easier to benchmark performance.
* [gym/rllib] Enable to evaluate policy on multiple trials and report statistics. Export data from best trial only.
* [gym/rllib] Relax ray dependency version requirement.

Co-authored-by: Alexis Duburcq <alexis.duburcq@wandercraft.eu>
---
 CMakeLists.txt                                |  2 +-
 core/src/engine/EngineMultiRobot.cc           |  3 +
 .../gym_jiminy/common/envs/env_generic.py     | 64 ++++++++------
 .../common/gym_jiminy/common/utils/helpers.py | 13 ++-
 .../common/gym_jiminy/common/utils/spaces.py  |  5 +-
 .../envs/gym_jiminy/envs/acrobot.py           |  2 +-
 .../envs/gym_jiminy/envs/cartpole.py          |  2 +-
 .../gym_jiminy/rllib/gym_jiminy/rllib/ppo.py  | 67 +++++++++-----
 .../rllib/gym_jiminy/rllib/utilities.py       | 87 ++++++++++++++++---
 python/gym_jiminy/rllib/setup.py              |  3 +-
 .../gym_jiminy/toolbox/math/__init__.py       |  6 +-
 .../gym_jiminy/toolbox/math/generic.py        | 21 +++--
 python/jiminy_py/src/jiminy_py/robot.py       | 13 ++-
 python/jiminy_py/src/jiminy_py/simulator.py   | 12 +--
 14 files changed, 212 insertions(+), 88 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 155849e99..97cc4dd51 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,7 @@
 cmake_minimum_required(VERSION 3.10)
 
 # Set the build version
-set(BUILD_VERSION 1.6.28)
+set(BUILD_VERSION 1.6.29)
 
 # Set compatibility
 if(CMAKE_VERSION VERSION_GREATER "3.11.0")
diff --git a/core/src/engine/EngineMultiRobot.cc b/core/src/engine/EngineMultiRobot.cc
index 2e492b52a..96dc33cea 100644
--- a/core/src/engine/EngineMultiRobot.cc
+++ b/core/src/engine/EngineMultiRobot.cc
@@ -2162,6 +2162,9 @@ namespace jiminy
         // Log current buffer content as final point of the log data
         updateTelemetry();
 
+        // Clear log data buffer one last time, now that the final point has been added
+        logData_ = nullptr;
+
         /* Reset the telemetry.
            Note that calling ``stop` or  `reset` does NOT clear
            the internal data buffer of telemetryRecorder_.
diff --git a/python/gym_jiminy/common/gym_jiminy/common/envs/env_generic.py b/python/gym_jiminy/common/gym_jiminy/common/envs/env_generic.py
index 29a8dd822..093e7a1ec 100644
--- a/python/gym_jiminy/common/gym_jiminy/common/envs/env_generic.py
+++ b/python/gym_jiminy/common/gym_jiminy/common/envs/env_generic.py
@@ -6,12 +6,12 @@
 import tempfile
 from copy import deepcopy
 from collections import OrderedDict
-from typing import Optional, Tuple, Sequence, Dict, Any, Callable, List, Union
+from typing import Optional, Tuple, Dict, Any, Callable, List, Union
 
 import numpy as np
 import gym
 from gym import logger, spaces
-from gym.utils.seeding import create_seed, _int_list_from_bigint, hash_seed
+from gym.utils.seeding import _int_list_from_bigint, hash_seed
 
 import jiminy_py.core as jiminy
 from jiminy_py.core import (EncoderSensor as encoder,
@@ -120,10 +120,13 @@ def __init__(self,
 
         # Internal buffers for physics computations
         self.rg = np.random.Generator(np.random.Philox())
-        self._seed: Optional[np.uint32] = None
+        self._seed: List[np.uint32] = []
         self.log_path: Optional[str] = None
         self.logfile_action_headers: Optional[FieldDictNested] = None
 
+        # Whether or not evaluation mode is active
+        self.is_training = True
+
         # Whether or not play interactive mode is active
         self._is_interactive = False
 
@@ -139,7 +142,7 @@ def __init__(self,
         self._num_steps_beyond_done: Optional[int] = None
 
         # Initialize the seed of the environment.
-        # Note that reseting the seed also reset robot internal state.
+        # Note that resetting the seed also reset robot internal state.
         self.seed()
 
         # Set robot in neutral configuration
@@ -429,15 +432,10 @@ def _refresh_action_space(self) -> None:
                     command_limit[motor.joint_velocity_idx] = \
                         MOTOR_EFFORT_MAX
 
-        # Set the action space.
-        # Note that float32 is used instead of float64, because otherwise it
-        # would requires the neural network to perform float64 computations
-        # or cast the output for no really advantage since the action is
-        # directly forwarded to the motors, without intermediary computations.
+        # Set the action space
         action_scale = command_limit[self.robot.motors_velocity_idx]
-        self.action_space = spaces.Box(low=-action_scale.astype(np.float32),
-                                       high=action_scale.astype(np.float32),
-                                       dtype=np.float32)
+        self.action_space = spaces.Box(
+            low=-action_scale, high=action_scale, dtype=np.float64)
 
     def reset(self,
               controller_hook: Optional[Callable[[], Optional[Tuple[
@@ -566,14 +564,9 @@ def reset(self,
             # Fallback: Get generic fieldnames otherwise
             self.logfile_action_headers = get_fieldnames(
                 self.action_space, "action")
-        is_success = register_variables(self.simulator.controller,
-                                        self.logfile_action_headers,
-                                        self._action)
-        if not is_success:
-            self.logfile_action_headers = None
-            logger.warn(
-                "Action must have dtype np.float64 to be registered to the "
-                "telemetry.")
+        register_variables(self.simulator.controller,
+                           self.logfile_action_headers,
+                           self._action)
 
         # Sample the initial state and reset the low-level engine
         qpos, qvel = self._sample_state()
@@ -626,7 +619,7 @@ def reset(self,
 
         return obs
 
-    def seed(self, seed: Optional[int] = None) -> Sequence[np.uint32]:
+    def seed(self, seed: Optional[int] = None) -> List[np.uint32]:
         """Specify the seed of the environment.
 
         .. warning::
@@ -643,16 +636,16 @@ def seed(self, seed: Optional[int] = None) -> Sequence[np.uint32]:
         # into sequence of 4 bytes uint32 seeds. Backup only the first one.
         # Note that hashing is used to get rid off possible correlation in the
         # presence of concurrency.
-        seed_ints = _int_list_from_bigint(hash_seed(create_seed(seed)))
-        self._seed = np.uint32(seed_ints[0])
+        seed = hash_seed(seed)
+        self._seed = list(map(np.uint32, _int_list_from_bigint(seed)))
 
         # Instantiate a new random number generator based on the provided seed
-        self.rg = np.random.Generator(np.random.Philox(seed_ints))
+        self.rg = np.random.Generator(np.random.Philox(self._seed))
 
         # Reset the seed of Jiminy Engine
-        self.simulator.seed(self._seed)
+        self.simulator.seed(self._seed[0])
 
-        return [self._seed]
+        return self._seed
 
     def close(self) -> None:
         """Terminate the Python Jiminy engine.
@@ -958,6 +951,24 @@ def _interact(key: Optional[str] = None) -> bool:
         # Disable play interactive mode flag
         self._is_interactive = False
 
+    def train(self) -> None:
+        """Sets the environment in training mode.
+
+        .. note::
+            This mode is enabled by default.
+        """
+        self.is_training = True
+
+    def eval(self) -> None:
+        """Sets the environment in evaluation mode.
+
+        This has any effect only on certain environment. See documentations of
+        particular environment for details of their behaviors in training and
+        evaluation modes, if they are affected. It can be used to activate
+        clipping or some filtering of the action specifical at evaluation time.
+        """
+        self.is_training = False
+
     # methods to override:
     # ----------------------------
 
@@ -982,7 +993,6 @@ def _setup(self) -> None:
         engine_options["stepper"]["iterMax"] = 0
         engine_options["stepper"]["timeout"] = 0.0
         engine_options["stepper"]["logInternalStepperSteps"] = False
-        engine_options["stepper"]["randomSeed"] = self._seed
         self.simulator.engine.set_options(engine_options)
 
         # Set robot in neutral configuration
diff --git a/python/gym_jiminy/common/gym_jiminy/common/utils/helpers.py b/python/gym_jiminy/common/gym_jiminy/common/utils/helpers.py
index 2f8f2d670..ef8ede40c 100644
--- a/python/gym_jiminy/common/gym_jiminy/common/utils/helpers.py
+++ b/python/gym_jiminy/common/gym_jiminy/common/utils/helpers.py
@@ -1,6 +1,6 @@
 """ TODO: Write documentation.
 """
-from typing import Union, Dict, List
+from typing import Union, Dict, List, ValuesView
 
 import numpy as np
 import numba as nb
@@ -64,7 +64,8 @@ def get_fieldnames(space: spaces.Space,
 
 
 def register_variables(controller: jiminy.AbstractController,
-                       fields: FieldDictNested,
+                       fields: Union[
+                           ValuesView[FieldDictNested], FieldDictNested],
                        data: SpaceDictNested,
                        namespace: str = "") -> bool:
     """Register data from `Gym.Space` to the telemetry of a controller.
@@ -101,8 +102,8 @@ def register_variables(controller: jiminy.AbstractController,
     # Default case: data is already a numpy array. Can be registered directly.
     if isinstance(data, np.ndarray):
         if np.issubsctype(data, np.float64):
+            assert isinstance(fields, list)
             for i, field in enumerate(fields):
-                assert not isinstance(fields, dict)
                 if isinstance(fields[i], list):
                     fields[i] = [".".join(filter(None, (namespace, subfield)))
                                  for subfield in field]
@@ -115,10 +116,8 @@ def register_variables(controller: jiminy.AbstractController,
     # Fallback to looping over fields and data iterators
     is_success = True
     if isinstance(fields, dict):
-        fields = list(fields.values())
-    if isinstance(data, dict):
-        data = list(data.values())
-    for subfields, value in zip(fields, data):
+        fields = fields.values()
+    for subfields, value in zip(fields, data.values()):
         assert isinstance(subfields, (dict, list))
         is_success = register_variables(
             controller, subfields, value, namespace)
diff --git a/python/gym_jiminy/common/gym_jiminy/common/utils/spaces.py b/python/gym_jiminy/common/gym_jiminy/common/utils/spaces.py
index 9bbaba624..3c8b6dece 100644
--- a/python/gym_jiminy/common/gym_jiminy/common/utils/spaces.py
+++ b/python/gym_jiminy/common/gym_jiminy/common/utils/spaces.py
@@ -129,9 +129,10 @@ def zeros(space: gym.Space,
     if isinstance(space, spaces.Tuple):
         return tuple(zeros(subspace, dtype=dtype) for subspace in space.spaces)
     if isinstance(space, (spaces.Discrete, spaces.MultiDiscrete)):
-        return np.array(0)  # Using np.array of 0 dim to be mutable
+        # Note that np.array of 0 dim is returned in order to be mutable
+        return np.array(0, dtype=dtype or np.int64)
     if isinstance(space, spaces.MultiBinary):
-        return np.zeros(space.n, dtype=np.int8)
+        return np.zeros(space.n, dtype=dtype or np.int8)
     raise NotImplementedError(
         f"Space of type {type(space)} is not supported.")
 
diff --git a/python/gym_jiminy/envs/gym_jiminy/envs/acrobot.py b/python/gym_jiminy/envs/gym_jiminy/envs/acrobot.py
index a37b0cac7..b4148b936 100644
--- a/python/gym_jiminy/envs/gym_jiminy/envs/acrobot.py
+++ b/python/gym_jiminy/envs/gym_jiminy/envs/acrobot.py
@@ -214,7 +214,7 @@ def compute_command(self,
 
         # Compute the actual torque to apply
         if not self.continuous:
-            action = self.AVAIL_CTRL[action]
+            action = self.AVAIL_CTRL[round(action[()])]
         if ACTION_NOISE > 0.0:
             action += sample(scale=ACTION_NOISE, rg=self.rg)
 
diff --git a/python/gym_jiminy/envs/gym_jiminy/envs/cartpole.py b/python/gym_jiminy/envs/gym_jiminy/envs/cartpole.py
index e5a813a20..89095bcb0 100644
--- a/python/gym_jiminy/envs/gym_jiminy/envs/cartpole.py
+++ b/python/gym_jiminy/envs/gym_jiminy/envs/cartpole.py
@@ -214,7 +214,7 @@ def compute_command(self,
 
         # Compute the actual torque to apply
         if not self.continuous:
-            action = self.AVAIL_CTRL[action]
+            action = self.AVAIL_CTRL[round(action[()])]
 
         return action
 
diff --git a/python/gym_jiminy/rllib/gym_jiminy/rllib/ppo.py b/python/gym_jiminy/rllib/gym_jiminy/rllib/ppo.py
index f01faabee..923f3e361 100644
--- a/python/gym_jiminy/rllib/gym_jiminy/rllib/ppo.py
+++ b/python/gym_jiminy/rllib/gym_jiminy/rllib/ppo.py
@@ -12,6 +12,7 @@
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.view_requirement import ViewRequirement
 from ray.rllib.utils.typing import TensorType, TrainerConfigDict
+from ray.rllib.utils.torch_ops import l2_loss
 from ray.rllib.agents.ppo import DEFAULT_CONFIG, PPOTrainer
 from ray.rllib.agents.ppo.ppo_torch_policy import (
     ppo_surrogate_loss, kl_and_loss_stats, setup_mixins, PPOTorchPolicy)
@@ -23,7 +24,8 @@
         "symmetric_policy_reg": 0.0,
         "caps_temporal_reg": 0.0,
         "caps_spatial_reg": 0.0,
-        "caps_global_reg": 0.0
+        "caps_global_reg": 0.0,
+        "l2_reg": 0.0
     },
     _allow_unknown_configs=True)
 
@@ -53,6 +55,7 @@ def ppo_init(policy: Policy,
     policy._mean_temporal_caps_loss = 0.0
     policy._mean_spatial_caps_loss = 0.0
     policy._mean_global_caps_loss = 0.0
+    policy._l2_reg_loss = 0.0
 
     # Convert to torch.Tensor observation bounds
     policy._observation_space_low = \
@@ -110,7 +113,8 @@ def ppo_loss(policy: Policy,
         # Append the training batches to the set
         train_batches["prev"] = train_batch_copy
 
-    if policy.config["caps_spatial_reg"] > 0.0:
+    if policy.config["caps_spatial_reg"] > 0.0 or \
+            policy.config["caps_global_reg"] > 0.0:
         # Shallow copy the original training batch
         train_batch_copy = train_batch.copy(shallow=True)
 
@@ -211,8 +215,8 @@ def value_function(self, *args: Any, **kwargs: Any) -> torch.Tensor:
         action_dist = dist_class(action_logits, model)
         action_mean_true = action_dist.deterministic_sample()
 
+    # Compute the mean action corresponding to the previous observation
     if policy.config["caps_temporal_reg"] > 0.0:
-        # Compute the mean action corresponding to the previous observation
         action_logits_prev = logits["prev"]
         if issubclass(dist_class, TorchDiagGaussian):
             action_mean_prev, _ = torch.chunk(action_logits_prev, 2, dim=1)
@@ -220,6 +224,29 @@ def value_function(self, *args: Any, **kwargs: Any) -> torch.Tensor:
             action_dist_prev = dist_class(action_logits_prev, model)
             action_mean_prev = action_dist_prev.deterministic_sample()
 
+    # Compute the mean action corresponding to the noisy observation
+    if policy.config["caps_spatial_reg"] > 0.0 or \
+            policy.config["caps_global_reg"] > 0.0:
+        action_logits_noisy = logits["noisy"]
+        if issubclass(dist_class, TorchDiagGaussian):
+            action_mean_noisy, _ = torch.chunk(action_logits_noisy, 2, dim=1)
+        else:
+            action_dist_noisy = dist_class(action_logits_noisy, model)
+            action_mean_noisy = action_dist_noisy.deterministic_sample()
+
+    # Compute the mirrored mean action corresponding to the mirrored action
+    if policy.config["symmetric_policy_reg"] > 0.0:
+        action_logits_mirror = logits["mirrored"]
+        if issubclass(dist_class, TorchDiagGaussian):
+            action_mean_mirror, _ = torch.chunk(action_logits_mirror, 2, dim=1)
+        else:
+            action_dist_mirror = dist_class(action_logits_mirror, model)
+            action_mean_mirror = action_dist_mirror.deterministic_sample()
+        action_mirror_mat = policy.action_space.mirror_mat.to(device)
+        policy.action_space.mirror_mat = action_mirror_mat
+        action_mean_mirror = action_mean_mirror @ action_mirror_mat
+
+    if policy.config["caps_temporal_reg"] > 0.0:
         # Minimize the difference between the successive action mean
         policy._mean_temporal_caps_loss = torch.mean(
             (action_mean_prev - action_mean_true) ** 2)
@@ -229,14 +256,6 @@ def value_function(self, *args: Any, **kwargs: Any) -> torch.Tensor:
             policy._mean_temporal_caps_loss
 
     if policy.config["caps_spatial_reg"] > 0.0:
-        # Compute the mean action corresponding to the noisy observation
-        action_logits_noisy = logits["noisy"]
-        if issubclass(dist_class, TorchDiagGaussian):
-            action_mean_noisy, _ = torch.chunk(action_logits_noisy, 2, dim=1)
-        else:
-            action_dist_noisy = dist_class(action_logits_noisy, model)
-            action_mean_noisy = action_dist_noisy.deterministic_sample()
-
         # Minimize the difference between the original action mean and the
         # one corresponding to the noisy observation.
         policy._mean_spatial_caps_loss = torch.mean(
@@ -248,24 +267,13 @@ def value_function(self, *args: Any, **kwargs: Any) -> torch.Tensor:
 
     if policy.config["caps_global_reg"] > 0.0:
         # Minimize the magnitude of action mean
-        policy._mean_global_caps_loss = torch.mean(action_mean_true ** 2)
+        policy._mean_global_caps_loss = torch.mean(action_mean_noisy ** 2)
 
         # Add global smoothness loss to total loss
         total_loss += policy.config["caps_global_reg"] * \
             policy._mean_global_caps_loss
 
     if policy.config["symmetric_policy_reg"] > 0.0:
-        # Compute the mirrored mean action corresponding to the mirrored action
-        action_logits_mirror = logits["mirrored"]
-        if issubclass(dist_class, TorchDiagGaussian):
-            action_mean_mirror, _ = torch.chunk(action_logits_mirror, 2, dim=1)
-        else:
-            action_dist_mirror = dist_class(action_logits_mirror, model)
-            action_mean_mirror = action_dist_mirror.deterministic_sample()
-        action_mirror_mat = policy.action_space.mirror_mat.to(device)
-        policy.action_space.mirror_mat = action_mirror_mat
-        action_mean_mirror = action_mean_mirror @ action_mirror_mat
-
         # Minimize the assymetry of policy output
         policy._mean_symmetric_policy_loss = torch.mean(
             (action_mean_mirror - action_mean_true) ** 2)
@@ -274,6 +282,17 @@ def value_function(self, *args: Any, **kwargs: Any) -> torch.Tensor:
         total_loss += policy.config["symmetric_policy_reg"] * \
             policy._mean_symmetric_policy_loss
 
+    if policy.config["l2_reg"] > 0.0:
+        # Add actor l2-regularization loss
+        l2_reg_loss = 0.0
+        for name, params in model.state_dict().items():
+            if "bias" not in name:
+                l2_reg_loss += l2_loss(params)
+        policy._l2_reg_loss = l2_reg_loss
+
+        # Add l2-regularization loss to total loss
+        total_loss += policy.config["l2_reg"] * policy._l2_reg_loss
+
     return total_loss
 
 
@@ -293,6 +312,8 @@ def ppo_stats(policy: Policy,
         stats_dict["spatial_smoothness"] = policy._mean_spatial_caps_loss
     if policy.config["caps_global_reg"] > 0.0:
         stats_dict["global_smoothness"] = policy._mean_global_caps_loss
+    if policy.config["l2_reg"] > 0.0:
+        stats_dict["l2_reg"] = policy._l2_reg_loss
 
     return stats_dict
 
diff --git a/python/gym_jiminy/rllib/gym_jiminy/rllib/utilities.py b/python/gym_jiminy/rllib/gym_jiminy/rllib/utilities.py
index 07f8c55e1..455a6b1e1 100644
--- a/python/gym_jiminy/rllib/gym_jiminy/rllib/utilities.py
+++ b/python/gym_jiminy/rllib/gym_jiminy/rllib/utilities.py
@@ -11,12 +11,14 @@
 import logging
 import inspect
 import tracemalloc
+from tempfile import mkstemp
 from datetime import datetime
 from collections import defaultdict
 from typing import Optional, Callable, Dict, Any, Tuple
 
 import gym
 import numpy as np
+import plotext as plt
 from tensorboard.program import TensorBoard
 
 import ray
@@ -32,6 +34,7 @@
 from ray.rllib.agents.trainer import Trainer
 from ray.rllib.models.preprocessors import get_preprocessor
 
+from jiminy_py.viewer import play_logs_files
 from gym_jiminy.common.utils import clip, SpaceDictNested
 
 try:
@@ -43,9 +46,12 @@
 except ModuleNotFoundError:
     pass
 
+
 logger = logging.getLogger(__name__)
 
 
+HISTOGRAM_BINS = 20
+
 PRINT_RESULT_FIELDS_FILTER = [
     "training_iteration",
     "time_total_s",
@@ -384,6 +390,7 @@ def build_policy_from_checkpoint(
 def train(train_agent: Trainer,
           max_timesteps: int = 0,
           max_iters: int = 0,
+          evaluation_num: int = 10,
           evaluation_period: int = 0,
           checkpoint_period: int = 0,
           record_video: bool = True,
@@ -404,6 +411,10 @@ def train(train_agent: Trainer,
                           Optional: Disabled by default.
     :param max_iters: Maximum number of training iterations. 0 to disable.
                       Optional: Disabled by default.
+    :param evaluation_num: How any evaluation to run. The log files of the best
+                           and worst performing trials will be exported, and
+                           some statistics will be reported if 'verbose' is
+                           enabled.
     :param evaluation_period: Run one simulation (with exploration) every given
                               number of training steps, and save the log file
                               and a video of the result in log folder if
@@ -413,7 +424,8 @@ def train(train_agent: Trainer,
                               steps in log folder if requested. 0 to disable.
                               Optional: Disabled by default.
     :param record_video: Whether or not to enable video recording during
-                         evaluation.
+                         evaluation. Video will be recorded for best and worst
+                         trials.
                          Optional: True by default.
     :param debug: Whether or not to monitor memory allocation for debugging
                   memory leaks.
@@ -501,15 +513,55 @@ def train(train_agent: Trainer,
 
             # Record video and log data of the result
             if evaluation_period > 0 and iter_num % evaluation_period == 0:
-                record_video_path = f"{train_agent.logdir}/iter_{iter_num}.mp4"
-                env, _ = test(train_agent,
-                              explore=True,
-                              enable_replay=record_video,
-                              viewer_kwargs={
-                                  "record_video_path": record_video_path,
-                                  "scene_name": f"iter_{iter_num}"
-                              })
-                env.write_log(f"{train_agent.logdir}/iter_{iter_num}.hdf5")
+                duration = []
+                total_rewards = []
+                log_files_tmp = []
+                test_env = train_agent.env_creator(
+                    train_agent.config["env_config"])
+                seed = train_agent.config["seed"] or 0
+                for i in range(evaluation_num):
+                    # Evaluate the policy once
+                    test(train_agent,
+                         explore=True,
+                         seed=seed+i,
+                         test_env=test_env,
+                         enable_stats=False,
+                         enable_replay=False)
+
+                    # Export temporary log file
+                    fd, log_path = mkstemp(prefix="log_", suffix=".hdf5")
+                    os.close(fd)
+                    test_env.write_log(log_path)
+
+                    # Monitor some statistics
+                    duration.append(test_env.num_steps * test_env.step_dt)
+                    total_rewards.append(test_env.total_reward)
+                    log_files_tmp.append(log_path)
+
+                # Backup log file of best trial
+                trial_best_idx = np.argmax(duration)
+                log_path = f"{train_agent.logdir}/iter_{iter_num}.hdf5"
+                shutil.move(log_files_tmp[trial_best_idx], log_path)
+
+                # Record video of best trial if requested
+                if record_video:
+                    video_path = f"{train_agent.logdir}/iter_{iter_num}.mp4"
+                    play_logs_files(log_path,
+                                    record_video_path=video_path,
+                                    scene_name=f"iter_{iter_num}")
+
+                # Ascii histogram if requested
+                if verbose:
+                    plt.clp()
+                    plt.subplots(1, 2)
+                    for i, (title, data) in enumerate(zip(
+                            ("Episode duration", "Total reward"),
+                            (duration, total_rewards))):
+                        plt.subplot(1, i)
+                        plt.hist(data, HISTOGRAM_BINS)
+                        plt.plotsize(50, 20)
+                        plt.title(title)
+                    plt.show()
 
             # Backup the policy
             if checkpoint_period > 0 and iter_num % checkpoint_period == 0:
@@ -536,6 +588,7 @@ def train(train_agent: Trainer,
 
 def evaluate(env: gym.Env,
              policy: Policy,
+             seed: int,
              obs_filter_fn: Optional[
                  Callable[[np.ndarray], np.ndarray]] = None,
              n_frames_stack: int = 1,
@@ -585,6 +638,14 @@ def evaluate(env: gym.Env,
     policy_forward = build_policy_wrapper(
         policy, obs_filter_fn, n_frames_stack, clip_action, explore)
 
+    # Make sure evaluation mode is enabled
+    is_training = env.is_training
+    if is_training:
+        env.eval()
+
+    # Reset the seed of the environment
+    env.seed(seed)
+
     # Initialize the simulation
     obs = env.reset()
     reward = None
@@ -602,6 +663,10 @@ def evaluate(env: gym.Env,
     except KeyboardInterrupt:
         pass
 
+    # Restore training mode if it was enabled
+    if is_training:
+        env.train()
+
     # Display some statistic if requested
     if enable_stats:
         print("env.num_steps:", env.num_steps)
@@ -620,6 +685,7 @@ def evaluate(env: gym.Env,
 
 def test(test_agent: Trainer,
          explore: bool = True,
+         seed: Optional[int] = None,
          n_frames_stack: int = 1,
          enable_stats: bool = True,
          enable_replay: bool = True,
@@ -672,6 +738,7 @@ def test(test_agent: Trainer,
 
     return evaluate(test_env,
                     policy,
+                    seed or test_agent.config["seed"] or 0,
                     obs_filter_fn,
                     n_frames_stack=n_frames_stack,
                     clip_action=test_agent.config["clip_actions"],
diff --git a/python/gym_jiminy/rllib/setup.py b/python/gym_jiminy/rllib/setup.py
index 3e9ab7b63..191f560c8 100644
--- a/python/gym_jiminy/rllib/setup.py
+++ b/python/gym_jiminy/rllib/setup.py
@@ -29,7 +29,8 @@
     packages=find_namespace_packages(),
     install_requires=[
         f"gym_jiminy~={version}",
-        "ray[default,rllib]<=1.4.1"
+        "ray[default,rllib]<=1.5.1",
+        "plotext"
     ],
     zip_safe=False
 )
diff --git a/python/gym_jiminy/toolbox/gym_jiminy/toolbox/math/__init__.py b/python/gym_jiminy/toolbox/gym_jiminy/toolbox/math/__init__.py
index 5d40550f6..86f87ca2c 100644
--- a/python/gym_jiminy/toolbox/gym_jiminy/toolbox/math/__init__.py
+++ b/python/gym_jiminy/toolbox/gym_jiminy/toolbox/math/__init__.py
@@ -2,7 +2,10 @@
 
 from .spline import Spline
 from .qhull import ConvexHull, compute_distance_convex_to_point
-from .generic import squared_norm_2, matrix_to_yaw, quat_to_yaw
+from .generic import (squared_norm_2,
+                      matrix_to_yaw,
+                      quat_to_yaw_cos_sin,
+                      quat_to_yaw)
 from .signal import integrate_zoh
 
 
@@ -12,6 +15,7 @@
     "compute_distance_convex_to_point",
     "squared_norm_2",
     "matrix_to_yaw",
+    "quat_to_yaw_cos_sin",
     "quat_to_yaw",
     "integrate_zoh"
 ]
diff --git a/python/gym_jiminy/toolbox/gym_jiminy/toolbox/math/generic.py b/python/gym_jiminy/toolbox/gym_jiminy/toolbox/math/generic.py
index 1f1cf3d99..095727b55 100644
--- a/python/gym_jiminy/toolbox/gym_jiminy/toolbox/math/generic.py
+++ b/python/gym_jiminy/toolbox/gym_jiminy/toolbox/math/generic.py
@@ -1,6 +1,7 @@
 """ TODO: Write documentation.
 """
 import math
+from typing import Tuple
 
 import numba as nb
 import numpy as np
@@ -16,17 +17,27 @@ def squared_norm_2(array: np.ndarray) -> float:
 
 @nb.jit(nopython=True, nogil=True)
 def matrix_to_yaw(rotation_matrix: np.ndarray) -> float:
-    """Compute the yaw from Yaw-Pitch-Roll Euler angles representation from a
+    """Compute the yaw from Yaw-Pitch-Roll Euler angles representation of a
     rotation matrix.
     """
     return math.atan2(rotation_matrix[1, 0], rotation_matrix[0, 0])
 
 
+@nb.jit(nopython=True, nogil=True)
+def quat_to_yaw_cos_sin(quaternion: np.ndarray) -> Tuple[float, float]:
+    """Compute the cosinus and sinus of the yaw from Yaw-Pitch-Roll Euler
+    angles representation of a quaternion.
+    """
+    cos_yaw = - 1.0 + 2.0 * (quaternion[3] ** 2 + quaternion[0] ** 2)
+    sin_yaw = 2.0 * (
+        quaternion[2] * quaternion[3] + quaternion[0] * quaternion[1])
+    return cos_yaw, sin_yaw
+
+
 @nb.jit(nopython=True, nogil=True)
 def quat_to_yaw(quaternion: np.ndarray) -> float:
-    """Compute the yaw from Yaw-Pitch-Roll Euler angles representation from a
+    """Compute the yaw from Yaw-Pitch-Roll Euler angles representation of a
     quaternion.
     """
-    return math.atan2(
-        2.0 * (quaternion[2] * quaternion[3] + quaternion[0] * quaternion[1]),
-        - 1.0 + 2.0 * (quaternion[3] ** 2 + quaternion[0] ** 2))
+    cos_yaw, sin_yaw = quat_to_yaw_cos_sin(quaternion)
+    return math.atan2(sin_yaw, cos_yaw)
diff --git a/python/jiminy_py/src/jiminy_py/robot.py b/python/jiminy_py/src/jiminy_py/robot.py
index 8514681f6..4dd872a05 100644
--- a/python/jiminy_py/src/jiminy_py/robot.py
+++ b/python/jiminy_py/src/jiminy_py/robot.py
@@ -166,7 +166,7 @@ def generate_hardware_description_file(
         root_link = next(iter(parent_links.difference(child_links)))
     else:
         root_link = None
-    leaf_links = list(child_links.difference(parent_links))
+    leaf_links = sorted(list(child_links.difference(parent_links)))
 
     # Parse the gazebo plugins, if any.
     # Note that it is only useful to extract "advanced" hardware, not basic
@@ -289,7 +289,7 @@ def generate_hardware_description_file(
 
     # Specify collision bodies and ground model in global config options
     hardware_info['Global']['collisionBodiesNames'] = \
-        list(collision_bodies_names)
+        sorted(list(collision_bodies_names))
     if gazebo_ground_stiffness is not None:
         hardware_info['Global']['groundStiffness'] = gazebo_ground_stiffness
     if gazebo_ground_damping is not None:
@@ -745,9 +745,16 @@ def initialize(self,
         # Note that it must be done before adding the sensors because
         # Contact sensors requires contact points to be defined.
         # Mesh collisions is not numerically stable for now, so disabling it.
+        # Note: Be careful, the order of the contact points is important, it
+        # changes the computation of the external forces, which is an iterative
+        # algorithm for impulse model, resulting in different simulation
+        # results. The order of the element of the set depends of the `hash`
+        # method of python, whose seed is randomly generated when starting the
+        # interpreter for security reason. As a result, the set must be sorted
+        # manually to ensure consistent results.
         self.add_collision_bodies(
             collision_bodies_names, ignore_meshes=avoid_instable_collisions)
-        self.add_contact_points(list(set(contact_frames_names)))
+        self.add_contact_points(sorted(list(set(contact_frames_names))))
 
         # Add the motors to the robot
         for motor_type, motors_descr in motors_info.items():
diff --git a/python/jiminy_py/src/jiminy_py/simulator.py b/python/jiminy_py/src/jiminy_py/simulator.py
index a8603988d..75b485b8c 100644
--- a/python/jiminy_py/src/jiminy_py/simulator.py
+++ b/python/jiminy_py/src/jiminy_py/simulator.py
@@ -321,6 +321,9 @@ def seed(self, seed: int) -> None:
         """
         assert isinstance(seed, np.uint32), "'seed' must have type np.uint32."
 
+        # Make sure no simulation is running before setting the seed
+        self.engine.stop()
+
         # Set the seed through the engine instead of using
         # `jiminy.reset_random_generator` to keep track of the seed in options,
         # and thereby to log it in the telemetry as constant.
@@ -577,7 +580,7 @@ def replay(self,
         # Extract trajectory data from pairs (robot, log)
         trajectories, update_hooks, extra_kwargs = [], [], {}
         for robot, log_data in zip(robots, logs_data):
-            if log_data is not None:
+            if log_data:
                 traj, update_hook, _kwargs = \
                     extract_replay_data_from_log_data(robot, log_data)
                 trajectories.append(traj)
@@ -587,7 +590,7 @@ def replay(self,
         update_hooks += [None for _ in extra_trajectories]
 
         # Make sure there is something to replay
-        if not logs_data:
+        if not trajectories:
             raise RuntimeError(
                 "Nothing to replay. Please run a simulation before calling "
                 "`replay` method, or provided data manually.")
@@ -599,10 +602,7 @@ def replay(self,
             **kwargs})
 
         # Define sequence of viewer instances
-        viewers = [
-            self.viewer,
-            *[None for _ in extra_logs_files],
-            *[None for _ in extra_trajectories]]
+        viewers = [self.viewer, *[None for _ in trajectories[:-1]]]
 
         # Replay the trajectories
         self._viewers = play_trajectories(