[all] Fix reproducibility and performance comparation issues. (#399)

* [core] Fix log buffer not refreshed after calling 'stop', making last datapoint not accessible. (#396) * [python/robot] Fix non-reproducible simulation results. * [jiminy_py/simulator] Fix running simulation not stopped before updating seed. * [python/simulator] Fix unexpected error if 'replay' is called without data. * [gym/common] Do not set the seed by default in '_setup' method. * [gym/common] Add train and eval modes to allow specific evaluation behaviors. * [gym/common] Rename '_is_training' in 'is_training'. Fix 'is_training' False by default. (#397) * [gym/common] 'seed' method now return the actual sequence of seeds instead of only the first one. * [gym/common] Remove useless conversion of iterator to list. * [gym/common] Do not cast automatically the action space to float32. (#400) * [gym/common] Ensure internal '_action' buffer dtype float64. (#400) * [gym/toolbox] Add 'quat_to_yaw_cos_sin' math helper. * [gym/rllib] Add l2 regulization loss to PPO. * [gym/rllib] Use noisy samples instead of the true ones to compute global smoothness. * [gym/rllib] Make sure the validation is always the same to make it easier to benchmark performance. * [gym/rllib] Enable to evaluate policy on multiple trials and report statistics. Export data from best trial only. * [gym/rllib] Relax ray dependency version requirement. Co-authored-by: Alexis Duburcq <alexis.duburcq@wandercraft.eu>
duburcqa · Aug 18, 2021 · 6243e43 · 6243e43
1 parent 28449b9
commit 6243e43
Show file tree

Hide file tree

Showing 14 changed files with 212 additions and 88 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,7 +2,7 @@
 cmake_minimum_required(VERSION 3.10)
 
 # Set the build version
-set(BUILD_VERSION 1.6.28)
+set(BUILD_VERSION 1.6.29)
 
 # Set compatibility
 if(CMAKE_VERSION VERSION_GREATER "3.11.0")

diff --git a/core/src/engine/EngineMultiRobot.cc b/core/src/engine/EngineMultiRobot.cc
@@ -2162,6 +2162,9 @@ namespace jiminy
         // Log current buffer content as final point of the log data
         updateTelemetry();
 
+        // Clear log data buffer one last time, now that the final point has been added
+        logData_ = nullptr;
+
         /* Reset the telemetry.
            Note that calling ``stop` or  `reset` does NOT clear
            the internal data buffer of telemetryRecorder_.

diff --git a/python/gym_jiminy/common/gym_jiminy/common/envs/env_generic.py b/python/gym_jiminy/common/gym_jiminy/common/envs/env_generic.py
@@ -6,12 +6,12 @@
 import tempfile
 from copy import deepcopy
 from collections import OrderedDict
-from typing import Optional, Tuple, Sequence, Dict, Any, Callable, List, Union
+from typing import Optional, Tuple, Dict, Any, Callable, List, Union
 
 import numpy as np
 import gym
 from gym import logger, spaces
-from gym.utils.seeding import create_seed, _int_list_from_bigint, hash_seed
+from gym.utils.seeding import _int_list_from_bigint, hash_seed
 
 import jiminy_py.core as jiminy
 from jiminy_py.core import (EncoderSensor as encoder,
@@ -120,10 +120,13 @@ def __init__(self,
 
         # Internal buffers for physics computations
         self.rg = np.random.Generator(np.random.Philox())
-        self._seed: Optional[np.uint32] = None
+        self._seed: List[np.uint32] = []
         self.log_path: Optional[str] = None
         self.logfile_action_headers: Optional[FieldDictNested] = None
 
+        # Whether or not evaluation mode is active
+        self.is_training = True
+
         # Whether or not play interactive mode is active
         self._is_interactive = False
 
@@ -139,7 +142,7 @@ def __init__(self,
         self._num_steps_beyond_done: Optional[int] = None
 
         # Initialize the seed of the environment.
-        # Note that reseting the seed also reset robot internal state.
+        # Note that resetting the seed also reset robot internal state.
         self.seed()
 
         # Set robot in neutral configuration
@@ -429,15 +432,10 @@ def _refresh_action_space(self) -> None:
                     command_limit[motor.joint_velocity_idx] = \
                         MOTOR_EFFORT_MAX
 
-        # Set the action space.
-        # Note that float32 is used instead of float64, because otherwise it
-        # would requires the neural network to perform float64 computations
-        # or cast the output for no really advantage since the action is
-        # directly forwarded to the motors, without intermediary computations.
+        # Set the action space
         action_scale = command_limit[self.robot.motors_velocity_idx]
-        self.action_space = spaces.Box(low=-action_scale.astype(np.float32),
-                                       high=action_scale.astype(np.float32),
-                                       dtype=np.float32)
+        self.action_space = spaces.Box(
+            low=-action_scale, high=action_scale, dtype=np.float64)
 
     def reset(self,
               controller_hook: Optional[Callable[[], Optional[Tuple[
@@ -566,14 +564,9 @@ def reset(self,
             # Fallback: Get generic fieldnames otherwise
             self.logfile_action_headers = get_fieldnames(
                 self.action_space, "action")
-        is_success = register_variables(self.simulator.controller,
-                                        self.logfile_action_headers,
-                                        self._action)
-        if not is_success:
-            self.logfile_action_headers = None
-            logger.warn(
-                "Action must have dtype np.float64 to be registered to the "
-                "telemetry.")
+        register_variables(self.simulator.controller,
+                           self.logfile_action_headers,
+                           self._action)
 
         # Sample the initial state and reset the low-level engine
         qpos, qvel = self._sample_state()
@@ -626,7 +619,7 @@ def reset(self,
 
         return obs
 
-    def seed(self, seed: Optional[int] = None) -> Sequence[np.uint32]:
+    def seed(self, seed: Optional[int] = None) -> List[np.uint32]:
         """Specify the seed of the environment.
 
         .. warning::
@@ -643,16 +636,16 @@ def seed(self, seed: Optional[int] = None) -> Sequence[np.uint32]:
         # into sequence of 4 bytes uint32 seeds. Backup only the first one.
         # Note that hashing is used to get rid off possible correlation in the
         # presence of concurrency.
-        seed_ints = _int_list_from_bigint(hash_seed(create_seed(seed)))
-        self._seed = np.uint32(seed_ints[0])
+        seed = hash_seed(seed)
+        self._seed = list(map(np.uint32, _int_list_from_bigint(seed)))
 
         # Instantiate a new random number generator based on the provided seed
-        self.rg = np.random.Generator(np.random.Philox(seed_ints))
+        self.rg = np.random.Generator(np.random.Philox(self._seed))
 
         # Reset the seed of Jiminy Engine
-        self.simulator.seed(self._seed)
+        self.simulator.seed(self._seed[0])
 
-        return [self._seed]
+        return self._seed
 
     def close(self) -> None:
         """Terminate the Python Jiminy engine.
@@ -958,6 +951,24 @@ def _interact(key: Optional[str] = None) -> bool:
         # Disable play interactive mode flag
         self._is_interactive = False
 
+    def train(self) -> None:
+        """Sets the environment in training mode.
+
+        .. note::
+            This mode is enabled by default.
+        """
+        self.is_training = True
+
+    def eval(self) -> None:
+        """Sets the environment in evaluation mode.
+
+        This has any effect only on certain environment. See documentations of
+        particular environment for details of their behaviors in training and
+        evaluation modes, if they are affected. It can be used to activate
+        clipping or some filtering of the action specifical at evaluation time.
+        """
+        self.is_training = False
+
     # methods to override:
     # ----------------------------
 
@@ -982,7 +993,6 @@ def _setup(self) -> None:
         engine_options["stepper"]["iterMax"] = 0
         engine_options["stepper"]["timeout"] = 0.0
         engine_options["stepper"]["logInternalStepperSteps"] = False
-        engine_options["stepper"]["randomSeed"] = self._seed
         self.simulator.engine.set_options(engine_options)
 
         # Set robot in neutral configuration

diff --git a/python/gym_jiminy/common/gym_jiminy/common/utils/helpers.py b/python/gym_jiminy/common/gym_jiminy/common/utils/helpers.py
@@ -1,6 +1,6 @@
 """ TODO: Write documentation.
 """
-from typing import Union, Dict, List
+from typing import Union, Dict, List, ValuesView
 
 import numpy as np
 import numba as nb
@@ -64,7 +64,8 @@ def get_fieldnames(space: spaces.Space,
 
 
 def register_variables(controller: jiminy.AbstractController,
-                       fields: FieldDictNested,
+                       fields: Union[
+                           ValuesView[FieldDictNested], FieldDictNested],
                        data: SpaceDictNested,
                        namespace: str = "") -> bool:
     """Register data from `Gym.Space` to the telemetry of a controller.
@@ -101,8 +102,8 @@ def register_variables(controller: jiminy.AbstractController,
     # Default case: data is already a numpy array. Can be registered directly.
     if isinstance(data, np.ndarray):
         if np.issubsctype(data, np.float64):
+            assert isinstance(fields, list)
             for i, field in enumerate(fields):
-                assert not isinstance(fields, dict)
                 if isinstance(fields[i], list):
                     fields[i] = [".".join(filter(None, (namespace, subfield)))
                                  for subfield in field]
@@ -115,10 +116,8 @@ def register_variables(controller: jiminy.AbstractController,
     # Fallback to looping over fields and data iterators
     is_success = True
     if isinstance(fields, dict):
-        fields = list(fields.values())
-    if isinstance(data, dict):
-        data = list(data.values())
-    for subfields, value in zip(fields, data):
+        fields = fields.values()
+    for subfields, value in zip(fields, data.values()):
         assert isinstance(subfields, (dict, list))
         is_success = register_variables(
             controller, subfields, value, namespace)

diff --git a/python/gym_jiminy/common/gym_jiminy/common/utils/spaces.py b/python/gym_jiminy/common/gym_jiminy/common/utils/spaces.py
@@ -129,9 +129,10 @@ def zeros(space: gym.Space,
     if isinstance(space, spaces.Tuple):
         return tuple(zeros(subspace, dtype=dtype) for subspace in space.spaces)
     if isinstance(space, (spaces.Discrete, spaces.MultiDiscrete)):
-        return np.array(0)  # Using np.array of 0 dim to be mutable
+        # Note that np.array of 0 dim is returned in order to be mutable
+        return np.array(0, dtype=dtype or np.int64)
     if isinstance(space, spaces.MultiBinary):
-        return np.zeros(space.n, dtype=np.int8)
+        return np.zeros(space.n, dtype=dtype or np.int8)
     raise NotImplementedError(
         f"Space of type {type(space)} is not supported.")
 

diff --git a/python/gym_jiminy/envs/gym_jiminy/envs/acrobot.py b/python/gym_jiminy/envs/gym_jiminy/envs/acrobot.py
@@ -214,7 +214,7 @@ def compute_command(self,
 
         # Compute the actual torque to apply
         if not self.continuous:
-            action = self.AVAIL_CTRL[action]
+            action = self.AVAIL_CTRL[round(action[()])]
         if ACTION_NOISE > 0.0:
             action += sample(scale=ACTION_NOISE, rg=self.rg)
 

diff --git a/python/gym_jiminy/envs/gym_jiminy/envs/cartpole.py b/python/gym_jiminy/envs/gym_jiminy/envs/cartpole.py
@@ -214,7 +214,7 @@ def compute_command(self,
 
         # Compute the actual torque to apply
         if not self.continuous:
-            action = self.AVAIL_CTRL[action]
+            action = self.AVAIL_CTRL[round(action[()])]
 
         return action
 

diff --git a/python/gym_jiminy/rllib/gym_jiminy/rllib/ppo.py b/python/gym_jiminy/rllib/gym_jiminy/rllib/ppo.py
@@ -12,6 +12,7 @@
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.view_requirement import ViewRequirement
 from ray.rllib.utils.typing import TensorType, TrainerConfigDict
+from ray.rllib.utils.torch_ops import l2_loss
 from ray.rllib.agents.ppo import DEFAULT_CONFIG, PPOTrainer
 from ray.rllib.agents.ppo.ppo_torch_policy import (
     ppo_surrogate_loss, kl_and_loss_stats, setup_mixins, PPOTorchPolicy)
@@ -23,7 +24,8 @@
         "symmetric_policy_reg": 0.0,
         "caps_temporal_reg": 0.0,
         "caps_spatial_reg": 0.0,
-        "caps_global_reg": 0.0
+        "caps_global_reg": 0.0,
+        "l2_reg": 0.0
     },
     _allow_unknown_configs=True)
 
@@ -53,6 +55,7 @@ def ppo_init(policy: Policy,
     policy._mean_temporal_caps_loss = 0.0
     policy._mean_spatial_caps_loss = 0.0
     policy._mean_global_caps_loss = 0.0
+    policy._l2_reg_loss = 0.0
 
     # Convert to torch.Tensor observation bounds
     policy._observation_space_low = \
@@ -110,7 +113,8 @@ def ppo_loss(policy: Policy,
         # Append the training batches to the set
         train_batches["prev"] = train_batch_copy
 
-    if policy.config["caps_spatial_reg"] > 0.0:
+    if policy.config["caps_spatial_reg"] > 0.0 or \
+            policy.config["caps_global_reg"] > 0.0:
         # Shallow copy the original training batch
         train_batch_copy = train_batch.copy(shallow=True)
 
@@ -211,15 +215,38 @@ def value_function(self, *args: Any, **kwargs: Any) -> torch.Tensor:
         action_dist = dist_class(action_logits, model)
         action_mean_true = action_dist.deterministic_sample()
 
+    # Compute the mean action corresponding to the previous observation
     if policy.config["caps_temporal_reg"] > 0.0:
-        # Compute the mean action corresponding to the previous observation
         action_logits_prev = logits["prev"]
         if issubclass(dist_class, TorchDiagGaussian):
             action_mean_prev, _ = torch.chunk(action_logits_prev, 2, dim=1)
         else:
             action_dist_prev = dist_class(action_logits_prev, model)
             action_mean_prev = action_dist_prev.deterministic_sample()
 
+    # Compute the mean action corresponding to the noisy observation
+    if policy.config["caps_spatial_reg"] > 0.0 or \
+            policy.config["caps_global_reg"] > 0.0:
+        action_logits_noisy = logits["noisy"]
+        if issubclass(dist_class, TorchDiagGaussian):
+            action_mean_noisy, _ = torch.chunk(action_logits_noisy, 2, dim=1)
+        else:
+            action_dist_noisy = dist_class(action_logits_noisy, model)
+            action_mean_noisy = action_dist_noisy.deterministic_sample()
+
+    # Compute the mirrored mean action corresponding to the mirrored action
+    if policy.config["symmetric_policy_reg"] > 0.0:
+        action_logits_mirror = logits["mirrored"]
+        if issubclass(dist_class, TorchDiagGaussian):
+            action_mean_mirror, _ = torch.chunk(action_logits_mirror, 2, dim=1)
+        else:
+            action_dist_mirror = dist_class(action_logits_mirror, model)
+            action_mean_mirror = action_dist_mirror.deterministic_sample()
+        action_mirror_mat = policy.action_space.mirror_mat.to(device)
+        policy.action_space.mirror_mat = action_mirror_mat
+        action_mean_mirror = action_mean_mirror @ action_mirror_mat
+
+    if policy.config["caps_temporal_reg"] > 0.0:
         # Minimize the difference between the successive action mean
         policy._mean_temporal_caps_loss = torch.mean(
             (action_mean_prev - action_mean_true) ** 2)
@@ -229,14 +256,6 @@ def value_function(self, *args: Any, **kwargs: Any) -> torch.Tensor:
             policy._mean_temporal_caps_loss
 
     if policy.config["caps_spatial_reg"] > 0.0:
-        # Compute the mean action corresponding to the noisy observation
-        action_logits_noisy = logits["noisy"]
-        if issubclass(dist_class, TorchDiagGaussian):
-            action_mean_noisy, _ = torch.chunk(action_logits_noisy, 2, dim=1)
-        else:
-            action_dist_noisy = dist_class(action_logits_noisy, model)
-            action_mean_noisy = action_dist_noisy.deterministic_sample()
-
         # Minimize the difference between the original action mean and the
         # one corresponding to the noisy observation.
         policy._mean_spatial_caps_loss = torch.mean(
@@ -248,24 +267,13 @@ def value_function(self, *args: Any, **kwargs: Any) -> torch.Tensor:
 
     if policy.config["caps_global_reg"] > 0.0:
         # Minimize the magnitude of action mean
-        policy._mean_global_caps_loss = torch.mean(action_mean_true ** 2)
+        policy._mean_global_caps_loss = torch.mean(action_mean_noisy ** 2)
 
         # Add global smoothness loss to total loss
         total_loss += policy.config["caps_global_reg"] * \
             policy._mean_global_caps_loss
 
     if policy.config["symmetric_policy_reg"] > 0.0:
-        # Compute the mirrored mean action corresponding to the mirrored action
-        action_logits_mirror = logits["mirrored"]
-        if issubclass(dist_class, TorchDiagGaussian):
-            action_mean_mirror, _ = torch.chunk(action_logits_mirror, 2, dim=1)
-        else:
-            action_dist_mirror = dist_class(action_logits_mirror, model)
-            action_mean_mirror = action_dist_mirror.deterministic_sample()
-        action_mirror_mat = policy.action_space.mirror_mat.to(device)
-        policy.action_space.mirror_mat = action_mirror_mat
-        action_mean_mirror = action_mean_mirror @ action_mirror_mat
-
         # Minimize the assymetry of policy output
         policy._mean_symmetric_policy_loss = torch.mean(
             (action_mean_mirror - action_mean_true) ** 2)
@@ -274,6 +282,17 @@ def value_function(self, *args: Any, **kwargs: Any) -> torch.Tensor:
         total_loss += policy.config["symmetric_policy_reg"] * \
             policy._mean_symmetric_policy_loss
 
+    if policy.config["l2_reg"] > 0.0:
+        # Add actor l2-regularization loss
+        l2_reg_loss = 0.0
+        for name, params in model.state_dict().items():
+            if "bias" not in name:
+                l2_reg_loss += l2_loss(params)
+        policy._l2_reg_loss = l2_reg_loss
+
+        # Add l2-regularization loss to total loss
+        total_loss += policy.config["l2_reg"] * policy._l2_reg_loss
+
     return total_loss
 
 
@@ -293,6 +312,8 @@ def ppo_stats(policy: Policy,
         stats_dict["spatial_smoothness"] = policy._mean_spatial_caps_loss
     if policy.config["caps_global_reg"] > 0.0:
         stats_dict["global_smoothness"] = policy._mean_global_caps_loss
+    if policy.config["l2_reg"] > 0.0:
+        stats_dict["l2_reg"] = policy._l2_reg_loss
 
     return stats_dict