diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index a44a03fe3..6a5691be7 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -10,10 +10,12 @@ Pre-Release 0.4.0a0 (WIP) Breaking Changes: ^^^^^^^^^^^^^^^^^ - Removed CEMRL +- Model saved with previous versions cannot be loaded (because of the pre-preprocessing) New Features: ^^^^^^^^^^^^^ - Add support for Discrete observation spaces +- Add saving/loading for policy weights, so the policy can be used without the model Bug Fixes: ^^^^^^^^^^ @@ -26,6 +28,8 @@ Others: ^^^^^^^ - Refactor handling of observation and action spaces - Refactored features extraction to have proper preprocessing +- Refactored action distributions + Documentation: ^^^^^^^^^^^^^^ diff --git a/tests/test_distributions.py b/tests/test_distributions.py index eb9fbaf07..4a1294c47 100644 --- a/tests/test_distributions.py +++ b/tests/test_distributions.py @@ -38,7 +38,8 @@ def test_squashed_gaussian(model_class): gaussian_mean = th.rand(N_SAMPLES, N_ACTIONS) dist = SquashedDiagGaussianDistribution(N_ACTIONS) _, log_std = dist.proba_distribution_net(N_FEATURES) - actions, _ = dist.proba_distribution(gaussian_mean, log_std) + dist = dist.proba_distribution(gaussian_mean, log_std) + actions = dist.get_action() assert th.max(th.abs(actions)) <= 1.0 def test_sde_distribution(): @@ -51,7 +52,8 @@ def test_sde_distribution(): _, log_std = dist.proba_distribution_net(N_FEATURES) dist.sample_weights(log_std, batch_size=N_SAMPLES) - actions, _ = dist.proba_distribution(deterministic_actions, log_std, state) + dist = dist.proba_distribution(deterministic_actions, log_std, state) + actions = dist.get_action() assert th.allclose(actions.mean(), dist.distribution.mean.mean(), rtol=1e-3) assert th.allclose(actions.std(), dist.distribution.scale.mean(), rtol=1e-3) @@ -71,11 +73,12 @@ def test_entropy(dist): _, log_std = dist.proba_distribution_net(N_FEATURES, log_std_init=th.log(th.tensor(0.2))) if isinstance(dist, DiagGaussianDistribution): - actions, dist = dist.proba_distribution(deterministic_actions, log_std) + dist = dist.proba_distribution(deterministic_actions, log_std) else: dist.sample_weights(log_std, batch_size=N_SAMPLES) - actions, dist = dist.proba_distribution(deterministic_actions, log_std, state) + dist = dist.proba_distribution(deterministic_actions, log_std, state) + actions = dist.get_action() entropy = dist.entropy() log_prob = dist.log_prob(actions) assert th.allclose(entropy.mean(), -log_prob.mean(), rtol=5e-3) @@ -88,8 +91,9 @@ def test_categorical(): set_random_seed(1) state = th.rand(N_SAMPLES, N_FEATURES) action_logits = th.rand(N_SAMPLES, N_ACTIONS) - actions, dist = dist.proba_distribution(action_logits) + dist = dist.proba_distribution(action_logits) + actions = dist.get_action() entropy = dist.entropy() log_prob = dist.log_prob(actions) assert th.allclose(entropy.mean(), -log_prob.mean(), rtol=1e-4) diff --git a/tests/test_identity.py b/tests/test_identity.py index 87dc6acbd..b717c0a56 100644 --- a/tests/test_identity.py +++ b/tests/test_identity.py @@ -20,7 +20,7 @@ def test_continuous(model_class): env = IdentityEnvBox(eps=0.5) n_steps = { - A2C: 3000, + A2C: 3500, PPO: 3000, SAC: 700, TD3: 500 diff --git a/tests/test_save_load.py b/tests/test_save_load.py index 076ec924f..64fe2481b 100644 --- a/tests/test_save_load.py +++ b/tests/test_save_load.py @@ -16,7 +16,7 @@ SAC, ] - +# @pytest.mark.parametrize("model_class", MODEL_LIST) def test_save_load(model_class): """ @@ -160,3 +160,61 @@ def test_save_load_replay_buffer(model_class): # clear file from os os.remove(replay_path) + + +@pytest.mark.parametrize("model_class", MODEL_LIST) +def test_save_load_policy(model_class): + """ + Test saving and loading policy only. + + :param model_class: (BaseRLModel) A RL model + """ + env = DummyVecEnv([lambda: IdentityEnvBox(10)]) + + # create model + model = model_class('MlpPolicy', env, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True) + model.learn(total_timesteps=500, eval_freq=250) + + env.reset() + observations = np.array([env.step(env.action_space.sample())[0] for _ in range(10)]) + observations = observations.reshape(10, -1) + + policy = model.policy + + # Get dictionary of current parameters + params = deepcopy(policy.state_dict()) + + # Modify all parameters to be random values + random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items()) + + # Update model parameters with the new random values + policy.load_state_dict(random_params) + + new_params = policy.state_dict() + # Check that all params are different now + for k in params: + assert not th.allclose(params[k], new_params[k]), "Parameters did not change as expected." + + params = new_params + + # get selected actions + selected_actions, _ = policy.predict(observations, deterministic=True) + + # Save and load policy + policy.save("./logs/policy_weights.pkl") + # del policy + policy.load("./logs/policy_weights.pkl") + + # check if params are still the same after load + new_params = policy.state_dict() + + # Check that all params are the same as before save load procedure now + for key in params: + assert th.allclose(params[key], new_params[key]), "Policy parameters not the same after save and load." + + # check if model still selects the same actions + new_selected_actions, _ = policy.predict(observations, deterministic=True) + assert np.allclose(selected_actions, new_selected_actions, 1e-4) + + # clear file from os + os.remove("./logs/policy_weights.pkl") diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index c171a4cc7..3c36d49cc 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -158,27 +158,6 @@ def _get_eval_env(self, eval_env: Optional[GymEnv]) -> Optional[GymEnv]: assert eval_env.num_envs == 1 return eval_env - def scale_action(self, action: np.ndarray) -> np.ndarray: - """ - Rescale the action from [low, high] to [-1, 1] - (no need for symmetric action space) - - :param action: (np.ndarray) Action to scale - :return: (np.ndarray) Scaled action - """ - low, high = self.action_space.low, self.action_space.high - return 2.0 * ((action - low) / (high - low)) - 1.0 - - def unscale_action(self, scaled_action: np.ndarray) -> np.ndarray: - """ - Rescale the action from [-1, 1] to [low, high] - (no need for symmetric action space) - - :param scaled_action: Action to un-scale - """ - low, high = self.action_space.low, self.action_space.high - return low + (0.5 * (scaled_action + 1.0) * (high - low)) - def _setup_lr_schedule(self) -> None: """Transform to callable if needed.""" self.lr_schedule = get_schedule_fn(self.learning_rate) @@ -318,57 +297,6 @@ def learn(self, total_timesteps: int, """ raise NotImplementedError() - @staticmethod - def _is_vectorized_observation(observation: np.ndarray, observation_space: gym.spaces.Space) -> bool: - """ - For every observation type, detects and validates the shape, - then returns whether or not the observation is vectorized. - - :param observation: (np.ndarray) the input observation to validate - :param observation_space: (gym.spaces) the observation space - :return: (bool) whether the given observation is vectorized or not - """ - if isinstance(observation_space, gym.spaces.Box): - if observation.shape == observation_space.shape: - return False - elif observation.shape[1:] == observation_space.shape: - return True - else: - raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) + - "Box environment, please use {} ".format(observation_space.shape) + - "or (n_env, {}) for the observation shape." - .format(", ".join(map(str, observation_space.shape)))) - elif isinstance(observation_space, gym.spaces.Discrete): - if observation.shape == (): # A numpy array of a number, has shape empty tuple '()' - return False - elif len(observation.shape) == 1: - return True - else: - raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) + - "Discrete environment, please use (1,) or (n_env, 1) for the observation shape.") - # TODO: add support for MultiDiscrete and MultiBinary observation spaces - # elif isinstance(observation_space, gym.spaces.MultiDiscrete): - # if observation.shape == (len(observation_space.nvec),): - # return False - # elif len(observation.shape) == 2 and observation.shape[1] == len(observation_space.nvec): - # return True - # else: - # raise ValueError("Error: Unexpected observation shape {} for MultiDiscrete ".format(observation.shape) + - # "environment, please use ({},) or ".format(len(observation_space.nvec)) + - # "(n_env, {}) for the observation shape.".format(len(observation_space.nvec))) - # elif isinstance(observation_space, gym.spaces.MultiBinary): - # if observation.shape == (observation_space.n,): - # return False - # elif len(observation.shape) == 2 and observation.shape[1] == observation_space.n: - # return True - # else: - # raise ValueError("Error: Unexpected observation shape {} for MultiBinary ".format(observation.shape) + - # "environment, please use ({},) or ".format(observation_space.n) + - # "(n_env, {}) for the observation shape.".format(observation_space.n)) - else: - raise ValueError("Error: Cannot determine if the observation is vectorized with the space type {}." - .format(observation_space)) - def predict(self, observation: np.ndarray, state: Optional[np.ndarray] = None, mask: Optional[np.ndarray] = None, @@ -383,36 +311,7 @@ def predict(self, observation: np.ndarray, :return: (Tuple[np.ndarray, Optional[np.ndarray]]) the model's action and the next state (used in recurrent policies) """ - # TODO: move this block to BasePolicy - # if state is None: - # state = self.initial_state - # if mask is None: - # mask = [False for _ in range(self.n_envs)] - observation = np.array(observation) - vectorized_env = self._is_vectorized_observation(observation, self.observation_space) - - observation = observation.reshape((-1,) + self.observation_space.shape) - observation = th.as_tensor(observation).to(self.device) - with th.no_grad(): - actions = self.policy.predict(observation, deterministic=deterministic) - # Convert to numpy - actions = actions.cpu().numpy() - - # Rescale to proper domain when using squashing - if isinstance(self.action_space, gym.spaces.Box) and self.policy.squash_output: - actions = self.unscale_action(actions) - - clipped_actions = actions - # Clip the actions to avoid out of bound error when using gaussian distribution - if isinstance(self.action_space, gym.spaces.Box) and not self.policy.squash_output: - clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) - - if not vectorized_env: - if state is not None: - raise ValueError("Error: The environment must be vectorized when using recurrent policies.") - clipped_actions = clipped_actions[0] - - return clipped_actions, state + return self.policy.predict(observation, state, mask, deterministic) @classmethod def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs): @@ -484,10 +383,7 @@ def _load_from_file(load_path: str, load_data: bool = True) -> (Tuple[Optional[D raise ValueError(f"Error: the file {load_path} could not be found") # set device to cpu if cuda is not available - if th.cuda.is_available(): - device = th.device('cuda') - else: - device = th.device('cpu') + device = th.device('cuda') if th.cuda.is_available() else th.device('cpu') # Open the zip archive and load data try: @@ -534,20 +430,6 @@ def _load_from_file(load_path: str, load_data: bool = True) -> (Tuple[Optional[D # load the parameters with the right `map_location` params[os.path.splitext(file_path)[0]] = th.load(file_content, map_location=device) - # for backward compatibility - if params.get('params') is not None: - params_copy = {} - for name in params: - if name == 'params': - params_copy['policy'] = params[name] - elif name == 'opt': - params_copy['policy.optimizer'] = params[name] - # Special case for SAC - elif name == 'ent_coef_optimizer': - params_copy[name] = params[name] - else: - params_copy[name + '.optimizer'] = params[name] - params = params_copy except zipfile.BadZipFile: # load_path wasn't a zip file raise ValueError(f"Error: the file {load_path} wasn't a zip-file") @@ -925,7 +807,7 @@ def collect_rollouts(self, unscaled_action, _ = self.predict(obs, deterministic=False) # Rescale the action from [low, high] to [-1, 1] - scaled_action = self.scale_action(unscaled_action) + scaled_action = self.policy.scale_action(unscaled_action) if self.use_sde: # When using SDE, the action can be out of bounds @@ -941,7 +823,7 @@ def collect_rollouts(self, clipped_action = np.clip(clipped_action + action_noise(), -1, 1) # Rescale and perform action - new_obs, reward, done, infos = env.step(self.unscale_action(clipped_action)) + new_obs, reward, done, infos = env.step(self.policy.unscale_action(clipped_action)) # Only stop training if return value is False, not when it is None. if callback.on_step() is False: diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py index baf54aaa3..62d14b099 100644 --- a/torchy_baselines/common/distributions.py +++ b/torchy_baselines/common/distributions.py @@ -33,12 +33,52 @@ def entropy(self) -> Optional[th.Tensor]: def sample(self) -> th.Tensor: """ - returns a sample from the probabilty distribution + Returns a sample from the probabilty distribution :return: (th.Tensor) the stochastic action """ raise NotImplementedError + def mode(self) -> th.Tensor: + """ + Returns the most likely action (deterministic output) + from the probabilty distribution + + :return: (th.Tensor) the stochastic action + """ + raise NotImplementedError + + def get_action(self, deterministic: bool = False) -> th.Tensor: + """ + Return an action according to the probabilty distribution. + + :param deterministic: (bool) + :return: (th.Tensor) + """ + if deterministic: + return self.mode() + else: + return self.sample() + + def action_from_params(self, *args, **kwargs) -> th.Tensor: + """ + Returns a sample from the probabilty distribution + given its parameters. + + :return: (th.Tensor) the action + """ + raise NotImplementedError + + def log_prob_from_params(self, *args, **kwargs) -> Tuple[th.Tensor, th.Tensor]: + """ + Returns a sample and the associated log probabilty + from the probabilty distribution + given its parameters. + + :return: (th.Tuple[th.Tensor, th.Tensor]) action and log prob + """ + raise NotImplementedError + def sum_independent_dims(tensor: th.Tensor) -> th.Tensor: """ @@ -88,23 +128,17 @@ def proba_distribution_net(self, latent_dim: int, return mean_actions, log_std def proba_distribution(self, mean_actions: th.Tensor, - log_std: th.Tensor, - deterministic: bool = False) -> Tuple[th.Tensor, 'DiagGaussianDistribution']: + log_std: th.Tensor) -> 'DiagGaussianDistribution': """ - Create and sample for the distribution given its parameters (mean, std) + Create the distribution given its parameters (mean, std) :param mean_actions: (th.Tensor) :param log_std: (th.Tensor) - :param deterministic: (bool) - :return: (th.Tensor) + :return: (DiagGaussianDistribution) """ action_std = th.ones_like(mean_actions) * log_std.exp() self.distribution = Normal(mean_actions, action_std) - if deterministic: - action = self.mode() - else: - action = self.sample() - return action, self + return self def mode(self) -> th.Tensor: return self.distribution.mean @@ -115,7 +149,15 @@ def sample(self) -> th.Tensor: def entropy(self) -> th.Tensor: return sum_independent_dims(self.distribution.entropy()) - def log_prob_from_params(self, mean_actions: th.Tensor, log_std: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: + def action_from_params(self, mean_actions: th.Tensor, + log_std: th.Tensor, + deterministic: bool = False) -> th.Tensor: + # Update the proba distribution + self.proba_distribution(mean_actions, log_std) + return self.get_action(deterministic=deterministic) + + def log_prob_from_params(self, mean_actions: th.Tensor, + log_std: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: """ Compute the log probabilty of taking an action given the distribution parameters. @@ -124,7 +166,7 @@ def log_prob_from_params(self, mean_actions: th.Tensor, log_std: th.Tensor) -> T :param log_std: (th.Tensor) :return: (Tuple[th.Tensor, th.Tensor]) """ - action, _ = self.proba_distribution(mean_actions, log_std) + action = self.action_from_params(mean_actions, log_std) log_prob = self.log_prob(action) return action, log_prob @@ -156,10 +198,10 @@ def __init__(self, action_dim: int, epsilon: float = 1e-6): self.epsilon = epsilon self.gaussian_action = None - def proba_distribution(self, mean_actions, log_std, deterministic=False): - action, _ = super(SquashedDiagGaussianDistribution, self).proba_distribution(mean_actions, log_std, - deterministic) - return action, self + def proba_distribution(self, mean_actions: th.Tensor, + log_std: th.Tensor) -> 'SquashedDiagGaussianDistribution': + super(SquashedDiagGaussianDistribution, self).proba_distribution(mean_actions, log_std) + return self def mode(self) -> th.Tensor: self.gaussian_action = self.distribution.mean @@ -175,12 +217,14 @@ def sample(self) -> th.Tensor: self.gaussian_action = self.distribution.rsample() return th.tanh(self.gaussian_action) - def log_prob_from_params(self, mean_actions, log_std) -> Tuple[th.Tensor, th.Tensor]: - action, _ = self.proba_distribution(mean_actions, log_std) + def log_prob_from_params(self, mean_actions: th.Tensor, + log_std: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: + action = self.action_from_params(mean_actions, log_std) log_prob = self.log_prob(action, self.gaussian_action) return action, log_prob - def log_prob(self, action: th.Tensor, gaussian_action: Optional[th.Tensor] = None) -> th.Tensor: + def log_prob(self, action: th.Tensor, + gaussian_action: Optional[th.Tensor] = None) -> th.Tensor: # Inverse tanh # Naive implementation (not stable): 0.5 * torch.log((1 + x) / (1 - x)) # We use numpy to avoid numerical instability @@ -220,14 +264,9 @@ def proba_distribution_net(self, latent_dim: int) -> nn.Module: action_logits = nn.Linear(latent_dim, self.action_dim) return action_logits - def proba_distribution(self, action_logits: th.Tensor, - deterministic: bool = False) -> Tuple[th.Tensor, 'CategoricalDistribution']: + def proba_distribution(self, action_logits: th.Tensor) -> 'CategoricalDistribution': self.distribution = Categorical(logits=action_logits) - if deterministic: - action = self.mode() - else: - action = self.sample() - return action, self + return self def mode(self) -> th.Tensor: return th.argmax(self.distribution.probs, dim=1) @@ -238,8 +277,14 @@ def sample(self) -> th.Tensor: def entropy(self) -> th.Tensor: return self.distribution.entropy() + def action_from_params(self, action_logits: th.Tensor, + deterministic: bool = False) -> th.Tensor: + # Update the proba distribution + self.proba_distribution(action_logits) + return self.get_action(deterministic=deterministic) + def log_prob_from_params(self, action_logits: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: - action, _ = self.proba_distribution(action_logits) + action = self.action_from_params(action_logits) log_prob = self.log_prob(action) return action, log_prob @@ -283,6 +328,7 @@ def __init__(self, action_dim: int, self.weights_dist = None self.exploration_mat = None self.exploration_matrices = None + self._latent_sde = None self.use_expln = use_expln self.full_std = full_std self.epsilon = epsilon @@ -358,27 +404,26 @@ def proba_distribution_net(self, latent_dim: int, log_std_init: float = -2.0, def proba_distribution(self, mean_actions: th.Tensor, log_std: th.Tensor, - latent_sde: th.Tensor, - deterministic: bool = False) -> Tuple[th.Tensor, 'StateDependentNoiseDistribution']: + latent_sde: th.Tensor) -> 'StateDependentNoiseDistribution': """ Create and sample for the distribution given its parameters (mean, std) :param mean_actions: (th.Tensor) :param log_std: (th.Tensor) :param latent_sde: (th.Tensor) - :param deterministic: (bool) - :return: (Tuple[th.Tensor, Distribution]) + :return: (StateDependentNoiseDistribution) """ # Stop gradient if we don't want to influence the features - latent_sde = latent_sde if self.learn_features else latent_sde.detach() + self._latent_sde = latent_sde if self.learn_features else latent_sde.detach() variance = th.mm(latent_sde ** 2, self.get_std(log_std) ** 2) self.distribution = Normal(mean_actions, th.sqrt(variance + self.epsilon)) + return self + def get_action(self, deterministic: bool = False) -> th.Tensor: if deterministic: - action = self.mode() + return self.mode() else: - action = self.sample(latent_sde) - return action, self + return self.sample(self._latent_sde) def mode(self) -> th.Tensor: action = self.distribution.mean @@ -412,10 +457,18 @@ def entropy(self) -> Optional[th.Tensor]: return None return sum_independent_dims(self.distribution.entropy()) + def action_from_params(self, mean_actions: th.Tensor, + log_std: th.Tensor, + latent_sde: th.Tensor, + deterministic: bool = False) -> th.Tensor: + # Update the proba distribution + self.proba_distribution(mean_actions, log_std, latent_sde) + return self.get_action(deterministic=deterministic) + def log_prob_from_params(self, mean_actions: th.Tensor, log_std: th.Tensor, latent_sde: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: - action, _ = self.proba_distribution(mean_actions, log_std, latent_sde) + action = self.action_from_params(mean_actions, log_std, latent_sde) log_prob = self.log_prob(action) return action, log_prob diff --git a/torchy_baselines/common/policies.py b/torchy_baselines/common/policies.py index 85f0b18f5..d6677d6c3 100644 --- a/torchy_baselines/common/policies.py +++ b/torchy_baselines/common/policies.py @@ -63,7 +63,7 @@ def init_weights(module: nn.Module, gain: float = 1): def forward(self, *_args, **kwargs): raise NotImplementedError() - def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor: + def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor: """ Get the action according to the policy for a given observation. @@ -73,21 +73,145 @@ def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Ten """ raise NotImplementedError() + def predict(self, observation: np.ndarray, + state: Optional[np.ndarray] = None, + mask: Optional[np.ndarray] = None, + deterministic: bool = False) -> Tuple[np.ndarray, Optional[np.ndarray]]: + """ + Get the policy action and state from an observation (and optional state). + + :param observation: (np.ndarray) the input observation + :param state: (Optional[np.ndarray]) The last states (can be None, used in recurrent policies) + :param mask: (Optional[np.ndarray]) The last masks (can be None, used in recurrent policies) + :param deterministic: (bool) Whether or not to return deterministic actions. + :return: (Tuple[np.ndarray, Optional[np.ndarray]]) the model's action and the next state + (used in recurrent policies) + """ + # if state is None: + # state = self.initial_state + # if mask is None: + # mask = [False for _ in range(self.n_envs)] + observation = np.array(observation) + vectorized_env = self._is_vectorized_observation(observation, self.observation_space) + + observation = observation.reshape((-1,) + self.observation_space.shape) + observation = th.as_tensor(observation).to(self.device) + with th.no_grad(): + actions = self._predict(observation, deterministic=deterministic) + # Convert to numpy + actions = actions.cpu().numpy() + + # Rescale to proper domain when using squashing + if isinstance(self.action_space, gym.spaces.Box) and self.squash_output: + actions = self.unscale_action(actions) + + clipped_actions = actions + # Clip the actions to avoid out of bound error when using gaussian distribution + if isinstance(self.action_space, gym.spaces.Box) and not self.squash_output: + clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) + + if not vectorized_env: + if state is not None: + raise ValueError("Error: The environment must be vectorized when using recurrent policies.") + clipped_actions = clipped_actions[0] + + return clipped_actions, state + + def scale_action(self, action: np.ndarray) -> np.ndarray: + """ + Rescale the action from [low, high] to [-1, 1] + (no need for symmetric action space) + + :param action: (np.ndarray) Action to scale + :return: (np.ndarray) Scaled action + """ + low, high = self.action_space.low, self.action_space.high + return 2.0 * ((action - low) / (high - low)) - 1.0 + + def unscale_action(self, scaled_action: np.ndarray) -> np.ndarray: + """ + Rescale the action from [-1, 1] to [low, high] + (no need for symmetric action space) + + :param scaled_action: Action to un-scale + """ + low, high = self.action_space.low, self.action_space.high + return low + (0.5 * (scaled_action + 1.0) * (high - low)) + + @staticmethod + def _is_vectorized_observation(observation: np.ndarray, observation_space: gym.spaces.Space) -> bool: + """ + For every observation type, detects and validates the shape, + then returns whether or not the observation is vectorized. + + :param observation: (np.ndarray) the input observation to validate + :param observation_space: (gym.spaces) the observation space + :return: (bool) whether the given observation is vectorized or not + """ + if isinstance(observation_space, gym.spaces.Box): + if observation.shape == observation_space.shape: + return False + elif observation.shape[1:] == observation_space.shape: + return True + else: + raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) + + "Box environment, please use {} ".format(observation_space.shape) + + "or (n_env, {}) for the observation shape." + .format(", ".join(map(str, observation_space.shape)))) + elif isinstance(observation_space, gym.spaces.Discrete): + if observation.shape == (): # A numpy array of a number, has shape empty tuple '()' + return False + elif len(observation.shape) == 1: + return True + else: + raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) + + "Discrete environment, please use (1,) or (n_env, 1) for the observation shape.") + # TODO: add support for MultiDiscrete and MultiBinary observation spaces + # elif isinstance(observation_space, gym.spaces.MultiDiscrete): + # if observation.shape == (len(observation_space.nvec),): + # return False + # elif len(observation.shape) == 2 and observation.shape[1] == len(observation_space.nvec): + # return True + # else: + # raise ValueError("Error: Unexpected observation shape {} for MultiDiscrete ".format(observation.shape) + + # "environment, please use ({},) or ".format(len(observation_space.nvec)) + + # "(n_env, {}) for the observation shape.".format(len(observation_space.nvec))) + # elif isinstance(observation_space, gym.spaces.MultiBinary): + # if observation.shape == (observation_space.n,): + # return False + # elif len(observation.shape) == 2 and observation.shape[1] == observation_space.n: + # return True + # else: + # raise ValueError("Error: Unexpected observation shape {} for MultiBinary ".format(observation.shape) + + # "environment, please use ({},) or ".format(observation_space.n) + + # "(n_env, {}) for the observation shape.".format(observation_space.n)) + else: + raise ValueError("Error: Cannot determine if the observation is vectorized with the space type {}." + .format(observation_space)) + + def save(self, path: str) -> None: """ - Save model to a given location. + Save policy weights to a given location. + NOTE: we don't save policy parameters :param path: (str) """ + previous_device = self.device + # Convert to cpu before saving + self = self.to('cpu') th.save(self.state_dict(), path) + self = self.to(previous_device) def load(self, path: str) -> None: """ - Load saved model from path. + Load policy weights from path. + NOTE: we don't load policy parameters :param path: (str) """ self.load_state_dict(th.load(path)) + self = self.to(self.device) def load_from_vector(self, vector: np.ndarray): """ diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py index 216790ccb..1589247db 100644 --- a/torchy_baselines/ppo/policies.py +++ b/torchy_baselines/ppo/policies.py @@ -156,9 +156,9 @@ def forward(self, obs: th.Tensor, latent_pi, latent_vf, latent_sde = self._get_latent(obs) # Evaluate the values for the given observations value = self.value_net(latent_vf) - action, action_distribution = self._get_action_dist_from_latent(latent_pi, latent_sde=latent_sde, - deterministic=deterministic) - log_prob = action_distribution.log_prob(action) + distribution = self._get_action_dist_from_latent(latent_pi, latent_sde=latent_sde) + action = distribution.get_action(deterministic=deterministic) + log_prob = distribution.log_prob(action) return action, value, log_prob def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]: @@ -180,33 +180,29 @@ def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]: return latent_pi, latent_vf, latent_sde def _get_action_dist_from_latent(self, latent_pi: th.Tensor, - latent_sde: Optional[th.Tensor] = None, - deterministic: bool = False) -> Tuple[th.Tensor, Distribution]: + latent_sde: Optional[th.Tensor] = None) -> Distribution: """ - Retrieve action and associated action distribution - given the latent codes. + Retrieve action distribution given the latent codes. :param latent_pi: (th.Tensor) Latent code for the actor :param latent_sde: (Optional[th.Tensor]) Latent code for the SDE exploration function - :param deterministic: (bool) Whether to sample or use deterministic actions - :return: (Tuple[th.Tensor, Distribution]) Action and action distribution + :return: (Distribution) Action distribution """ mean_actions = self.action_net(latent_pi) if isinstance(self.action_dist, DiagGaussianDistribution): - return self.action_dist.proba_distribution(mean_actions, self.log_std, deterministic=deterministic) + return self.action_dist.proba_distribution(mean_actions, self.log_std) elif isinstance(self.action_dist, CategoricalDistribution): # Here mean_actions are the logits before the softmax - return self.action_dist.proba_distribution(mean_actions, deterministic=deterministic) + return self.action_dist.proba_distribution(action_logits=mean_actions) elif isinstance(self.action_dist, StateDependentNoiseDistribution): - return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde, - deterministic=deterministic) + return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde) else: raise ValueError('Invalid action distribution') - def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor: + def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor: """ Get the action according to the policy for a given observation. @@ -215,27 +211,25 @@ def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Ten :return: (th.Tensor) Taken action according to the policy """ latent_pi, _, latent_sde = self._get_latent(observation) - action, _ = self._get_action_dist_from_latent(latent_pi, latent_sde, deterministic=deterministic) - return action + distribution = self._get_action_dist_from_latent(latent_pi, latent_sde) + return distribution.get_action(deterministic=deterministic) def evaluate_actions(self, obs: th.Tensor, - actions: th.Tensor, - deterministic: bool = False) -> Tuple[th.Tensor, th.Tensor, th.Tensor]: + actions: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]: """ Evaluate actions according to the current policy, given the observations. :param obs: (th.Tensor) :param actions: (th.Tensor) - :param deterministic: (bool) :return: (th.Tensor, th.Tensor, th.Tensor) estimated value, log likelihood of taking those actions and entropy of the action distribution. """ latent_pi, latent_vf, latent_sde = self._get_latent(obs) - _, action_distribution = self._get_action_dist_from_latent(latent_pi, latent_sde, deterministic=deterministic) - log_prob = action_distribution.log_prob(actions) + distribution = self._get_action_dist_from_latent(latent_pi, latent_sde) + log_prob = distribution.log_prob(actions) values = self.value_net(latent_vf) - return values, log_prob, action_distribution.entropy() + return values, log_prob, distribution.entropy() MlpPolicy = PPOPolicy diff --git a/torchy_baselines/sac/policies.py b/torchy_baselines/sac/policies.py index db0f1a9aa..f561c6e6b 100644 --- a/torchy_baselines/sac/policies.py +++ b/torchy_baselines/sac/policies.py @@ -108,14 +108,11 @@ def reset_noise(self, batch_size: int = 1) -> None: 'reset_noise() is only available when using SDE' self.action_dist.sample_weights(self.log_std, batch_size=batch_size) - def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: + def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]: features = self.extract_features(obs) latent_pi = self.latent_pi(features) latent_sde = self.sde_features_extractor(features) if self.sde_features_extractor is not None else latent_pi - return latent_pi, latent_sde - def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]: - latent_pi, latent_sde = self._get_latent(obs) mean_actions = self.mu(latent_pi) if self.use_sde: @@ -130,9 +127,8 @@ def forward(self, obs: th.Tensor, deterministic: bool = False) -> th.Tensor: mean_actions, log_std, latent_sde = self.get_action_dist_params(obs) kwargs = dict(latent_sde=latent_sde) if self.use_sde else {} # Note: the action is squashed - action, _ = self.action_dist.proba_distribution(mean_actions, log_std, - deterministic=deterministic, **kwargs) - return action + return self.action_dist.action_from_params(mean_actions, log_std, + deterministic=deterministic, **kwargs) def action_log_prob(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: mean_actions, log_std, latent_sde = self.get_action_dist_params(obs) @@ -268,7 +264,7 @@ def make_critic(self) -> Critic: def forward(self, obs: th.Tensor) -> th.Tensor: return self.predict(obs, deterministic=False) - def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor: + def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor: return self.actor(observation, deterministic) diff --git a/torchy_baselines/td3/policies.py b/torchy_baselines/td3/policies.py index 8b24832c5..c10e4f2fc 100644 --- a/torchy_baselines/td3/policies.py +++ b/torchy_baselines/td3/policies.py @@ -104,11 +104,6 @@ def get_std(self) -> th.Tensor: """ return self.action_dist.get_std(self.log_std) - def _get_action_dist_from_latent(self, latent_pi: th.Tensor, - latent_sde: th.Tensor) -> Tuple[th.Tensor, Distribution]: - mean_actions = self.mu(latent_pi) - return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde) - def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: features = self.extract_features(obs) latent_pi = self.latent_pi(features) @@ -126,9 +121,9 @@ def evaluate_actions(self, obs: th.Tensor, action: th.Tensor) -> Tuple[th.Tensor and entropy of the action distribution. """ latent_pi, latent_sde = self._get_latent(obs) - _, distribution = self._get_action_dist_from_latent(latent_pi, latent_sde) + mean_actions = self.mu(latent_pi) + distribution = self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde) log_prob = distribution.log_prob(action) - # value = self.value_net(latent_vf) return log_prob, distribution.entropy() def reset_noise(self) -> None: @@ -150,8 +145,6 @@ def forward(self, obs: th.Tensor, deterministic: bool = True) -> th.Tensor: # -> set squash_output=True in the action_dist? # NOTE: the clipping is done in the rollout for now return self.mu(latent_pi) + noise - # action, _ = self._get_action_dist_from_latent(latent_pi) - # return action else: features = self.extract_features(obs) return self.mu(features) @@ -338,9 +331,9 @@ def make_critic(self) -> Critic: return Critic(**self.net_args).to(self.device) def forward(self, observation: th.Tensor, deterministic: bool = False): - return self.predict(observation, deterministic=deterministic) + return self._predict(observation, deterministic=deterministic) - def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor: + def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor: return self.actor(observation, deterministic=deterministic) diff --git a/torchy_baselines/version.txt b/torchy_baselines/version.txt index db79cdc82..892c7ae8c 100644 --- a/torchy_baselines/version.txt +++ b/torchy_baselines/version.txt @@ -1 +1 @@ -0.4.0a2 +0.4.0a3