pytorch · vmoens · Mar 18, 2024 · Mar 11, 2024 · Mar 12, 2024 · Mar 12, 2024
diff --git a/docs/source/reference/data.rst b/docs/source/reference/data.rst
@@ -823,3 +823,11 @@ Utils
     consolidate_spec
     check_no_exclusive_keys
     contains_lazy_spec
+
+.. currentmodule:: torchrl.envs.transforms.rb_transforms
+
+.. autosummary::
+    :toctree: generated/
+    :template: rl_template.rst
+
+    MultiStepTransform
diff --git a/test/test_cost.py b/test/test_cost.py
@@ -547,7 +547,7 @@ def test_dqn_state_dict(self, delay_value, device, action_spec_type):
         loss_fn2 = DQNLoss(actor, loss_function="l2", delay_value=delay_value)
         loss_fn2.load_state_dict(sd)
 
-    @pytest.mark.parametrize("n", range(4))
+    @pytest.mark.parametrize("n", range(1, 4))
     @pytest.mark.parametrize("delay_value", (False, True))
     @pytest.mark.parametrize("device", get_default_devices())
     @pytest.mark.parametrize("action_spec_type", ("one_hot", "categorical"))
@@ -579,7 +579,7 @@ def test_dqn_batcher(self, n, delay_value, device, action_spec_type, gamma=0.9):
 
         with torch.no_grad():
             loss = loss_fn(td)
-        if n == 0:
+        if n == 1:
             assert_allclose_td(td, ms_td.select(*td.keys(True, True)))
             _loss = sum(
                 [item for name, item in loss.items() if name.startswith("loss")]
@@ -1125,7 +1125,7 @@ def test_qmixer_state_dict(self, delay_value, device, action_spec_type):
         loss_fn2 = QMixerLoss(actor, mixer, loss_function="l2", delay_value=delay_value)
         loss_fn2.load_state_dict(sd)
 
-    @pytest.mark.parametrize("n", range(4))
+    @pytest.mark.parametrize("n", range(1, 4))
     @pytest.mark.parametrize("delay_value", (False, True))
     @pytest.mark.parametrize("device", get_default_devices())
     @pytest.mark.parametrize("action_spec_type", ("one_hot", "categorical"))
@@ -1158,7 +1158,7 @@ def test_qmix_batcher(self, n, delay_value, device, action_spec_type, gamma=0.9)
 
         with torch.no_grad():
             loss = loss_fn(td)
-        if n == 0:
+        if n == 1:
             assert_allclose_td(td, ms_td.select(*td.keys(True, True)))
             _loss = sum(
                 [item for name, item in loss.items() if name.startswith("loss")]
@@ -1801,7 +1801,7 @@ def test_ddpg_separate_losses(
                 raise NotImplementedError(k)
             loss_fn.zero_grad()
 
-    @pytest.mark.parametrize("n", list(range(4)))
+    @pytest.mark.parametrize("n", range(1, 4))
     @pytest.mark.parametrize("device", get_default_devices())
     @pytest.mark.parametrize("delay_actor,delay_value", [(False, False), (True, True)])
     def test_ddpg_batcher(self, n, delay_actor, delay_value, device, gamma=0.9):
@@ -1832,7 +1832,7 @@ def test_ddpg_batcher(self, n, delay_actor, delay_value, device, gamma=0.9):
 
         with torch.no_grad():
             loss = loss_fn(td)
-        if n == 0:
+        if n == 1:
             assert_allclose_td(td, ms_td.select(*list(td.keys(True, True))))
             _loss = sum(
                 [item for name, item in loss.items() if name.startswith("loss_")]
@@ -2433,7 +2433,7 @@ def test_td3_separate_losses(
                 loss_fn.zero_grad()
 
     @pytest.mark.skipif(not _has_functorch, reason="functorch not installed")
-    @pytest.mark.parametrize("n", list(range(4)))
+    @pytest.mark.parametrize("n", range(1, 4))
     @pytest.mark.parametrize("device", get_default_devices())
     @pytest.mark.parametrize("delay_actor,delay_qvalue", [(False, False), (True, True)])
     @pytest.mark.parametrize("policy_noise", [0.1, 1.0])
@@ -2479,7 +2479,7 @@ def test_td3_batcher(
             np.random.seed(0)
             loss = loss_fn(td)
 
-        if n == 0:
+        if n == 1:
             assert_allclose_td(td, ms_td.select(*list(td.keys(True, True))))
             _loss = sum(
                 [item for name, item in loss.items() if name.startswith("loss_")]
@@ -3228,7 +3228,7 @@ def test_sac_separate_losses(
                     raise NotImplementedError(k)
                 loss_fn.zero_grad()
 
-    @pytest.mark.parametrize("n", list(range(4)))
+    @pytest.mark.parametrize("n", range(1, 4))
     @pytest.mark.parametrize("delay_value", (True, False))
     @pytest.mark.parametrize("delay_actor", (True, False))
     @pytest.mark.parametrize("delay_qvalue", (True, False))
@@ -3292,7 +3292,7 @@ def test_sac_batcher(
                 torch.manual_seed(0)  # log-prob is computed with a random action
                 np.random.seed(0)
                 loss = loss_fn(td)
-            if n == 0:
+            if n == 1:
                 assert_allclose_td(td, ms_td.select(*list(td.keys(True, True))))
                 _loss = sum(
                     [item for name, item in loss.items() if name.startswith("loss_")]
@@ -3927,7 +3927,7 @@ def test_discrete_sac_state_dict(
         )
         loss_fn2.load_state_dict(sd)
 
-    @pytest.mark.parametrize("n", list(range(4)))
+    @pytest.mark.parametrize("n", range(1, 4))
     @pytest.mark.parametrize("delay_qvalue", (True, False))
     @pytest.mark.parametrize("num_qvalue", [2])
     @pytest.mark.parametrize("device", get_default_devices())
@@ -3983,7 +3983,7 @@ def test_discrete_sac_batcher(
             torch.manual_seed(0)  # log-prob is computed with a random action
             np.random.seed(0)
             loss = loss_fn(td)
-        if n == 0:
+        if n == 1:
             assert_allclose_td(td, ms_td.select(*list(td.keys(True, True))))
             _loss = sum(
                 [item for name, item in loss.items() if name.startswith("loss_")]
@@ -4871,7 +4871,7 @@ def test_redq_batched(self, delay_qvalue, num_qvalue, device, td_est):
         # TODO: find a way to compare the losses: problem is that we sample actions either sequentially or in batch,
         #  so setting seed has little impact
 
-    @pytest.mark.parametrize("n", list(range(4)))
+    @pytest.mark.parametrize("n", range(1, 4))
     @pytest.mark.parametrize("delay_qvalue", (True, False))
     @pytest.mark.parametrize("num_qvalue", [1, 2, 4, 8])
     @pytest.mark.parametrize("device", get_default_devices())
@@ -4914,7 +4914,7 @@ def test_redq_batcher(self, n, delay_qvalue, num_qvalue, device, gamma=0.9):
                 torch.manual_seed(0)  # log-prob is computed with a random action
                 np.random.seed(0)
                 loss = loss_fn(td)
-            if n == 0:
+            if n == 1:
                 assert_allclose_td(td, ms_td.select(*list(td.keys(True, True))))
                 _loss = sum(
                     [item for name, item in loss.items() if name.startswith("loss_")]
@@ -5482,7 +5482,7 @@ def test_cql_state_dict(
         )
         loss_fn2.load_state_dict(sd)
 
-    @pytest.mark.parametrize("n", list(range(4)))
+    @pytest.mark.parametrize("n", range(1, 4))
     @pytest.mark.parametrize("delay_actor", (True, False))
     @pytest.mark.parametrize("delay_qvalue", (True, False))
     @pytest.mark.parametrize("max_q_backup", [True, False])
@@ -5537,7 +5537,7 @@ def test_cql_batcher(
                 torch.manual_seed(0)  # log-prob is computed with a random action
                 np.random.seed(0)
                 loss = loss_fn(td)
-            if n == 0:
+            if n == 1:
                 assert_allclose_td(td, ms_td.select(*list(td.keys(True, True))))
                 _loss = sum(
                     [item for name, item in loss.items() if name.startswith("loss_")]
@@ -5843,7 +5843,7 @@ def test_dcql_state_dict(self, delay_value, device, action_spec_type):
         loss_fn2 = DiscreteCQLLoss(actor, loss_function="l2", delay_value=delay_value)
         loss_fn2.load_state_dict(sd)
 
-    @pytest.mark.parametrize("n", range(4))
+    @pytest.mark.parametrize("n", range(1, 4))
     @pytest.mark.parametrize("delay_value", (False, True))
     @pytest.mark.parametrize("device", get_default_devices())
     @pytest.mark.parametrize("action_spec_type", ("one_hot", "categorical"))
@@ -5874,7 +5874,7 @@ def test_dcql_batcher(self, n, delay_value, device, action_spec_type, gamma=0.9)
 
         with torch.no_grad():
             loss = loss_fn(td)
-        if n == 0:
+        if n == 1:
             assert_allclose_td(td, ms_td.select(*td.keys(True, True)))
             _loss = sum([item for key, item in loss.items() if key.startswith("loss_")])
             _loss_ms = sum(
@@ -9356,7 +9356,7 @@ def test_iql_separate_losses(self, separate_losses):
                     raise NotImplementedError(k)
                 loss_fn.zero_grad()
 
-    @pytest.mark.parametrize("n", list(range(4)))
+    @pytest.mark.parametrize("n", range(1, 4))
     @pytest.mark.parametrize("num_qvalue", [1, 2, 4, 8])
     @pytest.mark.parametrize("temperature", [0.0, 0.1, 1.0, 10.0])
     @pytest.mark.parametrize("expectile", [0.1, 0.5, 1.0])
@@ -9407,7 +9407,7 @@ def test_iql_batcher(
             torch.manual_seed(0)  # log-prob is computed with a random action
             np.random.seed(0)
             loss = loss_fn(td)
-        if n == 0:
+        if n == 1:
             assert_allclose_td(td, ms_td.select(*list(td.keys(True, True))))
             _loss = sum(
                 [item for name, item in loss.items() if name.startswith("loss_")]
@@ -10168,7 +10168,7 @@ def test_discrete_iql_separate_losses(self, separate_losses):
                     raise NotImplementedError(k)
                 loss_fn.zero_grad()
 
-    @pytest.mark.parametrize("n", list(range(4)))
+    @pytest.mark.parametrize("n", range(1, 4))
     @pytest.mark.parametrize("num_qvalue", [1, 2, 4, 8])
     @pytest.mark.parametrize("temperature", [0.0, 0.1, 1.0, 10.0])
     @pytest.mark.parametrize("expectile", [0.1, 0.5])
@@ -10219,7 +10219,7 @@ def test_discrete_iql_batcher(
             torch.manual_seed(0)  # log-prob is computed with a random action
             np.random.seed(0)
             loss = loss_fn(td)
-        if n == 0:
+        if n == 1:
             assert_allclose_td(td, ms_td.select(*list(td.keys(True, True))))
             _loss = sum(
                 [item for name, item in loss.items() if name.startswith("loss_")]

diff --git a/test/test_postprocs.py b/test/test_postprocs.py
@@ -13,7 +13,7 @@
 from torchrl.data.postprocs.postprocs import MultiStep
 
 
-@pytest.mark.parametrize("n", range(13))
+@pytest.mark.parametrize("n", range(1, 14))
 @pytest.mark.parametrize("device", get_default_devices())
 @pytest.mark.parametrize("key", ["observation", "pixels", "observation_whatever"])
 def test_multistep(n, key, device, T=11):
@@ -58,7 +58,7 @@ def test_multistep(n, key, device, T=11):
 
     assert ms_tensordict.get("done").max() == 1
 
-    if n == 0:
+    if n == 1:
         assert_allclose_td(
             tensordict, ms_tensordict.select(*list(tensordict.keys(True, True)))
         )
@@ -76,20 +76,18 @@ def test_multistep(n, key, device, T=11):
     )
 
     # check that next obs is properly replaced, or that it is terminated
-    next_obs = ms_tensordict.get(key)[:, (1 + ms.n_steps) :]
-    true_next_obs = ms_tensordict.get(("next", key))[:, : -(1 + ms.n_steps)]
+    next_obs = ms_tensordict.get(key)[:, (ms.n_steps) :]
+    true_next_obs = ms_tensordict.get(("next", key))[:, : -(ms.n_steps)]
     terminated = ~ms_tensordict.get("nonterminal")
-    assert (
-        (next_obs == true_next_obs).all(-1) | terminated[:, (1 + ms.n_steps) :]
-    ).all()
+    assert ((next_obs == true_next_obs).all(-1) | terminated[:, (ms.n_steps) :]).all()
 
     # test gamma computation
     torch.testing.assert_close(
         ms_tensordict.get("gamma"), ms.gamma ** ms_tensordict.get("steps_to_next_obs")
     )
 
     # test reward
-    if n > 0:
+    if n > 1:
         assert (
             ms_tensordict.get(("next", "reward"))
             != ms_tensordict.get(("next", "original_reward"))
@@ -105,36 +103,17 @@ def test_multistep(n, key, device, T=11):
 @pytest.mark.parametrize(
     "batch_size",
     [
-        [
-            4,
-        ],
+        [4],
         [],
-        [
-            1,
-        ],
+        [1],
         [2, 3],
     ],
 )
-@pytest.mark.parametrize(
-    "T",
-    [
-        10,
-        1,
-        2,
-    ],
-)
-@pytest.mark.parametrize(
-    "obs_dim",
-    [
-        [
-            1,
-        ],
-        [],
-    ],
-)
+@pytest.mark.parametrize("T", [10, 1, 2])
+@pytest.mark.parametrize("obs_dim", [[1], []])
 @pytest.mark.parametrize("unsq_reward", [True, False])
 @pytest.mark.parametrize("last_done", [True, False])
-@pytest.mark.parametrize("n_steps", [3, 1, 0])
+@pytest.mark.parametrize("n_steps", [4, 2, 1])
 def test_mutistep_cattrajs(
     batch_size, T, obs_dim, unsq_reward, last_done, device, n_steps
 ):
@@ -166,7 +145,7 @@ def test_mutistep_cattrajs(
     )
     ms = MultiStep(0.98, n_steps)
     tdm = ms(td)
-    if n_steps == 0:
+    if n_steps == 1:
         # n_steps = 0 has no effect
         for k in td["next"].keys():
             assert (tdm["next", k] == td["next", k]).all()
@@ -179,7 +158,7 @@ def test_mutistep_cattrajs(
         if unsq_reward:
             done = done.squeeze(-1)
         for t in range(T):
-            idx = t + n_steps
+            idx = t + n_steps - 1
             while (done[..., t:idx].any() and idx > t) or idx > done.shape[-1] - 1:
                 idx = idx - 1
             next_obs.append(obs[..., idx])