diff --git a/torchrl/envs/batched_envs.py b/torchrl/envs/batched_envs.py
index 0851ed15fd4..ac0a136c7f9 100644
--- a/torchrl/envs/batched_envs.py
+++ b/torchrl/envs/batched_envs.py
@@ -732,6 +732,32 @@ class ParallelEnv(_BatchedEnv):
     """
 
     __doc__ += _BatchedEnv.__doc__
+    __doc__ += """
+
+    .. note::
+      The choice of the devices where ParallelEnv needs to be executed can
+      drastically influence its performance. The rule of thumbs is:
+
+        - If the base environment (backend, e.g., Gym) is executed on CPU, the
+          sub-environments should be executed on CPU and the data should be
+          passed via shared physical memory.
+        - If the base environment is (or can be) executed on CUDA, the sub-environments
+          should be placed on CUDA too.
+        - If a CUDA device is available and the policy is to be executed on CUDA,
+          the ParallelEnv device should be set to CUDA.
+
+      Therefore, supposing a CUDA device is available, we have the following scenarios:
+
+        >>> # The sub-envs are executed on CPU, but the policy is on GPU
+        >>> env = ParallelEnv(N, MyEnv(..., device="cpu"), device="cuda")
+        >>> # The sub-envs are executed on CUDA
+        >>> env = ParallelEnv(N, MyEnv(..., device="cuda"), device="cuda")
+        >>> # this will create the exact same environment
+        >>> env = ParallelEnv(N, MyEnv(..., device="cuda"))
+        >>> # If no cuda device is available
+        >>> env = ParallelEnv(N, MyEnv(..., device="cpu"))
+
+    """
 
     def _start_workers(self) -> None:
         from torchrl.envs.env_creator import EnvCreator