From ab39e4b8244be2bcfadcecd09649a68508b3667d Mon Sep 17 00:00:00 2001
From: DESKTOP-42S1K65 <muellerzr@gmail.com>
Date: Mon, 20 Mar 2023 10:53:57 -0400
Subject: [PATCH 01/13] Set the state device dependant to Accelerator on
 multigpu

---
 src/accelerate/accelerator.py   | 11 ++++++++++-
 src/accelerate/checkpointing.py | 13 +++++++++++--
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
index c2423855228..79f9e06e47f 100644
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@@ -2385,8 +2385,17 @@ def load_state(self, input_dir: str, **load_model_func_kwargs):
         for hook in self._load_model_state_pre_hook.values():
             hook(models, input_dir)
 
+        optimizer_device = "cpu" if self.num_processes < 2 else self.device
+
         load_accelerator_state(
-            input_dir, models, optimizers, schedulers, self.state.process_index, self.scaler, **load_model_func_kwargs
+            input_dir,
+            models,
+            optimizers,
+            schedulers,
+            self.state.process_index,
+            self.scaler,
+            optimizer_device,
+            **load_model_func_kwargs,
         )
         custom_checkpoints = [f for f in os.listdir(input_dir) if "custom_checkpoint" in f]
         if len(custom_checkpoints) != len(self._custom_objects):
diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py
index 7215035f9d1..02cac9f2020 100644
--- a/src/accelerate/checkpointing.py
+++ b/src/accelerate/checkpointing.py
@@ -110,7 +110,14 @@ def save_accelerator_state(
 
 
 def load_accelerator_state(
-    input_dir, models, optimizers, schedulers, process_index, scaler=None, **load_model_func_kwargs
+    input_dir,
+    models,
+    optimizers,
+    schedulers,
+    process_index,
+    scaler=None,
+    optimizer_map_location="cpu",
+    **load_model_func_kwargs,
 ):
     """
     Loads states of the models, optimizers, scaler, and RNG generators from a given directory.
@@ -128,6 +135,8 @@ def load_accelerator_state(
             The current process index in the Accelerator state
         scaler (`torch.cuda.amp.GradScaler`, *optional*):
             An optional *GradScaler* instance to load
+        optimizer_map_location (`torch.device`, *optional*):
+            What device to load the optimizer state onto. By default uses "cpu".
         load_model_func_kwargs (`dict`, *optional*):
             Additional arguments that can be passed to the model's `load_state_dict` method.
     """
@@ -142,7 +151,7 @@ def load_accelerator_state(
     for i, opt in enumerate(optimizers):
         optimizer_name = f"{OPTIMIZER_NAME}.bin" if i == 0 else f"{OPTIMIZER_NAME}_{i}.bin"
         input_optimizer_file = os.path.join(input_dir, optimizer_name)
-        optimizers[i].load_state_dict(torch.load(input_optimizer_file, map_location="cpu"))
+        optimizers[i].load_state_dict(torch.load(input_optimizer_file, map_location=optimizer_map_location))
     logger.info("All optimizer states loaded successfully")
 
     # Scheduler states

From e862ccd04382c143d0c32045806de02780c00e80 Mon Sep 17 00:00:00 2001
From: DESKTOP-42S1K65 <muellerzr@gmail.com>
Date: Mon, 20 Mar 2023 10:57:42 -0400
Subject: [PATCH 02/13] Use map location

---
 src/accelerate/accelerator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
index 79f9e06e47f..ffc80c1155e 100644
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@@ -2385,7 +2385,7 @@ def load_state(self, input_dir: str, **load_model_func_kwargs):
         for hook in self._load_model_state_pre_hook.values():
             hook(models, input_dir)
 
-        optimizer_device = "cpu" if self.num_processes < 2 else self.device
+        optimizer_map_location = "cpu" if self.num_processes < 2 else self.device
 
         load_accelerator_state(
             input_dir,
@@ -2394,7 +2394,7 @@ def load_state(self, input_dir: str, **load_model_func_kwargs):
             schedulers,
             self.state.process_index,
             self.scaler,
-            optimizer_device,
+            optimizer_map_location,
             **load_model_func_kwargs,
         )
         custom_checkpoints = [f for f in os.listdir(input_dir) if "custom_checkpoint" in f]

From 43990678888361eeca43a7fa50d17ec87c11dddc Mon Sep 17 00:00:00 2001
From: DESKTOP-42S1K65 <muellerzr@gmail.com>
Date: Mon, 20 Mar 2023 11:39:37 -0400
Subject: [PATCH 03/13] Use on_device and PartialState

---
 src/accelerate/accelerator.py   |  2 +-
 src/accelerate/checkpointing.py | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
index ffc80c1155e..4a81cdfcde3 100644
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@@ -2385,7 +2385,7 @@ def load_state(self, input_dir: str, **load_model_func_kwargs):
         for hook in self._load_model_state_pre_hook.values():
             hook(models, input_dir)
 
-        optimizer_map_location = "cpu" if self.num_processes < 2 else self.device
+        optimizer_map_location = "on_device" if self.num_processes > 1 else "cpu"
 
         load_accelerator_state(
             input_dir,
diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py
index 02cac9f2020..c4027e7d6c5 100644
--- a/src/accelerate/checkpointing.py
+++ b/src/accelerate/checkpointing.py
@@ -21,6 +21,7 @@
 import torch
 from torch.cuda.amp import GradScaler
 
+from .state import PartialState
 from .utils import (
     MODEL_NAME,
     OPTIMIZER_NAME,
@@ -116,7 +117,7 @@ def load_accelerator_state(
     schedulers,
     process_index,
     scaler=None,
-    optimizer_map_location="cpu",
+    optimizer_map_location=None,
     **load_model_func_kwargs,
 ):
     """
@@ -135,8 +136,8 @@ def load_accelerator_state(
             The current process index in the Accelerator state
         scaler (`torch.cuda.amp.GradScaler`, *optional*):
             An optional *GradScaler* instance to load
-        optimizer_map_location (`torch.device`, *optional*):
-            What device to load the optimizer state onto. By default uses "cpu".
+        optimizer_map_location (`str`, *optional*):
+            What device to load the optimizer state onto. Should be one of either "cpu" or "on_device".
         load_model_func_kwargs (`dict`, *optional*):
             Additional arguments that can be passed to the model's `load_state_dict` method.
     """
@@ -148,6 +149,14 @@ def load_accelerator_state(
     logger.info("All model weights loaded successfully")
 
     # Optimizer states
+    if optimizer_map_location not in [None, "cpu", "on_device"]:
+        raise TypeError(
+            "Unsupported optimizer map location passed, please choose one of `None`, `cpu`, or `on_device`"
+        )
+    if optimizer_map_location is None:
+        optimizer_map_location = "cpu"
+    elif optimizer_map_location == "on_device":
+        optimizer_map_location = PartialState().device
     for i, opt in enumerate(optimizers):
         optimizer_name = f"{OPTIMIZER_NAME}.bin" if i == 0 else f"{OPTIMIZER_NAME}_{i}.bin"
         input_optimizer_file = os.path.join(input_dir, optimizer_name)

From 480e9e3016801c8f352ba7214d35bfbc38aad302 Mon Sep 17 00:00:00 2001
From: DESKTOP-42S1K65 <muellerzr@gmail.com>
Date: Mon, 20 Mar 2023 11:44:36 -0400
Subject: [PATCH 04/13] Move import

---
 src/accelerate/checkpointing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py
index c4027e7d6c5..52a28cfb2c9 100644
--- a/src/accelerate/checkpointing.py
+++ b/src/accelerate/checkpointing.py
@@ -21,7 +21,6 @@
 import torch
 from torch.cuda.amp import GradScaler
 
-from .state import PartialState
 from .utils import (
     MODEL_NAME,
     OPTIMIZER_NAME,
@@ -38,6 +37,7 @@
     import torch_xla.core.xla_model as xm
 
 from .logging import get_logger
+from .state import PartialState
 
 
 logger = get_logger(__name__)

From 1e6f5122d232baa7f5a506824e76d22b388bc65a Mon Sep 17 00:00:00 2001
From: DESKTOP-42S1K65 <muellerzr@gmail.com>
Date: Tue, 21 Mar 2023 08:41:14 -0400
Subject: [PATCH 05/13] Check for multi-gpu

---
 src/accelerate/checkpointing.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py
index 52a28cfb2c9..3618e31c923 100644
--- a/src/accelerate/checkpointing.py
+++ b/src/accelerate/checkpointing.py
@@ -27,6 +27,7 @@
     RNG_STATE_NAME,
     SCALER_NAME,
     SCHEDULER_NAME,
+    DistributedType,
     get_pretty_name,
     is_tpu_available,
     save,
@@ -153,10 +154,14 @@ def load_accelerator_state(
         raise TypeError(
             "Unsupported optimizer map location passed, please choose one of `None`, `cpu`, or `on_device`"
         )
+    current_device = PartialState().device
     if optimizer_map_location is None:
-        optimizer_map_location = "cpu"
+        if PartialState().distributed_type == DistributedType.MULTI_GPU:
+            optimizer_map_location = current_device
+        else:
+            optimizer_map_location = "cpu"
     elif optimizer_map_location == "on_device":
-        optimizer_map_location = PartialState().device
+        optimizer_map_location = current_device
     for i, opt in enumerate(optimizers):
         optimizer_name = f"{OPTIMIZER_NAME}.bin" if i == 0 else f"{OPTIMIZER_NAME}_{i}.bin"
         input_optimizer_file = os.path.join(input_dir, optimizer_name)

From f0e40ff44805b960d42d6c1539fefc5674d4c2a7 Mon Sep 17 00:00:00 2001
From: DESKTOP-42S1K65 <muellerzr@gmail.com>
Date: Tue, 21 Mar 2023 08:52:06 -0400
Subject: [PATCH 06/13] Change logic in accelerator as well

---
 src/accelerate/accelerator.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
index 4a81cdfcde3..f3df0f6870c 100644
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@@ -2385,7 +2385,10 @@ def load_state(self, input_dir: str, **load_model_func_kwargs):
         for hook in self._load_model_state_pre_hook.values():
             hook(models, input_dir)
 
-        optimizer_map_location = "on_device" if self.num_processes > 1 else "cpu"
+        if self.num_processes > 1 and self.distributed_type == DistributedType.MULTI_GPU:
+            optimizer_map_location = "on_device"
+        else:
+            optimizer_map_location = "cpu"
 
         load_accelerator_state(
             input_dir,

From a4b81e0b21dc9fb15527fb62aa8c45fbdeb4f0fb Mon Sep 17 00:00:00 2001
From: DESKTOP-42S1K65 <muellerzr@gmail.com>
Date: Tue, 21 Mar 2023 08:57:30 -0400
Subject: [PATCH 07/13] Pop from load_model_func kwargs

---
 src/accelerate/accelerator.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
index f3df0f6870c..d8607f5254a 100644
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@@ -2385,10 +2385,12 @@ def load_state(self, input_dir: str, **load_model_func_kwargs):
         for hook in self._load_model_state_pre_hook.values():
             hook(models, input_dir)
 
-        if self.num_processes > 1 and self.distributed_type == DistributedType.MULTI_GPU:
-            optimizer_map_location = "on_device"
-        else:
-            optimizer_map_location = "cpu"
+        optimizer_map_location = load_model_func_kwargs.pop("optimizer_map_location", None)
+        if optimizer_map_location is None:
+            if self.num_processes > 1 and self.distributed_type == DistributedType.MULTI_GPU:
+                optimizer_map_location = "on_device"
+            else:
+                optimizer_map_location = "cpu"
 
         load_accelerator_state(
             input_dir,

From 86de8958890810deca49f8056d4a3d6da72380af Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Thu, 6 Apr 2023 12:25:35 -0400
Subject: [PATCH 08/13] Working

---
 src/accelerate/accelerator.py     | 13 ++++----
 src/accelerate/checkpointing.py   | 37 +++++++++++----------
 tests/test_state_checkpointing.py | 54 +++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 23 deletions(-)

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
index d8607f5254a..bf6014299b5 100644
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@@ -2322,7 +2322,8 @@ def load_state(self, input_dir: str, **load_model_func_kwargs):
                 The name of the folder all relevant weights and states were saved in.
             load_model_func_kwargs (`dict`, *optional*):
                 Additional keyword arguments for loading model which can be passed to the underlying load function,
-                such as optional arguments for DeepSpeed's `load_checkpoint` function.
+                such as optional arguments for DeepSpeed's `load_checkpoint` function or a `map_location` to load the
+                model and optimizer on.
 
         Example:
 
@@ -2385,12 +2386,12 @@ def load_state(self, input_dir: str, **load_model_func_kwargs):
         for hook in self._load_model_state_pre_hook.values():
             hook(models, input_dir)
 
-        optimizer_map_location = load_model_func_kwargs.pop("optimizer_map_location", None)
-        if optimizer_map_location is None:
+        map_location = load_model_func_kwargs.pop("map_location", None)
+        if map_location is None:
             if self.num_processes > 1 and self.distributed_type == DistributedType.MULTI_GPU:
-                optimizer_map_location = "on_device"
+                map_location = "on_device"
             else:
-                optimizer_map_location = "cpu"
+                map_location = "cpu"
 
         load_accelerator_state(
             input_dir,
@@ -2399,7 +2400,7 @@ def load_state(self, input_dir: str, **load_model_func_kwargs):
             schedulers,
             self.state.process_index,
             self.scaler,
-            optimizer_map_location,
+            map_location,
             **load_model_func_kwargs,
         )
         custom_checkpoints = [f for f in os.listdir(input_dir) if "custom_checkpoint" in f]
diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py
index 3618e31c923..056194f404d 100644
--- a/src/accelerate/checkpointing.py
+++ b/src/accelerate/checkpointing.py
@@ -32,7 +32,7 @@
     is_tpu_available,
     save,
 )
-
+from .optimizer import move_to_device
 
 if is_tpu_available(check_device=False):
     import torch_xla.core.xla_model as xm
@@ -73,6 +73,7 @@ def save_accelerator_state(
     for i, state in enumerate(model_states):
         weights_name = f"{MODEL_NAME}.bin" if i == 0 else f"{MODEL_NAME}_{i}.bin"
         output_model_file = os.path.join(output_dir, weights_name)
+        state = move_to_device(state, "cpu")
         save(state, output_model_file)
         logger.info(f"Model weights saved in {output_model_file}")
     # Optimizer states
@@ -118,7 +119,7 @@ def load_accelerator_state(
     schedulers,
     process_index,
     scaler=None,
-    optimizer_map_location=None,
+    map_location=None,
     **load_model_func_kwargs,
 ):
     """
@@ -137,35 +138,37 @@ def load_accelerator_state(
             The current process index in the Accelerator state
         scaler (`torch.cuda.amp.GradScaler`, *optional*):
             An optional *GradScaler* instance to load
-        optimizer_map_location (`str`, *optional*):
+        map_location (`str`, *optional*):
             What device to load the optimizer state onto. Should be one of either "cpu" or "on_device".
         load_model_func_kwargs (`dict`, *optional*):
             Additional arguments that can be passed to the model's `load_state_dict` method.
     """
+    if map_location not in [None, "cpu", "on_device"]:
+        raise TypeError(
+            "Unsupported optimizer map location passed, please choose one of `None`, `cpu`, or `on_device`"
+        )
+    current_device = PartialState().device
+    if map_location is None:
+        if PartialState().distributed_type == DistributedType.MULTI_GPU:
+            map_location = current_device
+        else:
+            map_location = "cpu"
+    elif map_location == "on_device":
+        map_location = current_device
     # Model states
     for i, model in enumerate(models):
         weights_name = f"{MODEL_NAME}.bin" if i == 0 else f"{MODEL_NAME}_{i}.bin"
         input_model_file = os.path.join(input_dir, weights_name)
-        models[i].load_state_dict(torch.load(input_model_file, map_location="cpu"), **load_model_func_kwargs)
+        models[i].to(map_location)
+        models[i].load_state_dict(torch.load(input_model_file, map_location=map_location), **load_model_func_kwargs)
     logger.info("All model weights loaded successfully")
 
     # Optimizer states
-    if optimizer_map_location not in [None, "cpu", "on_device"]:
-        raise TypeError(
-            "Unsupported optimizer map location passed, please choose one of `None`, `cpu`, or `on_device`"
-        )
-    current_device = PartialState().device
-    if optimizer_map_location is None:
-        if PartialState().distributed_type == DistributedType.MULTI_GPU:
-            optimizer_map_location = current_device
-        else:
-            optimizer_map_location = "cpu"
-    elif optimizer_map_location == "on_device":
-        optimizer_map_location = current_device
     for i, opt in enumerate(optimizers):
         optimizer_name = f"{OPTIMIZER_NAME}.bin" if i == 0 else f"{OPTIMIZER_NAME}_{i}.bin"
         input_optimizer_file = os.path.join(input_dir, optimizer_name)
-        optimizers[i].load_state_dict(torch.load(input_optimizer_file, map_location=optimizer_map_location))
+        optimizer_state = torch.load(input_optimizer_file)
+        optimizers[i].load_state_dict(optimizer_state)
     logger.info("All optimizer states loaded successfully")
 
     # Scheduler states
diff --git a/tests/test_state_checkpointing.py b/tests/test_state_checkpointing.py
index cc1a5a4266e..38611c701a4 100644
--- a/tests/test_state_checkpointing.py
+++ b/tests/test_state_checkpointing.py
@@ -23,6 +23,7 @@
 from torch.utils.data import DataLoader, TensorDataset
 
 from accelerate import Accelerator
+from accelerate.test_utils import require_cuda
 from accelerate.utils import ProjectConfiguration, set_seed
 
 
@@ -248,3 +249,56 @@ def test_checkpoint_deletion(self):
             self.assertTrue(not os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_0")))
             self.assertTrue(os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_9")))
             self.assertTrue(os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_10")))
+
+    @require_cuda
+    def test_map_location(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            model = DummyModel()
+            optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
+            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99)
+            train_dataloader, valid_dataloader = dummy_dataloaders()
+            project_config = ProjectConfiguration(automatic_checkpoint_naming=True)
+            # Train baseline
+            accelerator = Accelerator(project_dir=tmpdir, project_config=project_config)
+            model, optimizer, train_dataloader, valid_dataloader, scheduler = accelerator.prepare(
+                model, optimizer, train_dataloader, valid_dataloader, scheduler
+            )
+            model, optimizer = accelerator.prepare(model, optimizer)
+            train(3, model, train_dataloader, optimizer, accelerator, scheduler)
+            # Check that the intial optimizer is loaded on the GPU
+            for group in optimizer.param_groups:
+                param_device = group["params"][0].device
+                break
+            self.assertEqual(param_device.type, accelerator.device.type)
+            model = model.cpu()
+            accelerator.save_state()
+
+            # Check CPU state
+            accelerator.load_state(os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="cpu")
+            for group in optimizer.param_groups:
+                param_device = group["params"][0].device
+                break
+            self.assertEqual(
+                param_device.type,
+                torch.device("cpu").type,
+                f"Loaded optimizer states did not match, expected to be loaded on the CPU but got {param_device}",
+            )
+
+            # Check device state
+            accelerator.load_state(
+                os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="on_device"
+            )
+            for group in optimizer.param_groups:
+                param_device = group["params"][0].device
+                break
+            self.assertEqual(
+                param_device.type,
+                accelerator.device.type,
+                f"Loaded optimizer states did not match, expected to be loaded on {accelerator.device} but got {param_device}",
+            )
+
+            # Check error
+            with self.assertRaises(TypeError, msg="Unsupported optimizer map location passed"):
+                accelerator.load_state(
+                    os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="invalid"
+                )

From f92b48cae8c7bdb9997f25053045a76776bb5049 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Thu, 6 Apr 2023 12:41:09 -0400
Subject: [PATCH 09/13] Add tests

---
 src/accelerate/checkpointing.py   | 15 +++++----------
 tests/test_state_checkpointing.py |  8 ++------
 2 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py
index 056194f404d..3d4d43fa815 100644
--- a/src/accelerate/checkpointing.py
+++ b/src/accelerate/checkpointing.py
@@ -27,12 +27,11 @@
     RNG_STATE_NAME,
     SCALER_NAME,
     SCHEDULER_NAME,
-    DistributedType,
     get_pretty_name,
     is_tpu_available,
     save,
 )
-from .optimizer import move_to_device
+
 
 if is_tpu_available(check_device=False):
     import torch_xla.core.xla_model as xm
@@ -73,7 +72,6 @@ def save_accelerator_state(
     for i, state in enumerate(model_states):
         weights_name = f"{MODEL_NAME}.bin" if i == 0 else f"{MODEL_NAME}_{i}.bin"
         output_model_file = os.path.join(output_dir, weights_name)
-        state = move_to_device(state, "cpu")
         save(state, output_model_file)
         logger.info(f"Model weights saved in {output_model_file}")
     # Optimizer states
@@ -147,19 +145,16 @@ def load_accelerator_state(
         raise TypeError(
             "Unsupported optimizer map location passed, please choose one of `None`, `cpu`, or `on_device`"
         )
-    current_device = PartialState().device
     if map_location is None:
-        if PartialState().distributed_type == DistributedType.MULTI_GPU:
-            map_location = current_device
-        else:
-            map_location = "cpu"
+        map_location = "cpu"
     elif map_location == "on_device":
-        map_location = current_device
+        map_location = PartialState().device
     # Model states
     for i, model in enumerate(models):
         weights_name = f"{MODEL_NAME}.bin" if i == 0 else f"{MODEL_NAME}_{i}.bin"
         input_model_file = os.path.join(input_dir, weights_name)
-        models[i].to(map_location)
+        if map_location != "cpu":
+            models[i].to(map_location)
         models[i].load_state_dict(torch.load(input_model_file, map_location=map_location), **load_model_func_kwargs)
     logger.info("All model weights loaded successfully")
 
diff --git a/tests/test_state_checkpointing.py b/tests/test_state_checkpointing.py
index 38611c701a4..2a78caedf7d 100644
--- a/tests/test_state_checkpointing.py
+++ b/tests/test_state_checkpointing.py
@@ -285,9 +285,7 @@ def test_map_location(self):
             )
 
             # Check device state
-            accelerator.load_state(
-                os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="on_device"
-            )
+            accelerator.load_state(os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="on_device")
             for group in optimizer.param_groups:
                 param_device = group["params"][0].device
                 break
@@ -299,6 +297,4 @@ def test_map_location(self):
 
             # Check error
             with self.assertRaises(TypeError, msg="Unsupported optimizer map location passed"):
-                accelerator.load_state(
-                    os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="invalid"
-                )
+                accelerator.load_state(os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="invalid")

From 75466f60766c8ff5fc72b68febbce21946547e26 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Thu, 6 Apr 2023 12:59:55 -0400
Subject: [PATCH 10/13] Working, needed to change model device

---
 src/accelerate/checkpointing.py   | 4 +---
 tests/test_state_checkpointing.py | 1 +
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py
index 3d4d43fa815..0ff2defcaa2 100644
--- a/src/accelerate/checkpointing.py
+++ b/src/accelerate/checkpointing.py
@@ -143,7 +143,7 @@ def load_accelerator_state(
     """
     if map_location not in [None, "cpu", "on_device"]:
         raise TypeError(
-            "Unsupported optimizer map location passed, please choose one of `None`, `cpu`, or `on_device`"
+            "Unsupported optimizer map location passed, please choose one of `None`, `'cpu'`, or `'on_device'`"
         )
     if map_location is None:
         map_location = "cpu"
@@ -153,8 +153,6 @@ def load_accelerator_state(
     for i, model in enumerate(models):
         weights_name = f"{MODEL_NAME}.bin" if i == 0 else f"{MODEL_NAME}_{i}.bin"
         input_model_file = os.path.join(input_dir, weights_name)
-        if map_location != "cpu":
-            models[i].to(map_location)
         models[i].load_state_dict(torch.load(input_model_file, map_location=map_location), **load_model_func_kwargs)
     logger.info("All model weights loaded successfully")
 
diff --git a/tests/test_state_checkpointing.py b/tests/test_state_checkpointing.py
index 2a78caedf7d..7796150bb9d 100644
--- a/tests/test_state_checkpointing.py
+++ b/tests/test_state_checkpointing.py
@@ -285,6 +285,7 @@ def test_map_location(self):
             )
 
             # Check device state
+            model.to(accelerator.device)
             accelerator.load_state(os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="on_device")
             for group in optimizer.param_groups:
                 param_device = group["params"][0].device

From 662ad594603edb551e91a1da7b01ae61000e4c3a Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Thu, 6 Apr 2023 17:42:14 +0000
Subject: [PATCH 11/13] move to main to be ran multicuda

---
 tests/test_state_checkpointing.py | 105 ++++++++++++++++--------------
 1 file changed, 56 insertions(+), 49 deletions(-)

diff --git a/tests/test_state_checkpointing.py b/tests/test_state_checkpointing.py
index 7796150bb9d..d251ff9a5a9 100644
--- a/tests/test_state_checkpointing.py
+++ b/tests/test_state_checkpointing.py
@@ -15,15 +15,16 @@
 import logging
 import os
 import random
+import shutil
 import tempfile
 import unittest
 
+import pytest
 import torch
 from torch import nn
 from torch.utils.data import DataLoader, TensorDataset
 
 from accelerate import Accelerator
-from accelerate.test_utils import require_cuda
 from accelerate.utils import ProjectConfiguration, set_seed
 
 
@@ -250,52 +251,58 @@ def test_checkpoint_deletion(self):
             self.assertTrue(os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_9")))
             self.assertTrue(os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_10")))
 
-    @require_cuda
-    def test_map_location(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            model = DummyModel()
-            optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
-            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99)
-            train_dataloader, valid_dataloader = dummy_dataloaders()
-            project_config = ProjectConfiguration(automatic_checkpoint_naming=True)
-            # Train baseline
-            accelerator = Accelerator(project_dir=tmpdir, project_config=project_config)
-            model, optimizer, train_dataloader, valid_dataloader, scheduler = accelerator.prepare(
-                model, optimizer, train_dataloader, valid_dataloader, scheduler
-            )
-            model, optimizer = accelerator.prepare(model, optimizer)
-            train(3, model, train_dataloader, optimizer, accelerator, scheduler)
-            # Check that the intial optimizer is loaded on the GPU
-            for group in optimizer.param_groups:
-                param_device = group["params"][0].device
-                break
-            self.assertEqual(param_device.type, accelerator.device.type)
-            model = model.cpu()
-            accelerator.save_state()
-
-            # Check CPU state
-            accelerator.load_state(os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="cpu")
-            for group in optimizer.param_groups:
-                param_device = group["params"][0].device
-                break
-            self.assertEqual(
-                param_device.type,
-                torch.device("cpu").type,
-                f"Loaded optimizer states did not match, expected to be loaded on the CPU but got {param_device}",
-            )
-
-            # Check device state
-            model.to(accelerator.device)
-            accelerator.load_state(os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="on_device")
-            for group in optimizer.param_groups:
-                param_device = group["params"][0].device
-                break
-            self.assertEqual(
-                param_device.type,
-                accelerator.device.type,
-                f"Loaded optimizer states did not match, expected to be loaded on {accelerator.device} but got {param_device}",
-            )
 
-            # Check error
-            with self.assertRaises(TypeError, msg="Unsupported optimizer map location passed"):
-                accelerator.load_state(os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="invalid")
+if __name__ == "__main__":
+    savedir = "/tmp/accelerate/state_checkpointing"
+    model = DummyModel()
+    optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99)
+    train_dataloader, valid_dataloader = dummy_dataloaders()
+    project_config = ProjectConfiguration(automatic_checkpoint_naming=True)
+    # Train baseline
+    accelerator = Accelerator(project_dir=savedir, project_config=project_config, mixed_precision="no")
+    if accelerator.process_index == 0:
+        if os.path.exists(savedir):
+            shutil.rmtree(savedir)
+        os.makedirs(savedir)
+    model, optimizer, train_dataloader, valid_dataloader, scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, valid_dataloader, scheduler
+    )
+    model, optimizer = accelerator.prepare(model, optimizer)
+    train(3, model, train_dataloader, optimizer, accelerator, scheduler)
+    # Check that the intial optimizer is loaded on the GPU
+    for group in optimizer.param_groups:
+        param_device = group["params"][0].device
+        break
+    assert param_device.type == accelerator.device.type
+    model = model.cpu()
+    accelerator.wait_for_everyone()
+    accelerator.save_state()
+    accelerator.wait_for_everyone()
+
+    # Check CPU state
+    accelerator.load_state(os.path.join(savedir, "checkpoints", "checkpoint_0"), map_location="cpu")
+    for group in optimizer.param_groups:
+        param_device = group["params"][0].device
+        break
+    assert (
+        param_device.type == torch.device("cpu").type
+    ), f"Loaded optimizer states did not match, expected to be loaded on the CPU but got {param_device}"
+
+    # Check device state
+    model.to(accelerator.device)
+    accelerator.load_state(os.path.join(savedir, "checkpoints", "checkpoint_0"), map_location="on_device")
+    for group in optimizer.param_groups:
+        param_device = group["params"][0].device
+        break
+    assert (
+        param_device.type == accelerator.device.type
+    ), f"Loaded optimizer states did not match, expected to be loaded on {accelerator.device} but got {param_device}"
+
+    # Check error
+    with pytest.raises(TypeError, match="Unsupported optimizer map location passed"):
+        accelerator.load_state(os.path.join(savedir, "checkpoints", "checkpoint_0"), map_location="invalid")
+    accelerator.wait_for_everyone()
+    if accelerator.process_index == 0:
+        shutil.rmtree(savedir)
+    accelerator.wait_for_everyone()

From 24e95c9b917668b26cdd1eb700d1e4946db35150 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Thu, 6 Apr 2023 17:50:55 +0000
Subject: [PATCH 12/13] Call test

---
 tests/test_state_checkpointing.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/test_state_checkpointing.py b/tests/test_state_checkpointing.py
index d251ff9a5a9..e91888fa091 100644
--- a/tests/test_state_checkpointing.py
+++ b/tests/test_state_checkpointing.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import inspect
 import logging
 import os
 import random
@@ -25,7 +26,8 @@
 from torch.utils.data import DataLoader, TensorDataset
 
 from accelerate import Accelerator
-from accelerate.utils import ProjectConfiguration, set_seed
+from accelerate.test_utils import execute_subprocess_async
+from accelerate.utils import ProjectConfiguration, get_launch_prefix, set_seed
 
 
 logger = logging.getLogger(__name__)
@@ -251,6 +253,11 @@ def test_checkpoint_deletion(self):
             self.assertTrue(os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_9")))
             self.assertTrue(os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_10")))
 
+    def test_map_location(self):
+        cmd = get_launch_prefix()
+        cmd += [f"--nproc_per_node={torch.cuda.device_count()}", inspect.getfile(self.__class__)]
+        execute_subprocess_async(cmd, env=os.environ.copy())
+
 
 if __name__ == "__main__":
     savedir = "/tmp/accelerate/state_checkpointing"

From d021379fbf001dc3888893aaf618f3e311d34d28 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Thu, 6 Apr 2023 17:52:35 +0000
Subject: [PATCH 13/13] Only CUDA

---
 tests/test_state_checkpointing.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_state_checkpointing.py b/tests/test_state_checkpointing.py
index e91888fa091..d79b52b4bb2 100644
--- a/tests/test_state_checkpointing.py
+++ b/tests/test_state_checkpointing.py
@@ -26,7 +26,7 @@
 from torch.utils.data import DataLoader, TensorDataset
 
 from accelerate import Accelerator
-from accelerate.test_utils import execute_subprocess_async
+from accelerate.test_utils import execute_subprocess_async, require_cuda
 from accelerate.utils import ProjectConfiguration, get_launch_prefix, set_seed
 
 
@@ -253,6 +253,7 @@ def test_checkpoint_deletion(self):
             self.assertTrue(os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_9")))
             self.assertTrue(os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_10")))
 
+    @require_cuda
     def test_map_location(self):
         cmd = get_launch_prefix()
         cmd += [f"--nproc_per_node={torch.cuda.device_count()}", inspect.getfile(self.__class__)]