From ab39e4b8244be2bcfadcecd09649a68508b3667d Mon Sep 17 00:00:00 2001 From: DESKTOP-42S1K65 Date: Mon, 20 Mar 2023 10:53:57 -0400 Subject: [PATCH 01/13] Set the state device dependant to Accelerator on multigpu --- src/accelerate/accelerator.py | 11 ++++++++++- src/accelerate/checkpointing.py | 13 +++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py index c2423855228..79f9e06e47f 100644 --- a/src/accelerate/accelerator.py +++ b/src/accelerate/accelerator.py @@ -2385,8 +2385,17 @@ def load_state(self, input_dir: str, **load_model_func_kwargs): for hook in self._load_model_state_pre_hook.values(): hook(models, input_dir) + optimizer_device = "cpu" if self.num_processes < 2 else self.device + load_accelerator_state( - input_dir, models, optimizers, schedulers, self.state.process_index, self.scaler, **load_model_func_kwargs + input_dir, + models, + optimizers, + schedulers, + self.state.process_index, + self.scaler, + optimizer_device, + **load_model_func_kwargs, ) custom_checkpoints = [f for f in os.listdir(input_dir) if "custom_checkpoint" in f] if len(custom_checkpoints) != len(self._custom_objects): diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py index 7215035f9d1..02cac9f2020 100644 --- a/src/accelerate/checkpointing.py +++ b/src/accelerate/checkpointing.py @@ -110,7 +110,14 @@ def save_accelerator_state( def load_accelerator_state( - input_dir, models, optimizers, schedulers, process_index, scaler=None, **load_model_func_kwargs + input_dir, + models, + optimizers, + schedulers, + process_index, + scaler=None, + optimizer_map_location="cpu", + **load_model_func_kwargs, ): """ Loads states of the models, optimizers, scaler, and RNG generators from a given directory. @@ -128,6 +135,8 @@ def load_accelerator_state( The current process index in the Accelerator state scaler (`torch.cuda.amp.GradScaler`, *optional*): An optional *GradScaler* instance to load + optimizer_map_location (`torch.device`, *optional*): + What device to load the optimizer state onto. By default uses "cpu". load_model_func_kwargs (`dict`, *optional*): Additional arguments that can be passed to the model's `load_state_dict` method. """ @@ -142,7 +151,7 @@ def load_accelerator_state( for i, opt in enumerate(optimizers): optimizer_name = f"{OPTIMIZER_NAME}.bin" if i == 0 else f"{OPTIMIZER_NAME}_{i}.bin" input_optimizer_file = os.path.join(input_dir, optimizer_name) - optimizers[i].load_state_dict(torch.load(input_optimizer_file, map_location="cpu")) + optimizers[i].load_state_dict(torch.load(input_optimizer_file, map_location=optimizer_map_location)) logger.info("All optimizer states loaded successfully") # Scheduler states From e862ccd04382c143d0c32045806de02780c00e80 Mon Sep 17 00:00:00 2001 From: DESKTOP-42S1K65 Date: Mon, 20 Mar 2023 10:57:42 -0400 Subject: [PATCH 02/13] Use map location --- src/accelerate/accelerator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py index 79f9e06e47f..ffc80c1155e 100644 --- a/src/accelerate/accelerator.py +++ b/src/accelerate/accelerator.py @@ -2385,7 +2385,7 @@ def load_state(self, input_dir: str, **load_model_func_kwargs): for hook in self._load_model_state_pre_hook.values(): hook(models, input_dir) - optimizer_device = "cpu" if self.num_processes < 2 else self.device + optimizer_map_location = "cpu" if self.num_processes < 2 else self.device load_accelerator_state( input_dir, @@ -2394,7 +2394,7 @@ def load_state(self, input_dir: str, **load_model_func_kwargs): schedulers, self.state.process_index, self.scaler, - optimizer_device, + optimizer_map_location, **load_model_func_kwargs, ) custom_checkpoints = [f for f in os.listdir(input_dir) if "custom_checkpoint" in f] From 43990678888361eeca43a7fa50d17ec87c11dddc Mon Sep 17 00:00:00 2001 From: DESKTOP-42S1K65 Date: Mon, 20 Mar 2023 11:39:37 -0400 Subject: [PATCH 03/13] Use on_device and PartialState --- src/accelerate/accelerator.py | 2 +- src/accelerate/checkpointing.py | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py index ffc80c1155e..4a81cdfcde3 100644 --- a/src/accelerate/accelerator.py +++ b/src/accelerate/accelerator.py @@ -2385,7 +2385,7 @@ def load_state(self, input_dir: str, **load_model_func_kwargs): for hook in self._load_model_state_pre_hook.values(): hook(models, input_dir) - optimizer_map_location = "cpu" if self.num_processes < 2 else self.device + optimizer_map_location = "on_device" if self.num_processes > 1 else "cpu" load_accelerator_state( input_dir, diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py index 02cac9f2020..c4027e7d6c5 100644 --- a/src/accelerate/checkpointing.py +++ b/src/accelerate/checkpointing.py @@ -21,6 +21,7 @@ import torch from torch.cuda.amp import GradScaler +from .state import PartialState from .utils import ( MODEL_NAME, OPTIMIZER_NAME, @@ -116,7 +117,7 @@ def load_accelerator_state( schedulers, process_index, scaler=None, - optimizer_map_location="cpu", + optimizer_map_location=None, **load_model_func_kwargs, ): """ @@ -135,8 +136,8 @@ def load_accelerator_state( The current process index in the Accelerator state scaler (`torch.cuda.amp.GradScaler`, *optional*): An optional *GradScaler* instance to load - optimizer_map_location (`torch.device`, *optional*): - What device to load the optimizer state onto. By default uses "cpu". + optimizer_map_location (`str`, *optional*): + What device to load the optimizer state onto. Should be one of either "cpu" or "on_device". load_model_func_kwargs (`dict`, *optional*): Additional arguments that can be passed to the model's `load_state_dict` method. """ @@ -148,6 +149,14 @@ def load_accelerator_state( logger.info("All model weights loaded successfully") # Optimizer states + if optimizer_map_location not in [None, "cpu", "on_device"]: + raise TypeError( + "Unsupported optimizer map location passed, please choose one of `None`, `cpu`, or `on_device`" + ) + if optimizer_map_location is None: + optimizer_map_location = "cpu" + elif optimizer_map_location == "on_device": + optimizer_map_location = PartialState().device for i, opt in enumerate(optimizers): optimizer_name = f"{OPTIMIZER_NAME}.bin" if i == 0 else f"{OPTIMIZER_NAME}_{i}.bin" input_optimizer_file = os.path.join(input_dir, optimizer_name) From 480e9e3016801c8f352ba7214d35bfbc38aad302 Mon Sep 17 00:00:00 2001 From: DESKTOP-42S1K65 Date: Mon, 20 Mar 2023 11:44:36 -0400 Subject: [PATCH 04/13] Move import --- src/accelerate/checkpointing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py index c4027e7d6c5..52a28cfb2c9 100644 --- a/src/accelerate/checkpointing.py +++ b/src/accelerate/checkpointing.py @@ -21,7 +21,6 @@ import torch from torch.cuda.amp import GradScaler -from .state import PartialState from .utils import ( MODEL_NAME, OPTIMIZER_NAME, @@ -38,6 +37,7 @@ import torch_xla.core.xla_model as xm from .logging import get_logger +from .state import PartialState logger = get_logger(__name__) From 1e6f5122d232baa7f5a506824e76d22b388bc65a Mon Sep 17 00:00:00 2001 From: DESKTOP-42S1K65 Date: Tue, 21 Mar 2023 08:41:14 -0400 Subject: [PATCH 05/13] Check for multi-gpu --- src/accelerate/checkpointing.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py index 52a28cfb2c9..3618e31c923 100644 --- a/src/accelerate/checkpointing.py +++ b/src/accelerate/checkpointing.py @@ -27,6 +27,7 @@ RNG_STATE_NAME, SCALER_NAME, SCHEDULER_NAME, + DistributedType, get_pretty_name, is_tpu_available, save, @@ -153,10 +154,14 @@ def load_accelerator_state( raise TypeError( "Unsupported optimizer map location passed, please choose one of `None`, `cpu`, or `on_device`" ) + current_device = PartialState().device if optimizer_map_location is None: - optimizer_map_location = "cpu" + if PartialState().distributed_type == DistributedType.MULTI_GPU: + optimizer_map_location = current_device + else: + optimizer_map_location = "cpu" elif optimizer_map_location == "on_device": - optimizer_map_location = PartialState().device + optimizer_map_location = current_device for i, opt in enumerate(optimizers): optimizer_name = f"{OPTIMIZER_NAME}.bin" if i == 0 else f"{OPTIMIZER_NAME}_{i}.bin" input_optimizer_file = os.path.join(input_dir, optimizer_name) From f0e40ff44805b960d42d6c1539fefc5674d4c2a7 Mon Sep 17 00:00:00 2001 From: DESKTOP-42S1K65 Date: Tue, 21 Mar 2023 08:52:06 -0400 Subject: [PATCH 06/13] Change logic in accelerator as well --- src/accelerate/accelerator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py index 4a81cdfcde3..f3df0f6870c 100644 --- a/src/accelerate/accelerator.py +++ b/src/accelerate/accelerator.py @@ -2385,7 +2385,10 @@ def load_state(self, input_dir: str, **load_model_func_kwargs): for hook in self._load_model_state_pre_hook.values(): hook(models, input_dir) - optimizer_map_location = "on_device" if self.num_processes > 1 else "cpu" + if self.num_processes > 1 and self.distributed_type == DistributedType.MULTI_GPU: + optimizer_map_location = "on_device" + else: + optimizer_map_location = "cpu" load_accelerator_state( input_dir, From a4b81e0b21dc9fb15527fb62aa8c45fbdeb4f0fb Mon Sep 17 00:00:00 2001 From: DESKTOP-42S1K65 Date: Tue, 21 Mar 2023 08:57:30 -0400 Subject: [PATCH 07/13] Pop from load_model_func kwargs --- src/accelerate/accelerator.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py index f3df0f6870c..d8607f5254a 100644 --- a/src/accelerate/accelerator.py +++ b/src/accelerate/accelerator.py @@ -2385,10 +2385,12 @@ def load_state(self, input_dir: str, **load_model_func_kwargs): for hook in self._load_model_state_pre_hook.values(): hook(models, input_dir) - if self.num_processes > 1 and self.distributed_type == DistributedType.MULTI_GPU: - optimizer_map_location = "on_device" - else: - optimizer_map_location = "cpu" + optimizer_map_location = load_model_func_kwargs.pop("optimizer_map_location", None) + if optimizer_map_location is None: + if self.num_processes > 1 and self.distributed_type == DistributedType.MULTI_GPU: + optimizer_map_location = "on_device" + else: + optimizer_map_location = "cpu" load_accelerator_state( input_dir, From 86de8958890810deca49f8056d4a3d6da72380af Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Thu, 6 Apr 2023 12:25:35 -0400 Subject: [PATCH 08/13] Working --- src/accelerate/accelerator.py | 13 ++++---- src/accelerate/checkpointing.py | 37 +++++++++++---------- tests/test_state_checkpointing.py | 54 +++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 23 deletions(-) diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py index d8607f5254a..bf6014299b5 100644 --- a/src/accelerate/accelerator.py +++ b/src/accelerate/accelerator.py @@ -2322,7 +2322,8 @@ def load_state(self, input_dir: str, **load_model_func_kwargs): The name of the folder all relevant weights and states were saved in. load_model_func_kwargs (`dict`, *optional*): Additional keyword arguments for loading model which can be passed to the underlying load function, - such as optional arguments for DeepSpeed's `load_checkpoint` function. + such as optional arguments for DeepSpeed's `load_checkpoint` function or a `map_location` to load the + model and optimizer on. Example: @@ -2385,12 +2386,12 @@ def load_state(self, input_dir: str, **load_model_func_kwargs): for hook in self._load_model_state_pre_hook.values(): hook(models, input_dir) - optimizer_map_location = load_model_func_kwargs.pop("optimizer_map_location", None) - if optimizer_map_location is None: + map_location = load_model_func_kwargs.pop("map_location", None) + if map_location is None: if self.num_processes > 1 and self.distributed_type == DistributedType.MULTI_GPU: - optimizer_map_location = "on_device" + map_location = "on_device" else: - optimizer_map_location = "cpu" + map_location = "cpu" load_accelerator_state( input_dir, @@ -2399,7 +2400,7 @@ def load_state(self, input_dir: str, **load_model_func_kwargs): schedulers, self.state.process_index, self.scaler, - optimizer_map_location, + map_location, **load_model_func_kwargs, ) custom_checkpoints = [f for f in os.listdir(input_dir) if "custom_checkpoint" in f] diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py index 3618e31c923..056194f404d 100644 --- a/src/accelerate/checkpointing.py +++ b/src/accelerate/checkpointing.py @@ -32,7 +32,7 @@ is_tpu_available, save, ) - +from .optimizer import move_to_device if is_tpu_available(check_device=False): import torch_xla.core.xla_model as xm @@ -73,6 +73,7 @@ def save_accelerator_state( for i, state in enumerate(model_states): weights_name = f"{MODEL_NAME}.bin" if i == 0 else f"{MODEL_NAME}_{i}.bin" output_model_file = os.path.join(output_dir, weights_name) + state = move_to_device(state, "cpu") save(state, output_model_file) logger.info(f"Model weights saved in {output_model_file}") # Optimizer states @@ -118,7 +119,7 @@ def load_accelerator_state( schedulers, process_index, scaler=None, - optimizer_map_location=None, + map_location=None, **load_model_func_kwargs, ): """ @@ -137,35 +138,37 @@ def load_accelerator_state( The current process index in the Accelerator state scaler (`torch.cuda.amp.GradScaler`, *optional*): An optional *GradScaler* instance to load - optimizer_map_location (`str`, *optional*): + map_location (`str`, *optional*): What device to load the optimizer state onto. Should be one of either "cpu" or "on_device". load_model_func_kwargs (`dict`, *optional*): Additional arguments that can be passed to the model's `load_state_dict` method. """ + if map_location not in [None, "cpu", "on_device"]: + raise TypeError( + "Unsupported optimizer map location passed, please choose one of `None`, `cpu`, or `on_device`" + ) + current_device = PartialState().device + if map_location is None: + if PartialState().distributed_type == DistributedType.MULTI_GPU: + map_location = current_device + else: + map_location = "cpu" + elif map_location == "on_device": + map_location = current_device # Model states for i, model in enumerate(models): weights_name = f"{MODEL_NAME}.bin" if i == 0 else f"{MODEL_NAME}_{i}.bin" input_model_file = os.path.join(input_dir, weights_name) - models[i].load_state_dict(torch.load(input_model_file, map_location="cpu"), **load_model_func_kwargs) + models[i].to(map_location) + models[i].load_state_dict(torch.load(input_model_file, map_location=map_location), **load_model_func_kwargs) logger.info("All model weights loaded successfully") # Optimizer states - if optimizer_map_location not in [None, "cpu", "on_device"]: - raise TypeError( - "Unsupported optimizer map location passed, please choose one of `None`, `cpu`, or `on_device`" - ) - current_device = PartialState().device - if optimizer_map_location is None: - if PartialState().distributed_type == DistributedType.MULTI_GPU: - optimizer_map_location = current_device - else: - optimizer_map_location = "cpu" - elif optimizer_map_location == "on_device": - optimizer_map_location = current_device for i, opt in enumerate(optimizers): optimizer_name = f"{OPTIMIZER_NAME}.bin" if i == 0 else f"{OPTIMIZER_NAME}_{i}.bin" input_optimizer_file = os.path.join(input_dir, optimizer_name) - optimizers[i].load_state_dict(torch.load(input_optimizer_file, map_location=optimizer_map_location)) + optimizer_state = torch.load(input_optimizer_file) + optimizers[i].load_state_dict(optimizer_state) logger.info("All optimizer states loaded successfully") # Scheduler states diff --git a/tests/test_state_checkpointing.py b/tests/test_state_checkpointing.py index cc1a5a4266e..38611c701a4 100644 --- a/tests/test_state_checkpointing.py +++ b/tests/test_state_checkpointing.py @@ -23,6 +23,7 @@ from torch.utils.data import DataLoader, TensorDataset from accelerate import Accelerator +from accelerate.test_utils import require_cuda from accelerate.utils import ProjectConfiguration, set_seed @@ -248,3 +249,56 @@ def test_checkpoint_deletion(self): self.assertTrue(not os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_0"))) self.assertTrue(os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_9"))) self.assertTrue(os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_10"))) + + @require_cuda + def test_map_location(self): + with tempfile.TemporaryDirectory() as tmpdir: + model = DummyModel() + optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3) + scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99) + train_dataloader, valid_dataloader = dummy_dataloaders() + project_config = ProjectConfiguration(automatic_checkpoint_naming=True) + # Train baseline + accelerator = Accelerator(project_dir=tmpdir, project_config=project_config) + model, optimizer, train_dataloader, valid_dataloader, scheduler = accelerator.prepare( + model, optimizer, train_dataloader, valid_dataloader, scheduler + ) + model, optimizer = accelerator.prepare(model, optimizer) + train(3, model, train_dataloader, optimizer, accelerator, scheduler) + # Check that the intial optimizer is loaded on the GPU + for group in optimizer.param_groups: + param_device = group["params"][0].device + break + self.assertEqual(param_device.type, accelerator.device.type) + model = model.cpu() + accelerator.save_state() + + # Check CPU state + accelerator.load_state(os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="cpu") + for group in optimizer.param_groups: + param_device = group["params"][0].device + break + self.assertEqual( + param_device.type, + torch.device("cpu").type, + f"Loaded optimizer states did not match, expected to be loaded on the CPU but got {param_device}", + ) + + # Check device state + accelerator.load_state( + os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="on_device" + ) + for group in optimizer.param_groups: + param_device = group["params"][0].device + break + self.assertEqual( + param_device.type, + accelerator.device.type, + f"Loaded optimizer states did not match, expected to be loaded on {accelerator.device} but got {param_device}", + ) + + # Check error + with self.assertRaises(TypeError, msg="Unsupported optimizer map location passed"): + accelerator.load_state( + os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="invalid" + ) From f92b48cae8c7bdb9997f25053045a76776bb5049 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Thu, 6 Apr 2023 12:41:09 -0400 Subject: [PATCH 09/13] Add tests --- src/accelerate/checkpointing.py | 15 +++++---------- tests/test_state_checkpointing.py | 8 ++------ 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py index 056194f404d..3d4d43fa815 100644 --- a/src/accelerate/checkpointing.py +++ b/src/accelerate/checkpointing.py @@ -27,12 +27,11 @@ RNG_STATE_NAME, SCALER_NAME, SCHEDULER_NAME, - DistributedType, get_pretty_name, is_tpu_available, save, ) -from .optimizer import move_to_device + if is_tpu_available(check_device=False): import torch_xla.core.xla_model as xm @@ -73,7 +72,6 @@ def save_accelerator_state( for i, state in enumerate(model_states): weights_name = f"{MODEL_NAME}.bin" if i == 0 else f"{MODEL_NAME}_{i}.bin" output_model_file = os.path.join(output_dir, weights_name) - state = move_to_device(state, "cpu") save(state, output_model_file) logger.info(f"Model weights saved in {output_model_file}") # Optimizer states @@ -147,19 +145,16 @@ def load_accelerator_state( raise TypeError( "Unsupported optimizer map location passed, please choose one of `None`, `cpu`, or `on_device`" ) - current_device = PartialState().device if map_location is None: - if PartialState().distributed_type == DistributedType.MULTI_GPU: - map_location = current_device - else: - map_location = "cpu" + map_location = "cpu" elif map_location == "on_device": - map_location = current_device + map_location = PartialState().device # Model states for i, model in enumerate(models): weights_name = f"{MODEL_NAME}.bin" if i == 0 else f"{MODEL_NAME}_{i}.bin" input_model_file = os.path.join(input_dir, weights_name) - models[i].to(map_location) + if map_location != "cpu": + models[i].to(map_location) models[i].load_state_dict(torch.load(input_model_file, map_location=map_location), **load_model_func_kwargs) logger.info("All model weights loaded successfully") diff --git a/tests/test_state_checkpointing.py b/tests/test_state_checkpointing.py index 38611c701a4..2a78caedf7d 100644 --- a/tests/test_state_checkpointing.py +++ b/tests/test_state_checkpointing.py @@ -285,9 +285,7 @@ def test_map_location(self): ) # Check device state - accelerator.load_state( - os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="on_device" - ) + accelerator.load_state(os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="on_device") for group in optimizer.param_groups: param_device = group["params"][0].device break @@ -299,6 +297,4 @@ def test_map_location(self): # Check error with self.assertRaises(TypeError, msg="Unsupported optimizer map location passed"): - accelerator.load_state( - os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="invalid" - ) + accelerator.load_state(os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="invalid") From 75466f60766c8ff5fc72b68febbce21946547e26 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Thu, 6 Apr 2023 12:59:55 -0400 Subject: [PATCH 10/13] Working, needed to change model device --- src/accelerate/checkpointing.py | 4 +--- tests/test_state_checkpointing.py | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py index 3d4d43fa815..0ff2defcaa2 100644 --- a/src/accelerate/checkpointing.py +++ b/src/accelerate/checkpointing.py @@ -143,7 +143,7 @@ def load_accelerator_state( """ if map_location not in [None, "cpu", "on_device"]: raise TypeError( - "Unsupported optimizer map location passed, please choose one of `None`, `cpu`, or `on_device`" + "Unsupported optimizer map location passed, please choose one of `None`, `'cpu'`, or `'on_device'`" ) if map_location is None: map_location = "cpu" @@ -153,8 +153,6 @@ def load_accelerator_state( for i, model in enumerate(models): weights_name = f"{MODEL_NAME}.bin" if i == 0 else f"{MODEL_NAME}_{i}.bin" input_model_file = os.path.join(input_dir, weights_name) - if map_location != "cpu": - models[i].to(map_location) models[i].load_state_dict(torch.load(input_model_file, map_location=map_location), **load_model_func_kwargs) logger.info("All model weights loaded successfully") diff --git a/tests/test_state_checkpointing.py b/tests/test_state_checkpointing.py index 2a78caedf7d..7796150bb9d 100644 --- a/tests/test_state_checkpointing.py +++ b/tests/test_state_checkpointing.py @@ -285,6 +285,7 @@ def test_map_location(self): ) # Check device state + model.to(accelerator.device) accelerator.load_state(os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="on_device") for group in optimizer.param_groups: param_device = group["params"][0].device From 662ad594603edb551e91a1da7b01ae61000e4c3a Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Thu, 6 Apr 2023 17:42:14 +0000 Subject: [PATCH 11/13] move to main to be ran multicuda --- tests/test_state_checkpointing.py | 105 ++++++++++++++++-------------- 1 file changed, 56 insertions(+), 49 deletions(-) diff --git a/tests/test_state_checkpointing.py b/tests/test_state_checkpointing.py index 7796150bb9d..d251ff9a5a9 100644 --- a/tests/test_state_checkpointing.py +++ b/tests/test_state_checkpointing.py @@ -15,15 +15,16 @@ import logging import os import random +import shutil import tempfile import unittest +import pytest import torch from torch import nn from torch.utils.data import DataLoader, TensorDataset from accelerate import Accelerator -from accelerate.test_utils import require_cuda from accelerate.utils import ProjectConfiguration, set_seed @@ -250,52 +251,58 @@ def test_checkpoint_deletion(self): self.assertTrue(os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_9"))) self.assertTrue(os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_10"))) - @require_cuda - def test_map_location(self): - with tempfile.TemporaryDirectory() as tmpdir: - model = DummyModel() - optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3) - scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99) - train_dataloader, valid_dataloader = dummy_dataloaders() - project_config = ProjectConfiguration(automatic_checkpoint_naming=True) - # Train baseline - accelerator = Accelerator(project_dir=tmpdir, project_config=project_config) - model, optimizer, train_dataloader, valid_dataloader, scheduler = accelerator.prepare( - model, optimizer, train_dataloader, valid_dataloader, scheduler - ) - model, optimizer = accelerator.prepare(model, optimizer) - train(3, model, train_dataloader, optimizer, accelerator, scheduler) - # Check that the intial optimizer is loaded on the GPU - for group in optimizer.param_groups: - param_device = group["params"][0].device - break - self.assertEqual(param_device.type, accelerator.device.type) - model = model.cpu() - accelerator.save_state() - - # Check CPU state - accelerator.load_state(os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="cpu") - for group in optimizer.param_groups: - param_device = group["params"][0].device - break - self.assertEqual( - param_device.type, - torch.device("cpu").type, - f"Loaded optimizer states did not match, expected to be loaded on the CPU but got {param_device}", - ) - - # Check device state - model.to(accelerator.device) - accelerator.load_state(os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="on_device") - for group in optimizer.param_groups: - param_device = group["params"][0].device - break - self.assertEqual( - param_device.type, - accelerator.device.type, - f"Loaded optimizer states did not match, expected to be loaded on {accelerator.device} but got {param_device}", - ) - # Check error - with self.assertRaises(TypeError, msg="Unsupported optimizer map location passed"): - accelerator.load_state(os.path.join(tmpdir, "checkpoints", "checkpoint_0"), map_location="invalid") +if __name__ == "__main__": + savedir = "/tmp/accelerate/state_checkpointing" + model = DummyModel() + optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3) + scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99) + train_dataloader, valid_dataloader = dummy_dataloaders() + project_config = ProjectConfiguration(automatic_checkpoint_naming=True) + # Train baseline + accelerator = Accelerator(project_dir=savedir, project_config=project_config, mixed_precision="no") + if accelerator.process_index == 0: + if os.path.exists(savedir): + shutil.rmtree(savedir) + os.makedirs(savedir) + model, optimizer, train_dataloader, valid_dataloader, scheduler = accelerator.prepare( + model, optimizer, train_dataloader, valid_dataloader, scheduler + ) + model, optimizer = accelerator.prepare(model, optimizer) + train(3, model, train_dataloader, optimizer, accelerator, scheduler) + # Check that the intial optimizer is loaded on the GPU + for group in optimizer.param_groups: + param_device = group["params"][0].device + break + assert param_device.type == accelerator.device.type + model = model.cpu() + accelerator.wait_for_everyone() + accelerator.save_state() + accelerator.wait_for_everyone() + + # Check CPU state + accelerator.load_state(os.path.join(savedir, "checkpoints", "checkpoint_0"), map_location="cpu") + for group in optimizer.param_groups: + param_device = group["params"][0].device + break + assert ( + param_device.type == torch.device("cpu").type + ), f"Loaded optimizer states did not match, expected to be loaded on the CPU but got {param_device}" + + # Check device state + model.to(accelerator.device) + accelerator.load_state(os.path.join(savedir, "checkpoints", "checkpoint_0"), map_location="on_device") + for group in optimizer.param_groups: + param_device = group["params"][0].device + break + assert ( + param_device.type == accelerator.device.type + ), f"Loaded optimizer states did not match, expected to be loaded on {accelerator.device} but got {param_device}" + + # Check error + with pytest.raises(TypeError, match="Unsupported optimizer map location passed"): + accelerator.load_state(os.path.join(savedir, "checkpoints", "checkpoint_0"), map_location="invalid") + accelerator.wait_for_everyone() + if accelerator.process_index == 0: + shutil.rmtree(savedir) + accelerator.wait_for_everyone() From 24e95c9b917668b26cdd1eb700d1e4946db35150 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Thu, 6 Apr 2023 17:50:55 +0000 Subject: [PATCH 12/13] Call test --- tests/test_state_checkpointing.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/test_state_checkpointing.py b/tests/test_state_checkpointing.py index d251ff9a5a9..e91888fa091 100644 --- a/tests/test_state_checkpointing.py +++ b/tests/test_state_checkpointing.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import inspect import logging import os import random @@ -25,7 +26,8 @@ from torch.utils.data import DataLoader, TensorDataset from accelerate import Accelerator -from accelerate.utils import ProjectConfiguration, set_seed +from accelerate.test_utils import execute_subprocess_async +from accelerate.utils import ProjectConfiguration, get_launch_prefix, set_seed logger = logging.getLogger(__name__) @@ -251,6 +253,11 @@ def test_checkpoint_deletion(self): self.assertTrue(os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_9"))) self.assertTrue(os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_10"))) + def test_map_location(self): + cmd = get_launch_prefix() + cmd += [f"--nproc_per_node={torch.cuda.device_count()}", inspect.getfile(self.__class__)] + execute_subprocess_async(cmd, env=os.environ.copy()) + if __name__ == "__main__": savedir = "/tmp/accelerate/state_checkpointing" From d021379fbf001dc3888893aaf618f3e311d34d28 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Thu, 6 Apr 2023 17:52:35 +0000 Subject: [PATCH 13/13] Only CUDA --- tests/test_state_checkpointing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_state_checkpointing.py b/tests/test_state_checkpointing.py index e91888fa091..d79b52b4bb2 100644 --- a/tests/test_state_checkpointing.py +++ b/tests/test_state_checkpointing.py @@ -26,7 +26,7 @@ from torch.utils.data import DataLoader, TensorDataset from accelerate import Accelerator -from accelerate.test_utils import execute_subprocess_async +from accelerate.test_utils import execute_subprocess_async, require_cuda from accelerate.utils import ProjectConfiguration, get_launch_prefix, set_seed @@ -253,6 +253,7 @@ def test_checkpoint_deletion(self): self.assertTrue(os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_9"))) self.assertTrue(os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_10"))) + @require_cuda def test_map_location(self): cmd = get_launch_prefix() cmd += [f"--nproc_per_node={torch.cuda.device_count()}", inspect.getfile(self.__class__)]