From 5572f885de072d186d9edad1b588b7e28c29b653 Mon Sep 17 00:00:00 2001 From: Monius Date: Sun, 29 Sep 2024 08:26:55 +0800 Subject: [PATCH 1/8] fix deprecated FutureWarning for pytorch 2.4+ --- benchmarks/fp8/ms_amp/ddp.py | 2 +- benchmarks/fp8/ms_amp/non_distributed.py | 2 +- src/accelerate/accelerator.py | 4 ++-- src/accelerate/checkpointing.py | 4 ++-- src/accelerate/utils/dataclasses.py | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmarks/fp8/ms_amp/ddp.py b/benchmarks/fp8/ms_amp/ddp.py index 7250e981bdf..12c14a88631 100644 --- a/benchmarks/fp8/ms_amp/ddp.py +++ b/benchmarks/fp8/ms_amp/ddp.py @@ -35,7 +35,7 @@ def train_baseline(opt_level="O2"): set_seed(42) - scaler = torch.cuda.amp.GradScaler() + scaler = torch.amp.GradScaler('cuda') model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(MODEL_NAME) accelerator = Accelerator() device = accelerator.device diff --git a/benchmarks/fp8/ms_amp/non_distributed.py b/benchmarks/fp8/ms_amp/non_distributed.py index 1bbc20f1406..b916f3fbc15 100644 --- a/benchmarks/fp8/ms_amp/non_distributed.py +++ b/benchmarks/fp8/ms_amp/non_distributed.py @@ -41,7 +41,7 @@ def train_baseline(opt_level="O2"): base_model_results = evaluate_model(model, eval_dataloader, METRIC) model.train() - scaler = torch.cuda.amp.GradScaler() + scaler = torch.amp.GradScaler('cuda') for batch in train_dataloader: batch = batch.to("cuda") diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py index a45ab05b729..faea1ed7d95 100755 --- a/src/accelerate/accelerator.py +++ b/src/accelerate/accelerator.py @@ -497,7 +497,7 @@ def __init__( elif is_xpu_available(): self.scaler = torch.amp.GradScaler("xpu", **kwargs) else: - self.scaler = torch.cuda.amp.GradScaler(**kwargs) + self.scaler = torch.amp.GradScaler("cuda", **kwargs) elif self.state.mixed_precision == "bf16" and self.distributed_type not in ( DistributedType.DEEPSPEED, @@ -521,7 +521,7 @@ def __init__( ) elif self.distributed_type != DistributedType.DEEPSPEED: # MS-AMP requires `GradScaler` even with bf16 autocast w/ single GPU or DDP: - self.scaler = torch.cuda.amp.GradScaler() + self.scaler = torch.amp.GradScaler('cuda') # Start of internal step tracking self.step = 0 diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py index 04fe527e21c..82d0d837e26 100644 --- a/src/accelerate/checkpointing.py +++ b/src/accelerate/checkpointing.py @@ -86,7 +86,7 @@ def save_accelerator_state( The current process index in the Accelerator state step (`int`): The current step in the internal step tracker - scaler (`torch.cuda.amp.GradScaler`, *optional*): + scaler (`torch.amp.GradScaler`, *optional*): An optional gradient scaler instance to save save_on_each_node (`bool`, *optional*): Whether to save on every node, or only the main node. @@ -186,7 +186,7 @@ def load_accelerator_state( A list of learning rate schedulers process_index (`int`): The current process index in the Accelerator state - scaler (`torch.cuda.amp.GradScaler`, *optional*): + scaler (`torch.amp.GradScaler`, *optional*): An optional *GradScaler* instance to load map_location (`str`, *optional*): What device to load the optimizer state onto. Should be one of either "cpu" or "on_device". diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py index 85e746dfe6f..9a2d801978f 100644 --- a/src/accelerate/utils/dataclasses.py +++ b/src/accelerate/utils/dataclasses.py @@ -209,7 +209,7 @@ def register_comm_hook(self, model): class GradScalerKwargs(KwargsHandler): """ Use this object in your [`Accelerator`] to customize the behavior of mixed precision, specifically how the - `torch.cuda.amp.GradScaler` used is created. Please refer to the documentation of this + `torch.amp.GradScaler` used is created. Please refer to the documentation of this [scaler](https://pytorch.org/docs/stable/amp.html?highlight=gradscaler) for more information on each argument. From e583d2ee378c0442bea99eb4cb50aa77ff8202ee Mon Sep 17 00:00:00 2001 From: Monius Date: Tue, 1 Oct 2024 09:00:02 +0800 Subject: [PATCH 2/8] perform `make style` and `make quality` --- benchmarks/fp8/ms_amp/ddp.py | 2 +- benchmarks/fp8/ms_amp/non_distributed.py | 2 +- src/accelerate/accelerator.py | 2 +- src/accelerate/utils/memory.py | 5 ++--- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/benchmarks/fp8/ms_amp/ddp.py b/benchmarks/fp8/ms_amp/ddp.py index 12c14a88631..69c0561f15d 100644 --- a/benchmarks/fp8/ms_amp/ddp.py +++ b/benchmarks/fp8/ms_amp/ddp.py @@ -35,7 +35,7 @@ def train_baseline(opt_level="O2"): set_seed(42) - scaler = torch.amp.GradScaler('cuda') + scaler = torch.amp.GradScaler("cuda") model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(MODEL_NAME) accelerator = Accelerator() device = accelerator.device diff --git a/benchmarks/fp8/ms_amp/non_distributed.py b/benchmarks/fp8/ms_amp/non_distributed.py index b916f3fbc15..d37fe0770a7 100644 --- a/benchmarks/fp8/ms_amp/non_distributed.py +++ b/benchmarks/fp8/ms_amp/non_distributed.py @@ -41,7 +41,7 @@ def train_baseline(opt_level="O2"): base_model_results = evaluate_model(model, eval_dataloader, METRIC) model.train() - scaler = torch.amp.GradScaler('cuda') + scaler = torch.amp.GradScaler("cuda") for batch in train_dataloader: batch = batch.to("cuda") diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py index faea1ed7d95..f975b4fd6b9 100755 --- a/src/accelerate/accelerator.py +++ b/src/accelerate/accelerator.py @@ -521,7 +521,7 @@ def __init__( ) elif self.distributed_type != DistributedType.DEEPSPEED: # MS-AMP requires `GradScaler` even with bf16 autocast w/ single GPU or DDP: - self.scaler = torch.amp.GradScaler('cuda') + self.scaler = torch.amp.GradScaler("cuda") # Start of internal step tracking self.step = 0 diff --git a/src/accelerate/utils/memory.py b/src/accelerate/utils/memory.py index 2fdde754417..1398fc7fcbe 100644 --- a/src/accelerate/utils/memory.py +++ b/src/accelerate/utils/memory.py @@ -127,11 +127,10 @@ def find_executable_batch_size(function: callable = None, starting_batch_size: i >>> @find_executable_batch_size(starting_batch_size=128) - ... def train(batch_size, model, optimizer): - ... ... + ... def train(batch_size, model, optimizer): ... - >>> train(model, optimizer) + ... train(model, optimizer) ``` """ if function is None: From c9e6667da82e3525e2d81b29f47145f96d39e744 Mon Sep 17 00:00:00 2001 From: Monius Date: Tue, 1 Oct 2024 23:07:26 +0800 Subject: [PATCH 3/8] try to fix `Quality Check` on `actions/workflows/quality.yml` --- src/accelerate/utils/memory.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/accelerate/utils/memory.py b/src/accelerate/utils/memory.py index 1398fc7fcbe..53e6d5b3e5f 100644 --- a/src/accelerate/utils/memory.py +++ b/src/accelerate/utils/memory.py @@ -127,10 +127,10 @@ def find_executable_batch_size(function: callable = None, starting_batch_size: i >>> @find_executable_batch_size(starting_batch_size=128) - ... def train(batch_size, model, optimizer): ... + ... def train(batch_size, model, optimizer): + ... ... - - ... train(model, optimizer) + >>> train(model, optimizer) ``` """ if function is None: From 75d87c7eac1eb3056650e0c4ed1f1e7340f935dc Mon Sep 17 00:00:00 2001 From: Monius Date: Wed, 2 Oct 2024 00:37:15 +0800 Subject: [PATCH 4/8] undo changes for `src/accelerate/utils/memory.py` --- src/accelerate/utils/memory.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/accelerate/utils/memory.py b/src/accelerate/utils/memory.py index 53e6d5b3e5f..2fdde754417 100644 --- a/src/accelerate/utils/memory.py +++ b/src/accelerate/utils/memory.py @@ -130,6 +130,7 @@ def find_executable_batch_size(function: callable = None, starting_batch_size: i ... def train(batch_size, model, optimizer): ... ... + >>> train(model, optimizer) ``` """ From 5990f7f0ebdfbdfb11d899ed45a3d9de0ba07ac1 Mon Sep 17 00:00:00 2001 From: Monius Date: Wed, 2 Oct 2024 19:43:48 +0800 Subject: [PATCH 5/8] adapt scaler for pytorch.__version__ --- benchmarks/fp8/ms_amp/ddp.py | 7 ++++++- benchmarks/fp8/ms_amp/non_distributed.py | 6 +++++- src/accelerate/accelerator.py | 11 +++++++++-- src/accelerate/checkpointing.py | 4 ++-- src/accelerate/utils/dataclasses.py | 5 +++-- 5 files changed, 25 insertions(+), 8 deletions(-) diff --git a/benchmarks/fp8/ms_amp/ddp.py b/benchmarks/fp8/ms_amp/ddp.py index 69c0561f15d..7a2e5bfe3b7 100644 --- a/benchmarks/fp8/ms_amp/ddp.py +++ b/benchmarks/fp8/ms_amp/ddp.py @@ -22,6 +22,7 @@ import msamp import torch from fp8_utils import evaluate_model, get_training_utilities +from packaging import version from torch.nn.parallel import DistributedDataParallel as DDP from accelerate import Accelerator @@ -35,7 +36,11 @@ def train_baseline(opt_level="O2"): set_seed(42) - scaler = torch.amp.GradScaler("cuda") + if version.parse(torch.__version__) > version.parse("2.3"): + scaler = torch.amp.GradScaler("cuda") + else: + scaler = torch.cuda.amp.GradScaler() + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(MODEL_NAME) accelerator = Accelerator() device = accelerator.device diff --git a/benchmarks/fp8/ms_amp/non_distributed.py b/benchmarks/fp8/ms_amp/non_distributed.py index d37fe0770a7..5fc659117ca 100644 --- a/benchmarks/fp8/ms_amp/non_distributed.py +++ b/benchmarks/fp8/ms_amp/non_distributed.py @@ -22,6 +22,7 @@ import msamp import torch from fp8_utils import evaluate_model, get_training_utilities +from packaging import version from accelerate import Accelerator from accelerate.state import AcceleratorState @@ -41,7 +42,10 @@ def train_baseline(opt_level="O2"): base_model_results = evaluate_model(model, eval_dataloader, METRIC) model.train() - scaler = torch.amp.GradScaler("cuda") + if version.parse(torch.__version__) > version.parse("2.3"): + scaler = torch.amp.GradScaler("cuda") + else: + scaler = torch.cuda.amp.GradScaler() for batch in train_dataloader: batch = batch.to("cuda") diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py index bdf7005b726..6d00f8d8197 100755 --- a/src/accelerate/accelerator.py +++ b/src/accelerate/accelerator.py @@ -32,6 +32,7 @@ import torch import torch.utils.hooks as hooks from huggingface_hub import split_torch_state_dict_into_shards +from packaging import version from .checkpointing import load_accelerator_state, load_custom_state, save_accelerator_state, save_custom_state from .data_loader import DataLoaderDispatcher, prepare_data_loader, skip_first_batches @@ -498,7 +499,10 @@ def __init__( elif is_xpu_available(): self.scaler = torch.amp.GradScaler("xpu", **kwargs) else: - self.scaler = torch.amp.GradScaler("cuda", **kwargs) + if version.parse(torch.__version__) > version.parse("2.3"): + self.scaler = torch.amp.GradScaler("cuda", **kwargs) + else: + self.scaler = torch.cuda.amp.GradScaler(**kwargs) elif self.state.mixed_precision == "bf16" and self.distributed_type not in ( DistributedType.DEEPSPEED, @@ -522,7 +526,10 @@ def __init__( ) elif self.distributed_type != DistributedType.DEEPSPEED: # MS-AMP requires `GradScaler` even with bf16 autocast w/ single GPU or DDP: - self.scaler = torch.amp.GradScaler("cuda") + if version.parse(torch.__version__) > version.parse("2.3"): + self.scaler = torch.amp.GradScaler("cuda") + else: + self.scaler = torch.cuda.amp.GradScaler() # Start of internal step tracking self.step = 0 diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py index 82d0d837e26..54901e82fb9 100644 --- a/src/accelerate/checkpointing.py +++ b/src/accelerate/checkpointing.py @@ -86,8 +86,8 @@ def save_accelerator_state( The current process index in the Accelerator state step (`int`): The current step in the internal step tracker - scaler (`torch.amp.GradScaler`, *optional*): - An optional gradient scaler instance to save + scaler (`torch.amp.GradScaler`, *optional*) for pytorch>2.3: + An optional gradient scaler instance to save; for lower version, check `torch.cuda.amp.GradScaler` save_on_each_node (`bool`, *optional*): Whether to save on every node, or only the main node. safe_serialization (`bool`, *optional*, defaults to `True`): diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py index 9a2d801978f..ad2486d4832 100644 --- a/src/accelerate/utils/dataclasses.py +++ b/src/accelerate/utils/dataclasses.py @@ -209,8 +209,9 @@ def register_comm_hook(self, model): class GradScalerKwargs(KwargsHandler): """ Use this object in your [`Accelerator`] to customize the behavior of mixed precision, specifically how the - `torch.amp.GradScaler` used is created. Please refer to the documentation of this - [scaler](https://pytorch.org/docs/stable/amp.html?highlight=gradscaler) for more information on each argument. + `torch.amp.GradScaler` used is created for pytoch>2.3 or `torch.cuda.amp.GradScaler` for lower version. Please + refer to the documentation of this [scaler](https://pytorch.org/docs/stable/amp.html?highlight=gradscaler) for more + information on each argument. From 347bd279a4789101645f1682b555a6b75c36c73b Mon Sep 17 00:00:00 2001 From: Monius Date: Wed, 2 Oct 2024 19:48:15 +0800 Subject: [PATCH 6/8] fix scalar waning for npu device deps on pytorch2.4 version check --- benchmarks/fp8/ms_amp/ddp.py | 1 - src/accelerate/accelerator.py | 5 ++++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/benchmarks/fp8/ms_amp/ddp.py b/benchmarks/fp8/ms_amp/ddp.py index 7a2e5bfe3b7..25d9fc0a7bf 100644 --- a/benchmarks/fp8/ms_amp/ddp.py +++ b/benchmarks/fp8/ms_amp/ddp.py @@ -40,7 +40,6 @@ def train_baseline(opt_level="O2"): scaler = torch.amp.GradScaler("cuda") else: scaler = torch.cuda.amp.GradScaler() - model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(MODEL_NAME) accelerator = Accelerator() device = accelerator.device diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py index 6d00f8d8197..a4c0bbbbdd0 100755 --- a/src/accelerate/accelerator.py +++ b/src/accelerate/accelerator.py @@ -495,7 +495,10 @@ def __init__( elif is_musa_available(): self.scalar = torch.musa.amp.GradScaler(**kwargs) elif is_npu_available(): - self.scaler = torch.npu.amp.GradScaler(**kwargs) + if version.parse(torch.__version__) > version.parse("2.3"): + self.scaler = torch.amp.GradScaler("npu", **kwargs) + else: + self.scaler = torch.npu.amp.GradScaler(**kwargs) elif is_xpu_available(): self.scaler = torch.amp.GradScaler("xpu", **kwargs) else: From 6ac85868777f5ccf704530a132f413c7dc41a375 Mon Sep 17 00:00:00 2001 From: Monius Date: Wed, 2 Oct 2024 22:32:16 +0800 Subject: [PATCH 7/8] fallback to default npu scaler --- src/accelerate/accelerator.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py index a4c0bbbbdd0..6d00f8d8197 100755 --- a/src/accelerate/accelerator.py +++ b/src/accelerate/accelerator.py @@ -495,10 +495,7 @@ def __init__( elif is_musa_available(): self.scalar = torch.musa.amp.GradScaler(**kwargs) elif is_npu_available(): - if version.parse(torch.__version__) > version.parse("2.3"): - self.scaler = torch.amp.GradScaler("npu", **kwargs) - else: - self.scaler = torch.npu.amp.GradScaler(**kwargs) + self.scaler = torch.npu.amp.GradScaler(**kwargs) elif is_xpu_available(): self.scaler = torch.amp.GradScaler("xpu", **kwargs) else: From 7b4c949cd7cc77517b42b217178ad9a726a50926 Mon Sep 17 00:00:00 2001 From: Monius Date: Wed, 2 Oct 2024 22:36:06 +0800 Subject: [PATCH 8/8] fallback to default `GradScaler` doc --- src/accelerate/checkpointing.py | 4 ++-- src/accelerate/utils/dataclasses.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py index 54901e82fb9..259ff80c293 100644 --- a/src/accelerate/checkpointing.py +++ b/src/accelerate/checkpointing.py @@ -86,8 +86,8 @@ def save_accelerator_state( The current process index in the Accelerator state step (`int`): The current step in the internal step tracker - scaler (`torch.amp.GradScaler`, *optional*) for pytorch>2.3: - An optional gradient scaler instance to save; for lower version, check `torch.cuda.amp.GradScaler` + scaler (`torch.amp.GradScaler`, *optional*): + An optional gradient scaler instance to save; save_on_each_node (`bool`, *optional*): Whether to save on every node, or only the main node. safe_serialization (`bool`, *optional*, defaults to `True`): diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py index ad2486d4832..85e746dfe6f 100644 --- a/src/accelerate/utils/dataclasses.py +++ b/src/accelerate/utils/dataclasses.py @@ -209,9 +209,8 @@ def register_comm_hook(self, model): class GradScalerKwargs(KwargsHandler): """ Use this object in your [`Accelerator`] to customize the behavior of mixed precision, specifically how the - `torch.amp.GradScaler` used is created for pytoch>2.3 or `torch.cuda.amp.GradScaler` for lower version. Please - refer to the documentation of this [scaler](https://pytorch.org/docs/stable/amp.html?highlight=gradscaler) for more - information on each argument. + `torch.cuda.amp.GradScaler` used is created. Please refer to the documentation of this + [scaler](https://pytorch.org/docs/stable/amp.html?highlight=gradscaler) for more information on each argument.