diff --git a/CHANGELOG.md b/CHANGELOG.md index be7585ab1fc24..3c32b93cc0dec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed +- Changed depreceated `enable_pl_optimizer=True` ([#5244](https://github.com/PyTorchLightning/pytorch-lightning/pull/5244)) + ### Deprecated diff --git a/README.md b/README.md index cd9eb7cf02fc2..73286edc2c53b 100644 --- a/README.md +++ b/README.md @@ -225,7 +225,8 @@ with tempfile.NamedTemporaryFile(suffix='.onnx', delete=False) as tmpfile: ```python class LitAutoEncoder(pl.LightningModule): def training_step(self, batch, batch_idx, opt_idx): - (opt_a, opt_b) = self.optimizers() + # access your optimizers with use_pl_optimizer=False. Default is True + (opt_a, opt_b) = self.optimizers(use_pl_optimizer=True) loss_a = ... self.manual_backward(loss_a, opt_a) diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py index fae343d921035..5d688a8b374ff 100644 --- a/benchmarks/test_sharded_parity.py +++ b/benchmarks/test_sharded_parity.py @@ -186,7 +186,8 @@ def train_dataloader(self): class SeedTrainLoaderManualModel(SeedTrainLoaderModel): def training_step(self, batch, batch_idx, optimizer_idx): # manual - (opt_a, opt_b) = self.optimizers() + # access your optimizers with use_pl_optimizer=False. Default is True + (opt_a, opt_b) = self.optimizers(use_pl_optimizer=True) loss_1 = self.step(batch) self.manual_backward(loss_1, opt_a) diff --git a/docs/source/new-project.rst b/docs/source/new-project.rst index 4c9c16e9faa0d..def273f7a8257 100644 --- a/docs/source/new-project.rst +++ b/docs/source/new-project.rst @@ -268,7 +268,8 @@ Now you own the train loop! .. code-block:: python def training_step(self, batch, batch_idx, opt_idx): - (opt_a, opt_b, opt_c) = self.optimizers() + # access your optimizers with use_pl_optimizer=False. Default is True + (opt_a, opt_b, opt_c) = self.optimizers(use_pl_optimizer=True) loss_a = self.generator(batch[0]) diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst index 5e96b5da0da8c..588bdefb367e3 100644 --- a/docs/source/optimizers.rst +++ b/docs/source/optimizers.rst @@ -28,8 +28,15 @@ to manually manage the optimization process. To do so, do the following: .. code-block:: python def training_step(self, batch, batch_idx, optimizer_idx): - # ignore optimizer_idx - (opt_g, opt_d) = self.optimizers() + + # 1. ignore optimizer_idx + # 2. `use_pl_optimizer=True` means `opt_g` and `opt_d` will be of type `LightingOptimizer` + # `LightingOptimizer` simply wrapped your optimizer and behave the same way ! + # When calling `optimizer.step`, `LightingOptimizer` will just handle TPU, AMP, accumulate_grad_batches, etc ... for you. + + # access your optimizers with `use_pl_optimizer=False` or `optimizer.optimizer` when using use_pl_optimizer=True + # use_pl_optimizer=True is the default + (opt_g, opt_d) = self.optimizers(use_pl_optimizer=True) # do anything you want loss_a = ... @@ -242,19 +249,29 @@ Here we add a learning-rate warm up # update params optimizer.step(closure=closure) -The default ``optimizer_step`` is relying on the internal ``LightningOptimizer`` to properly perform a step. +.. note:: The default ``optimizer_step`` is relying on the internal ``LightningOptimizer`` to properly perform a step. It handles TPUs, AMP, accumulate_grad_batches, zero_grad, and much more ... .. testcode:: - from pytorch_lightning.core.optimizer import LightningOptimizer + # function hook in LightningModule + def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): + optimizer.step(closure=closure) + +.. note:: To access your wrapped Optimizer from ``LightningOptimizer``, do as follow. + +.. testcode:: # function hook in LightningModule def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): - if not isinstance(optimizer, LightningOptimizer): - # wraps into LightingOptimizer only for running step - optimizer = LightningOptimizer.to_lightning_optimizer(optimizer, self.trainer) + + # `optimizer is a ``LightningOptimizer`` wrapping the optimizer. + # To access it, do as follow: + optimizer = optimizer.optimizer + + # run step. However, it won't work on TPU, AMP, etc... optimizer.step(closure=closure) + ---------- Using the closure functions for optimization diff --git a/docs/source/trainer.rst b/docs/source/trainer.rst index 8d42541a3fbb4..d461c30a20a6f 100644 --- a/docs/source/trainer.rst +++ b/docs/source/trainer.rst @@ -335,7 +335,8 @@ optimizer behavior Example:: def training_step(self, batch, batch_idx): - opt = self.optimizers() + # access your optimizers with use_pl_optimizer=False. Default is True + opt = self.optimizers(use_pl_optimizer=True) loss = ... self.manual_backward(loss, opt) @@ -350,7 +351,8 @@ In the multi-optimizer case, ignore the optimizer_idx flag and use the optimizer Example:: def training_step(self, batch, batch_idx, optimizer_idx): - (opt_a, opt_b) = self.optimizers() + # access your optimizers with use_pl_optimizer=False. Default is True + (opt_a, opt_b) = self.optimizers(use_pl_optimizer=True) gen_loss = ... self.manual_backward(gen_loss, opt_a) diff --git a/pytorch_lightning/accelerators/cpu_accelerator.py b/pytorch_lightning/accelerators/cpu_accelerator.py index 25302cabbc70f..e034b209bf34c 100644 --- a/pytorch_lightning/accelerators/cpu_accelerator.py +++ b/pytorch_lightning/accelerators/cpu_accelerator.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Optional, Union, Callable +from typing import Any, Callable, Optional, Union import torch @@ -48,8 +48,6 @@ def setup(self, model): # allow for lr schedulers as well self.setup_optimizers(model) - self.trainer.convert_to_lightning_optimizers() - self.trainer.model = model def train(self): diff --git a/pytorch_lightning/accelerators/ddp2_accelerator.py b/pytorch_lightning/accelerators/ddp2_accelerator.py index 46d944a35cb62..68af3f579a6e8 100644 --- a/pytorch_lightning/accelerators/ddp2_accelerator.py +++ b/pytorch_lightning/accelerators/ddp2_accelerator.py @@ -192,8 +192,6 @@ def ddp_train(self, process_idx, mp_queue, model): # 16-bit model = self.trainer.precision_connector.connect(model) - self.trainer.convert_to_lightning_optimizers() - # device ids change depending on the DDP setup device_ids = self.get_device_ids() diff --git a/pytorch_lightning/accelerators/ddp_accelerator.py b/pytorch_lightning/accelerators/ddp_accelerator.py index 1f1f1f42f52ff..f0d9f2171bf48 100644 --- a/pytorch_lightning/accelerators/ddp_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_accelerator.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License import os +from os.path import abspath import subprocess import sys -from os.path import abspath from time import sleep from typing import Any, List, Optional, Union @@ -291,8 +291,6 @@ def ddp_train(self, process_idx, model): # 16-bit model = self.trainer.precision_connector.connect(model) - self.trainer.convert_to_lightning_optimizers() - # device ids change depending on the DDP setup device_ids = self.get_device_ids() diff --git a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py index cc178dc14b49d..e7ef38c8df3b4 100644 --- a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py @@ -152,8 +152,6 @@ def ddp_train(self, process_idx, mp_queue, model): # 16-bit model = self.trainer.precision_connector.connect(model) - self.trainer.convert_to_lightning_optimizers() - # DDP spawn already spawned off each process... no need to do anything device_ids = self.get_device_ids() diff --git a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py index c2915b9d570bb..c25e082ee348d 100644 --- a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py @@ -15,8 +15,8 @@ from typing import Any, List, Optional, Union import torch -import torch.distributed as torch_distrib import torch.distributed as dist +import torch.distributed as torch_distrib from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log @@ -183,8 +183,6 @@ def ddp_train(self, process_idx, model): # 16-bit model = self.trainer.precision_connector.connect(model) - self.trainer.convert_to_lightning_optimizers() - # device ids change depending on the DDP setup device_ids = self.get_device_ids() diff --git a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py index f35b42342d88a..23783fada72f1 100644 --- a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py @@ -167,8 +167,6 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 # 16-bit model = self.trainer.precision_connector.connect(model) - self.trainer.convert_to_lightning_optimizers() - # device ids change depending on the DDP setup device_ids = self.get_device_ids() diff --git a/pytorch_lightning/accelerators/dp_accelerator.py b/pytorch_lightning/accelerators/dp_accelerator.py index 834a920b505d9..fc01c4686f04f 100644 --- a/pytorch_lightning/accelerators/dp_accelerator.py +++ b/pytorch_lightning/accelerators/dp_accelerator.py @@ -65,8 +65,6 @@ def setup(self, model): if self.trainer.amp_backend: model = self.__init_half_precision(model) - self.trainer.convert_to_lightning_optimizers() - self.trainer.model = model def __init_torch_data_parallel(self, model): diff --git a/pytorch_lightning/accelerators/gpu_accelerator.py b/pytorch_lightning/accelerators/gpu_accelerator.py index 1310777e0d890..49f21e9e34816 100644 --- a/pytorch_lightning/accelerators/gpu_accelerator.py +++ b/pytorch_lightning/accelerators/gpu_accelerator.py @@ -54,8 +54,6 @@ def setup(self, model): # 16-bit model = self.trainer.precision_connector.connect(model) - self.trainer.convert_to_lightning_optimizers() - self.trainer.model = model def train(self): diff --git a/pytorch_lightning/accelerators/horovod_accelerator.py b/pytorch_lightning/accelerators/horovod_accelerator.py index 5895025673b9a..2013d75df7b1e 100644 --- a/pytorch_lightning/accelerators/horovod_accelerator.py +++ b/pytorch_lightning/accelerators/horovod_accelerator.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import ExitStack -from typing import Any, Optional, Union, Callable +from typing import Any, Callable, Optional, Union import torch from torch.optim.lr_scheduler import _LRScheduler @@ -20,7 +20,7 @@ from pytorch_lightning import _logger as log from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.cluster_environments import ClusterEnvironment -from pytorch_lightning.utilities import HOROVOD_AVAILABLE, AMPType +from pytorch_lightning.utilities import AMPType, HOROVOD_AVAILABLE from pytorch_lightning.utilities.distributed import rank_zero_only if HOROVOD_AVAILABLE: @@ -91,8 +91,6 @@ def _filter_named_parameters(model, optimizer): # 16-bit model = self.trainer.precision_connector.connect(model) - self.trainer.convert_to_lightning_optimizers() - # Update logger rank info from Horovod to avoid race conditions from different ranks # creating directories / writing files in the same locations. self.trainer.global_rank = hvd.rank() diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py index 9d1eec5594d82..7dcfaae401ca7 100644 --- a/pytorch_lightning/accelerators/tpu_accelerator.py +++ b/pytorch_lightning/accelerators/tpu_accelerator.py @@ -26,11 +26,11 @@ from pytorch_lightning.core import LightningModule from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.utilities import ( - TPU_AVAILABLE, move_data_to_device, rank_zero_info, rank_zero_only, rank_zero_warn, + TPU_AVAILABLE, ) from pytorch_lightning.utilities.cloud_io import atomic_save from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -230,8 +230,6 @@ def __setup_tpu_training(self, model: LightningModule, trainer): f' global rank: {trainer.tpu_global_core_rank}' f' with XLA_USE_BF16={os.environ.get("XLA_USE_BF16")}') - self.trainer.convert_to_lightning_optimizers() - def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): # do backward pass if self.trainer.train_loop.automatic_optimization: diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 421fc5e5cf2ac..bd6784cc3b4bb 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -112,8 +112,11 @@ def __init__(self, *args, **kwargs): self._current_hook_fx_name = None self._current_dataloader_idx = None - def optimizers(self): - opts = self.trainer.optimizers + def optimizers(self, use_pl_optimizer: bool = True) -> Union[Optimizer, List[Optimizer], List[LightningOptimizer]]: + if use_pl_optimizer: + opts = list(self.trainer.lightning_optimizers.values()) + else: + opts = self.trainer.optimizers # single optimizer if isinstance(opts, list) and len(opts) == 1 and isinstance(opts[0], Optimizer): diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py index f0b361de6133e..ed5e9490983b0 100644 --- a/pytorch_lightning/core/optimizer.py +++ b/pytorch_lightning/core/optimizer.py @@ -17,7 +17,7 @@ from torch.optim.optimizer import Optimizer -from pytorch_lightning.utilities import TPU_AVAILABLE +from pytorch_lightning.utilities import AMPType, TPU_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException if TPU_AVAILABLE: @@ -62,6 +62,10 @@ def __init__(self, self._accumulate_grad_batches = accumulate_grad_batches self._optimizer_idx = None + @property + def optimizer(self): + return self._optimizer + @property def defaults(self): return self._optimizer.defaults @@ -102,11 +106,13 @@ def _on_trainer_init(self, trainer): break @classmethod - def to_lightning_optimizer(cls, optimizer, trainer): - if isinstance(optimizer, LightningOptimizer): - return optimizer - optimizer = cls(optimizer) - optimizer._on_trainer_init(trainer) + def _to_lightning_optimizer(cls, optimizer, trainer, opt_idx): + # apex overrides .step function and need to be wrapped on each step + if trainer.amp_backend == AMPType.APEX: + optimizer = cls(optimizer) + optimizer._on_trainer_init(trainer) + else: + optimizer = trainer.lightning_optimizers[opt_idx] return optimizer def _accumulated_batches_reached(self): @@ -148,7 +154,7 @@ def __optimizer_step(self, *args, closure: Optional[Callable] = None, profiler_n **kwargs ) - trainer.train_loop.on_before_zero_grad(self) + trainer.train_loop.on_before_zero_grad(optimizer) model.optimizer_zero_grad( trainer.current_epoch, diff --git a/pytorch_lightning/plugins/ddp_sequential_plugin.py b/pytorch_lightning/plugins/ddp_sequential_plugin.py index 4d2835c518b2d..069b1754fbce0 100644 --- a/pytorch_lightning/plugins/ddp_sequential_plugin.py +++ b/pytorch_lightning/plugins/ddp_sequential_plugin.py @@ -15,8 +15,8 @@ from typing import Any, List, Optional import torch -import torch.distributed as torch_distrib from torch import nn +import torch.distributed as torch_distrib from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log @@ -27,8 +27,8 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException if FAIRSCALE_PIPE_AVAILABLE: - import fairscale.nn.model_parallel as mpu from fairscale.nn import PipeRPCWrapper + import fairscale.nn.model_parallel as mpu from fairscale.nn.pipe import balance as pipe_balance from fairscale.nn.pipe import rpc as rpc_pipe from fairscale.nn.pipe.pipeline import PipelineStyle @@ -380,7 +380,6 @@ def register_optimizers(ctx, model): model.trainer.optimizers = optimizers model.trainer.lr_schedulers = lr_schedulers model.trainer.optimizer_frequencies = optimizer_frequencies - model.trainer.convert_to_lightning_optimizers() def run_optimizer(ctx, model): diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py index 3d64fe91388b8..9df1ba3262afa 100644 --- a/pytorch_lightning/plugins/native_amp.py +++ b/pytorch_lightning/plugins/native_amp.py @@ -54,7 +54,7 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): # unscale gradient to allow analyze within `on_after_backward` if not self.trainer.train_loop.should_accumulate() and automatic_optimization: if isinstance(optimizer, LightningOptimizer): - self.trainer.scaler.unscale_(optimizer._optimizer) + self.trainer.scaler.unscale_(optimizer.optimizer) else: self.trainer.scaler.unscale_(optimizer) diff --git a/pytorch_lightning/plugins/sharded_plugin.py b/pytorch_lightning/plugins/sharded_plugin.py index b87a2c2a389ef..d989b6237ad72 100644 --- a/pytorch_lightning/plugins/sharded_plugin.py +++ b/pytorch_lightning/plugins/sharded_plugin.py @@ -63,7 +63,7 @@ def _reinit_with_fairscale_oss(self, trainer): optimizers = trainer.optimizers for x, optimizer in enumerate(optimizers): if is_lightning_optimizer(optimizer): - optimizer = optimizer._optimizer + optimizer = optimizer.optimizer if not isinstance(optimizer, OSS): optim_class = type(optimizer) zero_optimizer = OSS( @@ -73,7 +73,6 @@ def _reinit_with_fairscale_oss(self, trainer): ) optimizers[x] = zero_optimizer del optimizer - trainer.convert_to_lightning_optimizers() def get_model_from_plugin( self, diff --git a/pytorch_lightning/trainer/configuration_validator.py b/pytorch_lightning/trainer/configuration_validator.py index 21d6af043df02..20992255ba29e 100644 --- a/pytorch_lightning/trainer/configuration_validator.py +++ b/pytorch_lightning/trainer/configuration_validator.py @@ -73,17 +73,7 @@ def __verify_train_loop_configuration(self, model): trainer.overriden_optimizer_step = is_overridden('optimizer_step', model) trainer.overriden_optimizer_zero_grad = is_overridden('optimizer_zero_grad', model) - - enable_pl_optimizer = trainer._enable_pl_optimizer automatic_optimization = trainer.train_loop.automatic_optimization - if trainer.overriden_optimizer_step and not enable_pl_optimizer and automatic_optimization: - rank_zero_warn( - "When overriding `LightningModule` optimizer_step with" - " `Trainer(..., enable_pl_optimizer=False, ...)`," - " we won't be calling `.zero_grad` we can't assume when you call your `optimizer.step()`." - " For Lightning to take care of it, please use `Trainer(enable_pl_optimizer=True)`." - ) - going_to_accumulate_grad_batches = trainer.accumulation_scheduler.going_to_accumulate_grad_batches() has_overriden_optimization_functions = trainer.overriden_optimizer_step or trainer.overriden_optimizer_zero_grad @@ -94,13 +84,6 @@ def __verify_train_loop_configuration(self, model): ' It ensures optimizer_step or optimizer_zero_grad are called on every batch.' ) - if (enable_pl_optimizer) and trainer.overriden_optimizer_zero_grad and not automatic_optimization: - raise MisconfigurationException( - 'When overriding `LightningModule` optimizer_zero_grad' - ' and preserving model property `automatic_optimization` as True with' - ' `Trainer(enable_pl_optimizer=True, ...) is not supported' - ) - def __verify_eval_loop_configuration(self, model, eval_loop_name): step_name = f'{eval_loop_name}_step' diff --git a/pytorch_lightning/trainer/connectors/optimizer_connector.py b/pytorch_lightning/trainer/connectors/optimizer_connector.py index 8c352c8e5ffeb..8b23203e42bc3 100644 --- a/pytorch_lightning/trainer/connectors/optimizer_connector.py +++ b/pytorch_lightning/trainer/connectors/optimizer_connector.py @@ -20,7 +20,11 @@ def __init__(self, trainer): self.trainer = trainer def on_trainer_init(self, enable_pl_optimizer): - self.trainer._enable_pl_optimizer = enable_pl_optimizer + if enable_pl_optimizer is not None: + rank_zero_warn( + "Trainer argument `enable_pl_optimizer` is deprecated in v1.1.3. It will be removed in v1.3.0", + DeprecationWarning + ) self.trainer.lr_schedulers = [] self.trainer.optimizers = [] self.trainer.optimizer_frequencies = [] diff --git a/pytorch_lightning/trainer/connectors/precision_connector.py b/pytorch_lightning/trainer/connectors/precision_connector.py index 37d5315e5d11b..822c3ef634fdc 100644 --- a/pytorch_lightning/trainer/connectors/precision_connector.py +++ b/pytorch_lightning/trainer/connectors/precision_connector.py @@ -15,7 +15,7 @@ from pytorch_lightning import _logger as log from pytorch_lightning.plugins.apex import ApexPlugin from pytorch_lightning.plugins.native_amp import NativeAMPPlugin -from pytorch_lightning.utilities import APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, AMPType, rank_zero_warn +from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, rank_zero_warn class PrecisionConnector: @@ -67,7 +67,6 @@ def _setup_amp_backend(self, amp_type: str): self.trainer.amp_backend = AMPType.APEX self.backend = ApexPlugin(self.trainer) log.warn("LightningOptimizer doesn't support Apex") - self.trainer._enable_pl_optimizer = False if not self.trainer.amp_backend: raise ModuleNotFoundError( diff --git a/pytorch_lightning/trainer/optimizers.py b/pytorch_lightning/trainer/optimizers.py index 974ee898ff00b..a8cb1e279984f 100644 --- a/pytorch_lightning/trainer/optimizers.py +++ b/pytorch_lightning/trainer/optimizers.py @@ -13,6 +13,7 @@ # limitations under the License. from abc import ABC +from collections import OrderedDict from typing import List, Optional, Tuple import torch @@ -88,8 +89,10 @@ def _convert_to_lightning_optimizer(trainer, optimizer): optimizer._on_trainer_init(trainer) return optimizer - if self._enable_pl_optimizer: - self.optimizers = [_convert_to_lightning_optimizer(self, opt) for opt in self.optimizers] + self._lightning_optimizers = { + opt_idx: _convert_to_lightning_optimizer(self, opt) + for opt_idx, opt in enumerate(self.optimizers) + } def configure_schedulers(self, schedulers: list, monitor: Optional[str] = None): # Convert each scheduler into dict structure with relevant information diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index 614c863fa7256..3fa2af79e5530 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -11,10 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import inspect -import os from abc import ABC from argparse import ArgumentParser, Namespace +import inspect +import os from typing import cast, List, Optional, Type, TypeVar, Union from pytorch_lightning.accelerators.accelerator import Accelerator @@ -59,6 +59,7 @@ class TrainerProperties(ABC): model_connector: ModelConnector checkpoint_connector: CheckpointConnector callbacks: List[Callback] + _lightning_optimizers = None @property def log_dir(self): @@ -258,16 +259,17 @@ def save_checkpoint(self, filepath, weights_only: bool = False): def get_model(self): return self.model_connector.get_model() + @property + def lightning_optimizers(self): + if self._lightning_optimizers is None: + self.convert_to_lightning_optimizers() + return self._lightning_optimizers + def __getstate__(self): - # unwrap optimizer - self.optimizers = [opt._optimizer if is_lightning_optimizer(opt) else opt for opt in self.optimizers] + # remove lightning_optimizers + self._lightning_optimizers = None return self.__dict__ - def __setstate__(self, d): - self.__dict__ = d - # wrap optimizers in enable_pl_optimzer is True - self.convert_to_lightning_optimizers() - @property def require_distributed_sampler(self): if self.accelerator_backend is not None: diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 06717c6333829..2c1867a21552d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -134,7 +134,7 @@ def __init__( distributed_backend: Optional[str] = None, automatic_optimization: Optional[bool] = None, move_metrics_to_cpu: bool = False, - enable_pl_optimizer: bool = False, + enable_pl_optimizer: bool = None, # todo: remove in v1.3 ): r""" Customize every aspect of training via flags @@ -283,7 +283,8 @@ def __init__( enable_pl_optimizer: If True, each optimizer will be wrapped by `pytorch_lightning.core.optimizer.LightningOptimizer`. It allows Lightning to - handle AMP, TPU, accumulated_gradients, etc.. + handle AMP, TPU, accumulated_gradients, etc. + .. warning:: Currently deprecated and it will be removed in v1.3 """ super().__init__() self._device_type = DeviceType.CPU diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 6ae0fc9af8fbd..3c8a8d45d0411 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -499,7 +499,7 @@ def optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_ ' To request, please file a Github issue in PyTorch and tag @mcarilli') # wraps into LightingOptimizer only for running step - optimizer = LightningOptimizer.to_lightning_optimizer(optimizer, self.trainer) + optimizer = LightningOptimizer._to_lightning_optimizer(optimizer, self.trainer, opt_idx) # model hook model_ref.optimizer_step( diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index 070bb4e9f6989..53debcebeb7cd 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from unittest import mock -from unittest.mock import ANY, MagicMock, call +from unittest.mock import ANY, call, MagicMock from pytorch_lightning import Trainer from tests.base import BoringModel @@ -33,8 +33,6 @@ def test_trainer_callback_system(torch_save): limit_train_batches=3, limit_test_batches=2, progress_bar_refresh_rate=0, - # todo: enabled since internally we wrap the model for optimizer step, this should be fixed - enable_pl_optimizer=True ) # no call yet diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index 8d4a859a88784..3de26ef1a6fb6 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -636,8 +636,7 @@ def validation_epoch_end(self, outputs): @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_checkpoint_repeated_strategy(enable_pl_optimizer, tmpdir): +def test_checkpoint_repeated_strategy(tmpdir): """ This test validates that the checkpoint can be called when provided to callbacks list """ @@ -657,7 +656,6 @@ def validation_step(self, batch, batch_idx): limit_val_batches=2, limit_test_batches=2, callbacks=[checkpoint_callback], - enable_pl_optimizer=enable_pl_optimizer, weights_summary=None, progress_bar_refresh_rate=0, ) @@ -674,7 +672,6 @@ def validation_step(self, batch, batch_idx): limit_val_batches=2, limit_test_batches=2, resume_from_checkpoint=checkpoint_callback.best_model_path, - enable_pl_optimizer=enable_pl_optimizer, weights_summary=None, progress_bar_refresh_rate=0, ) @@ -685,8 +682,7 @@ def validation_step(self, batch, batch_idx): @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_checkpoint_repeated_strategy_extended(enable_pl_optimizer, tmpdir): +def test_checkpoint_repeated_strategy_extended(tmpdir): """ This test validates checkpoint can be called several times without increasing internally its global step if nothing run. @@ -731,7 +727,6 @@ def assert_checkpoint_log_dir(idx): limit_train_batches=limit_train_batches, limit_val_batches=3, limit_test_batches=4, - enable_pl_optimizer=enable_pl_optimizer, callbacks=[checkpoint_cb], ) trainer = pl.Trainer(**trainer_config) diff --git a/tests/checkpointing/test_torch_saving.py b/tests/checkpointing/test_torch_saving.py index 493aa0dabe126..b322cfe5a7fd3 100644 --- a/tests/checkpointing/test_torch_saving.py +++ b/tests/checkpointing/test_torch_saving.py @@ -22,15 +22,13 @@ from tests.base import BoringModel -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_model_torch_save(tmpdir, enable_pl_optimizer): +def test_model_torch_save(tmpdir): """Test to ensure torch save does not fail for model and trainer.""" model = BoringModel() num_epochs = 1 trainer = Trainer( default_root_dir=tmpdir, max_epochs=num_epochs, - enable_pl_optimizer=enable_pl_optimizer, ) temp_path = os.path.join(tmpdir, 'temp.pt') trainer.fit(model) @@ -39,8 +37,6 @@ def test_model_torch_save(tmpdir, enable_pl_optimizer): torch.save(trainer.model, temp_path) torch.save(trainer, temp_path) trainer = torch.load(temp_path) - is_lightning_optimizer = isinstance(trainer.optimizers[0], LightningOptimizer) - assert is_lightning_optimizer if enable_pl_optimizer else not is_lightning_optimizer @pytest.mark.skipif(platform.system() == "Windows", diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py index 01319365d9051..64b68245ba66e 100644 --- a/tests/core/test_lightning_module.py +++ b/tests/core/test_lightning_module.py @@ -45,8 +45,7 @@ def optimizer_step(self, *_, **__): assert "It ensures optimizer_step or optimizer_zero_grad are called on every batch" in str(e) -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_automatic_optimization_num_calls(enable_pl_optimizer, tmpdir): +def test_automatic_optimization_num_calls(tmpdir): with patch("torch.optim.SGD.step") as sgd_step, \ patch("torch.optim.SGD.zero_grad") as sgd_zero_grad, \ @@ -90,7 +89,6 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, default_root_dir=tmpdir, limit_train_batches=8, accumulate_grad_batches=1, - enable_pl_optimizer=enable_pl_optimizer ) trainer.fit(model) @@ -101,8 +99,7 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, assert adam_zero_grad.call_count == 2 -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_params_groups_and_state_are_accessible(enable_pl_optimizer, tmpdir): +def test_params_groups_and_state_are_accessible(tmpdir): with patch("torch.optim.SGD.step") as sgd_step, \ patch("torch.optim.SGD.zero_grad") as sgd_zero_grad, \ @@ -139,7 +136,6 @@ def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, clos default_root_dir=tmpdir, limit_train_batches=8, accumulate_grad_batches=1, - enable_pl_optimizer=enable_pl_optimizer ) trainer.fit(model) diff --git a/tests/core/test_lightning_optimizer.py b/tests/core/test_lightning_optimizer.py index 530f20f86a3db..171cf00ad4f66 100644 --- a/tests/core/test_lightning_optimizer.py +++ b/tests/core/test_lightning_optimizer.py @@ -47,13 +47,12 @@ def configure_optimizers(self): limit_val_batches=1, max_epochs=1, weights_summary=None, - enable_pl_optimizer=True, ) trainer.fit(model) groups = "{'dampening': 0, 'initial_lr': 0.1, 'lr': 0.01, 'momentum': 0, 'nesterov': False, 'weight_decay': 0}" expected = f"LightningSGD(groups=[{groups}])" - assert trainer.optimizers[0].__repr__() == expected + assert trainer._lightning_optimizers[0].__repr__() == expected def test_lightning_optimizer_from_user(tmpdir): @@ -75,13 +74,12 @@ def configure_optimizers(self): limit_val_batches=1, max_epochs=1, weights_summary=None, - enable_pl_optimizer=True, ) trainer.fit(model) groups = "{'amsgrad': False, 'betas': (0.9, 0.999), 'eps': 1e-08, 'initial_lr': 0.1, 'lr': 0.01, 'weight_decay': 0}" expected = f"LightningAdam(groups=[{groups}])" - assert trainer.optimizers[0].__repr__() == expected + assert trainer._lightning_optimizers[0].__repr__() == expected @patch("torch.optim.Adam.step", autospec=True) @@ -129,7 +127,6 @@ def automatic_optimization(self) -> bool: limit_val_batches=1, max_epochs=1, weights_summary=None, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -183,7 +180,6 @@ def automatic_optimization(self) -> bool: max_epochs=1, weights_summary=None, accumulate_grad_batches=2, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -263,7 +259,6 @@ def configure_optimizers(self): limit_val_batches=1, max_epochs=1, weights_summary=None, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -316,7 +311,6 @@ def configure_optimizers(self): limit_val_batches=1, max_epochs=1, weights_summary=None, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -376,7 +370,6 @@ def configure_optimizers(self): limit_val_batches=1, max_epochs=1, weights_summary=None, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -429,7 +422,6 @@ def configure_optimizers(self): limit_val_batches=1, max_epochs=1, weights_summary=None, - enable_pl_optimizer=True, ) trainer.fit(model) diff --git a/tests/deprecated_api/test_remove_1-3.py b/tests/deprecated_api/test_remove_1-3.py index 7ec69796b1e46..4a5bed4de9b55 100644 --- a/tests/deprecated_api/test_remove_1-3.py +++ b/tests/deprecated_api/test_remove_1-3.py @@ -135,3 +135,8 @@ def test_trainer_cli_profiler_remove_in_v1_3_0(cli_args, expected_parsed_arg, ex assert getattr(args, "profiler") == expected_parsed_arg trainer = Trainer.from_argparse_args(args) assert isinstance(trainer.profiler, expected_profiler) + + +def test_trainer_enable_pl_optimizer(tmpdir): + with pytest.deprecated_call(match='will be removed in v1.3'): + Trainer(enable_pl_optimizer=True) diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py index 269a2069e4266..60da3ba55eba4 100644 --- a/tests/models/test_amp.py +++ b/tests/models/test_amp.py @@ -17,14 +17,14 @@ import pytest import torch -import tests.base.develop_pipelines as tpipes -import tests.base.develop_utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.loggers import WandbLogger from pytorch_lightning.trainer.states import TrainerState from pytorch_lightning.utilities import APEX_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import EvalModelTemplate +import tests.base.develop_pipelines as tpipes +import tests.base.develop_utils as tutils @pytest.mark.skip(reason='dp + amp not supported currently') # TODO @@ -145,8 +145,7 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir): assert trainer.slurm_connector.resolve_root_node_address('abc[23-24, 45-40, 40]') == 'abc23' -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_cpu_model_with_amp(enable_pl_optimizer, tmpdir): +def test_cpu_model_with_amp(tmpdir): """Make sure model trains on CPU.""" trainer_options = dict( default_root_dir=tmpdir, @@ -155,7 +154,6 @@ def test_cpu_model_with_amp(enable_pl_optimizer, tmpdir): limit_train_batches=0.4, limit_val_batches=0.4, precision=16, - enable_pl_optimizer=enable_pl_optimizer, ) model = EvalModelTemplate() diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 2848ab2e74f3c..8fea2ab941418 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -11,23 +11,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from distutils.version import LooseVersion import os import platform -from distutils.version import LooseVersion import pytest import torch -import tests.base.develop_pipelines as tpipes -import tests.base.develop_utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint from pytorch_lightning.core.step_result import TrainResult from tests.base import EvalModelTemplate +import tests.base.develop_pipelines as tpipes +import tests.base.develop_utils as tutils -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): +def test_cpu_slurm_save_load(tmpdir): """Verify model save/load/checkpoint on CPU.""" hparams = EvalModelTemplate.get_default_hparams() model = EvalModelTemplate(**hparams) @@ -44,7 +43,6 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): limit_train_batches=0.2, limit_val_batches=0.2, callbacks=[ModelCheckpoint(dirpath=tmpdir)], - enable_pl_optimizer=enable_pl_optimizer, ) result = trainer.fit(model) real_global_step = trainer.global_step @@ -81,7 +79,6 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): max_epochs=1, logger=logger, callbacks=[ModelCheckpoint(dirpath=tmpdir)], - enable_pl_optimizer=enable_pl_optimizer, ) model = EvalModelTemplate(**hparams) @@ -101,8 +98,7 @@ def assert_pred_same(): trainer.fit(model) -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_early_stopping_cpu_model(enable_pl_optimizer, tmpdir): +def test_early_stopping_cpu_model(tmpdir): """Test each of the trainer options.""" stopping = EarlyStopping(monitor='early_stop_on', min_delta=0.1) trainer_options = dict( @@ -114,7 +110,6 @@ def test_early_stopping_cpu_model(enable_pl_optimizer, tmpdir): track_grad_norm=2, limit_train_batches=0.1, limit_val_batches=0.1, - enable_pl_optimizer=enable_pl_optimizer, ) model = EvalModelTemplate() @@ -130,8 +125,7 @@ def test_early_stopping_cpu_model(enable_pl_optimizer, tmpdir): @pytest.mark.skipif((platform.system() == "Darwin" and LooseVersion(torch.__version__) < LooseVersion("1.3.0")), reason="Distributed training is not supported on MacOS before Torch 1.3.0") -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_multi_cpu_model_ddp(enable_pl_optimizer, tmpdir): +def test_multi_cpu_model_ddp(tmpdir): """Make sure DDP works.""" tutils.set_random_master_port() @@ -144,7 +138,6 @@ def test_multi_cpu_model_ddp(enable_pl_optimizer, tmpdir): gpus=None, num_processes=2, accelerator='ddp_cpu', - enable_pl_optimizer=enable_pl_optimizer, ) model = EvalModelTemplate() @@ -284,8 +277,7 @@ def test_cpu_model(tmpdir): tpipes.run_model_test(trainer_options, model, on_gpu=False) -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_all_features_cpu_model(enable_pl_optimizer, tmpdir): +def test_all_features_cpu_model(tmpdir): """Test each of the trainer options.""" trainer_options = dict( default_root_dir=tmpdir, @@ -297,7 +289,6 @@ def test_all_features_cpu_model(enable_pl_optimizer, tmpdir): max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.4, - enable_pl_optimizer=enable_pl_optimizer, ) model = EvalModelTemplate() diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index f47c13021edde..a047bfde6f7a2 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -20,18 +20,18 @@ import numpy as np import pytest -import torch from sklearn.metrics import accuracy_score +import torch -import tests.base.develop_pipelines as tpipes -import tests.base.develop_utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator from pytorch_lightning.core.step_result import EvalResult, Result, TrainResult from pytorch_lightning.metrics.classification.accuracy import Accuracy -from pytorch_lightning.utilities import APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, HOROVOD_AVAILABLE, _module_available +from pytorch_lightning.utilities import _module_available, APEX_AVAILABLE, HOROVOD_AVAILABLE, NATIVE_AMP_AVAILABLE from tests.base import EvalModelTemplate from tests.base.boring_model import BoringModel +import tests.base.develop_pipelines as tpipes +import tests.base.develop_utils as tutils from tests.base.models import BasicGAN if HOROVOD_AVAILABLE: @@ -69,8 +69,7 @@ def _run_horovod(trainer_options, on_gpu=False): @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_horovod_cpu(enable_pl_optimizer, tmpdir): +def test_horovod_cpu(tmpdir): """Test Horovod running multi-process on CPU.""" trainer_options = dict( default_root_dir=str(tmpdir), @@ -82,14 +81,12 @@ def test_horovod_cpu(enable_pl_optimizer, tmpdir): limit_val_batches=0.2, accelerator='horovod', deterministic=True, - enable_pl_optimizer=enable_pl_optimizer, ) _run_horovod(trainer_options) @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_horovod_cpu_implicit(enable_pl_optimizer, tmpdir): +def test_horovod_cpu_implicit(tmpdir): """Test Horovod without specifying a backend, inferring from env set by `horovodrun`.""" trainer_options = dict( default_root_dir=str(tmpdir), @@ -100,7 +97,6 @@ def test_horovod_cpu_implicit(enable_pl_optimizer, tmpdir): limit_train_batches=0.4, limit_val_batches=0.2, deterministic=True, - enable_pl_optimizer=enable_pl_optimizer, ) _run_horovod(trainer_options) @@ -206,8 +202,7 @@ def validation_step(self, batch, *args, **kwargs): @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_horovod_multi_optimizer(enable_pl_optimizer, tmpdir): +def test_horovod_multi_optimizer(tmpdir): model = BasicGAN(**EvalModelTemplate.get_default_hparams()) # fit model @@ -219,7 +214,6 @@ def test_horovod_multi_optimizer(enable_pl_optimizer, tmpdir): limit_val_batches=0.2, deterministic=True, accelerator='horovod', - enable_pl_optimizer=enable_pl_optimizer, ) result = trainer.fit(model) assert result == 1, 'model failed to complete' @@ -241,8 +235,7 @@ def get_optimizer_params(optimizer): @pytest.mark.skipif(not HOROVOD_AVAILABLE, reason="Horovod is unavailable") @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_result_reduce_horovod(enable_pl_optimizer, tmpdir): +def test_result_reduce_horovod(tmpdir): """Make sure result logging works with Horovod. This test mirrors tests/core/test_results.py::_ddp_test_fn @@ -282,7 +275,6 @@ def training_epoch_end(self, outputs) -> None: max_epochs=1, log_every_n_steps=1, weights_summary=None, - enable_pl_optimizer=enable_pl_optimizer, ) trainer.fit(model) diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py index f7773f63aa8c2..ded9deb0d0a45 100644 --- a/tests/models/test_restore.py +++ b/tests/models/test_restore.py @@ -11,11 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from copy import deepcopy import glob import logging as log import os import pickle -from copy import deepcopy import cloudpickle import pytest @@ -23,10 +23,10 @@ from torch.nn import functional as F from torch.utils.data import DataLoader +from pytorch_lightning import Callback, LightningModule, seed_everything, Trainer +from pytorch_lightning.callbacks import ModelCheckpoint import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils -from pytorch_lightning import Callback, LightningModule, Trainer, seed_everything -from pytorch_lightning.callbacks import ModelCheckpoint from tests.base import BoringModel, EvalModelTemplate, GenericEvalModelTemplate, TrialMNIST @@ -52,8 +52,7 @@ def on_train_end(self, trainer, pl_module): self._check_properties(trainer, pl_module) -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_model_properties_resume_from_checkpoint(enable_pl_optimizer, tmpdir): +def test_model_properties_resume_from_checkpoint(tmpdir): """ Test that properties like `current_epoch` and `global_step` in model and trainer are always the same. """ model = EvalModelTemplate() @@ -62,7 +61,6 @@ def test_model_properties_resume_from_checkpoint(enable_pl_optimizer, tmpdir): default_root_dir=tmpdir, max_epochs=1, logger=False, - enable_pl_optimizer=enable_pl_optimizer, callbacks=[checkpoint_callback, ModelTrainerPropertyParity()], # this performs the assertions ) trainer = Trainer(**trainer_args) @@ -99,8 +97,7 @@ def on_train_start(self, trainer, pl_module): self.callbacks = deepcopy(trainer.callbacks) -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_callbacks_state_resume_from_checkpoint(enable_pl_optimizer, tmpdir): +def test_callbacks_state_resume_from_checkpoint(tmpdir): """ Test that resuming from a checkpoint restores callbacks that persist state. """ model = EvalModelTemplate() callback_capture = CaptureCallbacksBeforeTraining() @@ -111,7 +108,6 @@ def get_trainer_args(): default_root_dir=tmpdir, max_steps=1, logger=False, - enable_pl_optimizer=enable_pl_optimizer, callbacks=[ checkpoint, callback_capture, @@ -138,11 +134,10 @@ def get_trainer_args(): assert before.best_model_score == after.best_model_score -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_callbacks_references_resume_from_checkpoint(enable_pl_optimizer, tmpdir): +def test_callbacks_references_resume_from_checkpoint(tmpdir): """ Test that resuming from a checkpoint sets references as expected. """ model = EvalModelTemplate() - args = {'default_root_dir': tmpdir, 'max_steps': 1, 'logger': False, "enable_pl_optimizer": enable_pl_optimizer} + args = {'default_root_dir': tmpdir, 'max_steps': 1, 'logger': False} # initial training checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 33d14e852b285..f0d7c6d96914e 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -21,7 +21,7 @@ import torch.distributed as torch_distrib import torch.nn.functional as F -from pytorch_lightning import Trainer, seed_everything +from pytorch_lightning import seed_everything, Trainer from pytorch_lightning.utilities import APEX_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base.boring_model import BoringModel @@ -454,7 +454,6 @@ def test_manual_optimization_and_return_tensor(tmpdir): amp_backend='native', accelerator="ddp_spawn", gpus=2, - enable_pl_optimizer=True ) trainer.fit(model) @@ -573,7 +572,6 @@ def automatic_optimization(self) -> bool: amp_backend='native', accumulate_grad_batches=4, gpus=1, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -648,7 +646,6 @@ def automatic_optimization(self) -> bool: precision=16, amp_backend='native', gpus=1, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -730,7 +727,6 @@ def automatic_optimization(self) -> bool: limit_val_batches=2, max_epochs=1, log_every_n_steps=1, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -795,7 +791,6 @@ def automatic_optimization(self) -> bool: max_epochs=1, log_every_n_steps=1, accumulate_grad_batches=2, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -851,7 +846,6 @@ def automatic_optimization(self) -> bool: max_epochs=1, log_every_n_steps=1, accumulate_grad_batches=2, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -929,7 +923,6 @@ def automatic_optimization(self) -> bool: max_epochs=1, log_every_n_steps=1, accumulate_grad_batches=2, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -1037,7 +1030,6 @@ def automatic_optimization(self) -> bool: max_epochs=1, log_every_n_steps=1, accumulate_grad_batches=2, - enable_pl_optimizer=True, gpus=2, accelerator="ddp", ) @@ -1048,35 +1040,3 @@ def automatic_optimization(self) -> bool: expected_calls = [call(closure=ANY, optim='adam')] * 2 mock_adam_step.assert_has_calls(expected_calls) - - -def test_step_with_misconfiguraiton_error_when_overriding_optimizer_zero_grad(tmpdir): - """ - Tests that `optimizer_zero_grad` in manual_optimization triggers a MisconfigurationException - """ - try: - class TestModel(BoringModel): - - def optimizer_zero_grad(self, *_): - pass - - @property - def automatic_optimization(self) -> bool: - return False - - model = TestModel() - model.val_dataloader = None - model.training_epoch_end = None - - limit_train_batches = 8 - trainer = Trainer( - default_root_dir=tmpdir, - limit_train_batches=limit_train_batches, - limit_val_batches=2, - max_epochs=1, - log_every_n_steps=1, - accumulate_grad_batches=2, - enable_pl_optimizer=True, - ) - except MisconfigurationException as e: - assert "`Trainer(enable_pl_optimizer=True, ...) is not supported" in str(e) diff --git a/tests/trainer/optimization/test_parity_automatic_optimization.py b/tests/trainer/optimization/test_parity_automatic_optimization.py index 4a1d6c384cd52..4f5cc855a3164 100644 --- a/tests/trainer/optimization/test_parity_automatic_optimization.py +++ b/tests/trainer/optimization/test_parity_automatic_optimization.py @@ -21,7 +21,9 @@ import torch from torch.optim import Optimizer +import pytorch_lightning as pl from pytorch_lightning import seed_everything, Trainer +from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.core.optimizer import LightningOptimizer from tests.base.boring_model import BoringModel @@ -35,7 +37,13 @@ class BaseParityAutomaticOptimizationModel(BoringModel): - def __init__(self, optimizer_cls, optimizer_is_mocked=False, accumulate_grad_batches=None): + def __init__( + self, + optimizer_cls, + optimizer_is_mocked=False, + accumulate_grad_batches=None, + lr=0.1 + ): super().__init__() self.optimizer_cls = optimizer_cls self.losses = [] @@ -44,6 +52,7 @@ def __init__(self, optimizer_cls, optimizer_is_mocked=False, accumulate_grad_bat self.optimizer_is_mocked = optimizer_is_mocked self.grad_checked = False self.accumulate_grad_batches = accumulate_grad_batches + self.lr = lr def on_before_zero_grad(self, optimizer): self.on_before_zero_grad_count += 1 @@ -51,7 +60,7 @@ def on_before_zero_grad(self, optimizer): self.grads.append(self.layer.weight.grad.clone()) def configure_optimizers(self): - optimizer = self.optimizer_cls(self.layer.parameters(), lr=0.1) + optimizer = self.optimizer_cls(self.layer.parameters(), lr=self.lr) assert isinstance(optimizer, Optimizer) return optimizer @@ -86,7 +95,7 @@ def optimizer_step( Override the optimizer step to define manual optimizer steps, as we use LightningOptimizer wrapper as standard """ # Get the unwrapped optimizer - optimizer = optimizer._optimizer + optimizer = optimizer.optimizer assert not isinstance(optimizer, LightningOptimizer) optimizer_closure() @@ -136,7 +145,7 @@ def optimizer_step( Override the optimizer step to define manual optimizer steps, as we use LightningOptimizer wrapper as standard """ # Get the unwrapped optimizer - optimizer = optimizer._optimizer + optimizer = optimizer.optimizer assert not isinstance(optimizer, LightningOptimizer) optimizer_closure() @@ -211,10 +220,8 @@ def test_lightning_optimizer_and_no_lightning_optimizer_equality_check_optim_cal with patch("torch.optim.SGD.step") as mock_sgd_step, \ patch("torch.optim.Adam.step") as mock_adam_step, \ - patch("torch.optim.AdamW.step") as mock_adamw_step, \ patch("torch.optim.SGD.zero_grad") as mock_sgd_zero_grad, \ - patch("torch.optim.Adam.zero_grad") as mock_adam_zero_grad, \ - patch("torch.optim.AdamW.zero_grad") as mock_adamw_zero_grad: + patch("torch.optim.Adam.zero_grad") as mock_adam_zero_grad: max_epochs = 2 limit_train_batches = 10 @@ -238,8 +245,62 @@ def test_lightning_optimizer_and_no_lightning_optimizer_equality_check_optim_cal assert mock_sgd_zero_grad.call_count == (expected_num_batches // accumulate_grad_batches) assert mock_sgd_step.call_count == mock_adam_step.call_count assert mock_sgd_step.call_count == mock_adam_step.call_count - assert mock_sgd_zero_grad.call_count == mock_adam_zero_grad.call_count - assert mock_sgd_zero_grad.call_count == mock_adamw_zero_grad.call_count + + +def train_with_restore(tmpdir, model_cls, restore_from=None): + # init model + if restore_from is not None: + seed_everything(42) + model = model_cls(torch.optim.Adam, accumulate_grad_batches=1, lr=10e-1) + ckpt_saver = ModelCheckpoint(dirpath=f"{tmpdir}/mckpt", save_last=True, save_weights_only=False) + # Initialize a trainer + trainer = pl.Trainer( + default_root_dir=tmpdir, + max_epochs=(1 + bool(restore_from)), + limit_train_batches=8, + callbacks=([ckpt_saver] if restore_from is None else []), + checkpoint_callback=(not restore_from), + resume_from_checkpoint=restore_from, + num_sanity_val_steps=0, + ) + + # Train the model + trainer.fit(model) + return ckpt_saver.best_model_path, model + + +def test_parity_checkpointing(tmpdir): + """ + This test assert that reloading a checkpoint and finetunning gives the same result + with / without LightningOptimizer + """ + + # Initial train run of the model. + seed_everything(0) + ckpt_path, first_epoch_pl_optimizer_model = train_with_restore( + tmpdir, + model_cls=BaseParityAutomaticOptimizationModel, + restore_from=None) + + assert "last" in ckpt_path + _, second_epoch_pl_optimizer_model = train_with_restore( + tmpdir, + model_cls=BaseParityAutomaticOptimizationModel, + restore_from=ckpt_path) + + seed_everything(0) + ckpt_path, first_epoch_pure_pytorch_optimizer_model = train_with_restore( + tmpdir, + model_cls=AutomaticOptimizationPurePytorchOptimizerModel, + restore_from=None) + + _, second_epoch_pure_pytorch_optimizer_model = train_with_restore( + tmpdir, + model_cls=AutomaticOptimizationPurePytorchOptimizerModel, + restore_from=ckpt_path) + + assert first_epoch_pl_optimizer_model.losses == first_epoch_pure_pytorch_optimizer_model.losses + assert second_epoch_pl_optimizer_model.losses == second_epoch_pure_pytorch_optimizer_model.losses def run_lightning_optimizer_equality( @@ -261,22 +322,12 @@ def run_lightning_optimizer_equality( torch.optim.SGD, expected_num_batches=expected_num_batches, optimizer_is_mocked=optimizer_is_mocked, - enable_pl_optimizer=True, - **trainer_kwargs, - ) - - no_pl_optimizer_initial_model_weights, no_pl_optimizer_model = train_specific_optimizer_model( - lightning_model_cls, - torch.optim.Adam if optimizer_is_mocked else torch.optim.SGD, - expected_num_batches=expected_num_batches, - optimizer_is_mocked=optimizer_is_mocked, - enable_pl_optimizer=False, # Disable pl optimizer **trainer_kwargs, ) pure_pytorch_optimizer_initial_model_weights, pure_pytorch_optimizer_model = train_specific_optimizer_model( vanilla_model_cls, - torch.optim.AdamW if optimizer_is_mocked else torch.optim.SGD, + torch.optim.Adam if optimizer_is_mocked else torch.optim.SGD, expected_num_batches=expected_num_batches, optimizer_is_mocked=optimizer_is_mocked, replace_optimizer_step_with_pure_pytorch=True, @@ -288,8 +339,6 @@ def run_lightning_optimizer_equality( assert_model_equality( pl_optimizer_initial_model_weights=pl_optimizer_initial_model_weights, pl_optimizer_model=pl_optimizer_model, - no_pl_optimizer_initial_model_weights=no_pl_optimizer_initial_model_weights, - no_pl_optimizer_model=no_pl_optimizer_model, pure_pytorch_optimizer_initial_model_weights=pure_pytorch_optimizer_initial_model_weights, pure_pytorch_optimizer_model=pure_pytorch_optimizer_model, expected_num_batches=expected_num_batches, @@ -300,35 +349,24 @@ def run_lightning_optimizer_equality( def assert_model_equality( pl_optimizer_initial_model_weights, pl_optimizer_model, - no_pl_optimizer_initial_model_weights, - no_pl_optimizer_model, pure_pytorch_optimizer_initial_model_weights, pure_pytorch_optimizer_model, expected_num_batches, precision, ): - assert torch.equal(pl_optimizer_initial_model_weights, no_pl_optimizer_initial_model_weights) assert torch.equal(pl_optimizer_initial_model_weights, pure_pytorch_optimizer_initial_model_weights) assert len(pl_optimizer_model.losses) == expected_num_batches assert pure_pytorch_optimizer_model.grad_checked - assert pure_pytorch_optimizer_model.losses == no_pl_optimizer_model.losses - assert not torch.isnan(torch.FloatTensor(no_pl_optimizer_model.losses)).any() - - assert torch.equal(torch.FloatTensor(no_pl_optimizer_model.losses), torch.FloatTensor(pl_optimizer_model.losses)) - assert no_pl_optimizer_model.on_before_zero_grad_count == pl_optimizer_model.on_before_zero_grad_count + assert not torch.isnan(torch.FloatTensor(pl_optimizer_model.losses)).any() - for pytorch_grad, no_pl_optim_grad, pl_optim_grad in zip(pure_pytorch_optimizer_model.grads, - no_pl_optimizer_model.grads, - pl_optimizer_model.grads): - assert torch.equal(no_pl_optim_grad, pl_optim_grad), 'Grad parameters are different' - assert torch.equal(pytorch_grad, no_pl_optim_grad), 'Grad parameters are different' + for pytorch_grad, pl_optim_grad in zip(pure_pytorch_optimizer_model.grads, + pl_optimizer_model.grads): + assert torch.equal(pytorch_grad, pl_optim_grad), 'Grad parameters are different' - for pytorch_weight, no_pl_optim_weight, pl_optim_weight in zip(pure_pytorch_optimizer_model.parameters(), - no_pl_optimizer_model.parameters(), - pl_optimizer_model.parameters()): - assert torch.equal(no_pl_optim_weight, pl_optim_weight), 'Model parameters are different' - assert torch.equal(pytorch_weight, no_pl_optim_weight), 'Model parameters are different' + for pytorch_weight, pl_optim_weight in zip(pure_pytorch_optimizer_model.parameters(), + pl_optimizer_model.parameters()): + assert torch.equal(pytorch_weight, pl_optim_weight), 'Model parameters are different' # train function @@ -336,7 +374,6 @@ def train_specific_optimizer_model( model_cls, optimizer_cls, expected_num_batches, - enable_pl_optimizer=False, optimizer_is_mocked=False, replace_optimizer_step_with_pure_pytorch=False, **trainer_kwargs, @@ -362,7 +399,6 @@ def train_specific_optimizer_model( model.training_epoch_end = None trainer = Trainer( - enable_pl_optimizer=enable_pl_optimizer, **trainer_kwargs ) trainer.fit(model) diff --git a/tests/trainer/optimization/test_parity_manual_optimization.py b/tests/trainer/optimization/test_parity_manual_optimization.py index 5d110b2fbdca7..08e4e9908f592 100644 --- a/tests/trainer/optimization/test_parity_manual_optimization.py +++ b/tests/trainer/optimization/test_parity_manual_optimization.py @@ -76,7 +76,7 @@ def training_step(self, batch, batch_idx): class ManualOptimizationPurePytorchOptimizerModel(BaseParityManualOptimizationModel): def training_step(self, batch, batch_idx): - optimizer = self.optimizers() + optimizer = self.optimizers(use_pl_optimizer=False) output = self.layer(batch) loss = self.loss(batch, output) self.losses.append(loss.detach().item()) @@ -104,7 +104,7 @@ def __init__(self, *args, **kwargs): self.scaler = torch.cuda.amp.GradScaler() def training_step(self, batch, batch_idx): - optimizer = self.optimizers() + optimizer = self.optimizers(use_pl_optimizer=False) with torch.cuda.amp.autocast(): output = self.layer(batch) loss = self.loss(batch, output) @@ -178,10 +178,8 @@ def test_lightning_optimizer_and_no_lightning_optimizer_equality_check_optim_cal with patch("torch.optim.SGD.step") as mock_sgd_step, \ patch("torch.optim.Adam.step") as mock_adam_step, \ - patch("torch.optim.AdamW.step") as mock_adamw_step, \ patch("torch.optim.SGD.zero_grad") as mock_sgd_zero_grad, \ - patch("torch.optim.Adam.zero_grad") as mock_adam_zero_grad, \ - patch("torch.optim.AdamW.zero_grad") as mock_adamw_zero_grad: + patch("torch.optim.Adam.zero_grad") as mock_adam_zero_grad: max_epochs = 2 limit_train_batches = 10 @@ -206,6 +204,4 @@ def test_lightning_optimizer_and_no_lightning_optimizer_equality_check_optim_cal assert mock_sgd_step.call_count == (expected_num_batches // accumulate_grad_batches) assert mock_sgd_zero_grad.call_count == (expected_num_batches // accumulate_grad_batches) assert mock_sgd_step.call_count == mock_adam_step.call_count - assert mock_sgd_step.call_count == mock_adam_step.call_count assert mock_sgd_zero_grad.call_count == mock_adam_zero_grad.call_count - assert mock_sgd_zero_grad.call_count == mock_adamw_zero_grad.call_count diff --git a/tests/trainer/test_optimizers.py b/tests/trainer/test_optimizers.py index e9a422dfb4711..2d2ebd6a6a2dd 100644 --- a/tests/trainer/test_optimizers.py +++ b/tests/trainer/test_optimizers.py @@ -180,9 +180,8 @@ def test_reducelronplateau_scheduling(tmpdir): ), 'lr scheduler was not correctly converted to dict' -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_optimizer_return_options(enable_pl_optimizer): - trainer = Trainer(enable_pl_optimizer=enable_pl_optimizer) +def test_optimizer_return_options(): + trainer = Trainer() model = EvalModelTemplate() # single optimizer diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 9e5ceccf9b646..8b66e7141957e 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -11,21 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from argparse import Namespace +from copy import deepcopy import math import os +from pathlib import Path import pickle import sys -from argparse import Namespace -from copy import deepcopy -from pathlib import Path from unittest.mock import ANY, call, patch import cloudpickle +from omegaconf import OmegaConf import pytest import torch -from omegaconf import OmegaConf -import tests.base.develop_utils as tutils from pytorch_lightning import Callback, LightningModule, Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint from pytorch_lightning.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml, save_hparams_to_tags_csv @@ -37,6 +36,7 @@ from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import BoringModel, EvalModelTemplate +import tests.base.develop_utils as tutils @pytest.mark.parametrize("url_ckpt", [True, False]) @@ -496,16 +496,13 @@ def test_model_checkpoint_only_weights(tmpdir): def test_model_freeze_unfreeze(): - model = EvalModelTemplate() - model.freeze() model.unfreeze() -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) @pytest.mark.parametrize("url_ckpt", [True, False]) -def test_resume_from_checkpoint_epoch_restored(monkeypatch, tmpdir, tmpdir_server, url_ckpt, enable_pl_optimizer): +def test_resume_from_checkpoint_epoch_restored(monkeypatch, tmpdir, tmpdir_server, url_ckpt): """Verify resuming from checkpoint runs the right number of epochs""" # set $TORCH_HOME, which determines torch hub's cache path, to tmpdir monkeypatch.setenv("TORCH_HOME", tmpdir) @@ -533,7 +530,6 @@ def on_load_checkpoint(self, _): callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor='early_stop_on', save_top_k=-1)], default_root_dir=tmpdir, val_check_interval=1.0, - enable_pl_optimizer=enable_pl_optimizer, progress_bar_refresh_rate=0, logger=False, weights_summary=None,