diff --git a/.github/workflows/probot-auto-cc.yml b/.github/workflows/probot-auto-cc.yml index 0595c4eee65f7f..a6d75ad175eec3 100644 --- a/.github/workflows/probot-auto-cc.yml +++ b/.github/workflows/probot-auto-cc.yml @@ -2,16 +2,14 @@ name: Probot on: issues: - types: - - labeled + types: [labeled] pull_request: - types: - - labeled + types: [labeled, ready_for_review] jobs: auto-cc: - if: ${{ github.repository_owner == 'PyTorchLightning' }} runs-on: ubuntu-latest + if: github.event_name == "issue" || github.event.pull_request.draft == false steps: - uses: carmocca/probot@v1 env: diff --git a/CHANGELOG.md b/CHANGELOG.md index 52a5e15b217565..9c8b4714fa4635 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,10 +31,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - The `monitor` argument in the `EarlyStopping` callback is no longer optional ([#10328](https://github.com/PyTorchLightning/pytorch-lightning/pull/10328)) -- Moved `precision_plugin` into `Training_type_plugin` and updated reference ([#10570](https://github.com/PyTorchLightning/pytorch-lightning/pull/10570)) +- Do not fail if batch size could not be inferred for logging when using DeepSpeed ([#10438](https://github.com/PyTorchLightning/pytorch-lightning/issues/10438)) -- +- Raise `MisconfigurationException` when `enable_progress_bar=False` and a progress bar instance has been passed in the callback list ([#10520](https://github.com/PyTorchLightning/pytorch-lightning/issues/10520)) + + +- Moved `precision_plugin` into `Training_type_plugin` and updated reference ([#10570](https://github.com/PyTorchLightning/pytorch-lightning/pull/10570)) - @@ -127,9 +130,15 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed deprecated `Trainer.train_loop` property in favor of `Trainer.fit_loop` ([#10482](https://github.com/PyTorchLightning/pytorch-lightning/pull/10482)) +- Removed deprecated `disable_validation` property from Trainer ([#10450](https://github.com/PyTorchLightning/pytorch-lightning/pull/10450)) + + - Removed deprecated `CheckpointConnector.hpc_load` property in favor of `CheckpointConnector.restore` ([#10525](https://github.com/PyTorchLightning/pytorch-lightning/pull/10525)) +- Removed deprecated `reload_dataloaders_every_epoch` from `Trainer` in favour of `reload_dataloaders_every_n_epochs` ([#10481](https://github.com/PyTorchLightning/pytorch-lightning/pull/10481)) + + - Removed `precision_plugin` from `Accelerator` in favor of `precision_plugin` in `training_type_plugin` ([#10570](https://github.com/PyTorchLightning/pytorch-lightning/pull/10570)) @@ -141,15 +150,26 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `CombinedLoader` and `max_size_cycle` didn't receive a `DistributedSampler` ([#10374](https://github.com/PyTorchLightning/pytorch-lightning/issues/10374)) +- Fixed scripting causing false positive deprecation warnings ([#10470](https://github.com/PyTorchLightning/pytorch-lightning/pull/10470), [#10555](https://github.com/PyTorchLightning/pytorch-lightning/pull/10555)) + + +- Fixed `isinstance` not working with `init_meta_context`, materialized model not being moved to the device ([#10493](https://github.com/PyTorchLightning/metrics/pull/10493)) + + - Fixed an issue that prevented the Trainer to shutdown workers when execution is interrupted due to failure([#10463](https://github.com/PyTorchLightning/pytorch-lightning/issues/10463)) - Squeeze the early stopping monitor to remove empty tensor dimensions ([#10461](https://github.com/PyTorchLightning/pytorch-lightning/issues/10461)) -- +- Fixed sampler replacement logic with `overfit_batches` to only replace the sample when `SequentialSampler` is not used ([#10486](https://github.com/PyTorchLightning/pytorch-lightning/issues/10486)) +- Fixed propagation of device and dtype information to submodules of LightningLite when they inherit from `DeviceDtypeModuleMixin` ([#10559](https://github.com/PyTorchLightning/pytorch-lightning/issues/10559)) + + +- + ## [1.5.1] - 2021-11-09 ### Fixed diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html index 26b33f3ed95e00..2363e862fbcf1c 100644 --- a/docs/source/_templates/layout.html +++ b/docs/source/_templates/layout.html @@ -4,7 +4,7 @@ {% block footer %} {{ super() }} {% endblock %} diff --git a/pl_examples/loop_examples/kfold.py b/pl_examples/loop_examples/kfold.py index bd14d42eb796fb..ed4db6faa50118 100644 --- a/pl_examples/loop_examples/kfold.py +++ b/pl_examples/loop_examples/kfold.py @@ -205,7 +205,7 @@ def on_run_end(self) -> None: voting_model = EnsembleVotingModel(type(self.trainer.lightning_module), checkpoint_paths) voting_model.trainer = self.trainer # This requires to connect the new model and move it the right device. - self.trainer.accelerator.connect(voting_model) + self.trainer.training_type_plugin.connect(voting_model) self.trainer.training_type_plugin.model_to_device() self.trainer.test_loop.run() diff --git a/pl_examples/loop_examples/yielding_training_step.py b/pl_examples/loop_examples/yielding_training_step.py index 4d870f002e247c..3e3082e3f9fac6 100644 --- a/pl_examples/loop_examples/yielding_training_step.py +++ b/pl_examples/loop_examples/yielding_training_step.py @@ -86,7 +86,7 @@ def _training_step(self, generator): # Here, instead of calling `lightning_module.training_step()` # we call next() on the generator! training_step_output = next(generator) - self.trainer.accelerator.post_training_step() + self.trainer.training_type_plugin.post_training_step() training_step_output = self.trainer.call_hook("training_step_end", training_step_output) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index b6f064d7d9802d..dc3ce5f0f4063b 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -115,6 +115,8 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self._param_requires_grad_state = {} self._metric_attributes: Optional[Dict[int, str]] = None self._should_prevent_trainer_and_dataloaders_deepcopy: bool = False + # TODO: remove after the 1.6 release + self._running_torchscript = False self._register_sharded_tensor_state_dict_hooks_if_available() @@ -1893,6 +1895,8 @@ def to_torchscript( """ mode = self.training + self._running_torchscript = True + if method == "script": torchscript_module = torch.jit.script(self.eval(), **kwargs) elif method == "trace": @@ -1918,6 +1922,8 @@ def to_torchscript( with fs.open(file_path, "wb") as f: torch.jit.save(torchscript_module, f) + self._running_torchscript = False + return torchscript_module @property @@ -1927,11 +1933,12 @@ def model_size(self) -> float: Note: This property will not return correct value for Deepspeed (stage 3) and fully-sharded training. """ - rank_zero_deprecation( - "The `LightningModule.model_size` property was deprecated in v1.5 and will be removed in v1.7." - " Please use the `pytorch_lightning.utilities.memory.get_model_size_mb`.", - stacklevel=5, - ) + if not self._running_torchscript: # remove with the deprecation removal + rank_zero_deprecation( + "The `LightningModule.model_size` property was deprecated in v1.5 and will be removed in v1.7." + " Please use the `pytorch_lightning.utilities.memory.get_model_size_mb`.", + stacklevel=5, + ) return get_model_size_mb(self) def add_to_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None: diff --git a/pytorch_lightning/core/mixins/device_dtype_mixin.py b/pytorch_lightning/core/mixins/device_dtype_mixin.py index e02790edddd1e5..e8b122989cd9c5 100644 --- a/pytorch_lightning/core/mixins/device_dtype_mixin.py +++ b/pytorch_lightning/core/mixins/device_dtype_mixin.py @@ -17,6 +17,8 @@ import torch from torch.nn import Module +import pytorch_lightning as pl + class DeviceDtypeModuleMixin(Module): __jit_unused_properties__ = ["device", "dtype"] @@ -177,7 +179,9 @@ def __update_properties( self, device: Optional[torch.device] = None, dtype: Optional[Union[str, torch.dtype]] = None ) -> None: def apply_fn(module: Union["DeviceDtypeModuleMixin", Module]) -> None: - if not isinstance(module, DeviceDtypeModuleMixin): + # TODO: Find why `isinstance(module, DeviceDtypeModuleMixin)` doesn't + # work when using `init_meta_context`. + if not isinstance(module, (DeviceDtypeModuleMixin, pl.LightningModule)): return if device is not None: module._device = device diff --git a/pytorch_lightning/lite/wrappers.py b/pytorch_lightning/lite/wrappers.py index 615f4610552045..ff95e89d1d2cf6 100644 --- a/pytorch_lightning/lite/wrappers.py +++ b/pytorch_lightning/lite/wrappers.py @@ -24,6 +24,7 @@ from torch.utils.data import DataLoader from pytorch_lightning.accelerators import Accelerator +from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin from pytorch_lightning.plugins import PrecisionPlugin from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device @@ -64,7 +65,7 @@ def step(self, closure: Optional[Callable] = None) -> None: ) -class _LiteModule(nn.Module): +class _LiteModule(DeviceDtypeModuleMixin): def __init__(self, module: nn.Module, precision_plugin: PrecisionPlugin) -> None: """The LiteModule is a thin wrapper around the :class:`torch.nn.Module` and handles precision / autocast automatically for the forward pass. diff --git a/pytorch_lightning/loggers/tensorboard.py b/pytorch_lightning/loggers/tensorboard.py index f26fc75ac58dbb..1ceadb8658a3d4 100644 --- a/pytorch_lightning/loggers/tensorboard.py +++ b/pytorch_lightning/loggers/tensorboard.py @@ -240,7 +240,9 @@ def log_graph(self, model: "pl.LightningModule", input_array=None): if input_array is not None: input_array = model._apply_batch_transfer_handler(input_array) + model._running_torchscript = True self.experiment.add_graph(model, input_array) + model._running_torchscript = False else: rank_zero_warn( "Could not log computational graph since the" diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 2fc1e17d2f1198..01959bdcee212c 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -620,11 +620,6 @@ def _format_batch_size_and_grad_accum_config(self): ) self.config["gradient_accumulation_steps"] = self.lightning_module.trainer.accumulate_grad_batches if "train_micro_batch_size_per_gpu" not in self.config: - rank_zero_warn( - "Inferring the batch size for internal deepspeed logging from the `train_dataloader()`. " - "If you require skipping this, please pass " - "`Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`" - ) batch_size = self._auto_select_batch_size() self.config["train_micro_batch_size_per_gpu"] = batch_size if "gradient_clipping" not in self.config: @@ -636,9 +631,19 @@ def _auto_select_batch_size(self): batch_size = 1 train_dl_source = self.lightning_module.trainer._data_connector._train_dataloader_source if train_dl_source.is_defined(): - train_dataloader = train_dl_source.dataloader() - if hasattr(train_dataloader, "batch_sampler"): - batch_size = train_dataloader.batch_sampler.batch_size + try: + train_dataloader = train_dl_source.dataloader() + if hasattr(train_dataloader, "batch_sampler"): + batch_size = train_dataloader.batch_sampler.batch_size + # broad exception on purpose as `source.dataloader()` will fail if the dataloader requires `setup` + # to have been called before + except Exception: + if self.global_rank == 0: + deepspeed.utils.logging.logger.warning( + "Tried to infer the batch size for internal deepspeed logging from the `train_dataloader()`. " + "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the " + "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`." + ) return batch_size def _format_precision_config(self): diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 160656d43b7c03..26e2f381e63002 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -238,21 +238,25 @@ def to_tensor(x): args = apply_to_collection(args, dtype=(int, float), function=to_tensor) return args - def training_step(self, *args, **kwargs): + def _step(self, stage: RunningStage, *args: Any, **kwargs: Any): args = self._prepare_input(args) - return self.poptorch_models[RunningStage.TRAINING](*args, **kwargs) + poptorch_model = self.poptorch_models[stage] + self.lightning_module._running_torchscript = True + out = poptorch_model(*args, **kwargs) + self.lightning_module._running_torchscript = False + return out + + def training_step(self, *args, **kwargs): + return self._step(RunningStage.TRAINING, *args, **kwargs) def validation_step(self, *args, **kwargs): - args = self._prepare_input(args) - return self.poptorch_models[RunningStage.VALIDATING](*args, **kwargs) + return self._step(RunningStage.VALIDATING, *args, **kwargs) def test_step(self, *args, **kwargs): - args = self._prepare_input(args) - return self.poptorch_models[RunningStage.TESTING](*args, **kwargs) + return self._step(RunningStage.TESTING, *args, **kwargs) def predict_step(self, *args, **kwargs): - args = self._prepare_input(args) - return self.poptorch_models[RunningStage.PREDICTING](*args, **kwargs) + return self._step(RunningStage.PREDICTING, *args, **kwargs) def teardown(self) -> None: # undo dataloader patching diff --git a/pytorch_lightning/trainer/connectors/callback_connector.py b/pytorch_lightning/trainer/connectors/callback_connector.py index 4d41734ed90e6d..6a54e973ffcf3d 100644 --- a/pytorch_lightning/trainer/connectors/callback_connector.py +++ b/pytorch_lightning/trainer/connectors/callback_connector.py @@ -94,12 +94,9 @@ def on_trainer_init( " bar pass `enable_progress_bar = False` to the Trainer." ) - if enable_progress_bar: - self.trainer._progress_bar_callback = self.configure_progress_bar( - progress_bar_refresh_rate, process_position - ) - else: - self.trainer._progress_bar_callback = None + self.trainer._progress_bar_callback = self.configure_progress_bar( + progress_bar_refresh_rate, process_position, enable_progress_bar + ) # configure the ModelSummary callback self._configure_model_summary_callback(enable_model_summary, weights_summary) @@ -215,7 +212,9 @@ def _configure_swa_callbacks(self): if not existing_swa: self.trainer.callbacks = [StochasticWeightAveraging()] + self.trainer.callbacks - def configure_progress_bar(self, refresh_rate=None, process_position=0): + def configure_progress_bar( + self, refresh_rate: Optional[int] = None, process_position: int = 0, enable_progress_bar: bool = True + ) -> Optional[ProgressBarBase]: if os.getenv("COLAB_GPU") and refresh_rate is None: # smaller refresh rate on colab causes crashes, choose a higher value refresh_rate = 20 @@ -229,7 +228,12 @@ def configure_progress_bar(self, refresh_rate=None, process_position=0): ) if len(progress_bars) == 1: progress_bar_callback = progress_bars[0] - elif refresh_rate > 0: + if not enable_progress_bar: + raise MisconfigurationException( + "Trainer was configured with `enable_progress_bar=False`" + f" but found `{progress_bar_callback.__class__.__name__}` in callbacks list." + ) + elif refresh_rate > 0 and enable_progress_bar: progress_bar_callback = TQDMProgressBar(refresh_rate=refresh_rate, process_position=process_position) self.trainer.callbacks.append(progress_bar_callback) else: diff --git a/pytorch_lightning/trainer/connectors/data_connector.py b/pytorch_lightning/trainer/connectors/data_connector.py index 90c398087578de..de81060ba1f805 100644 --- a/pytorch_lightning/trainer/connectors/data_connector.py +++ b/pytorch_lightning/trainer/connectors/data_connector.py @@ -64,7 +64,6 @@ def on_trainer_init( self, check_val_every_n_epoch: int, reload_dataloaders_every_n_epochs: int, - reload_dataloaders_every_epoch: bool, prepare_data_per_node: Optional[bool] = None, ) -> None: self.trainer.datamodule = None @@ -83,13 +82,6 @@ def on_trainer_init( self.trainer.check_val_every_n_epoch = check_val_every_n_epoch - if reload_dataloaders_every_epoch: - reload_dataloaders_every_n_epochs = int(reload_dataloaders_every_epoch) - rank_zero_deprecation( - "`reload_dataloaders_every_epoch` is deprecated in v1.4 and will be removed in v1.6." - " Please use `reload_dataloaders_every_n_epochs` in Trainer." - ) - if not isinstance(reload_dataloaders_every_n_epochs, int) or (reload_dataloaders_every_n_epochs < 0): raise MisconfigurationException( f"`reload_dataloaders_every_n_epochs` should be an int >= 0, got {reload_dataloaders_every_n_epochs}." diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 931f6a92958ee4..bdc051091b50c7 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -438,8 +438,7 @@ def _reset_eval_dataloader( for loader_i in range(len(dataloaders)): loader = dataloaders[loader_i] - if hasattr(loader, "sampler") and isinstance(loader.sampler, RandomSampler): - + if hasattr(loader, "sampler") and not isinstance(loader.sampler, SequentialSampler): # when overfitting, the dataloader should not have sampler if self.overfit_batches > 0 and mode.evaluating: rank_zero_warn( @@ -591,16 +590,17 @@ def _add_sampler_metadata_collate(dataloader: DataLoader) -> None: @staticmethod def _resolve_overfit_batches(dataloader: Collection[DataLoader]) -> Collection[DataLoader]: - has_random_sampler = False + all_have_sequential_sampler = True - def resolve_had_random_sampler(dataloader: DataLoader): - nonlocal has_random_sampler - if not has_random_sampler: - has_random_sampler = isinstance(dataloader.sampler, RandomSampler) + def resolve_has_no_sequential_sampler(dataloader: DataLoader): + nonlocal all_have_sequential_sampler + all_have_sequential_sampler = all_have_sequential_sampler & isinstance( + dataloader.sampler, SequentialSampler + ) - apply_to_collection(dataloader, DataLoader, resolve_had_random_sampler) + apply_to_collection(dataloader, DataLoader, resolve_has_no_sequential_sampler) - if has_random_sampler: + if not all_have_sequential_sampler: rank_zero_warn( "You requested to overfit but enabled training dataloader shuffling." " We are turning off the training dataloader shuffling for you." diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index bf28642e07cd11..f81ce0396e5bb9 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -84,7 +84,7 @@ from pytorch_lightning.utilities.distributed import distributed_available from pytorch_lightning.utilities.exceptions import ExitGracefullyException, MisconfigurationException from pytorch_lightning.utilities.imports import _fault_tolerant_training -from pytorch_lightning.utilities.meta import materialize_module +from pytorch_lightning.utilities.meta import is_on_meta_device, materialize_module from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.seed import reset_seed from pytorch_lightning.utilities.types import ( @@ -162,7 +162,6 @@ def __init__( benchmark: bool = False, deterministic: bool = False, reload_dataloaders_every_n_epochs: int = 0, - reload_dataloaders_every_epoch: bool = False, auto_lr_find: Union[bool, str] = False, replace_sampler_ddp: bool = True, detect_anomaly: bool = False, @@ -341,12 +340,6 @@ def __init__( reload_dataloaders_every_n_epochs: Set to a non-negative integer to reload dataloaders every n epochs. - reload_dataloaders_every_epoch: Set to True to reload dataloaders every epoch. - - .. deprecated:: v1.4 - ``reload_dataloaders_every_epoch`` has been deprecated in v1.4 and will be removed in v1.6. - Please use ``reload_dataloaders_every_n_epochs``. - replace_sampler_ddp: Explicitly enables or disables sampler replacement. If not specified this will toggled automatically when DDP is used. By default it will add ``shuffle=True`` for train sampler and ``shuffle=False`` for val/test sampler. If you want to customize it, @@ -515,7 +508,6 @@ def __init__( self._data_connector.on_trainer_init( check_val_every_n_epoch, reload_dataloaders_every_n_epochs, - reload_dataloaders_every_epoch, prepare_data_per_node, ) @@ -1406,10 +1398,21 @@ def _call_setup_hook(self) -> None: def _call_configure_sharded_model(self) -> None: with self.accelerator.model_sharded_context(): - materialize_module(self.lightning_module) + self._handle_meta_model() self.call_hook("configure_sharded_model") self.call_hook("on_configure_sharded_model") + def _handle_meta_model(self) -> None: + if not is_on_meta_device(self.lightning_module): + return + + if isinstance(self.training_type_plugin, DDPSpawnPlugin): + raise MisconfigurationException("LightningModule on meta device isn't supported with spawn.") + + materialize_module(self.lightning_module) + # the trainer reference is lost during materialization + self.lightning_module.trainer = proxy(self) + def _call_teardown_hook(self) -> None: fn = self.state.fn._setup_fn @@ -1783,15 +1786,6 @@ def _should_reload_dl_epoch(self) -> bool: n_epochs = self.reload_dataloaders_every_n_epochs return n_epochs and (not self.current_epoch % n_epochs) - @property - def disable_validation(self) -> bool: - """Check if validation is disabled during training.""" - rank_zero_deprecation( - "`trainer.disable_validation` is deprecated in v1.4 and will be removed in v1.6." - " Use `not trainer.enable_validation` instead." - ) - return not self.enable_validation - @property def enable_validation(self) -> bool: """Check if we should run validation during training.""" diff --git a/pytorch_lightning/utilities/meta.py b/pytorch_lightning/utilities/meta.py index 60e6cc791b7aee..6d3c1d6b5f11bf 100644 --- a/pytorch_lightning/utilities/meta.py +++ b/pytorch_lightning/utilities/meta.py @@ -18,13 +18,14 @@ from functools import partial from itertools import chain from types import ModuleType -from typing import Callable, Dict, Generator, Iterator, List, Optional, Set, Type +from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Set, Type import torch from torch import nn, Tensor from torch.nn import Module from torch.nn.modules.container import ModuleDict, ModuleList, Sequential +import pytorch_lightning as pl from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_10 @@ -191,7 +192,6 @@ def materialize_module(root_module: nn.Module) -> nn.Module: # cache subclasses to optimize the search when resetting the meta device later on. __STORAGE_META__ = {} - __CREATED_MODULES__ = set() @@ -237,45 +237,52 @@ def _set_meta_device() -> None: for subclass in get_all_subclasses(torch.nn.modules.module.Module): - if isinstance(subclass, (Sequential, ModuleList, ModuleDict)): + if subclass in (Sequential, ModuleList, ModuleDict, pl.LightningModule): continue # if a subclass has already been stored, we should use the cache if str(subclass) in __STORAGE_META__: - # reset the class import package to its rightfull state. + # reset the class import package to its rightful state. mods, subclass, meta_class = __STORAGE_META__[subclass] for mod in mods: setattr(mod, subclass.__name__, meta_class) continue + class _IsinstanceMetaclass(type(subclass)): + def __instancecheck__(self, instance: Any) -> bool: + """Overrides the ``isinstance`` check on ``_MaterializerModule`` objects.""" + return isinstance(instance, self.__bases__[0]) + # Create a class subclassing current `subclass` overriding its new method. # this will enable use to use `torch.distributed.nn.utils.init_meta` to create a `meta` # version of the current subclass module - class _MetaClass(subclass): + class _MaterializerModule(subclass, metaclass=_IsinstanceMetaclass): @classmethod @contextmanager - def instantiation_context(cls, materialize: bool): + def instantiation_context(cls): _unset_meta_device(from_created=True) yield _set_meta_device_populated(from_created=True) @classmethod def materialize(cls, materialize_fn: Callable): - with cls.instantiation_context(materialize=True): + with cls.instantiation_context(): obj = materialize_fn() return obj @staticmethod def add_subclasses(subclass): - """This is used to unrol the instantion tree while creating the modules.""" - __CREATED_MODULES__.add(subclass) + """This is used to unroll the instantiation tree while creating the modules.""" + # Don't store the LightningModule as skipped from the Meta process. + if subclass != pl.LightningModule: + __CREATED_MODULES__.add(subclass) if subclass.__bases__[0] != torch.nn.modules.module.Module: - _MetaClass.add_subclasses(subclass.__bases__[0]) + _MaterializerModule.add_subclasses(subclass.__bases__[0]) def __new__(cls, *args, **kwargs): subclass = cls.__bases__[0] cls.add_subclasses(subclass) - with cls.instantiation_context(materialize=False): + with cls.instantiation_context(): obj = init_meta(subclass, *args, **kwargs) obj.materialize = partial(cls.materialize, materialize_fn=obj.materialize) @@ -294,9 +301,8 @@ def search(mod: ModuleType) -> List[ModuleType]: # nn.Module class can be imported at different level and they all need to be mocked. # Example: torch.nn.Linear is actually torch.nn.modules.linear.Linear # Therefore, torch.nn.Linear, torch.nn.modules.Linear, torch.nn.modules.linear.Linear - # needs to be replaced by the torch.nn.linear.modules.Linear _MetaClass - out = [] - out.append(search(mod)) + # needs to be replaced by the torch.nn.linear.modules.Linear _MaterializerModule + out = [search(mod)] for name in submodules[1:]: mod = getattr(mod, name) out.append(search(mod)) @@ -305,11 +311,11 @@ def search(mod: ModuleType) -> List[ModuleType]: mods = [mod for mod in chain(*out) if mod] # store the modules search so it doesn't have to be performed again for this class - __STORAGE_META__[subclass] = (mods, subclass, _MetaClass) + __STORAGE_META__[subclass] = (mods, subclass, _MaterializerModule) # replace all subclass by its meta form for mod in mods: - setattr(mod, subclass.__name__, _MetaClass) + setattr(mod, subclass.__name__, _MaterializerModule) @contextmanager @@ -321,3 +327,11 @@ def init_meta_context() -> Generator: _set_meta_device() yield _unset_meta_device() + + +def is_on_meta_device(module: nn.Module) -> bool: + try: + param = next(module.parameters()) + return param.device.type == "meta" + except StopIteration: + return False diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index e70d862b048e0a..d005c487573302 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -86,7 +86,6 @@ def test_accelerator_choice_ddp_spawn(cuda_available_mock, device_count_mock): assert isinstance(trainer.training_type_plugin.cluster_environment, LightningEnvironment) -@RunIf(min_gpus=2) @mock.patch.dict( os.environ, { @@ -98,8 +97,10 @@ def test_accelerator_choice_ddp_spawn(cuda_available_mock, device_count_mock): "SLURM_LOCALID": "1", }, ) +@mock.patch("torch.cuda.set_device") +@mock.patch("torch.cuda.device_count", return_value=2) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_accelerator_choice_ddp_slurm(setup_distributed_mock): +def test_accelerator_choice_ddp_slurm(set_device_mock, device_count_mock, setup_distributed_mock): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer._accelerator_connector._is_slurm_managing_tasks @@ -111,13 +112,13 @@ def on_fit_start(self, trainer, pl_module): raise SystemExit() model = BoringModel() - trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=2, callbacks=[CB()]) + with pytest.deprecated_call(match=r"accelerator='ddp'\)` has been deprecated in v1.5"): + trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=2, callbacks=[CB()]) with pytest.raises(SystemExit): trainer.fit(model) -@RunIf(min_gpus=2) @mock.patch.dict( os.environ, { @@ -129,9 +130,10 @@ def on_fit_start(self, trainer, pl_module): "SLURM_LOCALID": "1", }, ) +@mock.patch("torch.cuda.set_device") @mock.patch("torch.cuda.device_count", return_value=2) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_accelerator_choice_ddp2_slurm(device_count_mock, setup_distributed_mock): +def test_accelerator_choice_ddp2_slurm(set_device_mock, device_count_mock, setup_distributed_mock): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer._accelerator_connector._is_slurm_managing_tasks @@ -143,13 +145,15 @@ def on_fit_start(self, trainer, pl_module): raise SystemExit() model = BoringModel() - trainer = Trainer(fast_dev_run=True, accelerator="ddp2", gpus=2, callbacks=[CB()]) + with pytest.deprecated_call(match=r"accelerator='ddp2'\)` has been deprecated in v1.5"): + trainer = Trainer(fast_dev_run=True, accelerator="ddp2", gpus=2, callbacks=[CB()]) with pytest.raises(SystemExit): trainer.fit(model) + set_device_mock.assert_called_once() + -@RunIf(min_gpus=1) @mock.patch.dict( os.environ, { @@ -161,9 +165,10 @@ def on_fit_start(self, trainer, pl_module): "GROUP_RANK": "0", }, ) -@mock.patch("torch.cuda.device_count", return_value=2) +@mock.patch("torch.cuda.set_device") +@mock.patch("torch.cuda.device_count", return_value=1) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_accelerator_choice_ddp_te(device_count_mock, setup_distributed_mock): +def test_accelerator_choice_ddp_te(set_device_mock, device_count_mock, setup_distributed_mock): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator, GPUAccelerator) @@ -174,13 +179,15 @@ def on_fit_start(self, trainer, pl_module): raise SystemExit() model = BoringModel() - trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=2, callbacks=[CB()]) + with pytest.deprecated_call(match=r"accelerator='ddp'\)` has been deprecated in v1.5"): + trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=2, callbacks=[CB()]) with pytest.raises(SystemExit): trainer.fit(model) + set_device_mock.assert_called_once() + -@RunIf(min_gpus=1) @mock.patch.dict( os.environ, { @@ -192,9 +199,10 @@ def on_fit_start(self, trainer, pl_module): "GROUP_RANK": "0", }, ) -@mock.patch("torch.cuda.device_count", return_value=2) +@mock.patch("torch.cuda.set_device") +@mock.patch("torch.cuda.device_count", return_value=1) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_accelerator_choice_ddp2_te(device_count_mock, setup_distributed_mock): +def test_accelerator_choice_ddp2_te(set_device_mock, device_count_mock, setup_distributed_mock): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator, GPUAccelerator) @@ -205,11 +213,14 @@ def on_fit_start(self, trainer, pl_module): raise SystemExit() model = BoringModel() - trainer = Trainer(fast_dev_run=True, accelerator="ddp2", gpus=2, callbacks=[CB()]) + with pytest.deprecated_call(match=r"accelerator='ddp2'\)` has been deprecated in v1.5"): + trainer = Trainer(fast_dev_run=True, accelerator="ddp2", gpus=2, callbacks=[CB()]) with pytest.raises(SystemExit): trainer.fit(model) + set_device_mock.assert_called_once() + @mock.patch.dict( os.environ, {"WORLD_SIZE": "2", "LOCAL_WORLD_SIZE": "2", "RANK": "1", "LOCAL_RANK": "1", "GROUP_RANK": "0"} @@ -233,7 +244,6 @@ def on_fit_start(self, trainer, pl_module): trainer.fit(model) -@RunIf(min_gpus=1) @mock.patch.dict( os.environ, { @@ -245,9 +255,10 @@ def on_fit_start(self, trainer, pl_module): "RANK": "1", }, ) +@mock.patch("torch.cuda.set_device") @mock.patch("torch.cuda.device_count", return_value=1) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_accelerator_choice_ddp_kubeflow(device_count_mock, setup_distributed_mock): +def test_accelerator_choice_ddp_kubeflow(set_device_mock, device_count_mock, setup_distributed_mock): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator, GPUAccelerator) @@ -258,11 +269,14 @@ def on_fit_start(self, trainer, pl_module): raise SystemExit() model = BoringModel() - trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=1, callbacks=[CB()]) + with pytest.deprecated_call(match=r"accelerator='ddp'\)` has been deprecated in v1.5"): + trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=1, callbacks=[CB()]) with pytest.raises(SystemExit): trainer.fit(model) + set_device_mock.assert_called_once() + @mock.patch.dict( os.environ, @@ -323,29 +337,28 @@ def on_fit_start(self, trainer, pl_module): trainer.fit(model) -@RunIf(special=True) -def test_accelerator_choice_ddp_cpu_and_plugin(tmpdir): +@RunIf(skip_windows=True, special=True) +def test_accelerator_choice_ddp_cpu_and_strategy(tmpdir): """Test that accelerator="ddp_cpu" can work together with an instance of DDPPlugin.""" - _test_accelerator_choice_ddp_cpu_and_plugin(tmpdir, ddp_plugin_class=DDPPlugin) + _test_accelerator_choice_ddp_cpu_and_strategy(tmpdir, ddp_strategy_class=DDPPlugin) -@RunIf(special=True) -def test_accelerator_choice_ddp_cpu_and_plugin_spawn(tmpdir): +@RunIf(skip_windows=True) +def test_accelerator_choice_ddp_cpu_and_strategy_spawn(tmpdir): """Test that accelerator="ddp_cpu" can work together with an instance of DDPPSpawnPlugin.""" - _test_accelerator_choice_ddp_cpu_and_plugin(tmpdir, ddp_plugin_class=DDPSpawnPlugin) - + _test_accelerator_choice_ddp_cpu_and_strategy(tmpdir, ddp_strategy_class=DDPSpawnPlugin) -def _test_accelerator_choice_ddp_cpu_and_plugin(tmpdir, ddp_plugin_class): +def _test_accelerator_choice_ddp_cpu_and_strategy(tmpdir, ddp_strategy_class): model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, - plugins=[ddp_plugin_class(find_unused_parameters=True)], + strategy=ddp_strategy_class(find_unused_parameters=True), fast_dev_run=True, accelerator="ddp_cpu", num_processes=2, ) - assert isinstance(trainer.training_type_plugin, ddp_plugin_class) + assert isinstance(trainer.training_type_plugin, ddp_strategy_class) assert isinstance(trainer.accelerator, CPUAccelerator) assert trainer.training_type_plugin.num_processes == 2 assert trainer.training_type_plugin.parallel_devices == [torch.device("cpu")] * 2 @@ -793,7 +806,6 @@ def on_fit_start(self, trainer, pl_module): trainer.fit(model) -@RunIf(min_gpus=2) @mock.patch.dict( os.environ, { @@ -805,10 +817,11 @@ def on_fit_start(self, trainer, pl_module): "SLURM_LOCALID": "1", }, ) +@mock.patch("torch.cuda.set_device") @mock.patch("torch.cuda.device_count", return_value=2) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) @pytest.mark.parametrize("strategy", ["ddp2", DDP2Plugin()]) -def test_strategy_choice_ddp2_slurm(device_count_mock, setup_distributed_mock, strategy): +def test_strategy_choice_ddp2_slurm(set_device_mock, device_count_mock, setup_distributed_mock, strategy): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer._accelerator_connector._is_slurm_managing_tasks @@ -825,8 +838,9 @@ def on_fit_start(self, trainer, pl_module): with pytest.raises(SystemExit): trainer.fit(model) + set_device_mock.assert_called_once() + -@RunIf(min_gpus=1) @mock.patch.dict( os.environ, { @@ -838,9 +852,10 @@ def on_fit_start(self, trainer, pl_module): "GROUP_RANK": "0", }, ) +@mock.patch("torch.cuda.set_device") @mock.patch("torch.cuda.device_count", return_value=2) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_strategy_choice_ddp_te(device_count_mock, setup_distributed_mock): +def test_strategy_choice_ddp_te(set_device_mock, device_count_mock, setup_distributed_mock): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator, GPUAccelerator) @@ -856,8 +871,9 @@ def on_fit_start(self, trainer, pl_module): with pytest.raises(SystemExit): trainer.fit(model) + set_device_mock.assert_called_once() + -@RunIf(min_gpus=1) @mock.patch.dict( os.environ, { @@ -869,9 +885,10 @@ def on_fit_start(self, trainer, pl_module): "GROUP_RANK": "0", }, ) +@mock.patch("torch.cuda.set_device") @mock.patch("torch.cuda.device_count", return_value=2) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_strategy_choice_ddp2_te(device_count_mock, setup_distributed_mock): +def test_strategy_choice_ddp2_te(set_device_mock, device_count_mock, setup_distributed_mock): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator, GPUAccelerator) @@ -887,6 +904,8 @@ def on_fit_start(self, trainer, pl_module): with pytest.raises(SystemExit): trainer.fit(model) + set_device_mock.assert_called_once() + @mock.patch.dict( os.environ, {"WORLD_SIZE": "2", "LOCAL_WORLD_SIZE": "2", "RANK": "1", "LOCAL_RANK": "1", "GROUP_RANK": "0"} @@ -910,7 +929,6 @@ def on_fit_start(self, trainer, pl_module): trainer.fit(model) -@RunIf(min_gpus=1) @mock.patch.dict( os.environ, { @@ -922,9 +940,10 @@ def on_fit_start(self, trainer, pl_module): "RANK": "1", }, ) +@mock.patch("torch.cuda.set_device") @mock.patch("torch.cuda.device_count", return_value=1) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_strategy_choice_ddp_kubeflow(device_count_mock, setup_distributed_mock): +def test_strategy_choice_ddp_kubeflow(set_device_mock, device_count_mock, setup_distributed_mock): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator, GPUAccelerator) @@ -940,6 +959,8 @@ def on_fit_start(self, trainer, pl_module): with pytest.raises(SystemExit): trainer.fit(model) + set_device_mock.assert_called_once() + @mock.patch.dict( os.environ, diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py index da200cc336504d..ffc8ee2ae0846f 100644 --- a/tests/callbacks/test_early_stopping.py +++ b/tests/callbacks/test_early_stopping.py @@ -381,7 +381,7 @@ def on_train_end(self) -> None: _ES_CHECK = dict(check_on_train_epoch_end=True) _ES_CHECK_P3 = dict(patience=3, check_on_train_epoch_end=True) -_NO_WIN = dict(marks=RunIf(skip_windows=True)) +_SPAWN_MARK = dict(marks=RunIf(skip_windows=True, skip_49370=True)) @pytest.mark.parametrize( @@ -389,8 +389,8 @@ def on_train_end(self) -> None: [ ([EarlyStopping("abc"), EarlyStopping("cba", patience=3)], 3, False, None, 1), ([EarlyStopping("cba", patience=3), EarlyStopping("abc")], 3, False, None, 1), - pytest.param([EarlyStopping("abc"), EarlyStopping("cba", patience=3)], 3, False, "ddp_spawn", 2, **_NO_WIN), - pytest.param([EarlyStopping("cba", patience=3), EarlyStopping("abc")], 3, False, "ddp_spawn", 2, **_NO_WIN), + pytest.param([EarlyStopping("abc"), EarlyStopping("cba", patience=3)], 3, False, "ddp_spawn", 2, **_SPAWN_MARK), + pytest.param([EarlyStopping("cba", patience=3), EarlyStopping("abc")], 3, False, "ddp_spawn", 2, **_SPAWN_MARK), ([EarlyStopping("abc", **_ES_CHECK), EarlyStopping("cba", **_ES_CHECK_P3)], 3, True, None, 1), ([EarlyStopping("cba", **_ES_CHECK_P3), EarlyStopping("abc", **_ES_CHECK)], 3, True, None, 1), pytest.param( @@ -399,7 +399,7 @@ def on_train_end(self) -> None: True, "ddp_spawn", 2, - **_NO_WIN, + **_SPAWN_MARK, ), pytest.param( [EarlyStopping("cba", **_ES_CHECK_P3), EarlyStopping("abc", **_ES_CHECK)], @@ -407,7 +407,7 @@ def on_train_end(self) -> None: True, "ddp_spawn", 2, - **_NO_WIN, + **_SPAWN_MARK, ), ], ) diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py index 1c1f84b5b95a0e..c813ed2b02e28b 100644 --- a/tests/callbacks/test_pruning.py +++ b/tests/callbacks/test_pruning.py @@ -187,7 +187,7 @@ def test_pruning_callback_ddp_spawn(tmpdir): train_with_pruning_callback(tmpdir, use_global_unstructured=True, strategy="ddp_spawn", gpus=2) -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_pruning_callback_ddp_cpu(tmpdir): train_with_pruning_callback(tmpdir, parameters_to_prune=True, strategy="ddp_spawn", num_processes=2) diff --git a/tests/callbacks/test_stochastic_weight_avg.py b/tests/callbacks/test_stochastic_weight_avg.py index 8bed31bc48ac80..c7186e819ea944 100644 --- a/tests/callbacks/test_stochastic_weight_avg.py +++ b/tests/callbacks/test_stochastic_weight_avg.py @@ -148,7 +148,7 @@ def test_swa_callback_ddp_spawn(tmpdir): train_with_swa(tmpdir, strategy="ddp_spawn", gpus=2) -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_swa_callback_ddp_cpu(tmpdir): train_with_swa(tmpdir, strategy="ddp_spawn", num_processes=2) diff --git a/tests/callbacks/test_tqdm_progress_bar.py b/tests/callbacks/test_tqdm_progress_bar.py index 99fe02ce21a112..a8371591759d7b 100644 --- a/tests/callbacks/test_tqdm_progress_bar.py +++ b/tests/callbacks/test_tqdm_progress_bar.py @@ -14,7 +14,7 @@ import os import pickle import sys -from typing import Optional, Union +from typing import Union from unittest import mock from unittest.mock import ANY, call, Mock @@ -32,65 +32,54 @@ @pytest.mark.parametrize( - "callbacks,refresh_rate", + "kwargs", [ - ([], None), - ([], 1), - ([], 2), - ([TQDMProgressBar(refresh_rate=1)], 0), - ([TQDMProgressBar(refresh_rate=2)], 0), - ([TQDMProgressBar(refresh_rate=2)], 1), + # won't print but is still set + {"callbacks": TQDMProgressBar(refresh_rate=0)}, + {"callbacks": TQDMProgressBar()}, + {"progress_bar_refresh_rate": 1}, ], ) -def test_tqdm_progress_bar_on(tmpdir, callbacks: list, refresh_rate: Optional[int]): +def test_tqdm_progress_bar_on(tmpdir, kwargs): """Test different ways the progress bar can be turned on.""" - - trainer = Trainer( - default_root_dir=tmpdir, - callbacks=callbacks, - progress_bar_refresh_rate=refresh_rate, - max_epochs=1, - overfit_batches=5, - ) + if "progress_bar_refresh_rate" in kwargs: + with pytest.deprecated_call(match=r"progress_bar_refresh_rate=.*` is deprecated"): + trainer = Trainer(default_root_dir=tmpdir, **kwargs) + else: + trainer = Trainer(default_root_dir=tmpdir, **kwargs) progress_bars = [c for c in trainer.callbacks if isinstance(c, ProgressBarBase)] - # Trainer supports only a single progress bar callback at the moment assert len(progress_bars) == 1 assert progress_bars[0] is trainer.progress_bar_callback -@pytest.mark.parametrize( - "callbacks,refresh_rate,enable_progress_bar", - [([], 0, True), ([], False, True), ([ModelCheckpoint(dirpath="../trainer")], 0, True), ([], 1, False)], -) -def test_tqdm_progress_bar_off(tmpdir, callbacks: list, refresh_rate: Union[bool, int], enable_progress_bar: bool): +@pytest.mark.parametrize("kwargs", [{"enable_progress_bar": False}, {"progress_bar_refresh_rate": 0}]) +def test_tqdm_progress_bar_off(tmpdir, kwargs): """Test different ways the progress bar can be turned off.""" - - trainer = Trainer( - default_root_dir=tmpdir, - callbacks=callbacks, - progress_bar_refresh_rate=refresh_rate, - enable_progress_bar=enable_progress_bar, - ) - - progress_bars = [c for c in trainer.callbacks if isinstance(c, TQDMProgressBar)] - assert 0 == len(progress_bars) - assert not trainer.progress_bar_callback + if "progress_bar_refresh_rate" in kwargs: + pytest.deprecated_call(match=r"progress_bar_refresh_rate=.*` is deprecated").__enter__() + trainer = Trainer(default_root_dir=tmpdir, **kwargs) + progress_bars = [c for c in trainer.callbacks if isinstance(c, ProgressBarBase)] + assert not len(progress_bars) def test_tqdm_progress_bar_misconfiguration(): """Test that Trainer doesn't accept multiple progress bars.""" + # Trainer supports only a single progress bar callback at the moment callbacks = [TQDMProgressBar(), TQDMProgressBar(), ModelCheckpoint(dirpath="../trainer")] with pytest.raises(MisconfigurationException, match=r"^You added multiple progress bar callbacks"): Trainer(callbacks=callbacks) + with pytest.raises(MisconfigurationException, match=r"enable_progress_bar=False` but found `TQDMProgressBar"): + Trainer(callbacks=TQDMProgressBar(), enable_progress_bar=False) + def test_tqdm_progress_bar_totals(tmpdir): """Test that the progress finishes with the correct total steps processed.""" model = BoringModel() - trainer = Trainer(default_root_dir=tmpdir, progress_bar_refresh_rate=1, max_epochs=1) + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) bar = trainer.progress_bar_callback assert float("inf") == bar.total_train_batches assert 0 == bar.total_val_batches @@ -209,14 +198,15 @@ def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, datal self.test_batches_seen += 1 progress_bar = CurrentProgressBar(refresh_rate=refresh_rate) - trainer = Trainer( - default_root_dir=tmpdir, - callbacks=[progress_bar], - progress_bar_refresh_rate=101, # should not matter if custom callback provided - limit_train_batches=1.0, - num_sanity_val_steps=2, - max_epochs=3, - ) + with pytest.deprecated_call(match=r"progress_bar_refresh_rate=101\)` is deprecated"): + trainer = Trainer( + default_root_dir=tmpdir, + callbacks=[progress_bar], + progress_bar_refresh_rate=101, # should not matter if custom callback provided + limit_train_batches=1.0, + num_sanity_val_steps=2, + max_epochs=3, + ) assert trainer.progress_bar_callback.refresh_rate == refresh_rate trainer.fit(model) @@ -276,9 +266,6 @@ def test_tqdm_progress_bar_default_value(tmpdir): trainer = Trainer(default_root_dir=tmpdir) assert trainer.progress_bar_callback.refresh_rate == 1 - trainer = Trainer(default_root_dir=tmpdir, progress_bar_refresh_rate=None) - assert trainer.progress_bar_callback.refresh_rate == 1 - @mock.patch.dict(os.environ, {"COLAB_GPU": "1"}) def test_tqdm_progress_bar_value_on_colab(tmpdir): @@ -286,10 +273,14 @@ def test_tqdm_progress_bar_value_on_colab(tmpdir): trainer = Trainer(default_root_dir=tmpdir) assert trainer.progress_bar_callback.refresh_rate == 20 - trainer = Trainer(default_root_dir=tmpdir, progress_bar_refresh_rate=None) - assert trainer.progress_bar_callback.refresh_rate == 20 + trainer = Trainer(default_root_dir=tmpdir, callbacks=TQDMProgressBar()) + assert trainer.progress_bar_callback.refresh_rate == 1 # FIXME: should be 20 + + trainer = Trainer(default_root_dir=tmpdir, callbacks=TQDMProgressBar(refresh_rate=19)) + assert trainer.progress_bar_callback.refresh_rate == 19 - trainer = Trainer(default_root_dir=tmpdir, progress_bar_refresh_rate=19) + with pytest.deprecated_call(match=r"progress_bar_refresh_rate=19\)` is deprecated"): + trainer = Trainer(default_root_dir=tmpdir, progress_bar_refresh_rate=19) assert trainer.progress_bar_callback.refresh_rate == 19 diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index 518d67cf251f5a..04255d51ad0699 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -385,7 +385,7 @@ def on_train_end(self, trainer, pl_module): assert torch.save.call_count == 0 -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_model_checkpoint_no_extraneous_invocations(tmpdir): """Test to ensure that the model callback saves the checkpoints only once in distributed mode.""" model = LogInTwoMethods() diff --git a/tests/checkpointing/test_torch_saving.py b/tests/checkpointing/test_torch_saving.py index 8b0f0e457bff97..f9634a9dadb2ab 100644 --- a/tests/checkpointing/test_torch_saving.py +++ b/tests/checkpointing/test_torch_saving.py @@ -34,7 +34,7 @@ def test_model_torch_save(tmpdir): trainer = torch.load(temp_path) -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_model_torch_save_ddp_cpu(tmpdir): """Test to ensure torch save does not fail for model and trainer using cpu ddp.""" model = BoringModel() diff --git a/tests/deprecated_api/test_remove_1-6.py b/tests/deprecated_api/test_remove_1-6.py deleted file mode 100644 index efb288a623d6af..00000000000000 --- a/tests/deprecated_api/test_remove_1-6.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Test deprecated functionality which will be removed in v1.6.0.""" -from unittest.mock import call, Mock - -import pytest - -from pytorch_lightning import Trainer -from tests.helpers import BoringModel - - -def test_v1_6_0_reload_dataloaders_every_epoch(tmpdir): - model = BoringModel() - - tracker = Mock() - model.train_dataloader = Mock(wraps=model.train_dataloader) - model.val_dataloader = Mock(wraps=model.val_dataloader) - model.test_dataloader = Mock(wraps=model.test_dataloader) - - tracker.attach_mock(model.train_dataloader, "train_dataloader") - tracker.attach_mock(model.val_dataloader, "val_dataloader") - tracker.attach_mock(model.test_dataloader, "test_dataloader") - - with pytest.deprecated_call(match="`reload_dataloaders_every_epoch` is deprecated in v1.4 and will be removed"): - trainer = Trainer( - default_root_dir=tmpdir, - limit_train_batches=0.3, - limit_val_batches=0.3, - reload_dataloaders_every_epoch=True, - max_epochs=3, - ) - trainer.fit(model) - trainer.test() - - expected_sequence = ( - [call.val_dataloader()] + [call.train_dataloader(), call.val_dataloader()] * 3 + [call.test_dataloader()] - ) - assert tracker.mock_calls == expected_sequence - - -def test_v1_6_0_deprecated_disable_validation(): - trainer = Trainer() - with pytest.deprecated_call(match="disable_validation` is deprecated in v1.4"): - _ = trainer.disable_validation diff --git a/tests/deprecated_api/test_remove_1-7.py b/tests/deprecated_api/test_remove_1-7.py index 4da10fb0b666a0..09a8df66a02ccc 100644 --- a/tests/deprecated_api/test_remove_1-7.py +++ b/tests/deprecated_api/test_remove_1-7.py @@ -245,7 +245,7 @@ def get_from_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None: return super().get_from_queue(queue) -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_v1_7_0_deprecate_add_get_queue(tmpdir): model = BoringCallbackDDPSpawnModel() trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, num_processes=2, strategy="ddp_spawn") diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index 490e023662f79c..e53d3811f6b345 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -70,6 +70,7 @@ def __new__( fairscale_fully_sharded: bool = False, deepspeed: bool = False, rich: bool = False, + skip_49370: bool = False, **kwargs, ): """ @@ -91,6 +92,7 @@ def __new__( fairscale_fully_sharded: if `fairscale` fully sharded module is required to run the test deepspeed: if `deepspeed` module is required to run the test rich: if `rich` module is required to run the test + skip_49370: Skip the test as it's impacted by https://github.com/pytorch/pytorch/issues/49370. kwargs: native pytest.mark.skipif keyword arguments """ conditions = [] @@ -165,6 +167,15 @@ def __new__( conditions.append(not _RICH_AVAILABLE) reasons.append("Rich") + if skip_49370: + # strategy=ddp_spawn, accelerator=cpu, python>=3.9, torch<1.8 does not work + py_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}" + ge_3_9 = Version(py_version) >= Version("3.9") + torch_version = get_distribution("torch").version + old_torch = Version(torch_version) < Version("1.8") + conditions.append(ge_3_9 and old_torch) + reasons.append("Impacted by https://github.com/pytorch/pytorch/issues/49370") + reasons = [rs for cond, rs in zip(conditions, reasons) if cond] return pytest.mark.skipif( *args, condition=any(conditions), reason=f"Requires: [{' + '.join(reasons)}]", **kwargs diff --git a/tests/lite/test_wrappers.py b/tests/lite/test_wrappers.py index 4993a10c8dbc23..c271d3b3163edb 100644 --- a/tests/lite/test_wrappers.py +++ b/tests/lite/test_wrappers.py @@ -17,6 +17,7 @@ import torch from torch.utils.data.dataloader import DataLoader +from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin from pytorch_lightning.lite import LightningLite from pytorch_lightning.lite.wrappers import _LiteDataLoader, _LiteModule, _LiteOptimizer from tests.helpers.runif import RunIf @@ -65,6 +66,27 @@ def check_autocast(forward_input): assert out.dtype == input_type or out.dtype == torch.get_default_dtype() +@pytest.mark.parametrize( + "device", [torch.device("cpu"), pytest.param(torch.device("cuda", 0), marks=RunIf(min_gpus=1))] +) +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) +def test_lite_module_device_dtype_propagation(device, dtype): + """Test that the LiteModule propagates device and dtype properties to its submodules (e.g. torchmetrics).""" + + class DeviceModule(DeviceDtypeModuleMixin): + pass + + device_module = DeviceModule() + lite_module = _LiteModule(device_module, Mock()) + lite_module.to(device) + assert device_module.device == device + assert lite_module.device == device + + lite_module.to(dtype) + assert device_module.dtype == dtype + assert lite_module.dtype == dtype + + def test_lite_dataloader_iterator(): """Test that the iteration over a LiteDataLoader wraps the iterator of the underlying dataloader (no automatic device placement).""" diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py index 271ffce811fe54..370b24431b088b 100644 --- a/tests/loggers/test_all.py +++ b/tests/loggers/test_all.py @@ -321,8 +321,8 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx): assert pl_module.logger.experiment.something(foo="bar") is None +@RunIf(skip_windows=True, skip_49370=True) @pytest.mark.parametrize("logger_class", [CometLogger, CSVLogger, MLFlowLogger, TensorBoardLogger, TestTubeLogger]) -@RunIf(skip_windows=True) def test_logger_created_on_rank_zero_only(tmpdir, monkeypatch, logger_class): """Test that loggers get replaced by dummy loggers on global rank > 0.""" _patch_comet_atexit(monkeypatch) diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py index 63a2211934ece8..6bd7db1aeff8d8 100644 --- a/tests/loops/test_loops.py +++ b/tests/loops/test_loops.py @@ -791,7 +791,7 @@ def val_dataloader(self): max_epochs=1, val_check_interval=val_check_interval, num_sanity_val_steps=0, - progress_bar_refresh_rate=0, + enable_progress_bar=False, ) trainer.fit(model) @@ -829,7 +829,7 @@ def val_dataloader(self): max_epochs=1, val_check_interval=val_check_interval, num_sanity_val_steps=0, - progress_bar_refresh_rate=0, + enable_progress_bar=False, ) with pytest.raises(CustomException): # will stop during validation @@ -880,7 +880,7 @@ def val_dataloader(self): max_epochs=1, val_check_interval=val_check_interval, num_sanity_val_steps=0, - progress_bar_refresh_rate=0, + enable_progress_bar=False, ) trainer.fit(model, ckpt_path=ckpt_path) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 2fb537b1d2861d..c110f3a83d815d 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -122,7 +122,7 @@ def validation_step(self, *args, **kwargs): model.unfreeze() -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_multi_cpu_model_ddp(tmpdir): """Make sure DDP works.""" tutils.set_random_main_port() diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 6b34553ff313be..b55e8344ef146f 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -866,7 +866,7 @@ def call(hook, fn, *args, **kwargs): limit_predict_batches=batches, enable_progress_bar=False, enable_model_summary=False, - reload_dataloaders_every_epoch=True, + reload_dataloaders_every_n_epochs=True, ) called = [] diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index abf5a347574245..59a22cf1656d18 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -66,7 +66,7 @@ def _run_horovod(trainer_options, on_gpu=False): assert exit_code == 0 -@RunIf(skip_windows=True, horovod=True) +@RunIf(skip_windows=True, horovod=True, skip_49370=True) def test_horovod_cpu(tmpdir): """Test Horovod running multi-process on CPU.""" trainer_options = dict( @@ -82,7 +82,7 @@ def test_horovod_cpu(tmpdir): _run_horovod(trainer_options) -@RunIf(skip_windows=True, horovod=True) +@RunIf(skip_windows=True, horovod=True, skip_49370=True) def test_horovod_cpu_clip_grad_by_value(tmpdir): """Test Horovod running multi-process on CPU.""" trainer_options = dict( @@ -99,7 +99,7 @@ def test_horovod_cpu_clip_grad_by_value(tmpdir): _run_horovod(trainer_options) -@RunIf(skip_windows=True, horovod=True) +@RunIf(skip_windows=True, horovod=True, skip_49370=True) def test_horovod_cpu_implicit(tmpdir): """Test Horovod without specifying a backend, inferring from env set by `horovodrun`.""" trainer_options = dict( diff --git a/tests/plugins/test_ddp_spawn_plugin.py b/tests/plugins/test_ddp_spawn_plugin.py index c389cf9290c783..c5e5f7ccda7485 100644 --- a/tests/plugins/test_ddp_spawn_plugin.py +++ b/tests/plugins/test_ddp_spawn_plugin.py @@ -46,7 +46,7 @@ def get_from_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None: return super().get_from_queue(queue) -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_ddp_cpu(): """Tests if device is set correctly when training for DDPSpawnPlugin.""" trainer = Trainer(num_processes=2, fast_dev_run=True) @@ -91,7 +91,7 @@ def get_from_queue(self, trainer: Trainer, queue: torch.multiprocessing.SimpleQu return super().get_from_queue(trainer, queue) -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_ddp_spawn_add_get_queue(tmpdir): """Tests add_to_queue/get_from_queue with DDPSpawnPlugin.""" @@ -128,7 +128,7 @@ def on_predict_start(self) -> None: assert isinstance(self.trainer.model, LightningModule) -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_ddp_spawn_configure_ddp(tmpdir): """Tests with ddp spawn plugin.""" trainer = Trainer(default_root_dir=tmpdir, num_processes=2, strategy="ddp_spawn", fast_dev_run=True) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 8804348c8f9155..836c0f685195fe 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -1,5 +1,6 @@ import contextlib import json +import logging import os from typing import Any, Dict, Optional from unittest import mock @@ -872,24 +873,9 @@ def training_step(self, batch, batch_idx): trainer.fit(model) -@RunIf(min_gpus=1, deepspeed=True, special=True) -def test_deepspeed_warn_train_dataloader_called(tmpdir): - """Test DeepSpeed warns when it calls ``lightning_module.train_dataloader`` internally for logging batch - size.""" - model = BoringModel() - trainer = Trainer( - default_root_dir=tmpdir, - strategy=DeepSpeedPlugin(), - gpus=1, - fast_dev_run=True, - ) - with pytest.warns(UserWarning, match="Inferring the batch size for internal deepspeed logging"): - trainer.fit(model) - - @RunIf(min_gpus=1, deepspeed=True, special=True) def test_deepspeed_setup_train_dataloader(tmpdir): - """Test DeepSpeed works when setup is required to call, and the user passes the batch size manually.""" + """Test DeepSpeed works when setup is required to call in the DataModule.""" class TestSetupIsCalledDataModule(LightningDataModule): def __init__(self): @@ -914,13 +900,14 @@ def test_dataloader(self): model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, - strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=32), + strategy=DeepSpeedPlugin(logging_level=logging.INFO), gpus=1, fast_dev_run=True, ) dm = TestSetupIsCalledDataModule() - trainer.fit(model, datamodule=dm) - trainer.test(model, datamodule=dm) + with mock.patch("deepspeed.utils.logging.logger.warning", autospec=True) as mock_object: + trainer.fit(model, datamodule=dm) + assert any("Tried to infer the batch size" in str(arg) for arg in mock_object.call_args_list) @mock.patch("torch.optim.lr_scheduler.StepLR.step", autospec=True) diff --git a/tests/profiler/test_profiler.py b/tests/profiler/test_profiler.py index 37756fcc623517..5b8c3939c7b485 100644 --- a/tests/profiler/test_profiler.py +++ b/tests/profiler/test_profiler.py @@ -161,7 +161,7 @@ def test_simple_profiler_with_nonexisting_dirpath(tmpdir): assert nonexisting_tmpdir.join("fit-profiler.txt").exists() -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_simple_profiler_distributed_files(tmpdir): """Ensure the proper files are saved in distributed.""" profiler = SimpleProfiler(dirpath=tmpdir, filename="profiler") @@ -226,6 +226,7 @@ def test_advanced_profiler_iterable_durations(advanced_profiler, action: str, ex np.testing.assert_allclose(recored_total_duration, expected_total_duration, rtol=0.2) +@pytest.mark.flaky(reruns=3) def test_advanced_profiler_overhead(advanced_profiler, n_iter=5): """ensure that the profiler doesn't introduce too much overhead during training.""" for _ in range(n_iter): diff --git a/tests/trainer/connectors/test_callback_connector.py b/tests/trainer/connectors/test_callback_connector.py index 2cb68aa2e95bdc..e3c353c3eb063a 100644 --- a/tests/trainer/connectors/test_callback_connector.py +++ b/tests/trainer/connectors/test_callback_connector.py @@ -22,6 +22,7 @@ LearningRateMonitor, ModelCheckpoint, ModelSummary, + ProgressBarBase, TQDMProgressBar, ) from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector @@ -143,10 +144,11 @@ def test_attach_model_callbacks(): def _attach_callbacks(trainer_callbacks, model_callbacks): model = LightningModule() model.configure_callbacks = lambda: model_callbacks + has_progress_bar = any(isinstance(cb, ProgressBarBase) for cb in trainer_callbacks + model_callbacks) trainer = Trainer( enable_checkpointing=False, - enable_progress_bar=False, - enable_model_summary=None, + enable_progress_bar=has_progress_bar, + enable_model_summary=False, callbacks=trainer_callbacks, ) trainer.model = model diff --git a/tests/trainer/flags/test_overfit_batches.py b/tests/trainer/flags/test_overfit_batches.py index 76c8b37405b47e..3860d85ec9836d 100644 --- a/tests/trainer/flags/test_overfit_batches.py +++ b/tests/trainer/flags/test_overfit_batches.py @@ -13,13 +13,16 @@ # limitations under the License. import pytest import torch +from torch.utils.data.sampler import Sampler, SequentialSampler from pytorch_lightning import Trainer from tests.helpers.boring_model import BoringModel, RandomDataset def test_overfit_multiple_val_loaders(tmpdir): - """Tests that only training_step can be used.""" + """Tests that overfit batches works with multiple val dataloaders.""" + val_dl_count = 2 + overfit_batches = 3 class TestModel(BoringModel): def validation_step(self, batch, batch_idx, dataloader_idx): @@ -31,25 +34,65 @@ def validation_epoch_end(self, outputs) -> None: pass def val_dataloader(self): - dl1 = torch.utils.data.DataLoader(RandomDataset(32, 64)) - dl2 = torch.utils.data.DataLoader(RandomDataset(32, 64)) - return [dl1, dl2] + dls = [torch.utils.data.DataLoader(RandomDataset(32, 64)) for _ in range(val_dl_count)] + return dls model = TestModel() trainer = Trainer( - default_root_dir=tmpdir, max_epochs=2, overfit_batches=1, log_every_n_steps=1, enable_model_summary=False + default_root_dir=tmpdir, + max_epochs=2, + overfit_batches=overfit_batches, + log_every_n_steps=1, + enable_model_summary=False, ) trainer.fit(model) + assert trainer.num_training_batches == overfit_batches + assert len(trainer.num_val_batches) == val_dl_count + assert all(nbatches == overfit_batches for nbatches in trainer.num_val_batches) -@pytest.mark.parametrize("overfit", [1, 2, 0.1, 0.25, 1.0]) -def test_overfit_basic(tmpdir, overfit): - """Tests that only training_step can be used.""" +@pytest.mark.parametrize("overfit_batches", [1, 2, 0.1, 0.25, 1.0]) +def test_overfit_basic(tmpdir, overfit_batches): + """Tests that only training_step can be used when overfitting.""" model = BoringModel() + model.validation_step = None + total_train_samples = len(BoringModel().train_dataloader()) - trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, overfit_batches=overfit, enable_model_summary=False) - + trainer = Trainer( + default_root_dir=tmpdir, max_epochs=1, overfit_batches=overfit_batches, enable_model_summary=False + ) trainer.fit(model) + + assert trainer.num_val_batches == [] + assert trainer.num_training_batches == int( + overfit_batches * (1 if isinstance(overfit_batches, int) else total_train_samples) + ) + + +def test_overfit_batches_raises_warning_in_case_of_sequential_sampler(tmpdir): + class NonSequentialSampler(Sampler): + def __init__(self, data_source): + self.data_source = data_source + + def __iter__(self): + return iter(range(len(self.data_source))) + + def __len__(self): + return len(self.data_source) + + class TestModel(BoringModel): + def train_dataloader(self): + dataset = RandomDataset(32, 64) + sampler = NonSequentialSampler(dataset) + return torch.utils.data.DataLoader(dataset, sampler=sampler) + + model = TestModel() + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, overfit_batches=2) + + with pytest.warns(UserWarning, match="requested to overfit but enabled training dataloader shuffling"): + trainer.fit(model) + + assert isinstance(trainer.train_dataloader.loaders.sampler, SequentialSampler) diff --git a/tests/trainer/logging_/test_distributed_logging.py b/tests/trainer/logging_/test_distributed_logging.py index 487b7f38e4e197..d4ba4f242294ac 100644 --- a/tests/trainer/logging_/test_distributed_logging.py +++ b/tests/trainer/logging_/test_distributed_logging.py @@ -59,7 +59,7 @@ def on_train_end(self): assert self.log_name.format(rank=self.local_rank) in self.logger.logs, "Expected rank to be logged" -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_all_rank_logging_ddp_cpu(tmpdir): """Check that all ranks can be logged from.""" model = TestModel() diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py index 5b775b9968d993..22a1a2c90d7562 100644 --- a/tests/trainer/logging_/test_train_loop_logging.py +++ b/tests/trainer/logging_/test_train_loop_logging.py @@ -395,7 +395,7 @@ def validation_step(self, batch, batch_idx): return super().validation_step(batch, batch_idx) -@pytest.mark.parametrize("devices", [1, pytest.param(2, marks=RunIf(skip_windows=True))]) +@pytest.mark.parametrize("devices", [1, pytest.param(2, marks=RunIf(skip_windows=True, skip_49370=True))]) def test_logging_sync_dist_true(tmpdir, devices): """Tests to ensure that the sync_dist flag works (should just return the original value)""" fake_result = 1 diff --git a/tests/trainer/properties/test_get_model.py b/tests/trainer/properties/test_get_model.py index 6e405739e83fe9..ed81b90a2d1423 100644 --- a/tests/trainer/properties/test_get_model.py +++ b/tests/trainer/properties/test_get_model.py @@ -37,7 +37,7 @@ def test_get_model(tmpdir): trainer.fit(model) -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_get_model_ddp_cpu(tmpdir): """Tests that `trainer.lightning_module` extracts the model correctly when using ddp on cpu.""" diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 4f3a482e37ac47..edf40ac61d5ddb 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -133,7 +133,7 @@ def _get_warning_msg(): assert warn_str in msg -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) @pytest.mark.parametrize("num_workers", [0, 1]) def test_dataloader_warnings(tmpdir, num_workers): trainer = Trainer(default_root_dir=tmpdir, strategy="ddp_spawn", num_processes=2, fast_dev_run=4) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 1ffc957659ef01..272078b1d42064 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -1276,7 +1276,7 @@ def validation_step(self, batch, batch_idx): # the val dataloader on the first epoch because this only tracks the training epoch # meaning multiple passes through the validation data within a single training epoch # would not have the dataloader reloaded. - # This breaks the assumption behind reload_dataloaders_every_epoch=True + # This breaks the assumption behind reload_dataloaders_every_n_epochs=True call.val_dataloader(), call.train_dataloader(), call.val_dataloader(), diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index dc0ce2b68452c8..b9fb758321a0b3 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1809,7 +1809,7 @@ def on_predict_start(self) -> None: @pytest.mark.parametrize( - "strategy,num_processes", [(None, 1), pytest.param("ddp_spawn", 2, marks=RunIf(skip_windows=True))] + "strategy,num_processes", [(None, 1), pytest.param("ddp_spawn", 2, marks=RunIf(skip_windows=True, skip_49370=True))] ) def test_model_in_correct_mode_during_stages(tmpdir, strategy, num_processes): model = TrainerStagesModel() @@ -1830,7 +1830,7 @@ def validation_epoch_end(self, outputs) -> None: pass -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_fit_test_synchronization(tmpdir): """Test that the trainer synchronizes processes before returning control back to the caller.""" tutils.set_random_main_port() diff --git a/tests/utilities/test_all_gather_grad.py b/tests/utilities/test_all_gather_grad.py index 073468fc4cb289..2ed42b0b0f21a6 100644 --- a/tests/utilities/test_all_gather_grad.py +++ b/tests/utilities/test_all_gather_grad.py @@ -41,8 +41,8 @@ def _test_all_gather_ddp(rank, world_size): assert torch.allclose(grad2, tensor2.grad) -@RunIf(skip_windows=True) -def test_all_gather_ddp(): +@RunIf(skip_windows=True, skip_49370=True) +def test_all_gather_ddp_spawn(): world_size = 3 torch.multiprocessing.spawn(_test_all_gather_ddp, args=(world_size,), nprocs=world_size) diff --git a/tests/utilities/test_meta.py b/tests/utilities/test_meta.py index 8e36a86c3beef0..581b949d9167f5 100644 --- a/tests/utilities/test_meta.py +++ b/tests/utilities/test_meta.py @@ -14,7 +14,7 @@ from torch import nn from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.utilities.meta import init_meta_context, materialize_module +from pytorch_lightning.utilities.meta import init_meta_context, is_on_meta_device, materialize_module from tests.helpers.runif import RunIf @@ -31,18 +31,23 @@ def __init__(self, num_layers: int): self.layer = nn.Sequential(*[nn.Linear(1, 1) for _ in range(self.hparams.num_layers)]) -@RunIf(min_torch="1.10.0") +@RunIf(special=True, min_torch="1.10.0") def test_init_meta_context(): with init_meta_context(): m = nn.Linear(in_features=1, out_features=1) + assert isinstance(m, nn.Linear) assert m.weight.device.type == "meta" + assert is_on_meta_device(m) mlp = MLP(4) assert mlp.layer[0].weight.device.type == "meta" mlp = materialize_module(mlp) assert mlp.layer[0].weight.device.type == "cpu" + assert not is_on_meta_device(mlp) + assert not is_on_meta_device(nn.Module()) + model = BoringModel(4) assert model.layer[0].weight.device.type == "meta" materialize_module(model)