From 870b85c85c9fa9792297ec871c18247c3169462d Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 9 Apr 2021 18:34:11 +0200 Subject: [PATCH 1/6] Run checks on windows --- pytorch_lightning/trainer/data_loading.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 0480c8023c3f8..44f7cb8f0346a 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -13,7 +13,6 @@ # limitations under the License. import inspect import multiprocessing -import platform from abc import ABC from copy import deepcopy from typing import Iterable, List, Tuple, Union @@ -54,12 +53,10 @@ class TrainerDataLoadingMixin(ABC): dev_debugger: InternalDebugger def _worker_check(self, dataloader: DataLoader, name: str) -> None: - on_windows = platform.system() == 'Windows' - # ddp_spawn + num_workers > 0 don't mix! tell the user is_dataloader = isinstance(dataloader, DataLoader) using_spawn = self.accelerator_connector.distributed_backend == "ddp_spawn" - if is_dataloader and not on_windows: + if is_dataloader: if dataloader.num_workers > 0 and using_spawn: # checks for the attr persistent_workers available in pytorch >= 1.7 if hasattr(dataloader, "persistent_workers"): From 7711ed6e04f674f6222043ddc814fc638bea0573 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 9 Apr 2021 18:48:33 +0200 Subject: [PATCH 2/6] Refactor code --- pytorch_lightning/trainer/data_loading.py | 70 ++++++++++++----------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 44f7cb8f0346a..59944dada330c 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -53,51 +53,53 @@ class TrainerDataLoadingMixin(ABC): dev_debugger: InternalDebugger def _worker_check(self, dataloader: DataLoader, name: str) -> None: - # ddp_spawn + num_workers > 0 don't mix! tell the user - is_dataloader = isinstance(dataloader, DataLoader) + if not isinstance(dataloader, DataLoader): + return + using_spawn = self.accelerator_connector.distributed_backend == "ddp_spawn" - if is_dataloader: - if dataloader.num_workers > 0 and using_spawn: - # checks for the attr persistent_workers available in pytorch >= 1.7 - if hasattr(dataloader, "persistent_workers"): - if not dataloader.persistent_workers: - rank_zero_warn( - 'num_workers>0, persistent_workers=False, and accelerator=ddp_spawn' - ' may result in data loading bottlenecks.' - ' Consider setting persistent_workers=True' - ' (this is a limitation of Python .spawn() and PyTorch)' - ) - else: + num_cpus = multiprocessing.cpu_count() + + # ddp_spawn + num_workers > 0 don't mix! tell the user + if dataloader.num_workers > 0 and using_spawn: + # checks for the attr persistent_workers available in pytorch >= 1.7 + if hasattr(dataloader, "persistent_workers"): + if not dataloader.persistent_workers: rank_zero_warn( - 'num_workers>0 and accelerator=ddp_spawn do not mix well' - ' and may result in data loading bottlenecks.' - ' Consider setting accelerator=ddp to use num_workers>0' + 'num_workers>0, persistent_workers=False, and accelerator=ddp_spawn' + ' may result in data loading bottlenecks.' + ' Consider setting persistent_workers=True' ' (this is a limitation of Python .spawn() and PyTorch)' ) + else: + rank_zero_warn( + 'num_workers>0 and accelerator=ddp_spawn do not mix well' + ' and may result in data loading bottlenecks.' + ' Consider setting accelerator=ddp to use num_workers>0' + ' (this is a limitation of Python .spawn() and PyTorch)' + ) - elif dataloader.num_workers == 0 and using_spawn: - # checks for the attr persistent_workers available in pytorch >= 1.7 - if hasattr(dataloader, "persistent_workers"): - if not dataloader.persistent_workers: - rank_zero_warn( - 'accelerator=ddp_spawn and num_workers=0 may result in data loading bottlenecks.' - ' Consider setting num_workers>0 and persistent_workers=True' - ) - else: + elif dataloader.num_workers == 0 and using_spawn: + # checks for the attr persistent_workers available in pytorch >= 1.7 + if hasattr(dataloader, "persistent_workers"): + if not dataloader.persistent_workers: rank_zero_warn( 'accelerator=ddp_spawn and num_workers=0 may result in data loading bottlenecks.' - ' Consider setting accelerator=ddp and set num_workers>0' + ' Consider setting num_workers>0 and persistent_workers=True' ) - - elif dataloader.num_workers <= 2 and multiprocessing.cpu_count() > 2 and not using_spawn: - num_cpus = multiprocessing.cpu_count() + else: rank_zero_warn( - f'The dataloader, {name}, does not have many workers which may be a bottleneck.' - ' Consider increasing the value of the `num_workers` argument`' - f' (try {num_cpus} which is the number of cpus on this machine)' - f' in the `DataLoader` init to improve performance.' + 'accelerator=ddp_spawn and num_workers=0 may result in data loading bottlenecks.' + ' Consider setting accelerator=ddp and set num_workers>0' ) + elif dataloader.num_workers <= 2 < num_cpus and not using_spawn: + rank_zero_warn( + f'The dataloader, {name}, does not have many workers which may be a bottleneck.' + ' Consider increasing the value of the `num_workers` argument`' + f' (try {num_cpus} which is the number of cpus on this machine)' + f' in the `DataLoader` init to improve performance.' + ) + def auto_add_sampler(self, dataloader: DataLoader, shuffle: bool) -> DataLoader: # don't do anything if it's not a dataloader From 90d091243939d9ab07ce48ad295c3b4314a1ee52 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 9 Apr 2021 19:26:27 +0200 Subject: [PATCH 3/6] Re-running the lottery From 839fc30987172215ee1549db65e022fd817a66cf Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 9 Apr 2021 19:58:09 +0200 Subject: [PATCH 4/6] Maybe fix horovod? --- tests/models/test_horovod.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 49e4b04933eab..abedf10bc2ea5 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -253,6 +253,7 @@ def test_horovod_multi_optimizer(tmpdir): limit_val_batches=0.2, deterministic=True, accelerator='horovod', + logger=False, ) trainer.fit(model) assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}" From b9475c1ae913b647c18cbf5f445c53c2f793a7f5 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 9 Apr 2021 20:12:04 +0200 Subject: [PATCH 5/6] Didn't work --- tests/models/test_horovod.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index abedf10bc2ea5..4187802d2f6d0 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -49,9 +49,9 @@ def _run_horovod(trainer_options, on_gpu=False): # for Horovod, we interpret `gpus` to be set per worker trainer_options.update(gpus=1 if on_gpu else None) tutils.reset_seed() - # todo: Find why coverage breaks CI. + # TODO: Find out why coverage breaks CI. # append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else '' - # str(num_processes), sys.executable, '-m', 'coverage', 'run', '--source', 'pytorch_lightning', append, # noqa E265 + # str(num_processes), sys.executable, '-m', 'coverage', 'run', '--source', 'pytorch_lightning', append, cmdline = [ 'horovodrun', '-np', str(num_processes), sys.executable, TEST_SCRIPT, '--trainer-options', @@ -153,7 +153,7 @@ def test_horovod_multi_gpu_grad_by_value(tmpdir): # https://discuss.pytorch.org/t/torch-cuda-amp-vs-nvidia-apex/74994 # Check with (tgaddair) on Horovod issues if this feature is needed -@pytest.mark.skip(reason="Horovod currently doesn't work with Apex") # todo +@pytest.mark.skip(reason="TODO: Horovod currently doesn't work with Apex") @RunIf(min_gpus=2, skip_windows=True, amp_apex=True, horovod_nccl=True) def test_horovod_apex(tmpdir): """Test Horovod with multi-GPU support using apex amp.""" @@ -240,6 +240,7 @@ def validation_step(self, batch, *args, **kwargs): tpipes.run_model_test_without_loggers(trainer_options, model) +@pytest.mark.skip('TODO: flaky test - Fatal Python error: Aborted') @RunIf(skip_windows=True, horovod=True) def test_horovod_multi_optimizer(tmpdir): model = BasicGAN() @@ -253,7 +254,6 @@ def test_horovod_multi_optimizer(tmpdir): limit_val_batches=0.2, deterministic=True, accelerator='horovod', - logger=False, ) trainer.fit(model) assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}" @@ -273,7 +273,7 @@ def get_optimizer_params(optimizer): assert get_model_params(model.discriminator) == get_optimizer_params(trainer.optimizers[1]) -@pytest.mark.skipif(reason="CI agent.jobstatus=Succeeded: Permission denied") +@pytest.mark.skip(reason="TODO: CI agent.jobstatus=Succeeded: Permission denied") @RunIf(skip_windows=True, horovod=True) def test_result_reduce_horovod(tmpdir): """Make sure result logging works with Horovod. @@ -323,7 +323,7 @@ def training_epoch_end(self, outputs) -> None: horovod.run(hvd_test_fn, np=2) -@pytest.mark.skipif(reason="CI agent.jobstatus=Succeeded: Permission denied") +@pytest.mark.skip(reason="TODO: CI agent.jobstatus=Succeeded: Permission denied") @RunIf(skip_windows=True, horovod=True, num_gpus=2) def test_accuracy_metric_horovod(): num_batches = 10 From 90cf0647f63a61aff98424eae3a997398eabc12d Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 9 Apr 2021 20:39:28 +0200 Subject: [PATCH 6/6] Apply suggestions from code review --- tests/models/test_horovod.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 4187802d2f6d0..d12a755ca5d32 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -151,6 +151,7 @@ def test_horovod_multi_gpu_grad_by_value(tmpdir): _run_horovod(trainer_options, on_gpu=True) +# todo: need to be fixed :] # https://discuss.pytorch.org/t/torch-cuda-amp-vs-nvidia-apex/74994 # Check with (tgaddair) on Horovod issues if this feature is needed @pytest.mark.skip(reason="TODO: Horovod currently doesn't work with Apex") @@ -240,6 +241,7 @@ def validation_step(self, batch, *args, **kwargs): tpipes.run_model_test_without_loggers(trainer_options, model) +# todo: need to be fixed :] @pytest.mark.skip('TODO: flaky test - Fatal Python error: Aborted') @RunIf(skip_windows=True, horovod=True) def test_horovod_multi_optimizer(tmpdir): @@ -273,6 +275,7 @@ def get_optimizer_params(optimizer): assert get_model_params(model.discriminator) == get_optimizer_params(trainer.optimizers[1]) +# todo: need to be fixed :] @pytest.mark.skip(reason="TODO: CI agent.jobstatus=Succeeded: Permission denied") @RunIf(skip_windows=True, horovod=True) def test_result_reduce_horovod(tmpdir): @@ -323,6 +326,7 @@ def training_epoch_end(self, outputs) -> None: horovod.run(hvd_test_fn, np=2) +# todo: need to be fixed :] @pytest.mark.skip(reason="TODO: CI agent.jobstatus=Succeeded: Permission denied") @RunIf(skip_windows=True, horovod=True, num_gpus=2) def test_accuracy_metric_horovod():