From 870b85c85c9fa9792297ec871c18247c3169462d Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Fri, 9 Apr 2021 18:34:11 +0200
Subject: [PATCH 1/6] Run checks on windows

---
 pytorch_lightning/trainer/data_loading.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index 0480c8023c3f8..44f7cb8f0346a 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 import inspect
 import multiprocessing
-import platform
 from abc import ABC
 from copy import deepcopy
 from typing import Iterable, List, Tuple, Union
@@ -54,12 +53,10 @@ class TrainerDataLoadingMixin(ABC):
     dev_debugger: InternalDebugger
 
     def _worker_check(self, dataloader: DataLoader, name: str) -> None:
-        on_windows = platform.system() == 'Windows'
-
         # ddp_spawn + num_workers > 0 don't mix! tell the user
         is_dataloader = isinstance(dataloader, DataLoader)
         using_spawn = self.accelerator_connector.distributed_backend == "ddp_spawn"
-        if is_dataloader and not on_windows:
+        if is_dataloader:
             if dataloader.num_workers > 0 and using_spawn:
                 # checks for the attr persistent_workers available in pytorch >= 1.7
                 if hasattr(dataloader, "persistent_workers"):

From 7711ed6e04f674f6222043ddc814fc638bea0573 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Fri, 9 Apr 2021 18:48:33 +0200
Subject: [PATCH 2/6] Refactor code

---
 pytorch_lightning/trainer/data_loading.py | 70 ++++++++++++-----------
 1 file changed, 36 insertions(+), 34 deletions(-)

diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index 44f7cb8f0346a..59944dada330c 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -53,51 +53,53 @@ class TrainerDataLoadingMixin(ABC):
     dev_debugger: InternalDebugger
 
     def _worker_check(self, dataloader: DataLoader, name: str) -> None:
-        # ddp_spawn + num_workers > 0 don't mix! tell the user
-        is_dataloader = isinstance(dataloader, DataLoader)
+        if not isinstance(dataloader, DataLoader):
+            return
+
         using_spawn = self.accelerator_connector.distributed_backend == "ddp_spawn"
-        if is_dataloader:
-            if dataloader.num_workers > 0 and using_spawn:
-                # checks for the attr persistent_workers available in pytorch >= 1.7
-                if hasattr(dataloader, "persistent_workers"):
-                    if not dataloader.persistent_workers:
-                        rank_zero_warn(
-                            'num_workers>0, persistent_workers=False, and accelerator=ddp_spawn'
-                            ' may result in data loading bottlenecks.'
-                            ' Consider setting persistent_workers=True'
-                            ' (this is a limitation of Python .spawn() and PyTorch)'
-                        )
-                else:
+        num_cpus = multiprocessing.cpu_count()
+
+        # ddp_spawn + num_workers > 0 don't mix! tell the user
+        if dataloader.num_workers > 0 and using_spawn:
+            # checks for the attr persistent_workers available in pytorch >= 1.7
+            if hasattr(dataloader, "persistent_workers"):
+                if not dataloader.persistent_workers:
                     rank_zero_warn(
-                        'num_workers>0 and accelerator=ddp_spawn do not mix well'
-                        ' and may result in data loading bottlenecks.'
-                        ' Consider setting accelerator=ddp to use num_workers>0'
+                        'num_workers>0, persistent_workers=False, and accelerator=ddp_spawn'
+                        ' may result in data loading bottlenecks.'
+                        ' Consider setting persistent_workers=True'
                         ' (this is a limitation of Python .spawn() and PyTorch)'
                     )
+            else:
+                rank_zero_warn(
+                    'num_workers>0 and accelerator=ddp_spawn do not mix well'
+                    ' and may result in data loading bottlenecks.'
+                    ' Consider setting accelerator=ddp to use num_workers>0'
+                    ' (this is a limitation of Python .spawn() and PyTorch)'
+                )
 
-            elif dataloader.num_workers == 0 and using_spawn:
-                # checks for the attr persistent_workers available in pytorch >= 1.7
-                if hasattr(dataloader, "persistent_workers"):
-                    if not dataloader.persistent_workers:
-                        rank_zero_warn(
-                            'accelerator=ddp_spawn and num_workers=0 may result in data loading bottlenecks.'
-                            ' Consider setting num_workers>0 and persistent_workers=True'
-                        )
-                else:
+        elif dataloader.num_workers == 0 and using_spawn:
+            # checks for the attr persistent_workers available in pytorch >= 1.7
+            if hasattr(dataloader, "persistent_workers"):
+                if not dataloader.persistent_workers:
                     rank_zero_warn(
                         'accelerator=ddp_spawn and num_workers=0 may result in data loading bottlenecks.'
-                        ' Consider setting accelerator=ddp and set num_workers>0'
+                        ' Consider setting num_workers>0 and persistent_workers=True'
                     )
-
-            elif dataloader.num_workers <= 2 and multiprocessing.cpu_count() > 2 and not using_spawn:
-                num_cpus = multiprocessing.cpu_count()
+            else:
                 rank_zero_warn(
-                    f'The dataloader, {name}, does not have many workers which may be a bottleneck.'
-                    ' Consider increasing the value of the `num_workers` argument`'
-                    f' (try {num_cpus} which is the number of cpus on this machine)'
-                    f' in the `DataLoader` init to improve performance.'
+                    'accelerator=ddp_spawn and num_workers=0 may result in data loading bottlenecks.'
+                    ' Consider setting accelerator=ddp and set num_workers>0'
                 )
 
+        elif dataloader.num_workers <= 2 < num_cpus and not using_spawn:
+            rank_zero_warn(
+                f'The dataloader, {name}, does not have many workers which may be a bottleneck.'
+                ' Consider increasing the value of the `num_workers` argument`'
+                f' (try {num_cpus} which is the number of cpus on this machine)'
+                f' in the `DataLoader` init to improve performance.'
+            )
+
     def auto_add_sampler(self, dataloader: DataLoader, shuffle: bool) -> DataLoader:
 
         # don't do anything if it's not a dataloader

From 90d091243939d9ab07ce48ad295c3b4314a1ee52 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Fri, 9 Apr 2021 19:26:27 +0200
Subject: [PATCH 3/6] Re-running the lottery


From 839fc30987172215ee1549db65e022fd817a66cf Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Fri, 9 Apr 2021 19:58:09 +0200
Subject: [PATCH 4/6] Maybe fix horovod?

---
 tests/models/test_horovod.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index 49e4b04933eab..abedf10bc2ea5 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -253,6 +253,7 @@ def test_horovod_multi_optimizer(tmpdir):
         limit_val_batches=0.2,
         deterministic=True,
         accelerator='horovod',
+        logger=False,
     )
     trainer.fit(model)
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"

From b9475c1ae913b647c18cbf5f445c53c2f793a7f5 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Fri, 9 Apr 2021 20:12:04 +0200
Subject: [PATCH 5/6] Didn't work

---
 tests/models/test_horovod.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index abedf10bc2ea5..4187802d2f6d0 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -49,9 +49,9 @@ def _run_horovod(trainer_options, on_gpu=False):
     # for Horovod, we interpret `gpus` to be set per worker
     trainer_options.update(gpus=1 if on_gpu else None)
     tutils.reset_seed()
-    # todo: Find why coverage breaks CI.
+    # TODO: Find out why coverage breaks CI.
     # append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else ''
-    # str(num_processes), sys.executable, '-m', 'coverage', 'run', '--source', 'pytorch_lightning', append,   # noqa E265
+    # str(num_processes), sys.executable, '-m', 'coverage', 'run', '--source', 'pytorch_lightning', append,
     cmdline = [
         'horovodrun', '-np',
         str(num_processes), sys.executable, TEST_SCRIPT, '--trainer-options',
@@ -153,7 +153,7 @@ def test_horovod_multi_gpu_grad_by_value(tmpdir):
 
 # https://discuss.pytorch.org/t/torch-cuda-amp-vs-nvidia-apex/74994
 # Check with (tgaddair) on Horovod issues if this feature is needed
-@pytest.mark.skip(reason="Horovod currently doesn't work with Apex")  # todo
+@pytest.mark.skip(reason="TODO: Horovod currently doesn't work with Apex")
 @RunIf(min_gpus=2, skip_windows=True, amp_apex=True, horovod_nccl=True)
 def test_horovod_apex(tmpdir):
     """Test Horovod with multi-GPU support using apex amp."""
@@ -240,6 +240,7 @@ def validation_step(self, batch, *args, **kwargs):
     tpipes.run_model_test_without_loggers(trainer_options, model)
 
 
+@pytest.mark.skip('TODO: flaky test - Fatal Python error: Aborted')
 @RunIf(skip_windows=True, horovod=True)
 def test_horovod_multi_optimizer(tmpdir):
     model = BasicGAN()
@@ -253,7 +254,6 @@ def test_horovod_multi_optimizer(tmpdir):
         limit_val_batches=0.2,
         deterministic=True,
         accelerator='horovod',
-        logger=False,
     )
     trainer.fit(model)
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
@@ -273,7 +273,7 @@ def get_optimizer_params(optimizer):
     assert get_model_params(model.discriminator) == get_optimizer_params(trainer.optimizers[1])
 
 
-@pytest.mark.skipif(reason="CI agent.jobstatus=Succeeded: Permission denied")
+@pytest.mark.skip(reason="TODO: CI agent.jobstatus=Succeeded: Permission denied")
 @RunIf(skip_windows=True, horovod=True)
 def test_result_reduce_horovod(tmpdir):
     """Make sure result logging works with Horovod.
@@ -323,7 +323,7 @@ def training_epoch_end(self, outputs) -> None:
     horovod.run(hvd_test_fn, np=2)
 
 
-@pytest.mark.skipif(reason="CI agent.jobstatus=Succeeded: Permission denied")
+@pytest.mark.skip(reason="TODO: CI agent.jobstatus=Succeeded: Permission denied")
 @RunIf(skip_windows=True, horovod=True, num_gpus=2)
 def test_accuracy_metric_horovod():
     num_batches = 10

From 90cf0647f63a61aff98424eae3a997398eabc12d Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Fri, 9 Apr 2021 20:39:28 +0200
Subject: [PATCH 6/6] Apply suggestions from code review

---
 tests/models/test_horovod.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index 4187802d2f6d0..d12a755ca5d32 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -151,6 +151,7 @@ def test_horovod_multi_gpu_grad_by_value(tmpdir):
     _run_horovod(trainer_options, on_gpu=True)
 
 
+# todo: need to be fixed :]
 # https://discuss.pytorch.org/t/torch-cuda-amp-vs-nvidia-apex/74994
 # Check with (tgaddair) on Horovod issues if this feature is needed
 @pytest.mark.skip(reason="TODO: Horovod currently doesn't work with Apex")
@@ -240,6 +241,7 @@ def validation_step(self, batch, *args, **kwargs):
     tpipes.run_model_test_without_loggers(trainer_options, model)
 
 
+# todo: need to be fixed :]
 @pytest.mark.skip('TODO: flaky test - Fatal Python error: Aborted')
 @RunIf(skip_windows=True, horovod=True)
 def test_horovod_multi_optimizer(tmpdir):
@@ -273,6 +275,7 @@ def get_optimizer_params(optimizer):
     assert get_model_params(model.discriminator) == get_optimizer_params(trainer.optimizers[1])
 
 
+# todo: need to be fixed :]
 @pytest.mark.skip(reason="TODO: CI agent.jobstatus=Succeeded: Permission denied")
 @RunIf(skip_windows=True, horovod=True)
 def test_result_reduce_horovod(tmpdir):
@@ -323,6 +326,7 @@ def training_epoch_end(self, outputs) -> None:
     horovod.run(hvd_test_fn, np=2)
 
 
+# todo: need to be fixed :]
 @pytest.mark.skip(reason="TODO: CI agent.jobstatus=Succeeded: Permission denied")
 @RunIf(skip_windows=True, horovod=True, num_gpus=2)
 def test_accuracy_metric_horovod():