From e17712e5c35eb9f88589b6492d6537ad90fff78f Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 1 Oct 2020 12:34:12 -0400 Subject: [PATCH] part 5 of #3733 (#3774) * ref: part 4 of #3733 * ref: part 4 of #3733 * ref: part 4 of #3733 --- tests/backends/test_ddp.py | 57 +++++++++++++++++++ .../models/data/ddp/train_test_variations.py | 44 -------------- tests/models/test_gpu.py | 57 ------------------- 3 files changed, 57 insertions(+), 101 deletions(-) create mode 100644 tests/backends/test_ddp.py delete mode 100644 tests/models/data/ddp/train_test_variations.py diff --git a/tests/backends/test_ddp.py b/tests/backends/test_ddp.py new file mode 100644 index 0000000000000..91f22c4d7c59d --- /dev/null +++ b/tests/backends/test_ddp.py @@ -0,0 +1,57 @@ +import pytest +import torch +import os +from tests.backends import ddp_model +from tests.utilities.dist import call_training_script + + +@pytest.mark.parametrize('cli_args', [ + pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'), +]) +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_multi_gpu_model_ddp_fit_only(tmpdir, cli_args): + # call the script + std, err = call_training_script(ddp_model, cli_args, 'fit', tmpdir, timeout=120) + + # load the results of the script + result_path = os.path.join(tmpdir, 'ddp.result') + result = torch.load(result_path) + + # verify the file wrote the expected outputs + assert result['status'] == 'complete' + + +@pytest.mark.parametrize('cli_args', [ + pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'), +]) +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_multi_gpu_model_ddp_test_only(tmpdir, cli_args): + # call the script + call_training_script(ddp_model, cli_args, 'test', tmpdir) + + # load the results of the script + result_path = os.path.join(tmpdir, 'ddp.result') + result = torch.load(result_path) + + # verify the file wrote the expected outputs + assert result['status'] == 'complete' + + +# @pytest.mark.parametrize('cli_args', [ +# pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'), +# ]) +# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +# def test_multi_gpu_model_ddp_fit_test(tmpdir, cli_args): +# # call the script +# call_training_script(ddp_model, cli_args, 'fit_test', tmpdir, timeout=20) +# +# # load the results of the script +# result_path = os.path.join(tmpdir, 'ddp.result') +# result = torch.load(result_path) +# +# # verify the file wrote the expected outputs +# assert result['status'] == 'complete' +# +# model_outs = result['result'] +# for out in model_outs: +# assert out['test_acc'] > 0.90 diff --git a/tests/models/data/ddp/train_test_variations.py b/tests/models/data/ddp/train_test_variations.py deleted file mode 100644 index f37bd27e8a005..0000000000000 --- a/tests/models/data/ddp/train_test_variations.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Runs either `.fit()` or `.test()` on a single node across multiple gpus. -""" -from argparse import ArgumentParser - -from pytorch_lightning import Trainer, seed_everything -from tests.base import EvalModelTemplate - - -def variation_fit(trainer, model): - trainer.fit(model) - - -def variation_test(trainer, model): - trainer.test(model) - - -def get_variations(): - variations = [ - "variation_fit", - "variation_test", - ] - return variations - - -def main(): - seed_everything(1234) - parser = ArgumentParser(add_help=False) - parser = Trainer.add_argparse_args(parser) - parser.add_argument('--variation', default=variation_fit.__name__) - parser.set_defaults(gpus=2) - parser.set_defaults(distributed_backend="ddp") - args = parser.parse_args() - - model = EvalModelTemplate() - trainer = Trainer.from_argparse_args(args) - - # run the chosen variation - run_variation = globals()[args.variation] - run_variation(trainer, model) - - -if __name__ == '__main__': - main() diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 56a58760ee542..8c2be4cabc594 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -1,15 +1,10 @@ -import os -import subprocess -import sys from collections import namedtuple -from pathlib import Path from unittest.mock import patch import pytest import torch from torchtext.data import Batch, Dataset, Example, Field, LabelField -import pytorch_lightning import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer @@ -17,9 +12,7 @@ from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import EvalModelTemplate -from tests.models.data.ddp import train_test_variations from pytorch_lightning.accelerators.gpu_backend import GPUBackend -from pytorch_lightning.accelerators.cpu_backend import CPUBackend PRETEND_N_OF_GPUS = 16 @@ -84,34 +77,6 @@ def test_multi_gpu_model_dp(tmpdir): memory.get_memory_profile('min_max') -@pytest.mark.parametrize('cli_args', [ - pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'), -]) -@pytest.mark.parametrize('variation', train_test_variations.get_variations()) -@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): - """ Runs a basic training and test run with distributed_backend=ddp. """ - file = Path(train_test_variations.__file__).absolute() - cli_args = cli_args.split(' ') if cli_args else [] - cli_args += ['--default_root_dir', str(tmpdir)] - cli_args += ['--variation', variation] - command = [sys.executable, str(file)] + cli_args - - # need to set the PYTHONPATH in case pytorch_lightning was not installed into the environment - env = os.environ.copy() - env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '') - - # for running in ddp mode, we need to lauch it's own process or pytest will get stuck - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) - - std, err = p.communicate(timeout=60) - std = std.decode('utf-8').strip() - err = err.decode('utf-8').strip() - assert std, f"{variation} produced no output" - if p.returncode > 0: - pytest.fail(err) - - @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") @pytest.mark.parametrize('gpus', [1, [0], [1]]) def test_single_gpu_model(tmpdir, gpus): @@ -129,28 +94,6 @@ def test_single_gpu_model(tmpdir, gpus): tpipes.run_model_test(trainer_options, model) -@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -def test_ddp_all_dataloaders_passed_to_fit(tmpdir): - """Make sure DDP works with dataloaders passed to fit()""" - tutils.set_random_master_port() - - model = EvalModelTemplate() - fit_options = dict(train_dataloader=model.train_dataloader(), - val_dataloaders=model.val_dataloader()) - - trainer = Trainer( - default_root_dir=tmpdir, - progress_bar_refresh_rate=0, - max_epochs=1, - limit_train_batches=0.2, - limit_val_batches=0.2, - gpus=[0, 1], - distributed_backend='ddp_spawn' - ) - result = trainer.fit(model, **fit_options) - assert result == 1, "DDP doesn't work with dataloaders passed to fit()." - - @pytest.fixture def mocked_device_count(monkeypatch): def device_count():