Skip to content

Commit

Permalink
3053 release *_dist.py tests memory to avoid OOM (#3537)
Browse files Browse the repository at this point in the history
* adds min. memory testing utils

Signed-off-by: Wenqi Li <wenqil@nvidia.com>

* include valueerror for robust outcome

Signed-off-by: Wenqi Li <wenqil@nvidia.com>

* ensure float

Signed-off-by: Wenqi Li <wenqil@nvidia.com>

* msg improvements

Signed-off-by: Wenqi Li <wenqil@nvidia.com>

* update threshold

Signed-off-by: Wenqi Li <wenqil@nvidia.com>

* remove ref

Signed-off-by: Wenqi Li <wenqil@nvidia.com>

* separate disttests

Signed-off-by: Wenqi Li <wenqil@nvidia.com>

* update based on comments

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
  • Loading branch information
wyli authored Dec 24, 2021
1 parent 21c5f6d commit 7f23f38
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 13 deletions.
2 changes: 1 addition & 1 deletion .github/pull_request_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ A few sentences describing the changes proposed in this pull request.
- [ ] Breaking change (fix or new feature that would cause existing functionality to change).
- [ ] New tests added to cover the changes.
- [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`.
- [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests`.
- [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests --disttests`.
- [ ] In-line docstrings updated.
- [ ] Documentation updated, tested `make html` command in the `docs/` folder.
6 changes: 3 additions & 3 deletions .github/workflows/cron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
BUILD_MONAI=1 ./runtests.sh --coverage --unittests # unit tests with coverage report
BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests # unit tests with coverage report
BUILD_MONAI=1 ./runtests.sh --coverage --net # integration tests with coverage report
coverage xml
if pgrep python; then pkill python; fi
Expand Down Expand Up @@ -91,7 +91,7 @@ jobs:
python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
BUILD_MONAI=1 ./runtests.sh --coverage --unittests # unit tests with coverage report
BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests # unit tests with coverage report
BUILD_MONAI=1 ./runtests.sh --coverage --net # integration tests with coverage report
coverage xml
if pgrep python; then pkill python; fi
Expand Down Expand Up @@ -190,7 +190,7 @@ jobs:
python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))'
ngc --version
BUILD_MONAI=1 ./runtests.sh --coverage --pytype --unittests # unit tests with pytype checks, coverage report
BUILD_MONAI=1 ./runtests.sh --coverage --pytype --unittests --disttests # unit tests with pytype checks, coverage report
BUILD_MONAI=1 ./runtests.sh --coverage --net # integration tests with coverage report
coverage xml
if pgrep python; then pkill python; fi
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))'
BUILD_MONAI=1 ./runtests.sh --net
BUILD_MONAI=1 ./runtests.sh --unittests
BUILD_MONAI=1 ./runtests.sh --unittests --disttests
if pgrep python; then pkill python; fi
shell: bash
- name: Add reaction
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pythonapp-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ jobs:
python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
python -c "import monai; monai.config.print_config()"
# build for the current self-hosted CI Tesla V100
BUILD_MONAI=1 TORCH_CUDA_ARCH_LIST="7.0" ./runtests.sh --quick --unittests
BUILD_MONAI=1 TORCH_CUDA_ARCH_LIST="7.0" ./runtests.sh --quick --unittests --disttests
if [ ${{ matrix.environment }} = "PT110+CUDA102" ]; then
# test the clang-format tool downloading once
coverage run -m tests.clang_format_utils
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/setupapp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ jobs:
python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
BUILD_MONAI=1 ./runtests.sh --coverage --unittests # unit tests with coverage report
BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests # unit tests with coverage report
BUILD_MONAI=1 ./runtests.sh --coverage --net # integration tests with coverage report
coverage xml
if pgrep python; then pkill python; fi
Expand Down Expand Up @@ -104,7 +104,7 @@ jobs:
run: |
python -m pip list
python -c 'import torch; print(torch.__version__); print(torch.rand(5,3))'
BUILD_MONAI=1 ./runtests.sh --quick --unittests
BUILD_MONAI=1 ./runtests.sh --quick --unittests --disttests
coverage xml
- name: Upload coverage
uses: codecov/codecov-action@v1
Expand Down
8 changes: 6 additions & 2 deletions runtests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -567,15 +567,19 @@ if [ $doUnitTests = true ]
then
echo "${separator}${blue}unittests${noColor}"
torch_validate
${cmdPrefix}${cmd} ./tests/runner.py -p "test_((?!integration).)"
${cmdPrefix}${cmd} ./tests/runner.py -p "^(?!test_integration).*(?<!_dist)$" # excluding integration/dist tests
fi

# distributed test only
if [ $doDistTests = true ]
then
echo "${separator}${blue}run distributed unit test cases${noColor}"
torch_validate
${cmdPrefix}${cmd} ./tests/runner.py -p "test_.*_dist$"
for i in tests/test_*_dist.py
do
echo "$i"
${cmdPrefix}${cmd} "$i"
done
fi

# network training/inference/eval integration tests
Expand Down
16 changes: 13 additions & 3 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,8 +371,7 @@ def run_process(self, func, local_rank, args, kwargs, results):
os.environ["RANK"] = str(self.nproc_per_node * self.node_rank + local_rank)

if torch.cuda.is_available():
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
torch.cuda.set_device(int(local_rank))
torch.cuda.set_device(int(local_rank)) # using device ids from CUDA_VISIBILE_DEVICES

dist.init_process_group(
backend=self.backend,
Expand Down Expand Up @@ -427,6 +426,7 @@ def _wrapper(*args, **kwargs):
for p in processes:
p.join()
assert results.get(), "Distributed call failed."
_del_original_func(obj)

return _wrapper

Expand Down Expand Up @@ -508,6 +508,7 @@ def _wrapper(*args, **kwargs):
finally:
p.join()

_del_original_func(obj)
res = None
try:
res = results.get(block=False)
Expand All @@ -533,6 +534,15 @@ def _cache_original_func(obj) -> None:
_original_funcs[obj.__name__] = obj


def _del_original_func(obj):
"""pop the original function from cache."""
global _original_funcs
_original_funcs.pop(obj.__name__, None)
if torch.cuda.is_available(): # clean up the cached function
torch.cuda.synchronize()
torch.cuda.empty_cache()


def _call_original_func(name, module, *args, **kwargs):
if name not in _original_funcs:
_original_module = importlib.import_module(module) # reimport, refresh _original_funcs
Expand Down Expand Up @@ -621,7 +631,7 @@ def test_script_save(net, *inputs, device=None, rtol=1e-4, atol=0.0):

def query_memory(n=2):
"""
Find best n idle devices and return a string of device ids.
Find best n idle devices and return a string of device ids using the `nvidia-smi` command.
"""
bash_string = "nvidia-smi --query-gpu=power.draw,temperature.gpu,memory.used --format=csv,noheader,nounits"

Expand Down

0 comments on commit 7f23f38

Please sign in to comment.