Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

3053 release *_dist.py tests memory to avoid OOM #3537

Merged
merged 12 commits into from
Dec 24, 2021
2 changes: 1 addition & 1 deletion .github/pull_request_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ A few sentences describing the changes proposed in this pull request.
- [ ] Breaking change (fix or new feature that would cause existing functionality to change).
- [ ] New tests added to cover the changes.
- [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`.
- [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests`.
- [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests --disttests`.
- [ ] In-line docstrings updated.
- [ ] Documentation updated, tested `make html` command in the `docs/` folder.
6 changes: 3 additions & 3 deletions .github/workflows/cron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
BUILD_MONAI=1 ./runtests.sh --coverage --unittests # unit tests with coverage report
BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests # unit tests with coverage report
BUILD_MONAI=1 ./runtests.sh --coverage --net # integration tests with coverage report
coverage xml
if pgrep python; then pkill python; fi
Expand Down Expand Up @@ -91,7 +91,7 @@ jobs:
python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
BUILD_MONAI=1 ./runtests.sh --coverage --unittests # unit tests with coverage report
BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests # unit tests with coverage report
BUILD_MONAI=1 ./runtests.sh --coverage --net # integration tests with coverage report
coverage xml
if pgrep python; then pkill python; fi
Expand Down Expand Up @@ -190,7 +190,7 @@ jobs:
python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))'
ngc --version
BUILD_MONAI=1 ./runtests.sh --coverage --pytype --unittests # unit tests with pytype checks, coverage report
BUILD_MONAI=1 ./runtests.sh --coverage --pytype --unittests --disttests # unit tests with pytype checks, coverage report
BUILD_MONAI=1 ./runtests.sh --coverage --net # integration tests with coverage report
coverage xml
if pgrep python; then pkill python; fi
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))'
BUILD_MONAI=1 ./runtests.sh --net
BUILD_MONAI=1 ./runtests.sh --unittests
BUILD_MONAI=1 ./runtests.sh --unittests --disttests
if pgrep python; then pkill python; fi
shell: bash
- name: Add reaction
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pythonapp-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ jobs:
python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
python -c "import monai; monai.config.print_config()"
# build for the current self-hosted CI Tesla V100
BUILD_MONAI=1 TORCH_CUDA_ARCH_LIST="7.0" ./runtests.sh --quick --unittests
BUILD_MONAI=1 TORCH_CUDA_ARCH_LIST="7.0" ./runtests.sh --quick --unittests --disttests
if [ ${{ matrix.environment }} = "PT110+CUDA102" ]; then
# test the clang-format tool downloading once
coverage run -m tests.clang_format_utils
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/setupapp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ jobs:
python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
BUILD_MONAI=1 ./runtests.sh --coverage --unittests # unit tests with coverage report
BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests # unit tests with coverage report
BUILD_MONAI=1 ./runtests.sh --coverage --net # integration tests with coverage report
coverage xml
if pgrep python; then pkill python; fi
Expand Down Expand Up @@ -104,7 +104,7 @@ jobs:
run: |
python -m pip list
python -c 'import torch; print(torch.__version__); print(torch.rand(5,3))'
BUILD_MONAI=1 ./runtests.sh --quick --unittests
BUILD_MONAI=1 ./runtests.sh --quick --unittests --disttests
coverage xml
- name: Upload coverage
uses: codecov/codecov-action@v1
Expand Down
8 changes: 6 additions & 2 deletions runtests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -567,15 +567,19 @@ if [ $doUnitTests = true ]
then
echo "${separator}${blue}unittests${noColor}"
torch_validate
${cmdPrefix}${cmd} ./tests/runner.py -p "test_((?!integration).)"
${cmdPrefix}${cmd} ./tests/runner.py -p "^(?!test_integration).*(?<!_dist)$" # excluding integration/dist tests
fi

# distributed test only
if [ $doDistTests = true ]
then
echo "${separator}${blue}run distributed unit test cases${noColor}"
torch_validate
${cmdPrefix}${cmd} ./tests/runner.py -p "test_.*_dist$"
for i in tests/test_*_dist.py
do
echo "$i"
${cmdPrefix}${cmd} "$i"
done
fi

# network training/inference/eval integration tests
Expand Down
3 changes: 2 additions & 1 deletion tests/test_densenet.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from monai.networks import eval_mode
from monai.networks.nets import DenseNet121, Densenet169, DenseNet264, densenet201
from monai.utils import optional_import
from tests.utils import skip_if_quick, test_script_save
from tests.utils import SkipIfGPUMemoryLessThan, skip_if_quick, test_script_save

if TYPE_CHECKING:
import torchvision
Expand Down Expand Up @@ -90,6 +90,7 @@ def test_121_2d_shape_pretrain(self, model, input_param, input_shape, expected_s

@parameterized.expand([TEST_PRETRAINED_2D_CASE_3])
@skipUnless(has_torchvision, "Requires `torchvision` package.")
@SkipIfGPUMemoryLessThan(1024)
def test_pretrain_consistency(self, model, input_param, input_shape):
example = torch.randn(input_shape).to(device)
net = model(**input_param).to(device)
Expand Down
60 changes: 60 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,29 @@ def skip_if_windows(obj):
return unittest.skipIf(sys.platform == "win32", "Skipping tests on Windows")(obj)


class SkipIfGPUMemoryLessThan:
"""
Skip the unit tests if the GPU memory is less than a given amount.
"""

def __init__(self, required_mb=1000, idx=0):
"""
Args:
required_mb: minimum GPU memory size in MB
idx: device index

"""
self.required_mb = required_mb
self.idx = idx

def __call__(self, obj):
_mem = get_gpu_memory(self.idx)
return unittest.skipIf(
_mem < self.required_mb,
f"Skipping because GPU has less than {self.required_mb} MB of memory on device {self.idx} (avail. {_mem}).",
)(obj)


class SkipIfBeforePyTorchVersion:
"""Decorator to be used if test should be skipped
with PyTorch versions older than that given."""
Expand Down Expand Up @@ -292,6 +315,7 @@ def __init__(
backend: Optional[str] = None,
daemon: Optional[bool] = None,
method: Optional[str] = "spawn",
min_gpu_memory=9000,
verbose: bool = False,
):
"""
Expand All @@ -311,6 +335,7 @@ def __init__(
When daemon=None, the initial value is inherited from the creating process.
method: set the method which should be used to start a child process.
method can be 'fork', 'spawn' or 'forkserver'.
min_gpu_memory: minimum amount of GPU memory per process (in megabytes) required to run the test.
verbose: whether to print NCCL debug info.
"""
self.nnodes = int(nnodes)
Expand All @@ -333,6 +358,7 @@ def __init__(
self.timeout = datetime.timedelta(0, timeout)
self.daemon = daemon
self.method = method
self.min_gpu_memory = min_gpu_memory / self.nproc_per_node
wyli marked this conversation as resolved.
Show resolved Hide resolved
self.verbose = verbose

def run_process(self, func, local_rank, args, kwargs, results):
Expand Down Expand Up @@ -387,6 +413,14 @@ def __call__(self, obj):
f"Skipping distributed tests because it requires {self.nnodes} devices "
f"but got {torch.cuda.device_count()}",
)(obj)
for i in range(self.nproc_per_node): # check free memory for the current node
free_mem = get_gpu_memory(i)
if free_mem < self.min_gpu_memory:
return unittest.skipIf(
True,
f"Skipping distributed tests because it requires at least {self.min_gpu_memory}MB gpu memory "
f"but got {free_mem}MB on gpu {i}",
)(obj)

_cache_original_func(obj)

Expand All @@ -406,6 +440,7 @@ def _wrapper(*args, **kwargs):
for p in processes:
p.join()
assert results.get(), "Distributed call failed."
_del_original_func(obj)

return _wrapper

Expand Down Expand Up @@ -487,6 +522,7 @@ def _wrapper(*args, **kwargs):
finally:
p.join()

_del_original_func(obj)
res = None
try:
res = results.get(block=False)
Expand All @@ -512,6 +548,15 @@ def _cache_original_func(obj) -> None:
_original_funcs[obj.__name__] = obj


def _del_original_func(obj):
"""pop the original function from cache."""
global _original_funcs
_original_funcs.pop(obj.__name__, None)
if torch.cuda.is_available(): # clean up the cached function
torch.cuda.synchronize()
torch.cuda.empty_cache()


def _call_original_func(name, module, *args, **kwargs):
if name not in _original_funcs:
_original_module = importlib.import_module(module) # reimport, refresh _original_funcs
Expand Down Expand Up @@ -616,6 +661,21 @@ def query_memory(n=2):
return ",".join(f"{int(x)}" for x in ids)


def get_gpu_memory(idx=0) -> float:
"""
Return the amount of GPU free memory in MB.
"""
bash_string = f"nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits --id={idx}"

try:
p1 = Popen(bash_string.split(), stdout=PIPE)
output, error = p1.communicate()
free_memory = [x.split(",") for x in output.decode("utf-8").split("\n")[:-1]]
return float(np.asarray(free_memory, dtype=float).ravel()[0])
except (TypeError, IndexError, OSError, ValueError):
return float(0.0)


TEST_NDARRAYS: Tuple[Callable] = (np.array, torch.as_tensor) # type: ignore
if torch.cuda.is_available():
gpu_tensor: Callable = partial(torch.as_tensor, device="cuda")
Expand Down