Project-MONAI · wyli · Dec 24, 2021 · Dec 22, 2021 · Dec 22, 2021 · Dec 22, 2021
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -12,6 +12,6 @@ A few sentences describing the changes proposed in this pull request.
 - [ ] Breaking change (fix or new feature that would cause existing functionality to change).
 - [ ] New tests added to cover the changes.
 - [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`.
-- [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests`.
+- [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests  --disttests`.
 - [ ] In-line docstrings updated.
 - [ ] Documentation updated, tested `make html` command in the `docs/` folder.
diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml
@@ -48,7 +48,7 @@ jobs:
         python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
         python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
         python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
-        BUILD_MONAI=1 ./runtests.sh --coverage --unittests  # unit tests with coverage report
+        BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests  # unit tests with coverage report
         BUILD_MONAI=1 ./runtests.sh --coverage --net  # integration tests with coverage report
         coverage xml
         if pgrep python; then pkill python; fi
@@ -91,7 +91,7 @@ jobs:
         python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
         python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
         python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
-        BUILD_MONAI=1 ./runtests.sh --coverage --unittests  # unit tests with coverage report
+        BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests  # unit tests with coverage report
         BUILD_MONAI=1 ./runtests.sh --coverage --net  # integration tests with coverage report
         coverage xml
         if pgrep python; then pkill python; fi
@@ -190,7 +190,7 @@ jobs:
         python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
         python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))'
         ngc --version
-        BUILD_MONAI=1 ./runtests.sh --coverage --pytype --unittests  # unit tests with pytype checks, coverage report
+        BUILD_MONAI=1 ./runtests.sh --coverage --pytype --unittests --disttests  # unit tests with pytype checks, coverage report
         BUILD_MONAI=1 ./runtests.sh --coverage --net  # integration tests with coverage report
         coverage xml
         if pgrep python; then pkill python; fi

diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -47,7 +47,7 @@ jobs:
         python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
         python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))'
         BUILD_MONAI=1 ./runtests.sh --net
-        BUILD_MONAI=1 ./runtests.sh --unittests
+        BUILD_MONAI=1 ./runtests.sh --unittests --disttests
         if pgrep python; then pkill python; fi
       shell: bash
     - name: Add reaction

diff --git a/.github/workflows/pythonapp-gpu.yml b/.github/workflows/pythonapp-gpu.yml
@@ -123,7 +123,7 @@ jobs:
         python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
         python -c "import monai; monai.config.print_config()"
         # build for the current self-hosted CI Tesla V100
-        BUILD_MONAI=1 TORCH_CUDA_ARCH_LIST="7.0" ./runtests.sh --quick --unittests
+        BUILD_MONAI=1 TORCH_CUDA_ARCH_LIST="7.0" ./runtests.sh --quick --unittests --disttests
         if [ ${{ matrix.environment }} = "PT110+CUDA102" ]; then
           # test the clang-format tool downloading once
           coverage run -m tests.clang_format_utils

diff --git a/.github/workflows/setupapp.yml b/.github/workflows/setupapp.yml
@@ -59,7 +59,7 @@ jobs:
         python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
         python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
         python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
-        BUILD_MONAI=1 ./runtests.sh --coverage --unittests  # unit tests with coverage report
+        BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests  # unit tests with coverage report
         BUILD_MONAI=1 ./runtests.sh --coverage --net  # integration tests with coverage report
         coverage xml
         if pgrep python; then pkill python; fi
@@ -104,7 +104,7 @@ jobs:
       run: |
         python -m pip list
         python -c 'import torch; print(torch.__version__); print(torch.rand(5,3))'
-        BUILD_MONAI=1 ./runtests.sh --quick --unittests
+        BUILD_MONAI=1 ./runtests.sh --quick --unittests --disttests
         coverage xml
     - name: Upload coverage
       uses: codecov/codecov-action@v1

diff --git a/runtests.sh b/runtests.sh
@@ -567,15 +567,19 @@ if [ $doUnitTests = true ]
 then
     echo "${separator}${blue}unittests${noColor}"
     torch_validate
-    ${cmdPrefix}${cmd} ./tests/runner.py -p "test_((?!integration).)"
+    ${cmdPrefix}${cmd} ./tests/runner.py -p "^(?!test_integration).*(?<!_dist)$"  # excluding integration/dist tests
 fi
 
 # distributed test only
 if [ $doDistTests = true ]
 then
     echo "${separator}${blue}run distributed unit test cases${noColor}"
     torch_validate
-    ${cmdPrefix}${cmd} ./tests/runner.py -p "test_.*_dist$"
+    for i in tests/test_*_dist.py
+    do
+        echo "$i"
+        ${cmdPrefix}${cmd} "$i"
+    done
 fi
 
 # network training/inference/eval integration tests

diff --git a/tests/test_densenet.py b/tests/test_densenet.py
@@ -19,7 +19,7 @@
 from monai.networks import eval_mode
 from monai.networks.nets import DenseNet121, Densenet169, DenseNet264, densenet201
 from monai.utils import optional_import
-from tests.utils import skip_if_quick, test_script_save
+from tests.utils import SkipIfGPUMemoryLessThan, skip_if_quick, test_script_save
 
 if TYPE_CHECKING:
     import torchvision
@@ -90,6 +90,7 @@ def test_121_2d_shape_pretrain(self, model, input_param, input_shape, expected_s
 
     @parameterized.expand([TEST_PRETRAINED_2D_CASE_3])
     @skipUnless(has_torchvision, "Requires `torchvision` package.")
+    @SkipIfGPUMemoryLessThan(1024)
     def test_pretrain_consistency(self, model, input_param, input_shape):
         example = torch.randn(input_shape).to(device)
         net = model(**input_param).to(device)

diff --git a/tests/utils.py b/tests/utils.py
@@ -186,6 +186,29 @@ def skip_if_windows(obj):
     return unittest.skipIf(sys.platform == "win32", "Skipping tests on Windows")(obj)
 
 
+class SkipIfGPUMemoryLessThan:
+    """
+    Skip the unit tests if the GPU memory is less than a given amount.
+    """
+
+    def __init__(self, required_mb=1000, idx=0):
+        """
+        Args:
+            required_mb: minimum GPU memory size in MB
+            idx: device index
+
+        """
+        self.required_mb = required_mb
+        self.idx = idx
+
+    def __call__(self, obj):
+        _mem = get_gpu_memory(self.idx)
+        return unittest.skipIf(
+            _mem < self.required_mb,
+            f"Skipping because GPU has less than {self.required_mb} MB of memory on device {self.idx} (avail. {_mem}).",
+        )(obj)
+
+
 class SkipIfBeforePyTorchVersion:
     """Decorator to be used if test should be skipped
     with PyTorch versions older than that given."""
@@ -292,6 +315,7 @@ def __init__(
         backend: Optional[str] = None,
         daemon: Optional[bool] = None,
         method: Optional[str] = "spawn",
+        min_gpu_memory=9000,
         verbose: bool = False,
     ):
         """
@@ -311,6 +335,7 @@ def __init__(
                 When daemon=None, the initial value is inherited from the creating process.
             method: set the method which should be used to start a child process.
                 method can be 'fork', 'spawn' or 'forkserver'.
+            min_gpu_memory: minimum amount of GPU memory per process (in megabytes) required to run the test.
             verbose: whether to print NCCL debug info.
         """
         self.nnodes = int(nnodes)
@@ -333,6 +358,7 @@ def __init__(
         self.timeout = datetime.timedelta(0, timeout)
         self.daemon = daemon
         self.method = method
+        self.min_gpu_memory = min_gpu_memory / self.nproc_per_node
         self.verbose = verbose
 
     def run_process(self, func, local_rank, args, kwargs, results):
@@ -387,6 +413,14 @@ def __call__(self, obj):
                 f"Skipping distributed tests because it requires {self.nnodes} devices "
                 f"but got {torch.cuda.device_count()}",
             )(obj)
+        for i in range(self.nproc_per_node):  # check free memory for the current node
+            free_mem = get_gpu_memory(i)
+            if free_mem < self.min_gpu_memory:
+                return unittest.skipIf(
+                    True,
+                    f"Skipping distributed tests because it requires at least {self.min_gpu_memory}MB gpu memory "
+                    f"but got {free_mem}MB on gpu {i}",
+                )(obj)
 
         _cache_original_func(obj)
 
@@ -406,6 +440,7 @@ def _wrapper(*args, **kwargs):
             for p in processes:
                 p.join()
                 assert results.get(), "Distributed call failed."
+            _del_original_func(obj)
 
         return _wrapper
 
@@ -487,6 +522,7 @@ def _wrapper(*args, **kwargs):
             finally:
                 p.join()
 
+            _del_original_func(obj)
             res = None
             try:
                 res = results.get(block=False)
@@ -512,6 +548,15 @@ def _cache_original_func(obj) -> None:
     _original_funcs[obj.__name__] = obj
 
 
+def _del_original_func(obj):
+    """pop the original function from cache."""
+    global _original_funcs
+    _original_funcs.pop(obj.__name__, None)
+    if torch.cuda.is_available():  # clean up the cached function
+        torch.cuda.synchronize()
+        torch.cuda.empty_cache()
+
+
 def _call_original_func(name, module, *args, **kwargs):
     if name not in _original_funcs:
         _original_module = importlib.import_module(module)  # reimport, refresh _original_funcs
@@ -616,6 +661,21 @@ def query_memory(n=2):
     return ",".join(f"{int(x)}" for x in ids)
 
 
+def get_gpu_memory(idx=0) -> float:
+    """
+    Return the amount of GPU free memory in MB.
+    """
+    bash_string = f"nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits --id={idx}"
+
+    try:
+        p1 = Popen(bash_string.split(), stdout=PIPE)
+        output, error = p1.communicate()
+        free_memory = [x.split(",") for x in output.decode("utf-8").split("\n")[:-1]]
+        return float(np.asarray(free_memory, dtype=float).ravel()[0])
+    except (TypeError, IndexError, OSError, ValueError):
+        return float(0.0)
+
+
 TEST_NDARRAYS: Tuple[Callable] = (np.array, torch.as_tensor)  # type: ignore
 if torch.cuda.is_available():
     gpu_tensor: Callable = partial(torch.as_tensor, device="cuda")