From ca968673177705d3a5a907fd52483d9b0428c236 Mon Sep 17 00:00:00 2001
From: Qingpeng Li <43924785+qingpeng9802@users.noreply.github.com>
Date: Tue, 8 Aug 2023 02:07:39 +0800
Subject: [PATCH] Tf32 warnings (#6816)

about #6754 .

### Description

show a warning if any thing may enable tf32 is detected

### Types of changes
<!--- Put an `x` in all the boxes that apply, and remove the not
applicable items -->
- [x] Non-breaking change (fix or new feature that would not break
existing functionality).
- [ ] Breaking change (fix or new feature that would cause existing
functionality to change).
- [ ] New tests added to cover the changes.
- [ ] Integration tests passed locally by running `./runtests.sh -f -u
--net --coverage`.
- [ ] Quick tests passed locally by running `./runtests.sh --quick
--unittests --disttests`.
- [x] In-line docstrings updated.
- [ ] Documentation updated, tested `make html` command in the `docs/`
folder.

---------

Signed-off-by: Qingpeng Li <qingpeng9802@gmail.com>
---
 docs/source/index.rst                         |  4 +-
 docs/source/installation.md                   |  4 +-
 ...rformance.md => precision_accelerating.md} |  6 +-
 monai/__init__.py                             | 10 +++
 monai/utils/__init__.py                       |  2 +
 monai/utils/module.py                         | 68 ++++++++++----
 monai/utils/tf32.py                           | 89 +++++++++++++++++++
 requirements-dev.txt                          |  1 +
 setup.cfg                                     |  3 +
 .../{test_version_leq.py => test_version.py}  |  9 +-
 tests/utils.py                                | 16 ++--
 11 files changed, 177 insertions(+), 35 deletions(-)
 rename docs/source/{precision_performance.md => precision_accelerating.md} (87%)
 create mode 100644 monai/utils/tf32.py
 rename tests/{test_version_leq.py => test_version.py} (88%)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 54dc6e6922..050ecf4dff 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -60,9 +60,9 @@ Technical documentation is available at `docs.monai.io <https://docs.monai.io>`_
 
 .. toctree::
   :maxdepth: 1
-  :caption: Precision and Performance
+  :caption: Precision and Accelerating
 
-  precision_performance
+  precision_accelerating
 
 .. toctree::
   :maxdepth: 1
diff --git a/docs/source/installation.md b/docs/source/installation.md
index 6d63fbf08f..6580ce6717 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -254,11 +254,11 @@ Since MONAI v0.2.0, the extras syntax such as `pip install 'monai[nibabel]'` is
 - The options are
 
 ```
-[nibabel, skimage, scipy, pillow, tensorboard, gdown, ignite, torchvision, itk, tqdm, lmdb, psutil, cucim, openslide, pandas, einops, transformers, mlflow, clearml, matplotlib, tensorboardX, tifffile, imagecodecs, pyyaml, fire, jsonschema, ninja, pynrrd, pydicom, h5py, nni, optuna, onnx, onnxruntime, zarr, lpips]
+[nibabel, skimage, scipy, pillow, tensorboard, gdown, ignite, torchvision, itk, tqdm, lmdb, psutil, cucim, openslide, pandas, einops, transformers, mlflow, clearml, matplotlib, tensorboardX, tifffile, imagecodecs, pyyaml, fire, jsonschema, ninja, pynrrd, pydicom, h5py, nni, optuna, onnx, onnxruntime, zarr, lpips, pynvml]
 ```
 
 which correspond to `nibabel`, `scikit-image`,`scipy`, `pillow`, `tensorboard`,
-`gdown`, `pytorch-ignite`, `torchvision`, `itk`, `tqdm`, `lmdb`, `psutil`, `cucim`, `openslide-python`, `pandas`, `einops`, `transformers`, `mlflow`, `clearml`, `matplotlib`, `tensorboardX`, `tifffile`, `imagecodecs`, `pyyaml`, `fire`, `jsonschema`, `ninja`, `pynrrd`, `pydicom`, `h5py`, `nni`, `optuna`, `onnx`, `onnxruntime`, `zarr` and `lpips` respectively.
+`gdown`, `pytorch-ignite`, `torchvision`, `itk`, `tqdm`, `lmdb`, `psutil`, `cucim`, `openslide-python`, `pandas`, `einops`, `transformers`, `mlflow`, `clearml`, `matplotlib`, `tensorboardX`, `tifffile`, `imagecodecs`, `pyyaml`, `fire`, `jsonschema`, `ninja`, `pynrrd`, `pydicom`, `h5py`, `nni`, `optuna`, `onnx`, `onnxruntime`, `zarr`, `lpips` and `nvidia-ml-py` respectively.
 
 
 - `pip install 'monai[all]'` installs all the optional dependencies.
diff --git a/docs/source/precision_performance.md b/docs/source/precision_accelerating.md
similarity index 87%
rename from docs/source/precision_performance.md
rename to docs/source/precision_accelerating.md
index 6e6c51d8c1..d6c6f4959a 100644
--- a/docs/source/precision_performance.md
+++ b/docs/source/precision_accelerating.md
@@ -29,11 +29,11 @@ by TF32 mode so the impact is very wide.
 torch.backends.cuda.matmul.allow_tf32 = False # in PyTorch 1.12 and later.
 torch.backends.cudnn.allow_tf32 = True
 ```
-Please note that there are environment variables that can override the flags above. For example, the environment variables mentioned in [Accelerating AI Training with NVIDIA TF32 Tensor Cores](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/) and `TORCH_ALLOW_TF32_CUBLAS_OVERRIDE` used by PyTorch. Thus, in some cases, the flags may be accidentally changed or overridden.
-
-We recommend that users print out these two flags for confirmation when unsure.
+Please note that there are environment variables that can override the flags above. For example, the environment variable `NVIDIA_TF32_OVERRIDE` mentioned in [Accelerating AI Training with NVIDIA TF32 Tensor Cores](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/) and `TORCH_ALLOW_TF32_CUBLAS_OVERRIDE` used by PyTorch. Thus, in some cases, the flags may be accidentally changed or overridden.
 
 If you are using an [NGC PyTorch container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch), the container includes a layer `ENV TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=1`.
 The default value `torch.backends.cuda.matmul.allow_tf32` will be overridden to `True`.
 
+We recommend that users print out these two flags for confirmation when unsure.
+
 If you can confirm through experiments that your model has no accuracy or convergence issues in TF32 mode and you have NVIDIA Ampere GPUs or above, you can set the two flags above to `True` to speed up your model.
diff --git a/monai/__init__.py b/monai/__init__.py
index 9f542dd25c..638220f6df 100644
--- a/monai/__init__.py
+++ b/monai/__init__.py
@@ -78,3 +78,13 @@
     "utils",
     "visualize",
 ]
+
+try:
+    from .utils.tf32 import detect_default_tf32
+
+    detect_default_tf32()
+except BaseException:
+    from .utils.misc import MONAIEnvVars
+
+    if MONAIEnvVars.debug():
+        raise
diff --git a/monai/utils/__init__.py b/monai/utils/__init__.py
index 58bde3f5e8..c973d4bfa1 100644
--- a/monai/utils/__init__.py
+++ b/monai/utils/__init__.py
@@ -115,6 +115,7 @@
     require_pkg,
     run_debug,
     run_eval,
+    version_geq,
     version_leq,
 )
 from .nvtx import Range
@@ -128,6 +129,7 @@
     torch_profiler_time_end_to_end,
 )
 from .state_cacher import StateCacher
+from .tf32 import detect_default_tf32, has_ampere_or_later
 from .type_conversion import (
     convert_data_type,
     convert_to_cupy,
diff --git a/monai/utils/module.py b/monai/utils/module.py
index fcd5e04145..f46ba7c1b3 100644
--- a/monai/utils/module.py
+++ b/monai/utils/module.py
@@ -25,7 +25,7 @@
 from pydoc import locate
 from re import match
 from types import FunctionType, ModuleType
-from typing import Any, cast
+from typing import Any, Iterable, cast
 
 import torch
 
@@ -55,6 +55,7 @@
     "get_package_version",
     "get_torch_version_tuple",
     "version_leq",
+    "version_geq",
     "pytorch_after",
 ]
 
@@ -518,24 +519,11 @@ def get_torch_version_tuple():
     return tuple(int(x) for x in torch.__version__.split(".")[:2])
 
 
-def version_leq(lhs: str, rhs: str) -> bool:
+def parse_version_strs(lhs: str, rhs: str) -> tuple[Iterable[int | str], Iterable[int | str]]:
     """
-    Returns True if version `lhs` is earlier or equal to `rhs`.
-
-    Args:
-        lhs: version name to compare with `rhs`, return True if earlier or equal to `rhs`.
-        rhs: version name to compare with `lhs`, return True if later or equal to `lhs`.
-
+    Parse the version strings.
     """
 
-    lhs, rhs = str(lhs), str(rhs)
-    pkging, has_ver = optional_import("pkg_resources", name="packaging")
-    if has_ver:
-        try:
-            return cast(bool, pkging.version.Version(lhs) <= pkging.version.Version(rhs))
-        except pkging.version.InvalidVersion:
-            return True
-
     def _try_cast(val: str) -> int | str:
         val = val.strip()
         try:
@@ -554,7 +542,28 @@ def _try_cast(val: str) -> int | str:
     # parse the version strings in this basic way without `packaging` package
     lhs_ = map(_try_cast, lhs.split("."))
     rhs_ = map(_try_cast, rhs.split("."))
+    return lhs_, rhs_
+
 
+def version_leq(lhs: str, rhs: str) -> bool:
+    """
+    Returns True if version `lhs` is earlier or equal to `rhs`.
+
+    Args:
+        lhs: version name to compare with `rhs`, return True if earlier or equal to `rhs`.
+        rhs: version name to compare with `lhs`, return True if later or equal to `lhs`.
+
+    """
+
+    lhs, rhs = str(lhs), str(rhs)
+    pkging, has_ver = optional_import("pkg_resources", name="packaging")
+    if has_ver:
+        try:
+            return cast(bool, pkging.version.Version(lhs) <= pkging.version.Version(rhs))
+        except pkging.version.InvalidVersion:
+            return True
+
+    lhs_, rhs_ = parse_version_strs(lhs, rhs)
     for l, r in zip(lhs_, rhs_):
         if l != r:
             if isinstance(l, int) and isinstance(r, int):
@@ -564,6 +573,33 @@ def _try_cast(val: str) -> int | str:
     return True
 
 
+def version_geq(lhs: str, rhs: str) -> bool:
+    """
+    Returns True if version `lhs` is later or equal to `rhs`.
+
+    Args:
+        lhs: version name to compare with `rhs`, return True if later or equal to `rhs`.
+        rhs: version name to compare with `lhs`, return True if earlier or equal to `lhs`.
+
+    """
+    lhs, rhs = str(lhs), str(rhs)
+    pkging, has_ver = optional_import("pkg_resources", name="packaging")
+    if has_ver:
+        try:
+            return cast(bool, pkging.version.Version(lhs) >= pkging.version.Version(rhs))
+        except pkging.version.InvalidVersion:
+            return True
+
+    lhs_, rhs_ = parse_version_strs(lhs, rhs)
+    for l, r in zip(lhs_, rhs_):
+        if l != r:
+            if isinstance(l, int) and isinstance(r, int):
+                return l > r
+            return f"{l}" > f"{r}"
+
+    return True
+
+
 @functools.lru_cache(None)
 def pytorch_after(major: int, minor: int, patch: int = 0, current_ver_string: str | None = None) -> bool:
     """
diff --git a/monai/utils/tf32.py b/monai/utils/tf32.py
new file mode 100644
index 0000000000..9ef425ab8b
--- /dev/null
+++ b/monai/utils/tf32.py
@@ -0,0 +1,89 @@
+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import functools
+import os
+import warnings
+
+__all__ = ["has_ampere_or_later", "detect_default_tf32"]
+
+
+@functools.lru_cache(None)
+def has_ampere_or_later() -> bool:
+    """
+    Check if there is any Ampere and later GPU.
+    """
+    import torch
+
+    from monai.utils.module import optional_import, version_geq
+
+    if not (torch.version.cuda and version_geq(f"{torch.version.cuda}", "11.0")):
+        return False
+
+    pynvml, has_pynvml = optional_import("pynvml")
+    if not has_pynvml:  # assuming that the user has Ampere and later GPU
+        return True
+
+    try:
+        pynvml.nvmlInit()
+        for i in range(pynvml.nvmlDeviceGetCount()):
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+            major, _ = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
+            if major >= 8:
+                return True
+    except BaseException:
+        pass
+    finally:
+        pynvml.nvmlShutdown()
+
+    return False
+
+
+@functools.lru_cache(None)
+def detect_default_tf32() -> bool:
+    """
+    Dectect if there is anything that may enable TF32 mode by default.
+    If any, show a warning message.
+    """
+    may_enable_tf32 = False
+    try:
+        if not has_ampere_or_later():
+            return False
+
+        from monai.utils.module import pytorch_after
+
+        if pytorch_after(1, 7, 0) and not pytorch_after(1, 12, 0):
+            warnings.warn(
+                "torch.backends.cuda.matmul.allow_tf32 = True by default.\n"
+                "  This value defaults to True when PyTorch version in [1.7, 1.11] and may affect precision.\n"
+                "  See https://docs.monai.io/en/latest/precision_accelerating.html#precision-and-accelerating"
+            )
+            may_enable_tf32 = True
+
+        override_tf32_env_vars = {"NVIDIA_TF32_OVERRIDE": "1", "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE": "1"}
+        for name, override_val in override_tf32_env_vars.items():
+            if os.environ.get(name) == override_val:
+                warnings.warn(
+                    f"Environment variable `{name} = {override_val}` is set.\n"
+                    f"  This environment variable may enable TF32 mode accidentally and affect precision.\n"
+                    f"  See https://docs.monai.io/en/latest/precision_accelerating.html#precision-and-accelerating"
+                )
+                may_enable_tf32 = True
+
+        return may_enable_tf32
+    except BaseException:
+        from monai.utils.misc import MONAIEnvVars
+
+        if MONAIEnvVars.debug():
+            raise
+        return False
diff --git a/requirements-dev.txt b/requirements-dev.txt
index f0e3a37c24..9620ea253d 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -55,3 +55,4 @@ typeguard<3  # https://github.com/microsoft/nni/issues/5457
 filelock!=3.12.0  # https://github.com/microsoft/nni/issues/5523
 zarr
 lpips==0.1.4
+nvidia-ml-py
diff --git a/setup.cfg b/setup.cfg
index 9cd1d59f95..9d5a22963c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -82,6 +82,7 @@ all =
     onnxruntime; python_version <= '3.10'
     zarr
     lpips==0.1.4
+    nvidia-ml-py
 nibabel =
     nibabel
 ninja =
@@ -153,6 +154,8 @@ zarr =
     zarr
 lpips =
     lpips==0.1.4
+pynvml =
+    nvidia-ml-py
 # # workaround https://github.com/Project-MONAI/MONAI/issues/5882
 # MetricsReloaded =
 #     MetricsReloaded @ git+https://github.com/Project-MONAI/MetricsReloaded@monai-support#egg=MetricsReloaded
diff --git a/tests/test_version_leq.py b/tests/test_version.py
similarity index 88%
rename from tests/test_version_leq.py
rename to tests/test_version.py
index ef9e70ad86..15f8cd36c6 100644
--- a/tests/test_version_leq.py
+++ b/tests/test_version.py
@@ -16,7 +16,7 @@
 
 from parameterized import parameterized
 
-from monai.utils import version_leq
+from monai.utils import version_geq, version_leq
 
 
 # from pkg_resources
@@ -76,10 +76,15 @@ def _pairwise(iterable):
 
 class TestVersionCompare(unittest.TestCase):
     @parameterized.expand(TEST_CASES)
-    def test_compare(self, a, b, expected=True):
+    def test_compare_leq(self, a, b, expected=True):
         """Test version_leq with `a` and `b`"""
         self.assertEqual(version_leq(a, b), expected)
 
+    @parameterized.expand(TEST_CASES)
+    def test_compare_geq(self, a, b, expected=True):
+        """Test version_geq with `b` and `a`"""
+        self.assertEqual(version_geq(b, a), expected)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/utils.py b/tests/utils.py
index bd357aa529..1dfd5e6e20 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -46,7 +46,8 @@
 from monai.data.meta_tensor import MetaTensor, get_track_meta
 from monai.networks import convert_to_onnx, convert_to_torchscript
 from monai.utils import optional_import
-from monai.utils.module import pytorch_after, version_leq
+from monai.utils.module import pytorch_after
+from monai.utils.tf32 import detect_default_tf32
 from monai.utils.type_conversion import convert_data_type
 
 nib, _ = optional_import("nibabel")
@@ -172,19 +173,14 @@ def test_is_quick():
 
 def is_tf32_env():
     """
-    The environment variable NVIDIA_TF32_OVERRIDE=0 will override any defaults
-    or programmatic configuration of NVIDIA libraries, and consequently,
-    cuBLAS will not accelerate FP32 computations with TF32 tensor cores.
+    When we may be using TF32 mode, check the precision of matrix operation.
+    If the checking result is greater than the threshold 0.001,
+    set _tf32_enabled=True (and relax _rtol for tests).
     """
     global _tf32_enabled
     if _tf32_enabled is None:
         _tf32_enabled = False
-        if (
-            torch.cuda.is_available()
-            and not version_leq(f"{torch.version.cuda}", "10.100")
-            and os.environ.get("NVIDIA_TF32_OVERRIDE", "1") != "0"
-            and torch.cuda.device_count() > 0  # at least 11.0
-        ):
+        if detect_default_tf32() or torch.backends.cuda.matmul.allow_tf32:
             try:
                 # with TF32 enabled, the speed is ~8x faster, but the precision has ~2 digits less in the result
                 g_gpu = torch.Generator(device="cuda")