From c33db977ce25a1ffa36da8bcc2f5ea240c452568 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 28 Jun 2024 04:41:21 -0700 Subject: [PATCH 1/5] Increase maximum UCX runtime pin to `<1.18.0` --- conda/recipes/ucx-py/meta.yaml | 2 +- dependencies.yaml | 8 ++++---- pyproject.toml | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conda/recipes/ucx-py/meta.yaml b/conda/recipes/ucx-py/meta.yaml index da667d48..180203b2 100644 --- a/conda/recipes/ucx-py/meta.yaml +++ b/conda/recipes/ucx-py/meta.yaml @@ -39,7 +39,7 @@ requirements: {% endfor %} run: - python - - ucx >=1.15.0,<1.16.0 + - ucx >=1.15.0,<1.18.0 # 'libucx' wheel dependency is unnecessary... the 'ucx' conda-forge package is used here instead {% for req in data["project"]["dependencies"] if not req.startswith("libucx") %} - {{ req }} diff --git a/dependencies.yaml b/dependencies.yaml index 6fd64d4c..060fdece 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -157,7 +157,7 @@ dependencies: common: - output_types: conda packages: - - ucx>=1.15.0,<1.16 + - ucx>=1.15.0,<1.18 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -168,13 +168,13 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - libucx-cu12>=1.15.0,<1.16 + - libucx-cu12>=1.15.0,<1.18 - matrix: {cuda: "11.*"} packages: - - libucx-cu11>=1.15.0,<1.16 + - libucx-cu11>=1.15.0,<1.18 - matrix: null packages: - - libucx>=1.15.0,<1.16 + - libucx>=1.15.0,<1.18 test_python: common: - output_types: [conda, requirements, pyproject] diff --git a/pyproject.toml b/pyproject.toml index 18cc55e1..73d11a54 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ authors = [ license = { text = "BSD-3-Clause" } requires-python = ">=3.9" dependencies = [ - "libucx>=1.15.0,<1.16", + "libucx>=1.15.0,<1.18", "numpy>=1.23,<2.0a0", "pynvml>=11.4.1", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`. From ae26bb81f14b81b98efa5ab106ecf72f8483954e Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 28 Jun 2024 04:43:09 -0700 Subject: [PATCH 2/5] Disable protov2 by default --- ucp/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ucp/__init__.py b/ucp/__init__.py index 390fbf45..31fc7f70 100644 --- a/ucp/__init__.py +++ b/ucp/__init__.py @@ -102,6 +102,11 @@ def _is_mig_device(handle): logger.info("Setting UCX_MAX_RNDV_RAILS=1") os.environ["UCX_MAX_RNDV_RAILS"] = "1" +if "UCX_PROTO_ENABLE" not in os.environ: + # UCX protov2 still doesn't support CUDA async/managed memory + logger.info("Setting UCX_PROTO_ENABLE=n") + os.environ["UCX_PROTO_ENABLE"] = "n" + __ucx_version__ = "%d.%d.%d" % get_ucx_version() From 02bc2a53c99ea6c0533e7464646a454ce179c48e Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 28 Jun 2024 13:33:41 -0700 Subject: [PATCH 3/5] Include `UCX_PROTO_ENABLE` in docs --- docs/source/configuration.rst | 23 ++++++++++++++++------- ucp/__init__.py | 8 ++------ 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst index 14a6ff17..9d7d1d3a 100644 --- a/docs/source/configuration.rst +++ b/docs/source/configuration.rst @@ -24,19 +24,13 @@ UCX-Py redefines some of the UCX defaults for a variety of reasons, including be Below is a list of the UCX-Py redefined default values, and what conditions are required for them to apply. -Apply to all UCX versions: - :: UCX_RNDV_THRESH=8192 UCX_RNDV_SCHEME=get_zcopy - -Apply to UCX >= 1.12.0, older UCX versions rely on UCX defaults: - -:: - UCX_CUDA_COPY_MAX_REG_RATIO=1.0 UCX_MAX_RNDV_RAILS=1 + UCX_PROTO_ENABLE=n Please note that ``UCX_CUDA_COPY_MAX_REG_RATIO=1.0`` is only set provided at least one GPU is present with a BAR1 size smaller than its total memory (e.g., NVIDIA T4). @@ -45,6 +39,21 @@ UCX Environment Variables in UCX-Py In this section we go over a brief overview of some of the more relevant variables for current UCX-Py usage, along with some comments on their uses and limitations. To see a complete list of UCX environment variables, their descriptions and default values, please run the command-line tool ``ucx_info -f``. +UCP CONTEXT CONFIGURATION +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Configuration variables applying to the UCP context. + +UCX_PROTO_ENABLE +```````````````` + +Values: y, n + +Enable the new protocol selection logic, also known as "protov2". Its default has been changed to ``y`` starting with UCX 1.16.0. + +The new protocol solves various limitations from the original "protov1" including, for example, invalid choice of transport in systems with hybrid interconnectivity, such as a DGX-1 where only a subset of GPU pairs are interconnected via NVLink. On the other hand, it may still lack proper support on not be as well tested for lesser common use cases, such as CUDA async and managed memory. + + DEBUG ~~~~~ diff --git a/ucp/__init__.py b/ucp/__init__.py index 31fc7f70..fc51f0b8 100644 --- a/ucp/__init__.py +++ b/ucp/__init__.py @@ -50,11 +50,7 @@ logger.info("Setting UCX_RNDV_FRAG_MEM_TYPE=cuda") os.environ["UCX_RNDV_FRAG_MEM_TYPE"] = "cuda" -if ( - pynvml is not None - and "UCX_CUDA_COPY_MAX_REG_RATIO" not in os.environ - and get_ucx_version() >= (1, 12, 0) -): +if pynvml is not None and "UCX_CUDA_COPY_MAX_REG_RATIO" not in os.environ: try: pynvml.nvmlInit() device_count = pynvml.nvmlDeviceGetCount() @@ -98,7 +94,7 @@ def _is_mig_device(handle): ): pass -if "UCX_MAX_RNDV_RAILS" not in os.environ and get_ucx_version() >= (1, 12, 0): +if "UCX_MAX_RNDV_RAILS" not in os.environ: logger.info("Setting UCX_MAX_RNDV_RAILS=1") os.environ["UCX_MAX_RNDV_RAILS"] = "1" From 02b148f873f81e0462c4b7b03913f1493b11a3e0 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 28 Jun 2024 13:56:30 -0700 Subject: [PATCH 4/5] Add back checks for v1.12.0 --- docs/source/configuration.rst | 7 +++++++ ucp/__init__.py | 12 ++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst index 9d7d1d3a..5777bdd6 100644 --- a/docs/source/configuration.rst +++ b/docs/source/configuration.rst @@ -24,10 +24,17 @@ UCX-Py redefines some of the UCX defaults for a variety of reasons, including be Below is a list of the UCX-Py redefined default values, and what conditions are required for them to apply. +Apply to all UCX versions: + :: UCX_RNDV_THRESH=8192 UCX_RNDV_SCHEME=get_zcopy + +Apply to UCX >= 1.12.0, older UCX versions rely on UCX defaults: + +:: + UCX_CUDA_COPY_MAX_REG_RATIO=1.0 UCX_MAX_RNDV_RAILS=1 UCX_PROTO_ENABLE=n diff --git a/ucp/__init__.py b/ucp/__init__.py index fc51f0b8..e8058ba6 100644 --- a/ucp/__init__.py +++ b/ucp/__init__.py @@ -50,7 +50,11 @@ logger.info("Setting UCX_RNDV_FRAG_MEM_TYPE=cuda") os.environ["UCX_RNDV_FRAG_MEM_TYPE"] = "cuda" -if pynvml is not None and "UCX_CUDA_COPY_MAX_REG_RATIO" not in os.environ: +if ( + pynvml is not None + and "UCX_CUDA_COPY_MAX_REG_RATIO" not in os.environ + and get_ucx_version() >= (1, 12, 0) +): try: pynvml.nvmlInit() device_count = pynvml.nvmlDeviceGetCount() @@ -94,11 +98,11 @@ def _is_mig_device(handle): ): pass -if "UCX_MAX_RNDV_RAILS" not in os.environ: +if "UCX_MAX_RNDV_RAILS" not in os.environ and get_ucx_version() >= (1, 12, 0): logger.info("Setting UCX_MAX_RNDV_RAILS=1") os.environ["UCX_MAX_RNDV_RAILS"] = "1" -if "UCX_PROTO_ENABLE" not in os.environ: +if "UCX_PROTO_ENABLE" not in os.environ and get_ucx_version() >= (1, 12, 0): # UCX protov2 still doesn't support CUDA async/managed memory logger.info("Setting UCX_PROTO_ENABLE=n") os.environ["UCX_PROTO_ENABLE"] = "n" @@ -106,7 +110,7 @@ def _is_mig_device(handle): __ucx_version__ = "%d.%d.%d" % get_ucx_version() -if get_ucx_version() < (1, 11, 1): +if get_ucx_version() < (1, 15, 0): raise ImportError( f"Support for UCX {__ucx_version__} has ended. Please upgrade to " "1.11.1 or newer." From 3a673ed00f8d89f5fb6d612f6f887905530561ae Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 31 Jul 2024 10:43:26 +0200 Subject: [PATCH 5/5] Fix docs typos Co-authored-by: Lawrence Mitchell --- docs/source/configuration.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst index 5777bdd6..f759f91d 100644 --- a/docs/source/configuration.rst +++ b/docs/source/configuration.rst @@ -58,7 +58,7 @@ Values: y, n Enable the new protocol selection logic, also known as "protov2". Its default has been changed to ``y`` starting with UCX 1.16.0. -The new protocol solves various limitations from the original "protov1" including, for example, invalid choice of transport in systems with hybrid interconnectivity, such as a DGX-1 where only a subset of GPU pairs are interconnected via NVLink. On the other hand, it may still lack proper support on not be as well tested for lesser common use cases, such as CUDA async and managed memory. +The new protocol solves various limitations from the original "protov1" including, for example, invalid choice of transport in systems with hybrid interconnectivity, such as a DGX-1 where only a subset of GPU pairs are interconnected via NVLink. On the other hand, it may still lack proper support or not be as well tested for less common use cases, such as CUDA async and managed memory. DEBUG