From 892e8bbe8c54573924bf9d26421c113369a72ae4 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Thu, 13 Feb 2025 12:10:45 -0800
Subject: [PATCH 1/9] TRT-LLM installation tool

---
 .github/workflows/build-test-linux-x86_64.yml |  34 +++
 dev_dep_versions.yml                          |   1 +
 py/torch_tensorrt/dynamo/_compiler.py         |   9 +
 .../dynamo/conversion/converter_utils.py      |  65 -----
 .../conversion/custom_ops_converters.py       |  12 +-
 py/torch_tensorrt/dynamo/utils.py             | 244 +++++++++++++++++-
 setup.py                                      |   4 +
 .../dynamo/distributed/distributed_utils.py   |   1 -
 tests/py/dynamo/distributed/test_nccl_ops.py  |  99 ++++---
 tests/py/dynamo/distributed/test_nccl_ops.sh  |  47 +---
 10 files changed, 352 insertions(+), 164 deletions(-)

diff --git a/.github/workflows/build-test-linux-x86_64.yml b/.github/workflows/build-test-linux-x86_64.yml
index a81bfcf039..e061aebbd1 100644
--- a/.github/workflows/build-test-linux-x86_64.yml
+++ b/.github/workflows/build-test-linux-x86_64.yml
@@ -344,6 +344,40 @@ jobs:
         python -m pytest -m "not critical" -ra -n auto --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_torch_compile_dyn_models_tests_results.xml --ir torch_compile models/test_dyn_models.py
         popd
 
+  L1-dynamo-distributed-tests:
+    name: Test dynamo distributed [Python]
+    needs: [filter-matrix, build]
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/tensorrt
+            package-name: torch_tensorrt
+            pre-script: packaging/pre_build_script.sh
+            post-script: packaging/post_build_script.sh
+            smoke-test-script: packaging/smoke_test_script.sh
+    uses: ./.github/workflows/linux-test.yml
+    with:
+      job-name: tests-py-dynamo-distributed
+      repository: "pytorch/tensorrt"
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      script: |
+        set -euo pipefail
+        export USE_HOST_DEPS=1
+        export CI_BUILD=1
+        export USE_TRTLLM_PLUGINS=1
+        dnf install -y mpich mpich-devel openmpi openmpi-devel
+        pushd .
+        cd tests/py
+        cd dynamo
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
+        popd
+
+
   L2-dynamo-compile-tests:
     name: L2 dynamo compile tests
     needs: [filter-matrix, build, L1-dynamo-compile-tests, L1-dynamo-core-tests, L1-torch-compile-tests, L1-torchscript-tests]
diff --git a/dev_dep_versions.yml b/dev_dep_versions.yml
index 1159951385..8f3df6e509 100644
--- a/dev_dep_versions.yml
+++ b/dev_dep_versions.yml
@@ -1,3 +1,4 @@
 __cuda_version__: "12.8"
 __tensorrt_version__: "10.13.3"
 __tensorrt_rtx_version__: "1.0.0"
+__tensorrt_llm_version__: "0.17.0.post1"
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index 0dc4654db0..a83a622fdf 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -103,6 +103,7 @@ def cross_compile_for_windows(
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
+    use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows
@@ -176,6 +177,7 @@ def cross_compile_for_windows(
         enable_weight_streaming (bool): Enable weight streaming.
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
+        use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -330,6 +332,7 @@ def cross_compile_for_windows(
         "enable_weight_streaming": enable_weight_streaming,
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
+        "use_distributed_mode_trace": use_distributed_mode_trace,
     }
 
     # disable the following settings is not supported for cross compilation for windows feature
@@ -430,6 +433,7 @@ def compile(
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
+    use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -506,6 +510,7 @@ def compile(
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
+        use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -674,6 +679,7 @@ def compile(
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
         "offload_module_to_cpu": offload_module_to_cpu,
+        "use_distributed_mode_trace": use_distributed_mode_trace,
     }
 
     settings = CompilationSettings(**compilation_options)
@@ -1045,6 +1051,7 @@ def convert_exported_program_to_serialized_trt_engine(
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
+    use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
     **kwargs: Any,
 ) -> bytes:
     """Convert an ExportedProgram to a serialized TensorRT engine
@@ -1118,6 +1125,7 @@ def convert_exported_program_to_serialized_trt_engine(
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
+        use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model.
         **kwargs: Any,
     Returns:
         bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs
@@ -1286,6 +1294,7 @@ def convert_exported_program_to_serialized_trt_engine(
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
         "offload_module_to_cpu": offload_module_to_cpu,
+        "use_distributed_mode_trace": use_distributed_mode_trace,
     }
 
     settings = CompilationSettings(**compilation_options)
diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
index 3828f97f99..094de488ec 100644
--- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py
+++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
@@ -1,8 +1,6 @@
 import collections
-import ctypes
 import functools
 import logging
-import os
 from typing import (
     Any,
     Callable,
@@ -1124,69 +1122,6 @@ def args_bounds_check(
     return args[i] if len(args) > i and args[i] is not None else replacement
 
 
-def load_tensorrt_llm() -> bool:
-    """
-    Attempts to load the TensorRT-LLM plugin and initialize it.
-
-    Returns:
-        bool: True if the plugin was successfully loaded and initialized, False otherwise.
-    """
-    try:
-        import tensorrt_llm as trt_llm  # noqa: F401
-
-        _LOGGER.info("TensorRT-LLM successfully imported")
-        return True
-    except (ImportError, AssertionError) as e_import_error:
-        # Check for environment variable for the plugin library path
-        plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
-        if not plugin_lib_path:
-            _LOGGER.warning(
-                "TensorRT-LLM is not installed. Please install TensorRT-LLM or set TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops",
-            )
-            return False
-
-        _LOGGER.info(f"TensorRT-LLM Plugin lib path found: {plugin_lib_path}")
-        try:
-            # Load the shared library
-            handle = ctypes.CDLL(plugin_lib_path)
-            _LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}")
-        except OSError as e_os_error:
-            _LOGGER.error(
-                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}"
-                f"Ensure the path is correct and the library is compatible",
-                exc_info=e_os_error,
-            )
-            return False
-
-        try:
-            # Configure plugin initialization arguments
-            handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
-            handle.initTrtLlmPlugins.restype = ctypes.c_bool
-        except AttributeError as e_plugin_unavailable:
-            _LOGGER.warning(
-                "Unable to initialize the TensorRT-LLM plugin library",
-                exc_info=e_plugin_unavailable,
-            )
-            return False
-
-        try:
-            # Initialize the plugin
-            TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm"
-            if handle.initTrtLlmPlugins(None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8")):
-                _LOGGER.info("TensorRT-LLM plugin successfully initialized")
-                return True
-            else:
-                _LOGGER.warning("TensorRT-LLM plugin library failed in initialization")
-                return False
-        except Exception as e_initialization_error:
-            _LOGGER.warning(
-                "Exception occurred during TensorRT-LLM plugin library initialization",
-                exc_info=e_initialization_error,
-            )
-            return False
-    return False
-
-
 def promote_trt_tensors_to_same_dtype(
     ctx: ConversionContext, lhs: TRTTensor, rhs: TRTTensor, name_prefix: str
 ) -> tuple[TRTTensor, TRTTensor]:
diff --git a/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py
index 1442c2b17b..aecc99b1f1 100644
--- a/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py
+++ b/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py
@@ -11,15 +11,15 @@
 from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
     dynamo_tensorrt_converter,
 )
-from torch_tensorrt.dynamo.conversion.converter_utils import load_tensorrt_llm
+from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
+    tensorrt_fused_nccl_all_gather_op,
+    tensorrt_fused_nccl_reduce_scatter_op,
+)
+from torch_tensorrt.dynamo.utils import load_tensorrt_llm_for_nccl
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
 
-if load_tensorrt_llm():
-    from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
-        tensorrt_fused_nccl_all_gather_op,
-        tensorrt_fused_nccl_reduce_scatter_op,
-    )
+if load_tensorrt_llm_for_nccl():
 
     @dynamo_tensorrt_converter(tensorrt_fused_nccl_all_gather_op)
     def fused_nccl_gather(
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index 564250e5ae..14e9074702 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -1,11 +1,27 @@
 from __future__ import annotations
 
+import ctypes
 import gc
+import getpass
 import logging
+import os
+import platform
+import tempfile
+import urllib.request
 import warnings
 from dataclasses import fields, replace
 from enum import Enum
-from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
+from pathlib import Path
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
 
 import numpy as np
 import sympy
@@ -18,6 +34,7 @@
 from torch_tensorrt._features import ENABLED_FEATURES
 from torch_tensorrt._Input import Input
 from torch_tensorrt._utils import is_tensorrt_version_supported
+from torch_tensorrt._version import __tensorrt_llm_version__
 from torch_tensorrt.dynamo import _defaults
 from torch_tensorrt.dynamo._defaults import default_device
 from torch_tensorrt.dynamo._engine_cache import BaseEngineCache
@@ -34,6 +51,7 @@
 RTOL = 5e-3
 ATOL = 5e-3
 CPU_DEVICE = "cpu"
+_WHL_CPYTHON_VERSION = "cp310"
 
 
 class Frameworks(Enum):
@@ -90,11 +108,9 @@ def unified_dtype_converter(
 ) -> Union[np.dtype, torch.dtype, TRTDataType]:
     """
     Convert TensorRT, Numpy, or Torch data types to any other of those data types.
-
     Args:
         dtype (TRTDataType, torch.dtype, np.dtype): A TensorRT, Numpy, or Torch data type.
         to (Frameworks): The framework to convert the data type to.
-
     Returns:
         The equivalent data type in the requested framework.
     """
@@ -858,3 +874,225 @@ def is_thor() -> bool:
     if torch.cuda.get_device_capability() in [(11, 0)]:
         return True
     return False
+
+
+def is_platform_supported_for_trtllm() -> bool:
+    """
+    Checks if the current platform supports TensorRT-LLM plugins for the NCCL backend.
+
+    Returns:
+        bool: True if supported, False otherwise.
+
+    Unsupported:
+        - Windows platforms
+        - Jetson/Orin/Xavier (aarch64 architecture + 'tegra' in platform release)
+    """
+    system = platform.system().lower()
+    machine = platform.machine().lower()
+    release = platform.release().lower()
+
+    if "windows" in system:
+        logger.info(
+            "TensorRT-LLM plugins for NCCL backend are not supported on Windows."
+        )
+        return False
+
+    if machine == "aarch64" and "tegra" in release:
+        logger.info(
+            "TensorRT-LLM plugins for NCCL backend are not supported on Jetson/Orin/Xavier (Tegra) devices."
+        )
+        return False
+
+    return True
+
+
+def _cache_root() -> Path:
+    username = getpass.getuser()
+    return Path(tempfile.gettempdir()) / f"torch_tensorrt_{username}"
+
+
+def _extracted_dir_trtllm(platform_system: str, platform_machine: str) -> Path:
+    return (
+        _cache_root()
+        / "trtllm"
+        / f"{__tensorrt_llm_version__}_{platform_system}_{platform_machine}"
+    )
+
+
+def download_and_get_plugin_lib_path() -> Optional[str]:
+    """
+    Returns the path to the TensorRT‑LLM shared library, downloading and extracting if necessary.
+
+    Args:
+        platform (str): Platform identifier (e.g., 'linux_x86_64')
+
+    Returns:
+        Optional[str]: Path to shared library or None if operation fails.
+    """
+    platform_system = platform.system().lower()
+    platform_machine = platform.machine().lower()
+    wheel_filename = (
+        f"tensorrt_llm-{__tensorrt_llm_version__}-{_WHL_CPYTHON_VERSION}-"
+        f"{_WHL_CPYTHON_VERSION}-{platform_system}_{platform_machine}.whl"
+    )
+    wheel_path = _cache_root() / wheel_filename
+    extract_dir = _extracted_dir_trtllm(platform_system, platform_machine)
+    # else will never be met though
+    lib_filename = (
+        "libnvinfer_plugin_tensorrt_llm.so"
+        if "linux" in platform_system
+        else "libnvinfer_plugin_tensorrt_llm.dll"
+    )
+    # eg: /tmp/torch_tensorrt_<username>/trtllm/0.17.0.post1_linux_x86_64/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so
+    plugin_lib_path = extract_dir / "tensorrt_llm" / "libs" / lib_filename
+
+    if plugin_lib_path.exists():
+        return str(plugin_lib_path)
+
+    wheel_path.parent.mkdir(parents=True, exist_ok=True)
+    extract_dir.mkdir(parents=True, exist_ok=True)
+
+    if not wheel_path.exists():
+        base_url = "https://pypi.nvidia.com/tensorrt-llm/"
+        download_url = base_url + wheel_filename
+        try:
+            logger.debug(f"Downloading {download_url} ...")
+            urllib.request.urlretrieve(download_url, wheel_path)
+            logger.debug("Download succeeded and TRT-LLM wheel is now present")
+        except urllib.error.HTTPError as e:
+            logger.error(
+                f"HTTP error {e.code} when trying to download {download_url}: {e.reason}"
+            )
+        except urllib.error.URLError as e:
+            logger.error(
+                f"URL error when trying to download {download_url}: {e.reason}"
+            )
+        except OSError as e:
+            logger.error(f"Local file write error: {e}")
+
+    try:
+        import zipfile
+    except ImportError as e:
+        raise ImportError(
+            "zipfile module is required but not found. Please install zipfile"
+        )
+    try:
+        with zipfile.ZipFile(wheel_path) as zip_ref:
+            zip_ref.extractall(extract_dir)
+            logger.debug(f"Extracted wheel to {extract_dir}")
+    except FileNotFoundError as e:
+        # This should capture the errors in the download failure above
+        logger.error(f"Wheel file not found at {wheel_path}: {e}")
+        raise RuntimeError(
+            f"Failed to find downloaded wheel file at {wheel_path}"
+        ) from e
+    except zipfile.BadZipFile as e:
+        logger.error(f"Invalid or corrupted wheel file: {e}")
+        raise RuntimeError(
+            "Downloaded wheel file is corrupted or not a valid zip archive"
+        ) from e
+    except Exception as e:
+        logger.error(f"Unexpected error while extracting wheel: {e}")
+        raise RuntimeError(
+            "Unexpected error during extraction of TensorRT-LLM wheel"
+        ) from e
+
+    try:
+        wheel_path.unlink(missing_ok=True)
+        logger.debug(f"Deleted wheel file: {wheel_path}")
+    except Exception as e:
+        logger.warning(f"Could not delete wheel file {wheel_path}: {e}")
+    if not plugin_lib_path.exists():
+        logger.error(
+            f"Plugin library not found at expected location: {plugin_lib_path}"
+        )
+        return None
+
+    return str(plugin_lib_path)
+
+
+def load_and_initialize_trtllm_plugin(plugin_lib_path: str) -> bool:
+    """
+    Loads and initializes the TensorRT-LLM plugin from the given shared library path.
+
+    Args:
+        plugin_lib_path (str): Path to the shared TensorRT-LLM plugin library.
+
+    Returns:
+        bool: True if successful, False otherwise.
+    """
+    try:
+        handle = ctypes.CDLL(plugin_lib_path)
+        logger.info(f"Successfully loaded plugin library: {plugin_lib_path}")
+    except OSError as e_os_error:
+        if "libmpi" in str(e_os_error):
+            logger.warning(
+                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}, got error {e_os_error} (hint: libmpi.so is a necessary dependency; ensure that OpenMPI or MPICH is installed on your system)",
+                exc_info=e_os_error,
+            )
+        else:
+            logger.warning(
+                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}. "
+                f"Ensure the path is correct and the library is compatible.",
+                exc_info=e_os_error,
+            )
+        return False
+
+    try:
+        handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+        handle.initTrtLlmPlugins.restype = ctypes.c_bool
+    except AttributeError as e_plugin_unavailable:
+        logger.warning(
+            "Unable to initialize the TensorRT-LLM plugin library",
+            exc_info=e_plugin_unavailable,
+        )
+        return False
+
+    try:
+        if handle.initTrtLlmPlugins(None, b"tensorrt_llm"):
+            logger.info("TensorRT-LLM plugin successfully initialized")
+            return True
+        else:
+            logger.warning("TensorRT-LLM plugin library failed in initialization")
+            return False
+    except Exception as e_initialization_error:
+        logger.warning(
+            "Exception occurred during TensorRT-LLM plugin library initialization",
+            exc_info=e_initialization_error,
+        )
+        return False
+    return False
+
+
+def load_tensorrt_llm_for_nccl() -> bool:
+    """
+    Attempts to load the TensorRT-LLM plugin and initialize it.
+    Either the env variable TRTLLM_PLUGINS_PATH can specify the path
+    Or the user can specify USE_TRTLLM_PLUGINS as either of (1, true, yes, on) to download the TRT-LLM distribution and load it
+
+    Returns:
+        bool: True if the plugin was successfully loaded and initialized, False otherwise.
+    """
+    if not is_platform_supported_for_trtllm():
+        return False
+    plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
+
+    if plugin_lib_path:
+        return load_and_initialize_trtllm_plugin(plugin_lib_path)
+    else:
+        # this option can be used by user if TRTLLM_PLUGINS_PATH is not set by user
+        use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in (
+            "1",
+            "true",
+            "yes",
+            "on",
+        )
+        if not use_trtllm_plugin:
+            logger.warning(
+                "Neither TRTLLM_PLUGIN_PATH is set nor is it directed to download the shared library. Please set either of the two to use TRT-LLM libraries in torchTRT"
+            )
+            return False
+
+        plugin_lib_path = download_and_get_plugin_lib_path()
+        return load_and_initialize_trtllm_plugin(plugin_lib_path)  # type: ignore[arg-type]
+    return False
diff --git a/setup.py b/setup.py
index 878e4de4ca..d487530626 100644
--- a/setup.py
+++ b/setup.py
@@ -29,6 +29,7 @@
 __cuda_version__: str = "0.0"
 __tensorrt_version__: str = "0.0"
 __tensorrt_rtx_version__: str = "0.0"
+__tensorrt_llm_version__: str = "0.0"
 
 LEGACY_BASE_VERSION_SUFFIX_PATTERN = re.compile("a0$")
 # CI_PIPELINE_ID is the environment variable set by DLFW ci build
@@ -69,6 +70,7 @@ def load_dep_info():
     global __cuda_version__
     global __tensorrt_version__
     global __tensorrt_rtx_version__
+    global __tensorrt_llm_version__
     with open("dev_dep_versions.yml", "r") as stream:
         versions = yaml.safe_load(stream)
         if (gpu_arch_version := os.environ.get("CU_VERSION")) is not None:
@@ -79,6 +81,7 @@ def load_dep_info():
             __cuda_version__ = versions["__cuda_version__"]
         __tensorrt_version__ = versions["__tensorrt_version__"]
         __tensorrt_rtx_version__ = versions["__tensorrt_rtx_version__"]
+        __tensorrt_llm_version__ = versions["__tensorrt_llm_version__"]
 
 
 load_dep_info()
@@ -245,6 +248,7 @@ def gen_version_file():
         f.write('__cuda_version__ = "' + __cuda_version__ + '"\n')
         f.write('__tensorrt_version__ = "' + __tensorrt_version__ + '"\n')
         f.write('__tensorrt_rtx_version__ = "' + __tensorrt_rtx_version__ + '"\n')
+        f.write('__tensorrt_llm_version__ = "' + __tensorrt_llm_version__ + '"\n')
 
 
 def copy_libtorchtrt(multilinux=False, rt_only=False):
diff --git a/tests/py/dynamo/distributed/distributed_utils.py b/tests/py/dynamo/distributed/distributed_utils.py
index e3062249fa..bc058aaaec 100644
--- a/tests/py/dynamo/distributed/distributed_utils.py
+++ b/tests/py/dynamo/distributed/distributed_utils.py
@@ -13,7 +13,6 @@ def set_environment_variables_pytest():
     os.environ["RANK"] = str(0)
     os.environ["MASTER_ADDR"] = "127.0.0.1"
     os.environ["MASTER_PORT"] = str(29500)
-    os.environ["USE_TRTLLM_PLUGINS"] = "1"
 
 
 def initialize_logger(rank, logger_file_name):
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
index 89c94300b7..3043954ece 100644
--- a/tests/py/dynamo/distributed/test_nccl_ops.py
+++ b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -1,42 +1,72 @@
 import os
+import unittest
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+from conversion.harness import DispatchTestCase
 from distributed_utils import set_environment_variables_pytest
 from parameterized import parameterized
 from torch.testing._internal.common_utils import run_tests
+from torch_tensorrt.dynamo.utils import is_platform_supported_for_trtllm
 
-set_environment_variables_pytest()
-dist.init_process_group(backend="nccl", init_method="env://")
-group = dist.new_group(ranks=[0])
-group_name = group.group_name
-world_size = 1
 
-from conversion.harness import DispatchTestCase
+class DistributedGatherModel(nn.Module):
+    def __init__(self, input_dim, world_size, group_name):
+        super().__init__()
+        self.fc = nn.Linear(input_dim, input_dim)
+        self.world_size = world_size
+        self.group_name = group_name
 
+    def forward(self, x):
+        x = self.fc(x)
+        gathered_tensor = torch.ops._c10d_functional.all_gather_into_tensor(
+            x, self.world_size, self.group_name
+        )
+        return torch.ops._c10d_functional.wait_tensor(gathered_tensor)
 
-class TestGatherNcclOpsConverter(DispatchTestCase):
-    @parameterized.expand([8])
-    def test_nccl_ops(self, linear_layer_dim):
-        class DistributedGatherModel(nn.Module):
-            def __init__(self, input_dim):
-                super().__init__()
-                self.fc = torch.nn.Linear(input_dim, input_dim)
 
-            def forward(self, x):
-                x = self.fc(x)
-                gathered_tensor = torch.ops._c10d_functional.all_gather_into_tensor(
-                    x, world_size, group_name
-                )
-                gathered_tensor = torch.ops._c10d_functional.wait_tensor(
-                    gathered_tensor
-                )
-                return gathered_tensor
+class DistributedReduceScatterModel(nn.Module):
+    def __init__(self, input_dim, world_size, group_name):
+        super().__init__()
+        self.fc = nn.Linear(input_dim, input_dim)
+        self.world_size = world_size
+        self.group_name = group_name
+
+    def forward(self, x):
+        x = self.fc(x)
+        out = torch.ops._c10d_functional.reduce_scatter_tensor(
+            x, "sum", self.world_size, self.group_name
+        )
+        return torch.ops._c10d_functional.wait_tensor(out)
+
 
+class TestNcclOpsConverter(DispatchTestCase):
+    @unittest.skipIf(
+        not is_platform_supported_for_trtllm(),
+        "Skipped on Windows, Jetson: NCCL backend is not supported.",
+    )
+    @classmethod
+    def setUpClass(cls):
+        set_environment_variables_pytest()
+        cls.world_size = 1
+        if not dist.is_initialized():
+            dist.init_process_group(backend="nccl")
+        cls.group = dist.new_group(ranks=[0])
+        cls.group_name = cls.group.group_name
+
+    @classmethod
+    def tearDownClass(cls):
+        if dist.is_initialized():
+            dist.destroy_process_group()
+
+    @parameterized.expand([8])
+    def test_nccl_ops_gather(self, linear_layer_dim):
         inputs = [torch.randn(1, linear_layer_dim).to("cuda")]
         self.run_test(
-            DistributedGatherModel(linear_layer_dim).cuda(),
+            DistributedGatherModel(
+                linear_layer_dim, self.world_size, self.group_name
+            ).cuda(),
             inputs,
             use_dynamo_tracer=True,
             enable_passes=True,
@@ -44,28 +74,11 @@ def forward(self, x):
 
     @parameterized.expand([8])
     def test_nccl_ops_scatter(self, linear_layer_dim):
-
-        class DistributedReduceScatterModel(nn.Module):
-            def __init__(self, input_dim):
-                super().__init__()
-                self.fc = torch.nn.Linear(input_dim, input_dim)
-
-            def forward(self, x):
-                x = self.fc(x)
-                scatter_reduce_tensor = (
-                    torch.ops._c10d_functional.reduce_scatter_tensor(
-                        x, "sum", world_size, group_name
-                    )
-                )
-                scatter_reduce_tensor = torch.ops._c10d_functional.wait_tensor(
-                    scatter_reduce_tensor
-                )
-                return scatter_reduce_tensor
-
         inputs = [torch.zeros(1, linear_layer_dim).to("cuda")]
-
         self.run_test(
-            DistributedReduceScatterModel(linear_layer_dim).cuda(),
+            DistributedReduceScatterModel(
+                linear_layer_dim, self.world_size, self.group_name
+            ).cuda(),
             inputs,
             use_dynamo_tracer=True,
             enable_passes=True,
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.sh b/tests/py/dynamo/distributed/test_nccl_ops.sh
index dd54700048..677d0cb9bc 100644
--- a/tests/py/dynamo/distributed/test_nccl_ops.sh
+++ b/tests/py/dynamo/distributed/test_nccl_ops.sh
@@ -70,51 +70,6 @@ ensure_pytest_installed(){
 
 echo "Setting up the environment"
 
-OS="$(uname -s)"
-ARCH="$(uname -m)"
-
-
-#getting the file name for TensorRT-LLM download
-if [[ "$OS" == "Linux" && "$ARCH" == "x86_64"]]; then
-    FILE="tensorrt_llm-0.17.0.post1-cp312-cp312-linux_x86_64.whl"
-elif [[ "$OS" == "Linux" && "$ARCH" == "aarch64"]]; then
-    FILE="tensorrt_llm-0.17.0.post1-cp312-cp312-linux_aarch64.whl"
-else:
-    echo "Unsupported platform: OS=$OS ARCH=$ARCH
-    exit 1
-fi
-
-# Download the selected file
-URL="https://pypi.nvidia.com/tensorrt-llm/$FILE"
-echo "Downloading $FILE from $URL..."
-
-#Installing wget
-ensure_installed wget
-
-#Downloading the file
-filename=$(basename "$URL")
-if [ -f "$filename" ]; then
-    echo "File already exists: $filename"
-else
-    wget "$URL"
-fi
-echo "Download complete: $FILE"
-
-UNZIP_DIR="tensorrt_llm_unzip"
-if [[ ! -d "$UNZIP_DIR" ]]; then
-    echo "Creating directory: $UNZIP_DIR"
-    mkdir -p "$UNZIP_DIR"
-    echo "extracting $FILE to $UNZIP_DIR ..."
-    #Installing unzip
-    ensure_installed unzip
-    #unzip the TensorRT-LLM package
-    unzip -q "$FILE" -d "$UNZIP_DIR"
-    echo "Unzip complete"
-fi
-
-
-export TRTLLM_PLUGINS_PATH="$(pwd)/${UNZIP_DIR}/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so"
-echo ${TRTLLM_PLUGINS_PATH}
 
 ensure_mpi_installed libmpich-dev
 ensure_mpi_installed libopenmpi-dev
@@ -123,7 +78,7 @@ run_tests() {
     cd ..
     export PYTHONPATH=$(pwd)
     echo "Running pytest on distributed/test_nccl_ops.py..."
-    pytest distributed/test_nccl_ops.py
+    USE_TRTLLM_PLUGINS=1 pytest distributed/test_nccl_ops.py
 }
 
 run_mpi_tests(){

From 9b45f43d52682ef901bd40e3065e2d0b28dcd924 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Tue, 23 Sep 2025 00:39:10 -0700
Subject: [PATCH 2/9] Changes for CUDA13

---
 py/torch_tensorrt/dynamo/utils.py            | 18 ++++++++++++++++++
 tests/py/dynamo/distributed/test_nccl_ops.py |  2 +-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index 14e9074702..8fb31122d0 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -886,6 +886,7 @@ def is_platform_supported_for_trtllm() -> bool:
     Unsupported:
         - Windows platforms
         - Jetson/Orin/Xavier (aarch64 architecture + 'tegra' in platform release)
+        - CUDA 13 not supported
     """
     system = platform.system().lower()
     machine = platform.machine().lower()
@@ -903,6 +904,23 @@ def is_platform_supported_for_trtllm() -> bool:
         )
         return False
 
+    try:
+        cuda_version = torch.version.cuda  # e.g., "12.4" or "13.0"
+        if cuda_version is None:
+            logger.warning("No CUDA runtime detected — TRT-LLM plugins unavailable.")
+            return False
+
+        major, minor = map(int, cuda_version.split("."))
+        if major != 12:
+            logger.warning("CUDA 13 is not supported for TRT-LLM plugins.")
+            return False
+
+        return True
+
+    except Exception as e:
+        logger.warning(f"Failed to detect CUDA version: {e}")
+        return False
+
     return True
 
 
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
index 3043954ece..0aca81d506 100644
--- a/tests/py/dynamo/distributed/test_nccl_ops.py
+++ b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -44,7 +44,7 @@ def forward(self, x):
 class TestNcclOpsConverter(DispatchTestCase):
     @unittest.skipIf(
         not is_platform_supported_for_trtllm(),
-        "Skipped on Windows, Jetson: NCCL backend is not supported.",
+        "Skipped on Windows, Jetson and CUDA13: NCCL backend is not supported.",
     )
     @classmethod
     def setUpClass(cls):

From cee5c7a93a82c73de11b8674d2799c2b0d5d38d1 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Tue, 30 Sep 2025 13:17:16 -0700
Subject: [PATCH 3/9] Addressing review comments- include in enabled feature
 and error logging. Pending- check support on Thor and sbsa

---
 py/torch_tensorrt/_features.py                | 16 +++++
 .../conversion/custom_ops_converters.py       | 67 +++++++++----------
 py/torch_tensorrt/dynamo/utils.py             |  8 ++-
 3 files changed, 54 insertions(+), 37 deletions(-)

diff --git a/py/torch_tensorrt/_features.py b/py/torch_tensorrt/_features.py
index f7e4e91626..b11be05535 100644
--- a/py/torch_tensorrt/_features.py
+++ b/py/torch_tensorrt/_features.py
@@ -9,6 +9,7 @@
     check_cross_compile_trt_win_lib,
     sanitized_torch_version,
 )
+from torch_tensorrt.dynamo.utils import load_tensorrt_llm_for_nccl
 
 from packaging import version
 
@@ -23,6 +24,7 @@
         "qdp_plugin",
         "windows_cross_compile",
         "tensorrt_rtx",
+        "trtllm_for_nccl",
     ],
 )
 
@@ -48,6 +50,7 @@
 _FX_FE_AVAIL = False if _TENSORRT_RTX else True
 _REFIT_AVAIL = True
 _WINDOWS_CROSS_COMPILE = check_cross_compile_trt_win_lib()
+_TRTLLM_AVAIL = load_tensorrt_llm_for_nccl()
 
 if importlib.util.find_spec("tensorrt.plugin"):
     _QDP_PLUGIN_AVAIL = True
@@ -63,6 +66,7 @@
     _QDP_PLUGIN_AVAIL,
     _WINDOWS_CROSS_COMPILE,
     _TENSORRT_RTX,
+    _TRTLLM_AVAIL,
 )
 
 T = TypeVar("T")
@@ -158,6 +162,18 @@ def not_implemented(*args: List[Any], **kwargs: Dict[str, Any]) -> Any:
     return wrapper
 
 
+def needs_trtllm_for_nccl(f: Callable[..., Any]) -> Callable[..., Any]:
+    def wrapper(*args: List[Any], **kwargs: Dict[str, Any]) -> Any:
+        if ENABLED_FEATURES.trtllm_for_nccl:
+            return f(*args, **kwargs)
+        else:
+            raise NotImplementedError(
+                "TensorRT-LLM plugins for NCCL backend could not be loaded"
+            )
+
+    return wrapper
+
+
 def for_all_methods(
     decorator: Callable[..., Any], exclude: Optional[List[str]] = None
 ) -> Callable[..., Any]:
diff --git a/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py
index aecc99b1f1..db14e3528b 100644
--- a/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py
+++ b/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py
@@ -5,6 +5,7 @@
 
 import tensorrt as trt
 from torch.fx.node import Argument, Target
+from torch_tensorrt._features import needs_trtllm_for_nccl
 from torch_tensorrt.dynamo._SourceIR import SourceIR
 from torch_tensorrt.dynamo.conversion import impl
 from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
@@ -15,45 +16,41 @@
     tensorrt_fused_nccl_all_gather_op,
     tensorrt_fused_nccl_reduce_scatter_op,
 )
-from torch_tensorrt.dynamo.utils import load_tensorrt_llm_for_nccl
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
 
-if load_tensorrt_llm_for_nccl():
 
-    @dynamo_tensorrt_converter(tensorrt_fused_nccl_all_gather_op)
-    def fused_nccl_gather(
-        ctx: ConversionContext,
-        target: Target,
-        args: Tuple[Argument, ...],
-        kwargs: Dict[str, Argument],
-        name: str,
-    ) -> Union[trt.ITensor, Sequence[trt.ITensor]]:
-        return impl.nccl_ops.nccl_gather(
-            ctx,
-            target,
-            SourceIR.ATEN,
-            name,
-            [args[0]],
-        )
+@needs_trtllm_for_nccl
+@dynamo_tensorrt_converter(tensorrt_fused_nccl_all_gather_op)
+def fused_nccl_gather(
+    ctx: ConversionContext,
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> Union[trt.ITensor, Sequence[trt.ITensor]]:
+    return impl.nccl_ops.nccl_gather(
+        ctx,
+        target,
+        SourceIR.ATEN,
+        name,
+        [args[0]],
+    )
 
-    @dynamo_tensorrt_converter(tensorrt_fused_nccl_reduce_scatter_op)
-    def fused_nccl_reduce_scatter(
-        ctx: ConversionContext,
-        target: Target,
-        args: Tuple[Argument, ...],
-        kwargs: Dict[str, Argument],
-        name: str,
-    ) -> Union[trt.ITensor, Sequence[trt.ITensor]]:
-        return impl.nccl_ops.nccl_reduce_scatter(
-            ctx,
-            target,
-            SourceIR.ATEN,
-            name,
-            [args[0]],
-        )
 
-else:
-    _LOGGER.debug(
-        "Did not load torch.distributed converters since TensorRT-LLM is not available"
+@needs_trtllm_for_nccl
+@dynamo_tensorrt_converter(tensorrt_fused_nccl_reduce_scatter_op)
+def fused_nccl_reduce_scatter(
+    ctx: ConversionContext,
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> Union[trt.ITensor, Sequence[trt.ITensor]]:
+    return impl.nccl_ops.nccl_reduce_scatter(
+        ctx,
+        target,
+        SourceIR.ATEN,
+        name,
+        [args[0]],
     )
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index 8fb31122d0..8d4f3b59be 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -907,12 +907,16 @@ def is_platform_supported_for_trtllm() -> bool:
     try:
         cuda_version = torch.version.cuda  # e.g., "12.4" or "13.0"
         if cuda_version is None:
-            logger.warning("No CUDA runtime detected — TRT-LLM plugins unavailable.")
+            logger.error(
+                "This pytorch build does not support CUDA, please reinstall pytorch with CUDA support"
+            )
             return False
 
         major, minor = map(int, cuda_version.split("."))
         if major != 12:
-            logger.warning("CUDA 13 is not supported for TRT-LLM plugins.")
+            logger.error(
+                "CUDA 13 is not supported for TRT-LLM plugins. Please install pytorch with CUDA 12.x support"
+            )
             return False
 
         return True

From 6bbd852d9bad53e13024403b750ddfae78135530 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Tue, 30 Sep 2025 19:30:04 -0700
Subject: [PATCH 4/9] excluding thor from the supported platform of TRTLLM
 wheel

---
 py/torch_tensorrt/dynamo/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index 8d4f3b59be..8ea6baf5b8 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -898,9 +898,9 @@ def is_platform_supported_for_trtllm() -> bool:
         )
         return False
 
-    if machine == "aarch64" and "tegra" in release:
+    if machine == "aarch64" and "tegra" in release or is_thor():
         logger.info(
-            "TensorRT-LLM plugins for NCCL backend are not supported on Jetson/Orin/Xavier (Tegra) devices."
+            "TensorRT-LLM plugins for NCCL backend are not supported on Jetson/Orin/Xavier (Tegra) or Thor devices."
         )
         return False
 

From 7046c6d1b25c5538f4e43ab44b1021250f22caca Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Thu, 2 Oct 2025 12:42:49 -0700
Subject: [PATCH 5/9] fixing circular imports

---
 py/torch_tensorrt/_features.py    |  12 +-
 py/torch_tensorrt/_utils.py       | 265 +++++++++++++++++++++++++++++-
 py/torch_tensorrt/dynamo/utils.py | 259 -----------------------------
 3 files changed, 272 insertions(+), 264 deletions(-)

diff --git a/py/torch_tensorrt/_features.py b/py/torch_tensorrt/_features.py
index b11be05535..03cf4256ec 100644
--- a/py/torch_tensorrt/_features.py
+++ b/py/torch_tensorrt/_features.py
@@ -7,9 +7,9 @@
 import tensorrt
 from torch_tensorrt._utils import (
     check_cross_compile_trt_win_lib,
+    load_tensorrt_llm_for_nccl,
     sanitized_torch_version,
 )
-from torch_tensorrt.dynamo.utils import load_tensorrt_llm_for_nccl
 
 from packaging import version
 
@@ -167,9 +167,13 @@ def wrapper(*args: List[Any], **kwargs: Dict[str, Any]) -> Any:
         if ENABLED_FEATURES.trtllm_for_nccl:
             return f(*args, **kwargs)
         else:
-            raise NotImplementedError(
-                "TensorRT-LLM plugins for NCCL backend could not be loaded"
-            )
+
+            def not_implemented(*args: List[Any], **kwargs: Dict[str, Any]) -> Any:
+                raise NotImplementedError(
+                    "Refit feature is currently not available in Python 3.13 or higher"
+                )
+
+            return not_implemented(*args, **kwargs)
 
     return wrapper
 
diff --git a/py/torch_tensorrt/_utils.py b/py/torch_tensorrt/_utils.py
index b981fb325a..740b09740c 100644
--- a/py/torch_tensorrt/_utils.py
+++ b/py/torch_tensorrt/_utils.py
@@ -1,9 +1,22 @@
+import ctypes
+import getpass
+import logging
+import os
+import platform
 import sys
-from typing import Any
+import tempfile
+import urllib.request
+from pathlib import Path
+from typing import Any, Optional
 
 import tensorrt as trt
 import torch
 
+logger = logging.getLogger(__name__)
+
+_WHL_CPYTHON_VERSION = "cp310"
+_TENSORRT_LLM_VERSION = "0.17.0.post1"
+
 
 def sanitized_torch_version() -> Any:
     return (
@@ -50,3 +63,253 @@ def is_tensorrt_version_supported(min_version: str) -> bool:
     except (ImportError, ValueError):
         # If tensorrt is not installed or version cannot be determined
         return False
+
+
+def is_thor() -> bool:
+    if torch.cuda.get_device_capability() in [(11, 0)]:
+        return True
+    return False
+
+
+def is_platform_supported_for_trtllm() -> bool:
+    """
+    Checks if the current platform supports TensorRT-LLM plugins for the NCCL backend.
+
+    Returns:
+        bool: True if supported, False otherwise.
+
+    Unsupported:
+        - Windows platforms
+        - Jetson/Orin/Xavier (aarch64 architecture + 'tegra' in platform release)
+        - CUDA 13 not supported
+    """
+    system = platform.system().lower()
+    machine = platform.machine().lower()
+    release = platform.release().lower()
+
+    if "windows" in system:
+        logger.info(
+            "TensorRT-LLM plugins for NCCL backend are not supported on Windows."
+        )
+        return False
+
+    if machine == "aarch64" and "tegra" in release or is_thor():
+        logger.info(
+            "TensorRT-LLM plugins for NCCL backend are not supported on Jetson/Orin/Xavier (Tegra) or Thor devices."
+        )
+        return False
+
+    try:
+        cuda_version = torch.version.cuda  # e.g., "12.4" or "13.0"
+        if cuda_version is None:
+            logger.error(
+                "This pytorch build does not support CUDA, please reinstall pytorch with CUDA support"
+            )
+            return False
+
+        major, minor = map(int, cuda_version.split("."))
+        if major != 12:
+            logger.error(
+                "CUDA 13 is not supported for TRT-LLM plugins. Please install pytorch with CUDA 12.x support"
+            )
+            return False
+
+        return True
+
+    except Exception as e:
+        logger.warning(f"Failed to detect CUDA version: {e}")
+        return False
+
+    return True
+
+
+def _cache_root() -> Path:
+    username = getpass.getuser()
+    return Path(tempfile.gettempdir()) / f"torch_tensorrt_{username}"
+
+
+def _extracted_dir_trtllm(platform_system: str, platform_machine: str) -> Path:
+    return (
+        _cache_root()
+        / "trtllm"
+        / f"{__TENSORRT_LLM_VERSION__}_{platform_system}_{platform_machine}"
+    )
+
+
+def download_and_get_plugin_lib_path() -> Optional[str]:
+    """
+    Returns the path to the TensorRT‑LLM shared library, downloading and extracting if necessary.
+
+    Args:
+        platform (str): Platform identifier (e.g., 'linux_x86_64')
+
+    Returns:
+        Optional[str]: Path to shared library or None if operation fails.
+    """
+    platform_system = platform.system().lower()
+    platform_machine = platform.machine().lower()
+    wheel_filename = (
+        f"tensorrt_llm-{__TENSORRT_LLM_VERSION__}-{_WHL_CPYTHON_VERSION}-"
+        f"{_WHL_CPYTHON_VERSION}-{platform_system}_{platform_machine}.whl"
+    )
+    wheel_path = _cache_root() / wheel_filename
+    extract_dir = _extracted_dir_trtllm(platform_system, platform_machine)
+    # else will never be met though
+    lib_filename = (
+        "libnvinfer_plugin_tensorrt_llm.so"
+        if "linux" in platform_system
+        else "libnvinfer_plugin_tensorrt_llm.dll"
+    )
+    # eg: /tmp/torch_tensorrt_<username>/trtllm/0.17.0.post1_linux_x86_64/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so
+    plugin_lib_path = extract_dir / "tensorrt_llm" / "libs" / lib_filename
+
+    if plugin_lib_path.exists():
+        return str(plugin_lib_path)
+
+    wheel_path.parent.mkdir(parents=True, exist_ok=True)
+    extract_dir.mkdir(parents=True, exist_ok=True)
+
+    if not wheel_path.exists():
+        base_url = "https://pypi.nvidia.com/tensorrt-llm/"
+        download_url = base_url + wheel_filename
+        try:
+            logger.debug(f"Downloading {download_url} ...")
+            urllib.request.urlretrieve(download_url, wheel_path)
+            logger.debug("Download succeeded and TRT-LLM wheel is now present")
+        except urllib.error.HTTPError as e:
+            logger.error(
+                f"HTTP error {e.code} when trying to download {download_url}: {e.reason}"
+            )
+        except urllib.error.URLError as e:
+            logger.error(
+                f"URL error when trying to download {download_url}: {e.reason}"
+            )
+        except OSError as e:
+            logger.error(f"Local file write error: {e}")
+
+    try:
+        import zipfile
+    except ImportError as e:
+        raise ImportError(
+            "zipfile module is required but not found. Please install zipfile"
+        )
+    try:
+        with zipfile.ZipFile(wheel_path) as zip_ref:
+            zip_ref.extractall(extract_dir)
+            logger.debug(f"Extracted wheel to {extract_dir}")
+    except FileNotFoundError as e:
+        # This should capture the errors in the download failure above
+        logger.error(f"Wheel file not found at {wheel_path}: {e}")
+        raise RuntimeError(
+            f"Failed to find downloaded wheel file at {wheel_path}"
+        ) from e
+    except zipfile.BadZipFile as e:
+        logger.error(f"Invalid or corrupted wheel file: {e}")
+        raise RuntimeError(
+            "Downloaded wheel file is corrupted or not a valid zip archive"
+        ) from e
+    except Exception as e:
+        logger.error(f"Unexpected error while extracting wheel: {e}")
+        raise RuntimeError(
+            "Unexpected error during extraction of TensorRT-LLM wheel"
+        ) from e
+
+    try:
+        wheel_path.unlink(missing_ok=True)
+        logger.debug(f"Deleted wheel file: {wheel_path}")
+    except Exception as e:
+        logger.warning(f"Could not delete wheel file {wheel_path}: {e}")
+    if not plugin_lib_path.exists():
+        logger.error(
+            f"Plugin library not found at expected location: {plugin_lib_path}"
+        )
+        return None
+
+    return str(plugin_lib_path)
+
+
+def load_and_initialize_trtllm_plugin(plugin_lib_path: str) -> bool:
+    """
+    Loads and initializes the TensorRT-LLM plugin from the given shared library path.
+
+    Args:
+        plugin_lib_path (str): Path to the shared TensorRT-LLM plugin library.
+
+    Returns:
+        bool: True if successful, False otherwise.
+    """
+    try:
+        handle = ctypes.CDLL(plugin_lib_path)
+        logger.info(f"Successfully loaded plugin library: {plugin_lib_path}")
+    except OSError as e_os_error:
+        if "libmpi" in str(e_os_error):
+            logger.warning(
+                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}, got error {e_os_error} (hint: libmpi.so is a necessary dependency; ensure that OpenMPI or MPICH is installed on your system)",
+                exc_info=e_os_error,
+            )
+        else:
+            logger.warning(
+                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}. "
+                f"Ensure the path is correct and the library is compatible.",
+                exc_info=e_os_error,
+            )
+        return False
+
+    try:
+        handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+        handle.initTrtLlmPlugins.restype = ctypes.c_bool
+    except AttributeError as e_plugin_unavailable:
+        logger.warning(
+            "Unable to initialize the TensorRT-LLM plugin library",
+            exc_info=e_plugin_unavailable,
+        )
+        return False
+
+    try:
+        if handle.initTrtLlmPlugins(None, b"tensorrt_llm"):
+            logger.info("TensorRT-LLM plugin successfully initialized")
+            return True
+        else:
+            logger.warning("TensorRT-LLM plugin library failed in initialization")
+            return False
+    except Exception as e_initialization_error:
+        logger.warning(
+            "Exception occurred during TensorRT-LLM plugin library initialization",
+            exc_info=e_initialization_error,
+        )
+        return False
+    return False
+
+
+def load_tensorrt_llm_for_nccl() -> bool:
+    """
+    Attempts to load the TensorRT-LLM plugin and initialize it.
+    Either the env variable TRTLLM_PLUGINS_PATH can specify the path
+    Or the user can specify USE_TRTLLM_PLUGINS as either of (1, true, yes, on) to download the TRT-LLM distribution and load it
+
+    Returns:
+        bool: True if the plugin was successfully loaded and initialized, False otherwise.
+    """
+    if not is_platform_supported_for_trtllm():
+        return False
+    plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
+
+    if plugin_lib_path:
+        return load_and_initialize_trtllm_plugin(plugin_lib_path)
+    else:
+        # this option can be used by user if TRTLLM_PLUGINS_PATH is not set by user
+        use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in (
+            "1",
+            "true",
+            "yes",
+            "on",
+        )
+        if not use_trtllm_plugin:
+            logger.warning(
+                "Neither TRTLLM_PLUGIN_PATH is set nor is it directed to download the shared library. Please set either of the two to use TRT-LLM libraries in torchTRT"
+            )
+            return False
+
+        plugin_lib_path = download_and_get_plugin_lib_path()
+        return load_and_initialize_trtllm_plugin(plugin_lib_path)  # type: ignore[arg-type]
+    return False
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index 8ea6baf5b8..e7927bef84 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -1,17 +1,10 @@
 from __future__ import annotations
 
-import ctypes
 import gc
-import getpass
 import logging
-import os
-import platform
-import tempfile
-import urllib.request
 import warnings
 from dataclasses import fields, replace
 from enum import Enum
-from pathlib import Path
 from typing import (
     Any,
     Callable,
@@ -34,7 +27,6 @@
 from torch_tensorrt._features import ENABLED_FEATURES
 from torch_tensorrt._Input import Input
 from torch_tensorrt._utils import is_tensorrt_version_supported
-from torch_tensorrt._version import __tensorrt_llm_version__
 from torch_tensorrt.dynamo import _defaults
 from torch_tensorrt.dynamo._defaults import default_device
 from torch_tensorrt.dynamo._engine_cache import BaseEngineCache
@@ -51,7 +43,6 @@
 RTOL = 5e-3
 ATOL = 5e-3
 CPU_DEVICE = "cpu"
-_WHL_CPYTHON_VERSION = "cp310"
 
 
 class Frameworks(Enum):
@@ -868,253 +859,3 @@ def is_tegra_platform() -> bool:
     if torch.cuda.get_device_capability() in [(8, 7), (7, 2)]:
         return True
     return False
-
-
-def is_thor() -> bool:
-    if torch.cuda.get_device_capability() in [(11, 0)]:
-        return True
-    return False
-
-
-def is_platform_supported_for_trtllm() -> bool:
-    """
-    Checks if the current platform supports TensorRT-LLM plugins for the NCCL backend.
-
-    Returns:
-        bool: True if supported, False otherwise.
-
-    Unsupported:
-        - Windows platforms
-        - Jetson/Orin/Xavier (aarch64 architecture + 'tegra' in platform release)
-        - CUDA 13 not supported
-    """
-    system = platform.system().lower()
-    machine = platform.machine().lower()
-    release = platform.release().lower()
-
-    if "windows" in system:
-        logger.info(
-            "TensorRT-LLM plugins for NCCL backend are not supported on Windows."
-        )
-        return False
-
-    if machine == "aarch64" and "tegra" in release or is_thor():
-        logger.info(
-            "TensorRT-LLM plugins for NCCL backend are not supported on Jetson/Orin/Xavier (Tegra) or Thor devices."
-        )
-        return False
-
-    try:
-        cuda_version = torch.version.cuda  # e.g., "12.4" or "13.0"
-        if cuda_version is None:
-            logger.error(
-                "This pytorch build does not support CUDA, please reinstall pytorch with CUDA support"
-            )
-            return False
-
-        major, minor = map(int, cuda_version.split("."))
-        if major != 12:
-            logger.error(
-                "CUDA 13 is not supported for TRT-LLM plugins. Please install pytorch with CUDA 12.x support"
-            )
-            return False
-
-        return True
-
-    except Exception as e:
-        logger.warning(f"Failed to detect CUDA version: {e}")
-        return False
-
-    return True
-
-
-def _cache_root() -> Path:
-    username = getpass.getuser()
-    return Path(tempfile.gettempdir()) / f"torch_tensorrt_{username}"
-
-
-def _extracted_dir_trtllm(platform_system: str, platform_machine: str) -> Path:
-    return (
-        _cache_root()
-        / "trtllm"
-        / f"{__tensorrt_llm_version__}_{platform_system}_{platform_machine}"
-    )
-
-
-def download_and_get_plugin_lib_path() -> Optional[str]:
-    """
-    Returns the path to the TensorRT‑LLM shared library, downloading and extracting if necessary.
-
-    Args:
-        platform (str): Platform identifier (e.g., 'linux_x86_64')
-
-    Returns:
-        Optional[str]: Path to shared library or None if operation fails.
-    """
-    platform_system = platform.system().lower()
-    platform_machine = platform.machine().lower()
-    wheel_filename = (
-        f"tensorrt_llm-{__tensorrt_llm_version__}-{_WHL_CPYTHON_VERSION}-"
-        f"{_WHL_CPYTHON_VERSION}-{platform_system}_{platform_machine}.whl"
-    )
-    wheel_path = _cache_root() / wheel_filename
-    extract_dir = _extracted_dir_trtllm(platform_system, platform_machine)
-    # else will never be met though
-    lib_filename = (
-        "libnvinfer_plugin_tensorrt_llm.so"
-        if "linux" in platform_system
-        else "libnvinfer_plugin_tensorrt_llm.dll"
-    )
-    # eg: /tmp/torch_tensorrt_<username>/trtllm/0.17.0.post1_linux_x86_64/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so
-    plugin_lib_path = extract_dir / "tensorrt_llm" / "libs" / lib_filename
-
-    if plugin_lib_path.exists():
-        return str(plugin_lib_path)
-
-    wheel_path.parent.mkdir(parents=True, exist_ok=True)
-    extract_dir.mkdir(parents=True, exist_ok=True)
-
-    if not wheel_path.exists():
-        base_url = "https://pypi.nvidia.com/tensorrt-llm/"
-        download_url = base_url + wheel_filename
-        try:
-            logger.debug(f"Downloading {download_url} ...")
-            urllib.request.urlretrieve(download_url, wheel_path)
-            logger.debug("Download succeeded and TRT-LLM wheel is now present")
-        except urllib.error.HTTPError as e:
-            logger.error(
-                f"HTTP error {e.code} when trying to download {download_url}: {e.reason}"
-            )
-        except urllib.error.URLError as e:
-            logger.error(
-                f"URL error when trying to download {download_url}: {e.reason}"
-            )
-        except OSError as e:
-            logger.error(f"Local file write error: {e}")
-
-    try:
-        import zipfile
-    except ImportError as e:
-        raise ImportError(
-            "zipfile module is required but not found. Please install zipfile"
-        )
-    try:
-        with zipfile.ZipFile(wheel_path) as zip_ref:
-            zip_ref.extractall(extract_dir)
-            logger.debug(f"Extracted wheel to {extract_dir}")
-    except FileNotFoundError as e:
-        # This should capture the errors in the download failure above
-        logger.error(f"Wheel file not found at {wheel_path}: {e}")
-        raise RuntimeError(
-            f"Failed to find downloaded wheel file at {wheel_path}"
-        ) from e
-    except zipfile.BadZipFile as e:
-        logger.error(f"Invalid or corrupted wheel file: {e}")
-        raise RuntimeError(
-            "Downloaded wheel file is corrupted or not a valid zip archive"
-        ) from e
-    except Exception as e:
-        logger.error(f"Unexpected error while extracting wheel: {e}")
-        raise RuntimeError(
-            "Unexpected error during extraction of TensorRT-LLM wheel"
-        ) from e
-
-    try:
-        wheel_path.unlink(missing_ok=True)
-        logger.debug(f"Deleted wheel file: {wheel_path}")
-    except Exception as e:
-        logger.warning(f"Could not delete wheel file {wheel_path}: {e}")
-    if not plugin_lib_path.exists():
-        logger.error(
-            f"Plugin library not found at expected location: {plugin_lib_path}"
-        )
-        return None
-
-    return str(plugin_lib_path)
-
-
-def load_and_initialize_trtllm_plugin(plugin_lib_path: str) -> bool:
-    """
-    Loads and initializes the TensorRT-LLM plugin from the given shared library path.
-
-    Args:
-        plugin_lib_path (str): Path to the shared TensorRT-LLM plugin library.
-
-    Returns:
-        bool: True if successful, False otherwise.
-    """
-    try:
-        handle = ctypes.CDLL(plugin_lib_path)
-        logger.info(f"Successfully loaded plugin library: {plugin_lib_path}")
-    except OSError as e_os_error:
-        if "libmpi" in str(e_os_error):
-            logger.warning(
-                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}, got error {e_os_error} (hint: libmpi.so is a necessary dependency; ensure that OpenMPI or MPICH is installed on your system)",
-                exc_info=e_os_error,
-            )
-        else:
-            logger.warning(
-                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}. "
-                f"Ensure the path is correct and the library is compatible.",
-                exc_info=e_os_error,
-            )
-        return False
-
-    try:
-        handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
-        handle.initTrtLlmPlugins.restype = ctypes.c_bool
-    except AttributeError as e_plugin_unavailable:
-        logger.warning(
-            "Unable to initialize the TensorRT-LLM plugin library",
-            exc_info=e_plugin_unavailable,
-        )
-        return False
-
-    try:
-        if handle.initTrtLlmPlugins(None, b"tensorrt_llm"):
-            logger.info("TensorRT-LLM plugin successfully initialized")
-            return True
-        else:
-            logger.warning("TensorRT-LLM plugin library failed in initialization")
-            return False
-    except Exception as e_initialization_error:
-        logger.warning(
-            "Exception occurred during TensorRT-LLM plugin library initialization",
-            exc_info=e_initialization_error,
-        )
-        return False
-    return False
-
-
-def load_tensorrt_llm_for_nccl() -> bool:
-    """
-    Attempts to load the TensorRT-LLM plugin and initialize it.
-    Either the env variable TRTLLM_PLUGINS_PATH can specify the path
-    Or the user can specify USE_TRTLLM_PLUGINS as either of (1, true, yes, on) to download the TRT-LLM distribution and load it
-
-    Returns:
-        bool: True if the plugin was successfully loaded and initialized, False otherwise.
-    """
-    if not is_platform_supported_for_trtllm():
-        return False
-    plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
-
-    if plugin_lib_path:
-        return load_and_initialize_trtllm_plugin(plugin_lib_path)
-    else:
-        # this option can be used by user if TRTLLM_PLUGINS_PATH is not set by user
-        use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in (
-            "1",
-            "true",
-            "yes",
-            "on",
-        )
-        if not use_trtllm_plugin:
-            logger.warning(
-                "Neither TRTLLM_PLUGIN_PATH is set nor is it directed to download the shared library. Please set either of the two to use TRT-LLM libraries in torchTRT"
-            )
-            return False
-
-        plugin_lib_path = download_and_get_plugin_lib_path()
-        return load_and_initialize_trtllm_plugin(plugin_lib_path)  # type: ignore[arg-type]
-    return False

From a0286010ed9c9000461abd2d20a1fca4bd84b887 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Thu, 2 Oct 2025 15:02:49 -0700
Subject: [PATCH 6/9] fixing typo

---
 py/torch_tensorrt/_utils.py                  | 6 +++---
 tests/py/dynamo/distributed/test_nccl_ops.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/py/torch_tensorrt/_utils.py b/py/torch_tensorrt/_utils.py
index 740b09740c..e56c6f0742 100644
--- a/py/torch_tensorrt/_utils.py
+++ b/py/torch_tensorrt/_utils.py
@@ -15,7 +15,7 @@
 logger = logging.getLogger(__name__)
 
 _WHL_CPYTHON_VERSION = "cp310"
-_TENSORRT_LLM_VERSION = "0.17.0.post1"
+_TENSORRT_LLM_VERSION_ = "0.17.0.post1"
 
 
 def sanitized_torch_version() -> Any:
@@ -132,7 +132,7 @@ def _extracted_dir_trtllm(platform_system: str, platform_machine: str) -> Path:
     return (
         _cache_root()
         / "trtllm"
-        / f"{__TENSORRT_LLM_VERSION__}_{platform_system}_{platform_machine}"
+        / f"{_TENSORRT_LLM_VERSION_}_{platform_system}_{platform_machine}"
     )
 
 
@@ -149,7 +149,7 @@ def download_and_get_plugin_lib_path() -> Optional[str]:
     platform_system = platform.system().lower()
     platform_machine = platform.machine().lower()
     wheel_filename = (
-        f"tensorrt_llm-{__TENSORRT_LLM_VERSION__}-{_WHL_CPYTHON_VERSION}-"
+        f"tensorrt_llm-{_TENSORRT_LLM_VERSION_}-{_WHL_CPYTHON_VERSION}-"
         f"{_WHL_CPYTHON_VERSION}-{platform_system}_{platform_machine}.whl"
     )
     wheel_path = _cache_root() / wheel_filename
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
index 0aca81d506..eafe16d455 100644
--- a/tests/py/dynamo/distributed/test_nccl_ops.py
+++ b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -8,7 +8,7 @@
 from distributed_utils import set_environment_variables_pytest
 from parameterized import parameterized
 from torch.testing._internal.common_utils import run_tests
-from torch_tensorrt.dynamo.utils import is_platform_supported_for_trtllm
+from torch_tensorrt._utils import is_platform_supported_for_trtllm
 
 
 class DistributedGatherModel(nn.Module):

From 226ed04d041f9ee87858f502bd36c8999ae2c895 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Mon, 6 Oct 2025 10:18:42 -0700
Subject: [PATCH 7/9] addressing the review comments- comments and error
 message

---
 py/torch_tensorrt/_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/py/torch_tensorrt/_utils.py b/py/torch_tensorrt/_utils.py
index e56c6f0742..6ac22e3895 100644
--- a/py/torch_tensorrt/_utils.py
+++ b/py/torch_tensorrt/_utils.py
@@ -81,6 +81,7 @@ def is_platform_supported_for_trtllm() -> bool:
     Unsupported:
         - Windows platforms
         - Jetson/Orin/Xavier (aarch64 architecture + 'tegra' in platform release)
+        - Thor devices
         - CUDA 13 not supported
     """
     system = platform.system().lower()
@@ -110,7 +111,7 @@ def is_platform_supported_for_trtllm() -> bool:
         major, minor = map(int, cuda_version.split("."))
         if major != 12:
             logger.error(
-                "CUDA 13 is not supported for TRT-LLM plugins. Please install pytorch with CUDA 12.x support"
+                "CUDA 13 is not currently supported for TRT-LLM plugins. Please install pytorch with CUDA 12.x support"
             )
             return False
 

From 2f2cd311e321070c0169a9eccf4d1c897cfb32a3 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Mon, 6 Oct 2025 13:37:56 -0700
Subject: [PATCH 8/9] changing location of the L1 distributed tests

---
 .github/workflows/build-test-linux-x86_64.yml | 37 +++++++++----------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/build-test-linux-x86_64.yml b/.github/workflows/build-test-linux-x86_64.yml
index e061aebbd1..466eb2ac9a 100644
--- a/.github/workflows/build-test-linux-x86_64.yml
+++ b/.github/workflows/build-test-linux-x86_64.yml
@@ -315,9 +315,9 @@ jobs:
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_ts_models_tests_results.xml -n auto  models/
         popd
 
-  L2-torch-compile-tests:
-    name: L2 torch compile tests
-    needs: [filter-matrix, build, L1-torch-compile-tests, L1-dynamo-compile-tests, L1-dynamo-core-tests, L1-torchscript-tests]
+  L1-dynamo-distributed-tests:
+    name: L1 dynamo distributed tests
+    needs: [filter-matrix, build,L0-dynamo-core-tests, L0-dynamo-converter-tests, L0-py-core-tests, L0-torchscript-tests]
     strategy:
       fail-fast: false
       matrix:
@@ -329,7 +329,7 @@ jobs:
             smoke-test-script: packaging/smoke_test_script.sh
     uses: ./.github/workflows/linux-test.yml
     with:
-      job-name: L2-torch-compile-tests
+      job-name: L1-dynamo-distributed-tests
       repository: "pytorch/tensorrt"
       ref: ""
       test-infra-repository: pytorch/test-infra
@@ -338,15 +338,19 @@ jobs:
       pre-script: ${{ matrix.pre-script }}
       script: |
         set -euo pipefail
+        export USE_HOST_DEPS=1
+        export CI_BUILD=1
+        export USE_TRTLLM_PLUGINS=1
+        dnf install -y mpich mpich-devel openmpi openmpi-devel
         pushd .
-        cd tests/py/dynamo/
-        python -m pytest -m "not critical" -ra -n auto --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_torch_compile_models_tests_results.xml --ir torch_compile models/test_models.py
-        python -m pytest -m "not critical" -ra -n auto --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_torch_compile_dyn_models_tests_results.xml --ir torch_compile models/test_dyn_models.py
+        cd tests/py
+        cd dynamo
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
         popd
 
-  L1-dynamo-distributed-tests:
-    name: Test dynamo distributed [Python]
-    needs: [filter-matrix, build]
+  L2-torch-compile-tests:
+    name: L2 torch compile tests
+    needs: [filter-matrix, build, L1-torch-compile-tests, L1-dynamo-compile-tests, L1-dynamo-core-tests, L1-torchscript-tests]
     strategy:
       fail-fast: false
       matrix:
@@ -358,7 +362,7 @@ jobs:
             smoke-test-script: packaging/smoke_test_script.sh
     uses: ./.github/workflows/linux-test.yml
     with:
-      job-name: tests-py-dynamo-distributed
+      job-name: L2-torch-compile-tests
       repository: "pytorch/tensorrt"
       ref: ""
       test-infra-repository: pytorch/test-infra
@@ -367,17 +371,12 @@ jobs:
       pre-script: ${{ matrix.pre-script }}
       script: |
         set -euo pipefail
-        export USE_HOST_DEPS=1
-        export CI_BUILD=1
-        export USE_TRTLLM_PLUGINS=1
-        dnf install -y mpich mpich-devel openmpi openmpi-devel
         pushd .
-        cd tests/py
-        cd dynamo
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
+        cd tests/py/dynamo/
+        python -m pytest -m "not critical" -ra -n auto --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_torch_compile_models_tests_results.xml --ir torch_compile models/test_models.py
+        python -m pytest -m "not critical" -ra -n auto --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_torch_compile_dyn_models_tests_results.xml --ir torch_compile models/test_dyn_models.py
         popd
 
-
   L2-dynamo-compile-tests:
     name: L2 dynamo compile tests
     needs: [filter-matrix, build, L1-dynamo-compile-tests, L1-dynamo-core-tests, L1-torch-compile-tests, L1-torchscript-tests]

From 24264e512d50010f0ae58f1c91a6497b2c65e0e6 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Tue, 7 Oct 2025 16:10:12 -0700
Subject: [PATCH 9/9] moving tests to L2

---
 .github/workflows/build-test-linux-x86_64.yml | 66 +++++++++----------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/build-test-linux-x86_64.yml b/.github/workflows/build-test-linux-x86_64.yml
index 466eb2ac9a..0e770590f3 100644
--- a/.github/workflows/build-test-linux-x86_64.yml
+++ b/.github/workflows/build-test-linux-x86_64.yml
@@ -315,39 +315,6 @@ jobs:
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_ts_models_tests_results.xml -n auto  models/
         popd
 
-  L1-dynamo-distributed-tests:
-    name: L1 dynamo distributed tests
-    needs: [filter-matrix, build,L0-dynamo-core-tests, L0-dynamo-converter-tests, L0-py-core-tests, L0-torchscript-tests]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - repository: pytorch/tensorrt
-            package-name: torch_tensorrt
-            pre-script: packaging/pre_build_script.sh
-            post-script: packaging/post_build_script.sh
-            smoke-test-script: packaging/smoke_test_script.sh
-    uses: ./.github/workflows/linux-test.yml
-    with:
-      job-name: L1-dynamo-distributed-tests
-      repository: "pytorch/tensorrt"
-      ref: ""
-      test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
-      build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
-      pre-script: ${{ matrix.pre-script }}
-      script: |
-        set -euo pipefail
-        export USE_HOST_DEPS=1
-        export CI_BUILD=1
-        export USE_TRTLLM_PLUGINS=1
-        dnf install -y mpich mpich-devel openmpi openmpi-devel
-        pushd .
-        cd tests/py
-        cd dynamo
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
-        popd
-
   L2-torch-compile-tests:
     name: L2 torch compile tests
     needs: [filter-matrix, build, L1-torch-compile-tests, L1-dynamo-compile-tests, L1-dynamo-core-tests, L1-torchscript-tests]
@@ -494,6 +461,39 @@ jobs:
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_ts_integrations_tests_results.xml -n auto  integrations/
         popd
 
+  L2-dynamo-distributed-tests:
+    name: L2 dynamo distributed tests
+    needs: [filter-matrix, build, L1-dynamo-core-tests, L1-dynamo-compile-tests, L1-torch-compile-tests, L1-torchscript-tests]
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/tensorrt
+            package-name: torch_tensorrt
+            pre-script: packaging/pre_build_script.sh
+            post-script: packaging/post_build_script.sh
+            smoke-test-script: packaging/smoke_test_script.sh
+    uses: ./.github/workflows/linux-test.yml
+    with:
+      job-name: L2-dynamo-distributed-tests
+      repository: "pytorch/tensorrt"
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      script: |
+        set -euo pipefail
+        export USE_HOST_DEPS=1
+        export CI_BUILD=1
+        export USE_TRTLLM_PLUGINS=1
+        dnf install -y mpich mpich-devel openmpi openmpi-devel
+        pushd .
+        cd tests/py
+        cd dynamo
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
+        popd
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-tensorrt-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
   cancel-in-progress: true