vllm-project · youkaichao · Aug 11, 2024 · May 12, 2024 · May 12, 2024 · May 12, 2024
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
@@ -2,5 +2,4 @@
 -r requirements-common.txt
 
 # Dependencies for x86_64 CPUs
-torch == 2.3.0+cpu
-triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
+torch == 2.3.0+cpu
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
@@ -6,5 +6,5 @@ ray >= 2.9
 nvidia-ml-py # for pynvml package
 vllm-nccl-cu12>=2.18,<2.19  # for downloading nccl library
 torch == 2.3.0
-xformers == 0.0.26.post1  # Requires PyTorch 2.3.0
-vllm-flash-attn == 2.5.8.post1  # Requires PyTorch 2.3.0
+xformers == 0.0.26.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.3.0
+vllm-flash-attn == 2.5.8.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.3.0
diff --git a/setup.py b/setup.py
@@ -36,6 +36,8 @@ def load_module_from_path(module_name, path):
 assert sys.platform.startswith(
     "linux"), "vLLM only supports Linux platform (including WSL)."
 
+PLATFORM_AGNOSTIC_BUILD = envs.PLATFORM_AGNOSTIC_BUILD
+
 MAIN_CUDA_VERSION = "12.1"
 
 
@@ -398,6 +400,9 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules = []
     package_data["vllm"].append("*.so")
 
+if PLATFORM_AGNOSTIC_BUILD:
+    ext_modules = []
+
 setup(
     name="vllm",
     version=get_vllm_version(),
@@ -428,6 +433,7 @@ def _read_requirements(filename: str) -> List[str]:
     extras_require={
         "tensorizer": ["tensorizer==2.9.0"],
     },
-    cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
+    cmdclass={"build_ext": cmake_build_ext}
+    if not (_is_neuron() or PLATFORM_AGNOSTIC_BUILD) else {},
     package_data=package_data,
 )
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
@@ -2,8 +2,20 @@
 # https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py
 
 import torch
-import triton
-import triton.language as tl
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+try:
+    import triton
+    import triton.language as tl
+except ImportError as e:
+    logger.warning(
+        "Failed to import triton with %r. To enable vllm execution, "
+        "please install triton with `pip install triton` "
+        "(not available on macos)", e)
+    triton = type('triton', tuple(), {"__version__": "0.0.0"})()
 
 if triton.__version__ >= "2.1.0":
 

diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
@@ -21,8 +21,34 @@
 """
 
 import torch
-import triton
-import triton.language as tl
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+try:
+    import triton
+    import triton.language as tl
+except ImportError as e:
+    logger.warning(
+        "Failed to import triton with %r. To enable vllm execution, "
+        "please install triton with `pip install triton` "
+        "(not available on macos)", e)
+
+    def dummy_decorator(*args, **kwargs):
+        return args[0]
+
+    def dummy_callable(*args, **kwargs):
+        return None
+
+    triton = type(
+        "triton", tuple(), {
+            "jit": dummy_decorator,
+            "autotune": dummy_decorator,
+            "Config": dummy_callable,
+            "__call__": dummy_callable
+        })()
+    tl = type("tl", tuple(), {"constexpr": None})()
 
 torch_dtype: tl.constexpr = torch.float16
 

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -28,6 +28,7 @@
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
     VLLM_TARGET_DEVICE: str = "cuda"
+    PLATFORM_AGNOSTIC_BUILD: bool = False
     MAX_JOBS: Optional[str] = None
     NVCC_THREADS: Optional[str] = None
     VLLM_BUILD_WITH_NEURON: bool = False
@@ -49,6 +50,10 @@
     "VLLM_TARGET_DEVICE":
     lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
 
+    # Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
+    "PLATFORM_AGNOSTIC_BUILD":
+    lambda: bool(os.environ.get("PLATFORM_AGNOSTIC_BUILD", False)),
+
     # Maximum number of compilation jobs to run in parallel.
     # By default this is the number of CPUs
     "MAX_JOBS":

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -5,15 +5,28 @@
 from typing import Any, Dict, Optional, Tuple
 
 import torch
-import triton
-import triton.language as tl
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.utils import is_hip
 
 logger = init_logger(__name__)
 
+try:
+    import triton
+    import triton.language as tl
+except ImportError as e:
+    logger.warning(
+        "Failed to import triton with %r. To enable vllm execution, "
+        "please install triton with `pip install triton` "
+        "(not available on macos)", e)
+
+    def dummy_decorator(*args, **kwargs):
+        return args[0]
+
+    triton = type("triton", tuple(), {"jit": dummy_decorator})()
+    tl = type("tl", tuple(), {"constexpr": None, "dtype": None})()
+
 
 @triton.jit
 def fused_moe_kernel(

diff --git a/vllm/model_executor/layers/ops/rand.py b/vllm/model_executor/layers/ops/rand.py
@@ -1,8 +1,25 @@
 from typing import Optional, Union
 
 import torch
-import triton
-import triton.language as tl
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+try:
+    import triton
+    import triton.language as tl
+except ImportError as e:
+    logger.warning(
+        "Failed to import triton with %r. To enable vllm execution, "
+        "please install triton with `pip install triton` "
+        "(not available on macos)", e)
+
+    def dummy_decorator(*args, **kwargs):
+        return args[0]
+
+    triton = type("triton", tuple(), {"jit": dummy_decorator})()
+    tl = type("tl", tuple(), {"constexpr": None})()
 
 
 def seeded_uniform(

diff --git a/vllm/model_executor/layers/ops/sample.py b/vllm/model_executor/layers/ops/sample.py
@@ -2,11 +2,27 @@
 from typing import Optional, Tuple
 
 import torch
-import triton
-import triton.language as tl
 
+from vllm.logger import init_logger
 from vllm.model_executor.layers.ops.rand import seeded_uniform
 
+logger = init_logger(__name__)
+
+try:
+    import triton
+    import triton.language as tl
+except ImportError as e:
+    logger.warning(
+        "Failed to import triton with %r. To enable vllm execution, "
+        "please install triton with `pip install triton` "
+        "(not available on macos)", e)
+
+    def dummy_decorator(*args, **kwargs):
+        return args[0]
+
+    triton = type("triton", tuple(), {"jit": dummy_decorator})()
+    tl = type("tl", tuple(), {"constexpr": None})()
+
 _EPS = 1e-6
 
 # This is a hardcoded limit in Triton (max block size).