From 03850aa98c81d26d07009ddcad8fc93462343bcf Mon Sep 17 00:00:00 2001 From: Tomer Asida Date: Sun, 12 May 2024 16:02:46 +0300 Subject: [PATCH 01/10] mark xformers and vllm-flash-attn as installable only on x86_64 Linux --- requirements-cuda.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-cuda.txt b/requirements-cuda.txt index ba8c614d205d..e6caa863fc2b 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -6,5 +6,5 @@ ray >= 2.9 nvidia-ml-py # for pynvml package vllm-nccl-cu12>=2.18,<2.19 # for downloading nccl library torch == 2.3.0 -xformers == 0.0.26.post1 # Requires PyTorch 2.3.0 -vllm-flash-attn == 2.5.8.post1 # Requires PyTorch 2.3.0 +xformers == 0.0.26.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.3.0 +vllm-flash-attn == 2.5.8.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.3.0 From f47bc9c6c118829e50f6e9e33bf1acc2bc605106 Mon Sep 17 00:00:00 2001 From: Tomer Asida Date: Sun, 12 May 2024 16:54:31 +0300 Subject: [PATCH 02/10] changes in vllm to mock triton in case it can't be imported --- vllm/attention/ops/prefix_prefill.py | 15 ++++++++++-- vllm/attention/ops/triton_flash_attention.py | 23 +++++++++++++++++-- .../layers/fused_moe/fused_moe.py | 19 +++++++++++---- vllm/model_executor/layers/ops/rand.py | 18 +++++++++++++-- vllm/model_executor/layers/ops/sample.py | 18 +++++++++++++-- 5 files changed, 81 insertions(+), 12 deletions(-) diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index 997b25e887e3..3c7e241ee395 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -2,8 +2,19 @@ # https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py import torch -import triton -import triton.language as tl + +from vllm.logger import init_logger + +logger = init_logger(__name__) + +try: + import triton + import triton.language as tl +except ImportError as e: + logger.warning( + "Failed to import triton with %r. To enable vllm execution, " + "please install triton with `pip install triton` (not available on macos)", e) + triton = type('triton', tuple(), {"__version__": "0.0.0"})() if triton.__version__ >= "2.1.0": diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index 1147664183ff..4056f7219653 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -21,8 +21,27 @@ """ import torch -import triton -import triton.language as tl + +from vllm.logger import init_logger + +logger = init_logger(__name__) + +try: + import triton + import triton.language as tl +except ImportError as e: + logger.warning( + "Failed to import triton with %r. To enable vllm execution, " + "please install triton with `pip install triton` (not available on macos)", e) + def dummy_decorator(*args, **kwargs): + return args[0] + def dummy_callable(*args, **kwargs): + return None + triton = type("triton", tuple(), {"jit": dummy_decorator, + "autotune": dummy_decorator, + "Config": dummy_callable, + "__call__": dummy_callable})() + tl = type("tl", tuple(), {"constexpr": None})() torch_dtype: tl.constexpr = torch.float16 diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index bb7938b3715b..25a5ed51fee0 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -5,15 +5,26 @@ from typing import Any, Dict, Optional, Tuple import torch -import triton -import triton.language as tl -from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.utils import is_hip logger = init_logger(__name__) +try: + import triton + import triton.language as tl +except ImportError as e: + logger.warning( + "Failed to import triton with %r. To enable vllm execution, " + "please install triton with `pip install triton` (not available on macos)", e) + def dummy_decorator(*args, **kwargs): + return args[0] + triton = type("triton", tuple(), {"jit": dummy_decorator})() + tl = type("tl", tuple(), {"constexpr": None, "dtype": None})() + +from vllm import _custom_ops as ops +from vllm.utils import is_hip + @triton.jit def fused_moe_kernel( diff --git a/vllm/model_executor/layers/ops/rand.py b/vllm/model_executor/layers/ops/rand.py index 4a429e329567..b983a8cf9265 100644 --- a/vllm/model_executor/layers/ops/rand.py +++ b/vllm/model_executor/layers/ops/rand.py @@ -1,8 +1,22 @@ from typing import Optional, Union import torch -import triton -import triton.language as tl + +from vllm.logger import init_logger + +logger = init_logger(__name__) + +try: + import triton + import triton.language as tl +except ImportError as e: + logger.warning( + "Failed to import triton with %r. To enable vllm execution, " + "please install triton with `pip install triton` (not available on macos)", e) + def dummy_decorator(*args, **kwargs): + return args[0] + triton = type("triton", tuple(), {"jit": dummy_decorator})() + tl = type("tl", tuple(), {"constexpr": None})() def seeded_uniform( diff --git a/vllm/model_executor/layers/ops/sample.py b/vllm/model_executor/layers/ops/sample.py index d08ae6064aa2..c6a18ec3b003 100644 --- a/vllm/model_executor/layers/ops/sample.py +++ b/vllm/model_executor/layers/ops/sample.py @@ -2,8 +2,22 @@ from typing import Optional, Tuple import torch -import triton -import triton.language as tl + +from vllm.logger import init_logger + +logger = init_logger(__name__) + +try: + import triton + import triton.language as tl +except ImportError as e: + logger.warning( + "Failed to import triton with %r. To enable vllm execution, " + "please install triton with `pip install triton` (not available on macos)", e) + def dummy_decorator(*args, **kwargs): + return args[0] + triton = type("triton", tuple(), {"jit": dummy_decorator})() + tl = type("tl", tuple(), {"constexpr": None})() from vllm.model_executor.layers.ops.rand import seeded_uniform From bc83673d2c69708d53d1547dd7ec8d15cebb77f3 Mon Sep 17 00:00:00 2001 From: Tomer Asida Date: Sun, 12 May 2024 16:55:40 +0300 Subject: [PATCH 03/10] changes to setup.py to allow PLATFORM_AGNOSTIC_BUILD --- setup.py | 7 ++++++- vllm/envs.py | 5 +++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0dc8818b44a9..3ab30d40c9a3 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,8 @@ def load_module_from_path(module_name, path): assert sys.platform.startswith( "linux"), "vLLM only supports Linux platform (including WSL)." +PLATFORM_AGNOSTIC_BUILD = envs.PLATFORM_AGNOSTIC_BUILD + MAIN_CUDA_VERSION = "12.1" @@ -398,6 +400,9 @@ def _read_requirements(filename: str) -> List[str]: ext_modules = [] package_data["vllm"].append("*.so") +if PLATFORM_AGNOSTIC_BUILD: + ext_modules = [] + setup( name="vllm", version=get_vllm_version(), @@ -428,6 +433,6 @@ def _read_requirements(filename: str) -> List[str]: extras_require={ "tensorizer": ["tensorizer==2.9.0"], }, - cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {}, + cmdclass={"build_ext": cmake_build_ext} if not (_is_neuron() or PLATFORM_AGNOSTIC_BUILD) else {}, package_data=package_data, ) diff --git a/vllm/envs.py b/vllm/envs.py index 91cc8f3be775..7a9bc8589c49 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -28,6 +28,7 @@ VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_WORKER_MULTIPROC_METHOD: str = "spawn" VLLM_TARGET_DEVICE: str = "cuda" + PLATFORM_AGNOSTIC_BUILD: bool = False MAX_JOBS: Optional[str] = None NVCC_THREADS: Optional[str] = None VLLM_BUILD_WITH_NEURON: bool = False @@ -49,6 +50,10 @@ "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"), + # Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu] + "PLATFORM_AGNOSTIC_BUILD": + lambda: bool(os.environ.get("PLATFORM_AGNOSTIC_BUILD", False)), + # Maximum number of compilation jobs to run in parallel. # By default this is the number of CPUs "MAX_JOBS": From 947c2faa104de6085bfa59ced90cf8e2056226c4 Mon Sep 17 00:00:00 2001 From: Tomer Asida Date: Sun, 12 May 2024 17:06:51 +0300 Subject: [PATCH 04/10] format and lint fixes --- setup.py | 3 ++- vllm/attention/ops/prefix_prefill.py | 3 ++- vllm/attention/ops/triton_flash_attention.py | 17 ++++++++++++----- .../layers/fused_moe/fused_moe.py | 10 ++++++---- vllm/model_executor/layers/ops/rand.py | 5 ++++- vllm/model_executor/layers/ops/sample.py | 8 +++++--- 6 files changed, 31 insertions(+), 15 deletions(-) diff --git a/setup.py b/setup.py index 3ab30d40c9a3..efa338a3cff8 100644 --- a/setup.py +++ b/setup.py @@ -433,6 +433,7 @@ def _read_requirements(filename: str) -> List[str]: extras_require={ "tensorizer": ["tensorizer==2.9.0"], }, - cmdclass={"build_ext": cmake_build_ext} if not (_is_neuron() or PLATFORM_AGNOSTIC_BUILD) else {}, + cmdclass={"build_ext": cmake_build_ext} + if not (_is_neuron() or PLATFORM_AGNOSTIC_BUILD) else {}, package_data=package_data, ) diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index 3c7e241ee395..adfc0bcef04a 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -13,7 +13,8 @@ except ImportError as e: logger.warning( "Failed to import triton with %r. To enable vllm execution, " - "please install triton with `pip install triton` (not available on macos)", e) + "please install triton with `pip install triton` " + "(not available on macos)", e) triton = type('triton', tuple(), {"__version__": "0.0.0"})() if triton.__version__ >= "2.1.0": diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index 4056f7219653..f58fe4654ff7 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -32,15 +32,22 @@ except ImportError as e: logger.warning( "Failed to import triton with %r. To enable vllm execution, " - "please install triton with `pip install triton` (not available on macos)", e) + "please install triton with `pip install triton` " + "(not available on macos)", e) + def dummy_decorator(*args, **kwargs): return args[0] + def dummy_callable(*args, **kwargs): return None - triton = type("triton", tuple(), {"jit": dummy_decorator, - "autotune": dummy_decorator, - "Config": dummy_callable, - "__call__": dummy_callable})() + + triton = type( + "triton", tuple(), { + "jit": dummy_decorator, + "autotune": dummy_decorator, + "Config": dummy_callable, + "__call__": dummy_callable + })() tl = type("tl", tuple(), {"constexpr": None})() torch_dtype: tl.constexpr = torch.float16 diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 25a5ed51fee0..ccf8d99f002a 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -6,7 +6,9 @@ import torch +from vllm import _custom_ops as ops from vllm.logger import init_logger +from vllm.utils import is_hip logger = init_logger(__name__) @@ -16,15 +18,15 @@ except ImportError as e: logger.warning( "Failed to import triton with %r. To enable vllm execution, " - "please install triton with `pip install triton` (not available on macos)", e) + "please install triton with `pip install triton` " + "(not available on macos)", e) + def dummy_decorator(*args, **kwargs): return args[0] + triton = type("triton", tuple(), {"jit": dummy_decorator})() tl = type("tl", tuple(), {"constexpr": None, "dtype": None})() -from vllm import _custom_ops as ops -from vllm.utils import is_hip - @triton.jit def fused_moe_kernel( diff --git a/vllm/model_executor/layers/ops/rand.py b/vllm/model_executor/layers/ops/rand.py index b983a8cf9265..d90efccb73eb 100644 --- a/vllm/model_executor/layers/ops/rand.py +++ b/vllm/model_executor/layers/ops/rand.py @@ -12,9 +12,12 @@ except ImportError as e: logger.warning( "Failed to import triton with %r. To enable vllm execution, " - "please install triton with `pip install triton` (not available on macos)", e) + "please install triton with `pip install triton` " + "(not available on macos)", e) + def dummy_decorator(*args, **kwargs): return args[0] + triton = type("triton", tuple(), {"jit": dummy_decorator})() tl = type("tl", tuple(), {"constexpr": None})() diff --git a/vllm/model_executor/layers/ops/sample.py b/vllm/model_executor/layers/ops/sample.py index c6a18ec3b003..497c4d2f3ce5 100644 --- a/vllm/model_executor/layers/ops/sample.py +++ b/vllm/model_executor/layers/ops/sample.py @@ -4,6 +4,7 @@ import torch from vllm.logger import init_logger +from vllm.model_executor.layers.ops.rand import seeded_uniform logger = init_logger(__name__) @@ -13,14 +14,15 @@ except ImportError as e: logger.warning( "Failed to import triton with %r. To enable vllm execution, " - "please install triton with `pip install triton` (not available on macos)", e) + "please install triton with `pip install triton` " + "(not available on macos)", e) + def dummy_decorator(*args, **kwargs): return args[0] + triton = type("triton", tuple(), {"jit": dummy_decorator})() tl = type("tl", tuple(), {"constexpr": None})() -from vllm.model_executor.layers.ops.rand import seeded_uniform - _EPS = 1e-6 # This is a hardcoded limit in Triton (max block size). From 24c513eb84ecaf060b45a978299ac0ef19136dcc Mon Sep 17 00:00:00 2001 From: Tomer Asida Date: Sun, 12 May 2024 17:22:45 +0300 Subject: [PATCH 05/10] no need for triton installation in CPU requirements. Import errors are fixed --- requirements-cpu.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements-cpu.txt b/requirements-cpu.txt index b739642d8d34..603489aebf42 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -2,5 +2,4 @@ -r requirements-common.txt # Dependencies for x86_64 CPUs -torch == 2.3.0+cpu -triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error. \ No newline at end of file +torch == 2.3.0+cpu \ No newline at end of file From cd0729faab872be572281cc9b93c261da33843e4 Mon Sep 17 00:00:00 2001 From: Tomer Asida Date: Sun, 11 Aug 2024 12:53:33 +0300 Subject: [PATCH 06/10] reduce diffs from main --- vllm/attention/ops/prefix_prefill.py | 16 ++-------- vllm/attention/ops/triton_flash_attention.py | 30 ++----------------- .../layers/fused_moe/fused_moe.py | 17 ++--------- vllm/model_executor/layers/ops/rand.py | 21 ++----------- 4 files changed, 8 insertions(+), 76 deletions(-) diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index 3f31e8c72d73..4577d84db18a 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -2,20 +2,8 @@ # https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py import torch - -from vllm.logger import init_logger - -logger = init_logger(__name__) - -try: - import triton - import triton.language as tl -except ImportError as e: - logger.warning( - "Failed to import triton with %r. To enable vllm execution, " - "please install triton with `pip install triton` " - "(not available on macos)", e) - triton = type('triton', tuple(), {"__version__": "0.0.0"})() +import triton +import triton.language as tl from vllm.platforms import current_platform diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index c0ac8a843f94..f94211116a74 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -21,34 +21,8 @@ """ import torch - -from vllm.logger import init_logger - -logger = init_logger(__name__) - -try: - import triton - import triton.language as tl -except ImportError as e: - logger.warning( - "Failed to import triton with %r. To enable vllm execution, " - "please install triton with `pip install triton` " - "(not available on macos)", e) - - def dummy_decorator(*args, **kwargs): - return args[0] - - def dummy_callable(*args, **kwargs): - return None - - triton = type( - "triton", tuple(), { - "jit": dummy_decorator, - "autotune": dummy_decorator, - "Config": dummy_callable, - "__call__": dummy_callable - })() - tl = type("tl", tuple(), {"constexpr": None})() +import triton +import triton.language as tl torch_dtype: tl.constexpr = torch.float16 diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 4ced23ce5c21..413c0b6d0924 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -5,6 +5,8 @@ from typing import Any, Dict, Optional, Tuple import torch +import triton +import triton.language as tl import vllm.envs as envs from vllm import _custom_ops as ops @@ -12,21 +14,6 @@ logger = init_logger(__name__) -try: - import triton - import triton.language as tl -except ImportError as e: - logger.warning( - "Failed to import triton with %r. To enable vllm execution, " - "please install triton with `pip install triton` " - "(not available on macos)", e) - - def dummy_decorator(*args, **kwargs): - return args[0] - - triton = type("triton", tuple(), {"jit": dummy_decorator})() - tl = type("tl", tuple(), {"constexpr": None, "dtype": None})() - @triton.jit def fused_moe_kernel( diff --git a/vllm/model_executor/layers/ops/rand.py b/vllm/model_executor/layers/ops/rand.py index d90efccb73eb..4a429e329567 100644 --- a/vllm/model_executor/layers/ops/rand.py +++ b/vllm/model_executor/layers/ops/rand.py @@ -1,25 +1,8 @@ from typing import Optional, Union import torch - -from vllm.logger import init_logger - -logger = init_logger(__name__) - -try: - import triton - import triton.language as tl -except ImportError as e: - logger.warning( - "Failed to import triton with %r. To enable vllm execution, " - "please install triton with `pip install triton` " - "(not available on macos)", e) - - def dummy_decorator(*args, **kwargs): - return args[0] - - triton = type("triton", tuple(), {"jit": dummy_decorator})() - tl = type("tl", tuple(), {"constexpr": None})() +import triton +import triton.language as tl def seeded_uniform( From 18b33ff33bbb788265d1a2ac40aa29c785effaf4 Mon Sep 17 00:00:00 2001 From: Tomer Asida Date: Sun, 11 Aug 2024 15:42:52 +0300 Subject: [PATCH 07/10] implement using VLLM_TARGET_DEVICE='empty' --- setup.py | 16 +++++++++++----- vllm/envs.py | 5 ----- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/setup.py b/setup.py index 0188b8e71761..b0db146e16ff 100644 --- a/setup.py +++ b/setup.py @@ -65,8 +65,6 @@ def embed_commit_hash(): assert sys.platform.startswith( "linux"), "vLLM only supports Linux platform (including WSL)." -PLATFORM_AGNOSTIC_BUILD = envs.PLATFORM_AGNOSTIC_BUILD - MAIN_CUDA_VERSION = "12.1" @@ -233,6 +231,10 @@ def build_extensions(self) -> None: subprocess.check_call(["cmake", *build_args], cwd=self.build_temp) +def _no_device() -> bool: + return VLLM_TARGET_DEVICE == "empty" + + def _is_cuda() -> bool: has_cuda = torch.version.cuda is not None return (VLLM_TARGET_DEVICE == "cuda" and has_cuda @@ -352,7 +354,9 @@ def find_version(filepath: str) -> str: def get_vllm_version() -> str: version = find_version(get_path("vllm", "version.py")) - if _is_cuda(): + if _no_device(): + return version + elif _is_cuda(): cuda_version = str(get_nvcc_cuda_version()) if cuda_version != MAIN_CUDA_VERSION: cuda_version_str = cuda_version.replace(".", "")[:3] @@ -406,7 +410,9 @@ def _read_requirements(filename: str) -> List[str]: resolved_requirements.append(line) return resolved_requirements - if _is_cuda(): + if _no_device(): + requirements = _read_requirements("requirements-cuda.txt") + elif _is_cuda(): requirements = _read_requirements("requirements-cuda.txt") cuda_major, cuda_minor = torch.version.cuda.split(".") modified_requirements = [] @@ -455,7 +461,7 @@ def _read_requirements(filename: str) -> List[str]: ext_modules = [] package_data["vllm"].append("*.so") -if PLATFORM_AGNOSTIC_BUILD: +if _no_device(): ext_modules = [] setup( diff --git a/vllm/envs.py b/vllm/envs.py index 85ba71f1ead6..26d0c33707fe 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -45,7 +45,6 @@ VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_TARGET_DEVICE: str = "cuda" - PLATFORM_AGNOSTIC_BUILD: bool = False MAX_JOBS: Optional[str] = None NVCC_THREADS: Optional[str] = None VLLM_USE_PRECOMPILED: bool = False @@ -85,10 +84,6 @@ def get_default_config_root(): "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"), - # Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu] - "PLATFORM_AGNOSTIC_BUILD": - lambda: bool(os.environ.get("PLATFORM_AGNOSTIC_BUILD", False)), - # Maximum number of compilation jobs to run in parallel. # By default this is the number of CPUs "MAX_JOBS": From f632c8ea3cf2c321c9c9499cfe3c5d6a42c55e70 Mon Sep 17 00:00:00 2001 From: Tomer Asida Date: Sun, 11 Aug 2024 19:22:31 +0300 Subject: [PATCH 08/10] make build of .tar.gz possible on mac --- setup.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index b0db146e16ff..cef419c6a56b 100644 --- a/setup.py +++ b/setup.py @@ -61,9 +61,11 @@ def embed_commit_hash(): VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE -# vLLM only supports Linux platform -assert sys.platform.startswith( - "linux"), "vLLM only supports Linux platform (including WSL)." +if not sys.platform.startswith("linux"): + logger.info(f"vLLM only supports Linux platform (including WSL). " + f"Building on {sys.platform}, " + f"so vLLM may not be able to run correctly",) + VLLM_TARGET_DEVICE = "empty" MAIN_CUDA_VERSION = "12.1" From 37c40ed61a51b8c092c9d0a16b0d2ed1e0cd9628 Mon Sep 17 00:00:00 2001 From: Tomer Asida Date: Sun, 11 Aug 2024 19:22:56 +0300 Subject: [PATCH 09/10] Add "+empty" to version for build with VLLM_TARGET_DEVICE="empty" --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index cef419c6a56b..3040333d22bd 100644 --- a/setup.py +++ b/setup.py @@ -357,7 +357,7 @@ def get_vllm_version() -> str: version = find_version(get_path("vllm", "version.py")) if _no_device(): - return version + version += "+empty" elif _is_cuda(): cuda_version = str(get_nvcc_cuda_version()) if cuda_version != MAIN_CUDA_VERSION: From cf7503731889279f9fa4240a684f3f58bdb76232 Mon Sep 17 00:00:00 2001 From: Tomer Asida Date: Sun, 11 Aug 2024 19:52:30 +0300 Subject: [PATCH 10/10] (1) no f-strings in logs (2) warning instead of info log --- setup.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 3040333d22bd..9e34433eff0d 100644 --- a/setup.py +++ b/setup.py @@ -62,9 +62,10 @@ def embed_commit_hash(): VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE if not sys.platform.startswith("linux"): - logger.info(f"vLLM only supports Linux platform (including WSL). " - f"Building on {sys.platform}, " - f"so vLLM may not be able to run correctly",) + logger.warning( + "vLLM only supports Linux platform (including WSL). " + "Building on %s, " + "so vLLM may not be able to run correctly", sys.platform) VLLM_TARGET_DEVICE = "empty" MAIN_CUDA_VERSION = "12.1"