From 03850aa98c81d26d07009ddcad8fc93462343bcf Mon Sep 17 00:00:00 2001
From: Tomer Asida <tomera@ai21.com>
Date: Sun, 12 May 2024 16:02:46 +0300
Subject: [PATCH 01/10] mark xformers and vllm-flash-attn as installable only
 on x86_64 Linux

---
 requirements-cuda.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index ba8c614d205d..e6caa863fc2b 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -6,5 +6,5 @@ ray >= 2.9
 nvidia-ml-py # for pynvml package
 vllm-nccl-cu12>=2.18,<2.19  # for downloading nccl library
 torch == 2.3.0
-xformers == 0.0.26.post1  # Requires PyTorch 2.3.0
-vllm-flash-attn == 2.5.8.post1  # Requires PyTorch 2.3.0
+xformers == 0.0.26.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.3.0
+vllm-flash-attn == 2.5.8.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.3.0

From f47bc9c6c118829e50f6e9e33bf1acc2bc605106 Mon Sep 17 00:00:00 2001
From: Tomer Asida <tomera@ai21.com>
Date: Sun, 12 May 2024 16:54:31 +0300
Subject: [PATCH 02/10] changes in vllm to mock triton in case it can't be
 imported

---
 vllm/attention/ops/prefix_prefill.py          | 15 ++++++++++--
 vllm/attention/ops/triton_flash_attention.py  | 23 +++++++++++++++++--
 .../layers/fused_moe/fused_moe.py             | 19 +++++++++++----
 vllm/model_executor/layers/ops/rand.py        | 18 +++++++++++++--
 vllm/model_executor/layers/ops/sample.py      | 18 +++++++++++++--
 5 files changed, 81 insertions(+), 12 deletions(-)

diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 997b25e887e3..3c7e241ee395 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -2,8 +2,19 @@
 # https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py
 
 import torch
-import triton
-import triton.language as tl
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+try:
+    import triton
+    import triton.language as tl
+except ImportError as e:
+    logger.warning(
+        "Failed to import triton with %r. To enable vllm execution, "
+        "please install triton with `pip install triton` (not available on macos)", e)
+    triton = type('triton', tuple(), {"__version__": "0.0.0"})()
 
 if triton.__version__ >= "2.1.0":
 
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
index 1147664183ff..4056f7219653 100644
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -21,8 +21,27 @@
 """
 
 import torch
-import triton
-import triton.language as tl
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+try:
+    import triton
+    import triton.language as tl
+except ImportError as e:
+    logger.warning(
+        "Failed to import triton with %r. To enable vllm execution, "
+        "please install triton with `pip install triton` (not available on macos)", e)
+    def dummy_decorator(*args, **kwargs):
+        return args[0]
+    def dummy_callable(*args, **kwargs):
+        return None
+    triton = type("triton", tuple(), {"jit": dummy_decorator,
+                                      "autotune": dummy_decorator,
+                                      "Config": dummy_callable,
+                                      "__call__": dummy_callable})()
+    tl = type("tl", tuple(), {"constexpr": None})()
 
 torch_dtype: tl.constexpr = torch.float16
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index bb7938b3715b..25a5ed51fee0 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -5,15 +5,26 @@
 from typing import Any, Dict, Optional, Tuple
 
 import torch
-import triton
-import triton.language as tl
 
-from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.utils import is_hip
 
 logger = init_logger(__name__)
 
+try:
+    import triton
+    import triton.language as tl
+except ImportError as e:
+    logger.warning(
+        "Failed to import triton with %r. To enable vllm execution, "
+        "please install triton with `pip install triton` (not available on macos)", e)
+    def dummy_decorator(*args, **kwargs):
+        return args[0]
+    triton = type("triton", tuple(), {"jit": dummy_decorator})()
+    tl = type("tl", tuple(), {"constexpr": None, "dtype": None})()
+
+from vllm import _custom_ops as ops
+from vllm.utils import is_hip
+
 
 @triton.jit
 def fused_moe_kernel(
diff --git a/vllm/model_executor/layers/ops/rand.py b/vllm/model_executor/layers/ops/rand.py
index 4a429e329567..b983a8cf9265 100644
--- a/vllm/model_executor/layers/ops/rand.py
+++ b/vllm/model_executor/layers/ops/rand.py
@@ -1,8 +1,22 @@
 from typing import Optional, Union
 
 import torch
-import triton
-import triton.language as tl
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+try:
+    import triton
+    import triton.language as tl
+except ImportError as e:
+    logger.warning(
+        "Failed to import triton with %r. To enable vllm execution, "
+        "please install triton with `pip install triton` (not available on macos)", e)
+    def dummy_decorator(*args, **kwargs):
+        return args[0]
+    triton = type("triton", tuple(), {"jit": dummy_decorator})()
+    tl = type("tl", tuple(), {"constexpr": None})()
 
 
 def seeded_uniform(
diff --git a/vllm/model_executor/layers/ops/sample.py b/vllm/model_executor/layers/ops/sample.py
index d08ae6064aa2..c6a18ec3b003 100644
--- a/vllm/model_executor/layers/ops/sample.py
+++ b/vllm/model_executor/layers/ops/sample.py
@@ -2,8 +2,22 @@
 from typing import Optional, Tuple
 
 import torch
-import triton
-import triton.language as tl
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+try:
+    import triton
+    import triton.language as tl
+except ImportError as e:
+    logger.warning(
+        "Failed to import triton with %r. To enable vllm execution, "
+        "please install triton with `pip install triton` (not available on macos)", e)
+    def dummy_decorator(*args, **kwargs):
+        return args[0]
+    triton = type("triton", tuple(), {"jit": dummy_decorator})()
+    tl = type("tl", tuple(), {"constexpr": None})()
 
 from vllm.model_executor.layers.ops.rand import seeded_uniform
 

From bc83673d2c69708d53d1547dd7ec8d15cebb77f3 Mon Sep 17 00:00:00 2001
From: Tomer Asida <tomera@ai21.com>
Date: Sun, 12 May 2024 16:55:40 +0300
Subject: [PATCH 03/10] changes to setup.py to allow PLATFORM_AGNOSTIC_BUILD

---
 setup.py     | 7 ++++++-
 vllm/envs.py | 5 +++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0dc8818b44a9..3ab30d40c9a3 100644
--- a/setup.py
+++ b/setup.py
@@ -36,6 +36,8 @@ def load_module_from_path(module_name, path):
 assert sys.platform.startswith(
     "linux"), "vLLM only supports Linux platform (including WSL)."
 
+PLATFORM_AGNOSTIC_BUILD = envs.PLATFORM_AGNOSTIC_BUILD
+
 MAIN_CUDA_VERSION = "12.1"
 
 
@@ -398,6 +400,9 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules = []
     package_data["vllm"].append("*.so")
 
+if PLATFORM_AGNOSTIC_BUILD:
+    ext_modules = []
+
 setup(
     name="vllm",
     version=get_vllm_version(),
@@ -428,6 +433,6 @@ def _read_requirements(filename: str) -> List[str]:
     extras_require={
         "tensorizer": ["tensorizer==2.9.0"],
     },
-    cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
+    cmdclass={"build_ext": cmake_build_ext} if not (_is_neuron() or PLATFORM_AGNOSTIC_BUILD) else {},
     package_data=package_data,
 )
diff --git a/vllm/envs.py b/vllm/envs.py
index 91cc8f3be775..7a9bc8589c49 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -28,6 +28,7 @@
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
     VLLM_TARGET_DEVICE: str = "cuda"
+    PLATFORM_AGNOSTIC_BUILD: bool = False
     MAX_JOBS: Optional[str] = None
     NVCC_THREADS: Optional[str] = None
     VLLM_BUILD_WITH_NEURON: bool = False
@@ -49,6 +50,10 @@
     "VLLM_TARGET_DEVICE":
     lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
 
+    # Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
+    "PLATFORM_AGNOSTIC_BUILD":
+    lambda: bool(os.environ.get("PLATFORM_AGNOSTIC_BUILD", False)),
+
     # Maximum number of compilation jobs to run in parallel.
     # By default this is the number of CPUs
     "MAX_JOBS":

From 947c2faa104de6085bfa59ced90cf8e2056226c4 Mon Sep 17 00:00:00 2001
From: Tomer Asida <tomera@ai21.com>
Date: Sun, 12 May 2024 17:06:51 +0300
Subject: [PATCH 04/10] format and lint fixes

---
 setup.py                                        |  3 ++-
 vllm/attention/ops/prefix_prefill.py            |  3 ++-
 vllm/attention/ops/triton_flash_attention.py    | 17 ++++++++++++-----
 .../layers/fused_moe/fused_moe.py               | 10 ++++++----
 vllm/model_executor/layers/ops/rand.py          |  5 ++++-
 vllm/model_executor/layers/ops/sample.py        |  8 +++++---
 6 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/setup.py b/setup.py
index 3ab30d40c9a3..efa338a3cff8 100644
--- a/setup.py
+++ b/setup.py
@@ -433,6 +433,7 @@ def _read_requirements(filename: str) -> List[str]:
     extras_require={
         "tensorizer": ["tensorizer==2.9.0"],
     },
-    cmdclass={"build_ext": cmake_build_ext} if not (_is_neuron() or PLATFORM_AGNOSTIC_BUILD) else {},
+    cmdclass={"build_ext": cmake_build_ext}
+    if not (_is_neuron() or PLATFORM_AGNOSTIC_BUILD) else {},
     package_data=package_data,
 )
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 3c7e241ee395..adfc0bcef04a 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -13,7 +13,8 @@
 except ImportError as e:
     logger.warning(
         "Failed to import triton with %r. To enable vllm execution, "
-        "please install triton with `pip install triton` (not available on macos)", e)
+        "please install triton with `pip install triton` "
+        "(not available on macos)", e)
     triton = type('triton', tuple(), {"__version__": "0.0.0"})()
 
 if triton.__version__ >= "2.1.0":
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
index 4056f7219653..f58fe4654ff7 100644
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -32,15 +32,22 @@
 except ImportError as e:
     logger.warning(
         "Failed to import triton with %r. To enable vllm execution, "
-        "please install triton with `pip install triton` (not available on macos)", e)
+        "please install triton with `pip install triton` "
+        "(not available on macos)", e)
+
     def dummy_decorator(*args, **kwargs):
         return args[0]
+
     def dummy_callable(*args, **kwargs):
         return None
-    triton = type("triton", tuple(), {"jit": dummy_decorator,
-                                      "autotune": dummy_decorator,
-                                      "Config": dummy_callable,
-                                      "__call__": dummy_callable})()
+
+    triton = type(
+        "triton", tuple(), {
+            "jit": dummy_decorator,
+            "autotune": dummy_decorator,
+            "Config": dummy_callable,
+            "__call__": dummy_callable
+        })()
     tl = type("tl", tuple(), {"constexpr": None})()
 
 torch_dtype: tl.constexpr = torch.float16
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 25a5ed51fee0..ccf8d99f002a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -6,7 +6,9 @@
 
 import torch
 
+from vllm import _custom_ops as ops
 from vllm.logger import init_logger
+from vllm.utils import is_hip
 
 logger = init_logger(__name__)
 
@@ -16,15 +18,15 @@
 except ImportError as e:
     logger.warning(
         "Failed to import triton with %r. To enable vllm execution, "
-        "please install triton with `pip install triton` (not available on macos)", e)
+        "please install triton with `pip install triton` "
+        "(not available on macos)", e)
+
     def dummy_decorator(*args, **kwargs):
         return args[0]
+
     triton = type("triton", tuple(), {"jit": dummy_decorator})()
     tl = type("tl", tuple(), {"constexpr": None, "dtype": None})()
 
-from vllm import _custom_ops as ops
-from vllm.utils import is_hip
-
 
 @triton.jit
 def fused_moe_kernel(
diff --git a/vllm/model_executor/layers/ops/rand.py b/vllm/model_executor/layers/ops/rand.py
index b983a8cf9265..d90efccb73eb 100644
--- a/vllm/model_executor/layers/ops/rand.py
+++ b/vllm/model_executor/layers/ops/rand.py
@@ -12,9 +12,12 @@
 except ImportError as e:
     logger.warning(
         "Failed to import triton with %r. To enable vllm execution, "
-        "please install triton with `pip install triton` (not available on macos)", e)
+        "please install triton with `pip install triton` "
+        "(not available on macos)", e)
+
     def dummy_decorator(*args, **kwargs):
         return args[0]
+
     triton = type("triton", tuple(), {"jit": dummy_decorator})()
     tl = type("tl", tuple(), {"constexpr": None})()
 
diff --git a/vllm/model_executor/layers/ops/sample.py b/vllm/model_executor/layers/ops/sample.py
index c6a18ec3b003..497c4d2f3ce5 100644
--- a/vllm/model_executor/layers/ops/sample.py
+++ b/vllm/model_executor/layers/ops/sample.py
@@ -4,6 +4,7 @@
 import torch
 
 from vllm.logger import init_logger
+from vllm.model_executor.layers.ops.rand import seeded_uniform
 
 logger = init_logger(__name__)
 
@@ -13,14 +14,15 @@
 except ImportError as e:
     logger.warning(
         "Failed to import triton with %r. To enable vllm execution, "
-        "please install triton with `pip install triton` (not available on macos)", e)
+        "please install triton with `pip install triton` "
+        "(not available on macos)", e)
+
     def dummy_decorator(*args, **kwargs):
         return args[0]
+
     triton = type("triton", tuple(), {"jit": dummy_decorator})()
     tl = type("tl", tuple(), {"constexpr": None})()
 
-from vllm.model_executor.layers.ops.rand import seeded_uniform
-
 _EPS = 1e-6
 
 # This is a hardcoded limit in Triton (max block size).

From 24c513eb84ecaf060b45a978299ac0ef19136dcc Mon Sep 17 00:00:00 2001
From: Tomer Asida <tomera@ai21.com>
Date: Sun, 12 May 2024 17:22:45 +0300
Subject: [PATCH 05/10] no need for triton installation in CPU requirements.
 Import errors are fixed

---
 requirements-cpu.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index b739642d8d34..603489aebf42 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,5 +2,4 @@
 -r requirements-common.txt
 
 # Dependencies for x86_64 CPUs
-torch == 2.3.0+cpu
-triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
+torch == 2.3.0+cpu
\ No newline at end of file

From cd0729faab872be572281cc9b93c261da33843e4 Mon Sep 17 00:00:00 2001
From: Tomer Asida <tomera@ai21.com>
Date: Sun, 11 Aug 2024 12:53:33 +0300
Subject: [PATCH 06/10] reduce diffs from main

---
 vllm/attention/ops/prefix_prefill.py          | 16 ++--------
 vllm/attention/ops/triton_flash_attention.py  | 30 ++-----------------
 .../layers/fused_moe/fused_moe.py             | 17 ++---------
 vllm/model_executor/layers/ops/rand.py        | 21 ++-----------
 4 files changed, 8 insertions(+), 76 deletions(-)

diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 3f31e8c72d73..4577d84db18a 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -2,20 +2,8 @@
 # https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py
 
 import torch
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-try:
-    import triton
-    import triton.language as tl
-except ImportError as e:
-    logger.warning(
-        "Failed to import triton with %r. To enable vllm execution, "
-        "please install triton with `pip install triton` "
-        "(not available on macos)", e)
-    triton = type('triton', tuple(), {"__version__": "0.0.0"})()
+import triton
+import triton.language as tl
 
 from vllm.platforms import current_platform
 
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
index c0ac8a843f94..f94211116a74 100644
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -21,34 +21,8 @@
 """
 
 import torch
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-try:
-    import triton
-    import triton.language as tl
-except ImportError as e:
-    logger.warning(
-        "Failed to import triton with %r. To enable vllm execution, "
-        "please install triton with `pip install triton` "
-        "(not available on macos)", e)
-
-    def dummy_decorator(*args, **kwargs):
-        return args[0]
-
-    def dummy_callable(*args, **kwargs):
-        return None
-
-    triton = type(
-        "triton", tuple(), {
-            "jit": dummy_decorator,
-            "autotune": dummy_decorator,
-            "Config": dummy_callable,
-            "__call__": dummy_callable
-        })()
-    tl = type("tl", tuple(), {"constexpr": None})()
+import triton
+import triton.language as tl
 
 torch_dtype: tl.constexpr = torch.float16
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 4ced23ce5c21..413c0b6d0924 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -5,6 +5,8 @@
 from typing import Any, Dict, Optional, Tuple
 
 import torch
+import triton
+import triton.language as tl
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
@@ -12,21 +14,6 @@
 
 logger = init_logger(__name__)
 
-try:
-    import triton
-    import triton.language as tl
-except ImportError as e:
-    logger.warning(
-        "Failed to import triton with %r. To enable vllm execution, "
-        "please install triton with `pip install triton` "
-        "(not available on macos)", e)
-
-    def dummy_decorator(*args, **kwargs):
-        return args[0]
-
-    triton = type("triton", tuple(), {"jit": dummy_decorator})()
-    tl = type("tl", tuple(), {"constexpr": None, "dtype": None})()
-
 
 @triton.jit
 def fused_moe_kernel(
diff --git a/vllm/model_executor/layers/ops/rand.py b/vllm/model_executor/layers/ops/rand.py
index d90efccb73eb..4a429e329567 100644
--- a/vllm/model_executor/layers/ops/rand.py
+++ b/vllm/model_executor/layers/ops/rand.py
@@ -1,25 +1,8 @@
 from typing import Optional, Union
 
 import torch
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-try:
-    import triton
-    import triton.language as tl
-except ImportError as e:
-    logger.warning(
-        "Failed to import triton with %r. To enable vllm execution, "
-        "please install triton with `pip install triton` "
-        "(not available on macos)", e)
-
-    def dummy_decorator(*args, **kwargs):
-        return args[0]
-
-    triton = type("triton", tuple(), {"jit": dummy_decorator})()
-    tl = type("tl", tuple(), {"constexpr": None})()
+import triton
+import triton.language as tl
 
 
 def seeded_uniform(

From 18b33ff33bbb788265d1a2ac40aa29c785effaf4 Mon Sep 17 00:00:00 2001
From: Tomer Asida <tomera@ai21.com>
Date: Sun, 11 Aug 2024 15:42:52 +0300
Subject: [PATCH 07/10] implement using VLLM_TARGET_DEVICE='empty'

---
 setup.py     | 16 +++++++++++-----
 vllm/envs.py |  5 -----
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/setup.py b/setup.py
index 0188b8e71761..b0db146e16ff 100644
--- a/setup.py
+++ b/setup.py
@@ -65,8 +65,6 @@ def embed_commit_hash():
 assert sys.platform.startswith(
     "linux"), "vLLM only supports Linux platform (including WSL)."
 
-PLATFORM_AGNOSTIC_BUILD = envs.PLATFORM_AGNOSTIC_BUILD
-
 MAIN_CUDA_VERSION = "12.1"
 
 
@@ -233,6 +231,10 @@ def build_extensions(self) -> None:
         subprocess.check_call(["cmake", *build_args], cwd=self.build_temp)
 
 
+def _no_device() -> bool:
+    return VLLM_TARGET_DEVICE == "empty"
+
+
 def _is_cuda() -> bool:
     has_cuda = torch.version.cuda is not None
     return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
@@ -352,7 +354,9 @@ def find_version(filepath: str) -> str:
 def get_vllm_version() -> str:
     version = find_version(get_path("vllm", "version.py"))
 
-    if _is_cuda():
+    if _no_device():
+        return version
+    elif _is_cuda():
         cuda_version = str(get_nvcc_cuda_version())
         if cuda_version != MAIN_CUDA_VERSION:
             cuda_version_str = cuda_version.replace(".", "")[:3]
@@ -406,7 +410,9 @@ def _read_requirements(filename: str) -> List[str]:
                 resolved_requirements.append(line)
         return resolved_requirements
 
-    if _is_cuda():
+    if _no_device():
+        requirements = _read_requirements("requirements-cuda.txt")
+    elif _is_cuda():
         requirements = _read_requirements("requirements-cuda.txt")
         cuda_major, cuda_minor = torch.version.cuda.split(".")
         modified_requirements = []
@@ -455,7 +461,7 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules = []
     package_data["vllm"].append("*.so")
 
-if PLATFORM_AGNOSTIC_BUILD:
+if _no_device():
     ext_modules = []
 
 setup(
diff --git a/vllm/envs.py b/vllm/envs.py
index 85ba71f1ead6..26d0c33707fe 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -45,7 +45,6 @@
     VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
     VLLM_TARGET_DEVICE: str = "cuda"
-    PLATFORM_AGNOSTIC_BUILD: bool = False
     MAX_JOBS: Optional[str] = None
     NVCC_THREADS: Optional[str] = None
     VLLM_USE_PRECOMPILED: bool = False
@@ -85,10 +84,6 @@ def get_default_config_root():
     "VLLM_TARGET_DEVICE":
     lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
 
-    # Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
-    "PLATFORM_AGNOSTIC_BUILD":
-    lambda: bool(os.environ.get("PLATFORM_AGNOSTIC_BUILD", False)),
-
     # Maximum number of compilation jobs to run in parallel.
     # By default this is the number of CPUs
     "MAX_JOBS":

From f632c8ea3cf2c321c9c9499cfe3c5d6a42c55e70 Mon Sep 17 00:00:00 2001
From: Tomer Asida <tomera@ai21.com>
Date: Sun, 11 Aug 2024 19:22:31 +0300
Subject: [PATCH 08/10] make build of .tar.gz possible on mac

---
 setup.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index b0db146e16ff..cef419c6a56b 100644
--- a/setup.py
+++ b/setup.py
@@ -61,9 +61,11 @@ def embed_commit_hash():
 
 VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
 
-# vLLM only supports Linux platform
-assert sys.platform.startswith(
-    "linux"), "vLLM only supports Linux platform (including WSL)."
+if not sys.platform.startswith("linux"):
+    logger.info(f"vLLM only supports Linux platform (including WSL). "
+                f"Building on {sys.platform}, "
+                f"so vLLM may not be able to run correctly",)
+    VLLM_TARGET_DEVICE = "empty"
 
 MAIN_CUDA_VERSION = "12.1"
 

From 37c40ed61a51b8c092c9d0a16b0d2ed1e0cd9628 Mon Sep 17 00:00:00 2001
From: Tomer Asida <tomera@ai21.com>
Date: Sun, 11 Aug 2024 19:22:56 +0300
Subject: [PATCH 09/10] Add "+empty" to version for build with
 VLLM_TARGET_DEVICE="empty"

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index cef419c6a56b..3040333d22bd 100644
--- a/setup.py
+++ b/setup.py
@@ -357,7 +357,7 @@ def get_vllm_version() -> str:
     version = find_version(get_path("vllm", "version.py"))
 
     if _no_device():
-        return version
+        version += "+empty"
     elif _is_cuda():
         cuda_version = str(get_nvcc_cuda_version())
         if cuda_version != MAIN_CUDA_VERSION:

From cf7503731889279f9fa4240a684f3f58bdb76232 Mon Sep 17 00:00:00 2001
From: Tomer Asida <tomera@ai21.com>
Date: Sun, 11 Aug 2024 19:52:30 +0300
Subject: [PATCH 10/10] (1) no f-strings in logs (2) warning instead of info
 log

---
 setup.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 3040333d22bd..9e34433eff0d 100644
--- a/setup.py
+++ b/setup.py
@@ -62,9 +62,10 @@ def embed_commit_hash():
 VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
 
 if not sys.platform.startswith("linux"):
-    logger.info(f"vLLM only supports Linux platform (including WSL). "
-                f"Building on {sys.platform}, "
-                f"so vLLM may not be able to run correctly",)
+    logger.warning(
+        "vLLM only supports Linux platform (including WSL). "
+        "Building on %s, "
+        "so vLLM may not be able to run correctly", sys.platform)
     VLLM_TARGET_DEVICE = "empty"
 
 MAIN_CUDA_VERSION = "12.1"