vllm-project · pathorn · Mar 5, 2024 · Dec 21, 2023 · Dec 22, 2023 · Dec 22, 2023
diff --git a/FindModules/FindMKL.cmake b/FindModules/FindMKL.cmake
diff --git a/Makefile b/Makefile
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -23,6 +23,8 @@ def main(args: argparse.Namespace):
         tensor_parallel_size=args.tensor_parallel_size,
         trust_remote_code=args.trust_remote_code,
         dtype=args.dtype,
+        device=args.device,
+        swap_space=args.swap_space,
         enforce_eager=args.enforce_eager,
         kv_cache_dtype=args.kv_cache_dtype,
         device=args.device,
@@ -144,12 +146,16 @@ def run_to_completion(profile_dir: Optional[str] = None):
         "--device",
         type=str,
         default="cuda",
-        choices=["cuda"],
+        choices=["cuda", "cpu"],
         help='device type for vLLM execution, supporting CUDA only currently.')
     parser.add_argument(
         "--ray-workers-use-nsight",
         action='store_true',
         help="If specified, use nsight to profile ray workers",
     )
+    parser.add_argument("--swap-space",
+                        type=int,
+                        default=4,
+                        help="memory space available for CPU (GB).")
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -73,6 +73,7 @@ def run_vllm(
     enforce_eager: bool,
     kv_cache_dtype: str,
     device: str,
+    swap_space: int,
     enable_prefix_caching: bool,
     gpu_memory_utilization: float = 0.9,
 ) -> float:
@@ -89,6 +90,7 @@ def run_vllm(
               enforce_eager=enforce_eager,
               kv_cache_dtype=kv_cache_dtype,
               device=device,
+              swap_space=swap_space,
               enable_prefix_caching=enable_prefix_caching)
 
     # Add the requests to the engine.
@@ -115,21 +117,16 @@ def run_vllm(
     return end - start
 
 
-def run_hf(
-    requests: List[Tuple[str, int, int]],
-    model: str,
-    tokenizer: PreTrainedTokenizerBase,
-    n: int,
-    use_beam_search: bool,
-    max_batch_size: int,
-    trust_remote_code: bool,
-) -> float:
+def run_hf(requests: List[Tuple[str, int, int]], model: str,
+           tokenizer: PreTrainedTokenizerBase, n: int, use_beam_search: bool,
+           max_batch_size: int, trust_remote_code: bool) -> float:
     assert not use_beam_search
     llm = AutoModelForCausalLM.from_pretrained(
         model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
     if llm.config.model_type == "llama":
         # To enable padding in the HF backend.
         tokenizer.pad_token = tokenizer.eos_token
+
     llm = llm.cuda()
 
     pbar = tqdm(total=len(requests))
@@ -212,7 +209,7 @@ def main(args: argparse.Namespace):
             requests, args.model, args.tokenizer, args.quantization,
             args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
             args.trust_remote_code, args.dtype, args.max_model_len,
-            args.enforce_eager, args.kv_cache_dtype, args.device,
+            args.enforce_eager, args.kv_cache_dtype, args.device, args.swap_space,
             args.enable_prefix_caching, args.gpu_memory_utilization)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
@@ -308,12 +305,16 @@ def main(args: argparse.Namespace):
         "--device",
         type=str,
         default="cuda",
-        choices=["cuda"],
+        choices=["cuda", "cpu"],
         help='device type for vLLM execution, supporting CUDA only currently.')
     parser.add_argument(
         "--enable-prefix-caching",
         action='store_true',
         help="enable automatic prefix caching for vLLM backend.")
+    parser.add_argument("--swap-space",
+                        type=int,
+                        default=4,
+                        help="memory space available for CPU (GB).")
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model

diff --git a/benchmarks/kernels/activation.py b/benchmarks/kernels/activation.py
diff --git a/benchmarks/kernels/attention.py b/benchmarks/kernels/attention.py
diff --git a/benchmarks/kernels/benchmark.py b/benchmarks/kernels/benchmark.py
diff --git a/benchmarks/kernels/cache_op.py b/benchmarks/kernels/cache_op.py
diff --git a/benchmarks/kernels/pos_encoding.py b/benchmarks/kernels/pos_encoding.py
diff --git a/benchmarks/kernels/rmsnorm.py b/benchmarks/kernels/rmsnorm.py
diff --git a/cpu.Dockerfile b/cpu.Dockerfile
@@ -0,0 +1,77 @@
+FROM python:3.10 AS dev
+
+RUN apt-get update -y \
+    && apt-get install -y python3-pip
+
+WORKDIR /workspace
+
+# install build and runtime dependencies
+COPY requirements-cpu.txt requirements-cpu.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-cpu.txt
+
+# install development dependencies
+COPY requirements-dev.txt requirements-dev.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-dev.txt
+
+# image to build pytorch extensions
+FROM dev AS build
+
+# install build dependencies
+COPY requirements-build-cpu.txt requirements-build-cpu.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-build-cpu.txt
+
+# copy input files
+COPY csrc csrc
+COPY setup.py setup.py
+COPY requirements-cpu.txt requirements-cpu.txt
+COPY pyproject.toml pyproject.toml
+COPY vllm/__init__.py vllm/__init__.py
+
+# max jobs used by Ninja to build extensions
+ENV MAX_JOBS=$max_jobs
+RUN python3 setup.py build_ext --inplace
+
+# image to run unit testing suite
+FROM dev AS test
+
+# copy pytorch extensions separately to avoid having to rebuild
+# when python code changes
+COPY --from=build /workspace/vllm/*.so /workspace/vllm/
+COPY tests tests
+COPY vllm vllm
+
+ENTRYPOINT ["python3", "-m", "pytest", "tests"]
+
+# use CUDA base as CUDA runtime dependencies are already installed via pip
+FROM python:3.10 AS dev
+
+# libnccl required for ray
+RUN apt-get update -y \
+    && apt-get install -y python3-pip
+
+WORKDIR /workspace
+COPY requirements-cpu.txt requirements-cpu.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-cpu.txt
+
+FROM vllm-base AS vllm
+COPY --from=build /workspace/vllm/*.so /workspace/vllm/
+COPY vllm vllm
+
+EXPOSE 8000
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]
+
+# openai api server alternative
+FROM vllm-base AS vllm-openai
+# install additional dependencies for openai api server
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install accelerate fschat
+
+COPY --from=build /workspace/vllm/*.so /workspace/vllm/
+COPY vllm vllm
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+
diff --git a/csrc/cpu/activation_impl.cpp b/csrc/cpu/activation_impl.cpp
@@ -0,0 +1,186 @@
+#include "cpu_types.hpp"
+
+namespace {
+template <typename scalar_t>
+void silu_and_mul_cpu_impl(int num_tokens, int d, scalar_t *__restrict__ input,
+                           scalar_t *__restrict__ output) {
+  using scalar_vec_t = vec_op::vec_t<scalar_t>;
+  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
+
+  TORCH_CHECK(d % VEC_ELEM_NUM == 0);
+
+  const vec_op::FP32Vec8 zeros(0.0);
+  const vec_op::FP32Vec8 ones(1.0);
+
+#pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    for (int j = 0; j < d; j += VEC_ELEM_NUM) {
+      const int start = i * 2 * d;
+      const scalar_vec_t x(input + start + j);
+      const scalar_vec_t y(input + start + d + j);
+
+      const vec_op::FP32Vec8 f32_x(x.reg);
+      const vec_op::FP32Vec8 f32_y(y.reg);
+
+      const vec_op::FP32Vec8 f32_ans =
+          f32_y * (f32_x / (ones + (zeros - f32_x).exp()));
+
+      const scalar_vec_t ans(f32_ans.reg);
+      ans.save(output + i * d + j);
+    }
+  }
+}
+}; // namespace
+
+/*
+
+    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        d = x.shape[-1] // 2
+        return F.gelu(x[..., :d]) * x[..., d:]
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+*/
+
+void silu_and_mul_cpu(torch::Tensor &out, torch::Tensor &input) {
+  int num_tokens = input.numel() / input.size(-1);
+  int d = input.size(-1) / 2;
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "silu_and_mul_cpu_impl", [&] {
+        CPU_KERNEL_GUARD_IN(silu_and_mul_cpu_impl)
+        silu_and_mul_cpu_impl(num_tokens, d, input.data_ptr<scalar_t>(),
+                              out.data_ptr<scalar_t>());
+        CPU_KERNEL_GUARD_OUT(silu_and_mul_cpu_impl)
+      });
+}
+
+namespace {
+template <typename scalar_t>
+void gelu_and_mul_cpu_impl(int num_tokens, int d, scalar_t *__restrict__ input,
+                           scalar_t *__restrict__ output) {
+  using scalar_vec_t = vec_op::vec_t<scalar_t>;
+  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
+
+  TORCH_CHECK(d % VEC_ELEM_NUM == 0);
+
+  const vec_op::FP32Vec8 half(0.5);
+  const vec_op::FP32Vec8 ones(1.0);
+  const vec_op::FP32Vec8 sqrt_two_over_pi(sqrtf(2.0 / M_PI));
+  const vec_op::FP32Vec8 gelu_const(0.044715);
+
+#pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    for (int j = 0; j < d; j += VEC_ELEM_NUM) {
+      const int start = i * 2 * d;
+      const scalar_vec_t x(input + start + j);
+      const scalar_vec_t y(input + start + d + j);
+
+      const vec_op::FP32Vec8 f32_x(x.reg);
+      const vec_op::FP32Vec8 f32_y(y.reg);
+
+      const vec_op::FP32Vec8 f32_ans =
+          f32_y * half * f32_x * (ones + (sqrt_two_over_pi * (f32_x + gelu_const * f32_x * f32_x * f32_x)).tanh());
+
+      const scalar_vec_t ans(f32_ans.reg);
+      ans.save(output + i * d + j);
+    }
+  }
+}
+}
+
+/*
+
+    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        d = x.shape[-1] // 2
+        return F.gelu(x[..., :d]) * x[..., d:]
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+*/
+
+void gelu_and_mul_cpu(torch::Tensor &out, torch::Tensor &input) {
+  int num_tokens = input.numel() / input.size(-1);
+  int d = input.size(-1) / 2;
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "gelu_and_mul_cpu_impl", [&] {
+        CPU_KERNEL_GUARD_IN(gelu_and_mul_cpu_impl)
+        gelu_and_mul_cpu_impl(num_tokens, d, input.data_ptr<scalar_t>(),
+                              out.data_ptr<scalar_t>());
+        CPU_KERNEL_GUARD_OUT(gelu_and_mul_cpu_impl)
+      });
+}
+
+namespace {
+
+template <typename scalar_t>
+void gelu_new_cpu_impl(int num_tokens, int d, scalar_t *__restrict__ input,
+                           scalar_t *__restrict__ output) {
+  using scalar_vec_t = vec_op::vec_t<scalar_t>;
+  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
+
+  TORCH_CHECK(d % VEC_ELEM_NUM == 0);
+
+  const vec_op::FP32Vec8 half(0.5);
+  const vec_op::FP32Vec8 ones(1.0);
+  const vec_op::FP32Vec8 sqrt_two_over_pi(sqrtf(2.0 / M_PI));
+  const vec_op::FP32Vec8 gelu_const(0.044715);
+
+#pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    for (int j = 0; j < d; j += VEC_ELEM_NUM) {
+      const int start = i * d;
+      const scalar_vec_t x(input + start + j);
+
+      const vec_op::FP32Vec8 f32_x(x.reg);
+
+      const vec_op::FP32Vec8 f32_ans =
+          half * f32_x * (ones + (sqrt_two_over_pi * (f32_x + gelu_const * f32_x * f32_x * f32_x)).tanh());
+
+      const scalar_vec_t ans(f32_ans.reg);
+      ans.save(output + i * d + j);
+    }
+  }
+}
+}
+
+/*
+    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        c = math.sqrt(2.0 / math.pi)
+        return 0.5 * x * (1.0 + torch.tanh(c *
+                                           (x + 0.044715 * torch.pow(x, 3.0))))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+*/
+
+void gelu_new_cpu(torch::Tensor &out, torch::Tensor &input) {
+  int num_tokens = input.numel() / input.size(-1);
+  int d = input.size(-1);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "gelu_new_cpu_impl", [&] {
+        CPU_KERNEL_GUARD_IN(gelu_new_cpu_impl)
+        gelu_new_cpu_impl(num_tokens, d, input.data_ptr<scalar_t>(),
+                          out.data_ptr<scalar_t>());
+        CPU_KERNEL_GUARD_OUT(gelu_new_cpu_impl)
+      });
+}
+
+void gelu_fast_cpu(torch::Tensor &out, torch::Tensor &input) {
+  gelu_new_cpu(out, input);
+}