Merge branch 'vllm-project:main' into main

vllm-project · Aug 28, 2024 · 370de52 · 370de52
2 parents 5245f4f + c166e7e
commit 370de52
Show file tree

Hide file tree

Showing 118 changed files with 4,393 additions and 1,094 deletions.
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
@@ -75,6 +75,7 @@ docker run \
         --network host \
         --shm-size=16gb \
         --rm \
+        -e HIP_VISIBLE_DEVICES=0 \
         -e HF_TOKEN \
         -v ${HF_CACHE}:${HF_MOUNT} \
         -e HF_HOME=${HF_MOUNT} \

diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
@@ -12,5 +12,4 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu \
-    python3 /workspace/vllm/examples/offline_inference_tpu.py
+docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -233,12 +233,13 @@ steps:
   parallelism: 4
 
 - label: Tensorizer Test # 11min
+  mirror_hardwares: [amd]
   soft_fail: true
   source_file_dependencies:
   - vllm/model_executor/model_loader
   - tests/tensorizer_loader
   commands:
-    - apt-get install -y curl libsodium23
+    - apt-get update && apt-get install -y curl libsodium23
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s tensorizer_loader
 

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
@@ -35,7 +35,6 @@ jobs:
         mypy
         mypy tests --follow-imports skip
         mypy vllm/attention --follow-imports skip
-        mypy vllm/core --follow-imports skip
         mypy vllm/distributed --follow-imports skip
         mypy vllm/engine  --follow-imports skip
         mypy vllm/executor --follow-imports skip

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -296,6 +296,11 @@ set(VLLM_MOE_EXT_SRC
   "csrc/moe/torch_bindings.cpp"
   "csrc/moe/topk_softmax_kernels.cu")
 
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  list(APPEND VLLM_MOE_EXT_SRC
+      "csrc/moe/marlin_moe_ops.cu")
+endif()
+
 define_gpu_extension_target(
   _moe_C
   DESTINATION vllm

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -86,6 +86,7 @@ def run_vllm(
     use_v2_block_manager: bool = False,
     download_dir: Optional[str] = None,
     load_format: str = EngineArgs.load_format,
+    disable_async_output_proc: bool = False,
 ) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(
@@ -110,6 +111,7 @@ def run_vllm(
         load_format=load_format,
         num_scheduler_steps=num_scheduler_steps,
         use_v2_block_manager=use_v2_block_manager,
+        disable_async_output_proc=disable_async_output_proc,
     )
 
     # Add the requests to the engine.
@@ -237,7 +239,8 @@ def main(args: argparse.Namespace):
             args.enable_prefix_caching, args.enable_chunked_prefill,
             args.max_num_batched_tokens, args.distributed_executor_backend,
             args.gpu_memory_utilization, args.num_scheduler_steps,
-            args.use_v2_block_manager, args.download_dir, args.load_format)
+            args.use_v2_block_manager, args.download_dir, args.load_format,
+            args.disable_async_output_proc)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -418,6 +421,11 @@ def main(args: argparse.Namespace):
         'section for more information.\n'
         '* "bitsandbytes" will load the weights using bitsandbytes '
         'quantization.\n')
+    parser.add_argument(
+        "--disable-async-output-proc",
+        action='store_true',
+        default=False,
+        help="Disable async output processor for vLLM backend.")
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model

diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh
@@ -6,7 +6,7 @@ TOKENS=$2
 
 docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
            -v $PWD/data:/data \
-           ghcr.io/huggingface/text-generation-inference:1.4.0 \
+           ghcr.io/huggingface/text-generation-inference:2.2.0 \
            --model-id $MODEL \
            --sharded false  \
            --max-input-length 1024 \

diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp
@@ -387,7 +387,8 @@ class ScalarTypeTorch : public torch::CustomClassHolder, public ScalarType {
   // This needs to be implemented and throw a TypeError in order for
   // PyTorch's opcheck to work on ops that use ScalarTypes.
   int64_t len() const {
-    throw c10::TypeError("__len__ not implemented");
+    throw c10::TypeError({__func__, __FILE__, static_cast<uint32_t>(__LINE__)},
+                         "__len__ not implemented");
     return 0;
   }