IBM · prashantgupta24 · Jun 18, 2024 · Jun 18, 2024 · Jun 18, 2024 · Jun 19, 2024
diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -17,6 +17,7 @@ steps:
     plugins:
     - kubernetes:
         podSpec:
+          priorityClassName: perf-benchmark
           containers:
           - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
             command:

diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
@@ -0,0 +1,14 @@
+# This script build the OpenVINO docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t openvino-test -f Dockerfile.openvino .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f openvino-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -1,7 +1,10 @@
 # In this file, you can add more tests to run either by adding a new step or
 # adding a new command to an existing step. See different options here for examples.
-# This script will be feed into Jinja template in `test-template-aws.j2` to generate
-# the final pipeline yaml file.
+
+# This script will be feed into Jinja template in `test-template-aws.j2` at
+# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
+# to generate the final pipeline yaml file.
+
 
 steps:
 - label: Regression Test
@@ -24,19 +27,26 @@ steps:
 
 - label: Core Test
   mirror_hardwares: [amd]
-  command: pytest -v -s core
+  commands: 
+  - pytest -v -s core
+  - pytest -v -s distributed/test_parallel_state.py
 
 - label: Distributed Comm Ops Test
   #mirror_hardwares: [amd]
-  command: pytest -v -s distributed/test_comm_ops.py
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
+  commands:
+  - pytest -v -s distributed/test_comm_ops.py
+  - pytest -v -s distributed/test_shm_broadcast.py
 
 - label: Distributed Tests (2 GPUs)
   mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   commands:
+  # FIXIT: find out which code initialize cuda before running the test
+  # before the fix, we need to use spawn to test it
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
@@ -46,7 +56,7 @@ steps:
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 
@@ -55,11 +65,15 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   commands:
+  # FIXIT: find out which code initialize cuda before running the test
+  # before the fix, we need to use spawn to test it
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s distributed/test_pynccl.py
   # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
   # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
 
 - label: Engine Test
   mirror_hardwares: [amd]
@@ -145,6 +159,9 @@ steps:
   num_gpus: 4
   # This test runs llama 13B, so it is required to run on 4 GPUs.
   commands:
+    # FIXIT: find out which code initialize cuda before running the test
+    # before the fix, we need to use spawn to test it
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s -x lora/test_long_context.py
 
 - label: Tensorizer Test
@@ -181,3 +198,16 @@ steps:
   commands:
   - pip install -r requirements-docs.txt
   - SPHINXOPTS=\"-W\" make html
+
+- label: Distributed Tests (A100)
+  gpu: a100
+  num_gpus: 4
+  commands: 
+  # FIXIT: find out which code initialize cuda before running the test
+  # before the fix, we need to use spawn to test it
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  # NOTE: don't test llama model here, it seems hf implementation is buggy
+  # see https://github.com/vllm-project/vllm/pull/5689 for details
+  - pytest -v -s distributed/test_custom_all_reduce.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -5,7 +5,7 @@ on:
 
   push:
     branches:
-      - release
+      - wip-upstream
     paths-ignore:
       - "**.md"
       - "proto/**"

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,7 +2,8 @@ cmake_minimum_required(VERSION 3.21)
 
 project(vllm_extensions LANGUAGES CXX)
 
-option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
+# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
+set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
 
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
@@ -32,8 +33,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # versions are derived from Dockerfile.rocm
 #
 set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
-set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
-set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.4.0")
 
 #
 # Try to find python package with an executable that exactly matches
@@ -98,18 +98,11 @@ elseif(HIP_FOUND)
   # .hip extension automatically, HIP must be enabled explicitly.
   enable_language(HIP)
 
-  # ROCm 5.x
-  if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
-    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
-      "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
-  endif()
-
-  # ROCm 6.x
-  if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
-    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
-      "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
+  # ROCm 5.X and 6.X
+  if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM} "
+      "expected for ROCm build, saw ${Torch_VERSION} instead.")
   endif()
 else()
   message(FATAL_ERROR "Can't find CUDA or HIP installation.")

diff --git a/Dockerfile b/Dockerfile
@@ -172,7 +172,7 @@ FROM vllm-base AS vllm-openai
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer modelscope
+    pip install accelerate hf_transfer 'modelscope!=1.15.0'
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 

diff --git a/Dockerfile.openvino b/Dockerfile.openvino
@@ -0,0 +1,26 @@
+# The vLLM Dockerfile is used to construct vLLM image that can be directly used
+# to run the OpenAI compatible server.
+
+FROM ubuntu:22.04 AS dev
+
+RUN apt-get update -y && \
+    apt-get install -y python3-pip git
+WORKDIR /workspace
+
+# copy requirements
+COPY requirements-build.txt /workspace/vllm/
+COPY requirements-common.txt /workspace/vllm/
+COPY requirements-openvino.txt /workspace/vllm/
+
+COPY vllm/ /workspace/vllm/vllm
+COPY setup.py /workspace/vllm/
+
+# install build requirements
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
+# build vLLM with OpenVINO backend
+RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+
+COPY examples/ /workspace/vllm/examples
+COPY benchmarks/ /workspace/vllm/benchmarks
+
+CMD ["/bin/bash"]
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
@@ -0,0 +1,22 @@
+FROM mambaorg/micromamba
+ARG MAMBA_DOCKERFILE_ACTIVATE=1
+USER root
+
+RUN apt-get update  -y     && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+# Some packages in requirements-cpu are installed here
+# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
+# Currently these may not be available for venv or pip directly
+RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults     python=3.10     pytorch-cpu=2.1.2     torchvision-cpu=0.16.2    &&     micromamba clean --all --yes
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+# These packages will be in rocketce eventually
+RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
+
+RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
+
+WORKDIR /vllm-workspace
+ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]