Skip to content

Commit

Permalink
Adaptiations to vllm-project#6484 and Merge remote-tracking branch 'g…
Browse files Browse the repository at this point in the history
…ithub/main' into continous_batching_mamba_from_scratch
  • Loading branch information
mzusman committed Oct 13, 2024
2 parents 906379d + f519902 commit fa1162e
Show file tree
Hide file tree
Showing 113 changed files with 4,134 additions and 895 deletions.
8 changes: 7 additions & 1 deletion .buildkite/run-cpu-test-ppc64le.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,13 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
# Run basic model test
docker exec cpu-test bash -c "
pip install pytest matplotlib einops transformers_stream_generator
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
pytest -v -s tests/models -m \"not vlm\" \
--ignore=tests/models/test_embedding.py \
--ignore=tests/models/test_oot_registration.py \
--ignore=tests/models/test_registry.py \
--ignore=tests/models/test_jamba.py \
--ignore=tests/models/test_mamba.py \
--ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported

# online inference
docker exec cpu-test bash -c "
Expand Down
1 change: 1 addition & 0 deletions .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ docker exec cpu-test bash -c "
pytest -v -s tests/models/decoder_only/language \
--ignore=tests/models/test_fp8.py \
--ignore=tests/models/decoder_only/language/test_jamba.py \
--ignore=tests/models/decoder_only/language/test_mamba.py \
--ignore=tests/models/decoder_only/language/test_granitemoe.py \
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported

Expand Down
23 changes: 14 additions & 9 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,9 @@ steps:
- vllm/core/
- tests/distributed
- tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile
commands:
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py

Expand Down Expand Up @@ -231,14 +233,16 @@ steps:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_full_graph_smoke.py
- pytest -v -s compile/test_basic_correctness.py

- label: "PyTorch Fullgraph Test" # 18min
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_full_graph.py
# TODO: re-write in comparison tests, and fix symbolic shape
# for quantization ops.
# - label: "PyTorch Fullgraph Test" # 18min
# source_file_dependencies:
# - vllm/
# - tests/compile
# commands:
# - pytest -v -s compile/test_full_graph.py

- label: Kernels Test %N # 1h each
mirror_hardwares: [amd]
Expand Down Expand Up @@ -343,10 +347,11 @@ steps:
- pytest -v -s models/encoder_decoder/language
- pytest -v -s models/encoder_decoder/vision_language

# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
#mirror_hardwares: [amd]
optional: true
commands:
- echo 'Testing custom models...'
# PR authors can temporarily add commands below to test individual models
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
Expand Down Expand Up @@ -394,7 +399,7 @@ steps:
- tests/distributed/
- vllm/compilation
commands:
- pytest -v -s ./compile/test_full_graph_multi_gpu.py
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
- TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest basic_correctness/ -v -s -m distributed_2_gpus
Expand Down
21 changes: 16 additions & 5 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
@@ -1,19 +1,30 @@
# See https://help.github.com/articles/about-codeowners/
# for more info about CODEOWNERS file

# This lists cover the "core" components of vLLM that require careful review
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
CMakeLists.txt @tlrmchlsmth @WoosukKwon

# Test ownership
/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
/tests/models @DarkLight1337 @ywang96
/tests/multimodal @DarkLight1337 @ywang96
/tests/prefix_caching @comaniac @KuntaiDu
/tests/prefix_caching @comaniac @KuntaiDu
/tests/spec_decode @njhill @LiuXiaoxuanPKU
/tests/kernels @tlrmchlsmth @WoosukKwon
/tests/kernels @tlrmchlsmth @WoosukKwon
/tests/quantization @mgoin @robertgshaw2-neuralmagic
/.buildkite/lm-eval-harness @mgoin @simon-mo
/.buildkite/lm-eval-harness @mgoin @simon-mo
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
/tests/multi_step @alexm-neuralmagic @SolitaryThinker @comaniac
/tests/multi_step @alexm-neuralmagic @comaniac
/tests/weight_loading @mgoin @youkaichao
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
2 changes: 1 addition & 1 deletion .github/workflows/actionlint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: "Checkout"
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
with:
fetch-depth: 0

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/add_label_automerge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Add label
uses: actions/github-script@v6
uses: actions/github-script@v7
with:
script: |
github.rest.issues.addLabels({
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/clang-format.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ jobs:
matrix:
python-version: ["3.11"]
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/mypy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ jobs:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
upload_url: ${{ steps.create_release.outputs.upload_url }}
steps:
- name: Checkout
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Extract branch info
shell: bash
Expand All @@ -30,7 +30,7 @@ jobs:
- name: Create Release
id: create_release
uses: "actions/github-script@v6"
uses: "actions/github-script@v7"
env:
RELEASE_TAG: ${{ env.release_tag }}
with:
Expand All @@ -54,7 +54,7 @@ jobs:

steps:
- name: Checkout
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Setup ccache
uses: hendrikmuhs/ccache-action@v1.2
Expand All @@ -68,7 +68,7 @@ jobs:
bash -x .github/workflows/scripts/env.sh
- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/reminder_comment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Remind to run full CI on PR
uses: actions/github-script@v6
uses: actions/github-script@v7
with:
script: |
github.rest.issues.createComment({
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/ruff.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ jobs:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/yapf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ jobs:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand Down
35 changes: 20 additions & 15 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -144,27 +144,32 @@ else()
endif()


#
# For cuda we want to be able to control which architectures we compile for on
# a per-file basis in order to cut down on compile time. So here we extract
# the set of architectures we want to compile for and remove the from the
# CMAKE_CUDA_FLAGS so that they are not applied globally.
#
if(VLLM_GPU_LANG STREQUAL "CUDA")
#
# For cuda we want to be able to control which architectures we compile for on
# a per-file basis in order to cut down on compile time. So here we extract
# the set of architectures we want to compile for and remove the from the
# CMAKE_CUDA_FLAGS so that they are not applied globally.
#
clear_cuda_arches(CUDA_ARCH_FLAGS)
extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
# Filter the target architectures by the supported supported archs
# since for some files we will build for all CUDA_ARCHS.
cuda_archs_loose_intersection(CUDA_ARCHS
"${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
else()
#
# For other GPU targets override the GPU architectures detected by cmake/torch
# and filter them by the supported versions for the current language.
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
#
override_gpu_arches(VLLM_GPU_ARCHES
${VLLM_GPU_LANG}
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
endif()

#
# Override the GPU architectures detected by cmake/torch and filter them by
# the supported versions for the current language.
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
#
override_gpu_arches(VLLM_GPU_ARCHES
${VLLM_GPU_LANG}
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")

#
# Query torch for additional GPU compilation flags for the given
# `VLLM_GPU_LANG`.
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ RUN pip install intel_extension_for_pytorch==2.4.0

WORKDIR /workspace

ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
pip install --upgrade pip && \
Expand Down
8 changes: 4 additions & 4 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,9 @@ def sample_sonnet_requests(
# Sample the rest of lines per request.
sampled_requests: List[Tuple[str, int, int]] = []
for _ in range(num_requests):
sampled_lines = "".join(
prefix_lines +
random.sample(poem_lines, num_input_lines - num_prefix_lines))
num_lines_needed = num_input_lines - num_prefix_lines
sampled_lines = "".join(prefix_lines +
random.choices(poem_lines, k=num_lines_needed))

prompt = f"{base_prompt}{sampled_lines}"
message = [
Expand Down Expand Up @@ -536,7 +536,7 @@ def process_one_metric(
# E.g., "Time to First Token"
metric_header: str,
):
# This function print and add statistics of the specified
# This function prints and adds statistics of the specified
# metric.
if metric_attribute_name not in selected_percentile_metrics:
return
Expand Down
4 changes: 4 additions & 0 deletions csrc/quantization/machete/machete_pytorch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ torch::Tensor prepack_B(torch::Tensor const& B,
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
m.impl("machete_prepack_B", &prepack_B);
m.impl("machete_gemm", &gemm);
}

// use CatchAll since supported_schedules has no tensor arguments
TORCH_LIBRARY_IMPL(TORCH_EXTENSION_NAME, CatchAll, m) {
m.impl("machete_supported_schedules", &supported_schedules);
}

Expand Down
Loading

0 comments on commit fa1162e

Please sign in to comment.