diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index 686f70dbece6c..69b6b146b3549 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -43,7 +43,7 @@ main() { - # The figures should be genereated by a separate process outside the CI/CD pipeline + # The figures should be generated by a separate process outside the CI/CD pipeline # # generate figures # python3 -m pip install tabulate pandas matplotlib diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh index 3f38cf5137535..32bd34c431c89 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -301,6 +301,104 @@ run_serving_tests() { kill_gpu_processes } +run_genai_perf_tests() { + # run genai-perf tests + + # $1: a json file specifying genai-perf test cases + local genai_perf_test_file + genai_perf_test_file=$1 + + # Iterate over genai-perf tests + jq -c '.[]' "$genai_perf_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # prepend the current serving engine to the test name + test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} + + # get common parameters + common_params=$(echo "$params" | jq -r '.common_parameters') + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') + reuse_server=$(echo "$common_params" | jq -r '.reuse_server') + + # get client and server arguments + server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if there is enough GPU to run the test + if [[ $gpu_count -lt $tp ]]; then + echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." + continue + fi + + if [[ $reuse_server == "true" ]]; then + echo "Reuse previous server for test case $test_name" + else + kill_gpu_processes + bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ + "$server_params" "$common_params" + fi + + if wait_for_server; then + echo "" + echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." + else + echo "" + echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." + break + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps=$num_prompts + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + backend=$CURRENT_LLM_SERVING_ENGINE + + if [[ "$backend" == *"vllm"* ]]; then + backend="vllm" + fi + #TODO: add output dir. + client_command="genai-perf profile \ + -m $model \ + --service-kind openai \ + --backend vllm \ + --endpoint-type chat \ + --streaming \ + --url localhost:$port \ + --request-rate $qps \ + --num-prompts $num_prompts \ + " + + echo "Client command: $client_command" + + eval "$client_command" + + #TODO: process/record outputs + done + done + + kill_gpu_processes + +} prepare_dataset() { @@ -328,12 +426,17 @@ main() { pip install -U transformers + pip install -r requirements-dev.txt + which genai-perf + # check storage df -h ensure_installed wget ensure_installed curl ensure_installed jq + # genai-perf dependency + ensure_installed libb64-0d prepare_dataset @@ -345,6 +448,10 @@ main() { # run the test run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json" + # run genai-perf tests + run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json" + mv artifacts/ $RESULTS_FOLDER/ + # upload benchmark results to buildkite python3 -m pip install tabulate pandas python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py" diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json new file mode 100644 index 0000000000000..edbe9f2df0ce0 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json @@ -0,0 +1,23 @@ +[ + { + "test_name": "llama8B_tp1_genai_perf", + "qps_list": [4,8,16,32], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "tp": 1, + "port": 8000, + "num_prompts": 500, + "reuse_server": false + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "genai_perf_input_parameters": { + } + } +] \ No newline at end of file diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 9925db7bea593..e19ace782feb5 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -83,6 +83,6 @@ function cpu_tests() { tests/lora/test_qwen2vl.py" } -# All of CPU tests are expected to be finished less than 25 mins. +# All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" +timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index 189714ebb6d75..0590dad4f311f 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -25,8 +25,11 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then last_build=$(cat /tmp/neuron-docker-build-timestamp) current_time=$(date +%s) if [ $((current_time - last_build)) -gt 86400 ]; then + # Remove dangling images (those that are not tagged and not used by any container) docker image prune -f - docker system prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune -f + # Remove huggingface model artifacts and compiler cache rm -rf "${HF_MOUNT:?}/*" rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*" echo "$current_time" > /tmp/neuron-docker-build-timestamp diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 74b287c7adbfa..daec46760117d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -52,7 +52,6 @@ steps: - tests/worker - tests/standalone_tests/lazy_torch_compile.py commands: - - pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git # Used by multimoda processing test - python3 standalone_tests/lazy_torch_compile.py - pytest -v -s mq_llm_engine # MQLLMEngine - pytest -v -s async_engine # AsyncLLMEngine @@ -77,7 +76,9 @@ steps: - tests/basic_correctness/test_basic_correctness - tests/basic_correctness/test_cpu_offload - tests/basic_correctness/test_preemption + - tests/basic_correctness/test_cumem.py commands: + - pytest -v -s basic_correctness/test_cumem.py - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py @@ -107,7 +108,7 @@ steps: source_file_dependencies: - vllm/ commands: - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process @@ -126,11 +127,15 @@ steps: - tests/distributed - tests/spec_decode/e2e/test_integration_dist_tp4 - tests/compile + - examples/offline_inference/rlhf.py commands: - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py + # TODO: create a dedicated test section for multi-GPU example tests + # when we have multiple distributed example tests + - python3 ../examples/offline_inference/rlhf.py - label: Metrics, Tracing Test # 10min num_gpus: 2 @@ -462,7 +467,10 @@ steps: - vllm/worker/worker_base.py - vllm/worker/worker.py - vllm/worker/model_runner.py + - entrypoints/llm/test_collective_rpc.py commands: + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' @@ -471,7 +479,9 @@ steps: - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)' - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)' - - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py + # this test fails consistently. + # TODO: investigate and fix + # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py @@ -509,7 +519,9 @@ steps: - vllm/engine - tests/multi_step commands: - - pytest -v -s multi_step/test_correctness_async_llm.py + # this test is quite flaky + # TODO: investigate and fix. + # - pytest -v -s multi_step/test_correctness_async_llm.py - pytest -v -s multi_step/test_correctness_llm.py - label: Pipeline Parallelism Test # 45min diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 3cb91fc0f8232..bc324d8b988b1 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,32 +2,35 @@ # for more info about CODEOWNERS file # This lists cover the "core" components of vLLM that require careful review -/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth +/vllm/model_executor/guided_decoding @mgoin +/vllm/multimodal @DarkLight1337 @ywang96 CMakeLists.txt @tlrmchlsmth # vLLM V1 -/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic +/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat # Test ownership -/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo +/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo /tests/test_inputs.py @DarkLight1337 @ywang96 -/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo +/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo /tests/models @DarkLight1337 @ywang96 /tests/multimodal @DarkLight1337 @ywang96 /tests/prefix_caching @comaniac @KuntaiDu /tests/spec_decode @njhill @LiuXiaoxuanPKU /tests/kernels @tlrmchlsmth @WoosukKwon -/tests/quantization @mgoin @robertgshaw2-neuralmagic +/tests/quantization @mgoin @robertgshaw2-redhat /.buildkite/lm-eval-harness @mgoin @simon-mo /tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao -/tests/multi_step @alexm-neuralmagic @comaniac +/tests/multi_step @alexm-redhat @comaniac /tests/weight_loading @mgoin @youkaichao /tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml deleted file mode 100644 index 0226cf0ca00e9..0000000000000 --- a/.github/workflows/actionlint.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: Lint GitHub Actions workflows -on: - push: - branches: - - "main" - paths: - - '.github/workflows/*.ya?ml' - - '.github/workflows/actionlint.*' - - '.github/workflows/matchers/actionlint.json' - pull_request: - branches: - - "main" - paths: - - '.github/workflows/*.ya?ml' - - '.github/workflows/actionlint.*' - - '.github/workflows/matchers/actionlint.json' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - actionlint: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Run actionlint" - run: | - echo "::add-matcher::.github/workflows/matchers/actionlint.json" - tools/actionlint.sh -color diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml deleted file mode 100644 index 68149d2dc019f..0000000000000 --- a/.github/workflows/clang-format.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: clang-format - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - '**/*.h' - - '**/*.cpp' - - '**/*.cu' - - '**/*.cuh' - - '.github/workflows/clang-format.yml' - pull_request: - branches: - - main - paths: - - '**/*.h' - - '**/*.cpp' - - '**/*.cu' - - '**/*.cuh' - - '.github/workflows/clang-format.yml' - -jobs: - clang-format: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.11"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install clang-format==18.1.5 - - name: Running clang-format - run: | - EXCLUDES=( - 'csrc/moe/topk_softmax_kernels.cu' - 'csrc/quantization/gguf/ggml-common.h' - 'csrc/quantization/gguf/dequantize.cuh' - 'csrc/quantization/gguf/vecdotq.cuh' - 'csrc/quantization/gguf/mmq.cuh' - 'csrc/quantization/gguf/mmvq.cuh' - ) - find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ - | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \ - | xargs clang-format --dry-run --Werror diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml deleted file mode 100644 index 68887adaae54b..0000000000000 --- a/.github/workflows/codespell.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: codespell - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - "**/*.py" - - "**/*.md" - - "**/*.rst" - - pyproject.toml - - requirements-lint.txt - - .github/workflows/codespell.yml - pull_request: - branches: - - main - paths: - - "**/*.py" - - "**/*.md" - - "**/*.rst" - - pyproject.toml - - requirements-lint.txt - - .github/workflows/codespell.yml - -jobs: - codespell: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Spelling check with codespell - run: | - codespell --toml pyproject.toml diff --git a/.github/workflows/doc-lint.yml b/.github/workflows/doc-lint.yml deleted file mode 100644 index 2f5ee8bbfd8c5..0000000000000 --- a/.github/workflows/doc-lint.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: Lint documentation - -on: - push: - branches: - - main - paths: - - "docs/**" - pull_request: - branches: - - main - paths: - - "docs/**" - -jobs: - doc-lint: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Linting docs - run: tools/doc-lint.sh diff --git a/.github/workflows/matchers/ruff.json b/.github/workflows/matchers/ruff.json deleted file mode 100644 index f6d4479ee1996..0000000000000 --- a/.github/workflows/matchers/ruff.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "problemMatcher": [ - { - "owner": "ruff", - "pattern": [ - { - "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$", - "file": 1, - "line": 2, - "column": 3, - "code": 4, - "message": 5 - } - ] - } - ] - } diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml deleted file mode 100644 index 73eeacf1fa562..0000000000000 --- a/.github/workflows/mypy.yaml +++ /dev/null @@ -1,51 +0,0 @@ -name: mypy - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - '**/*.py' - - '.github/workflows/mypy.yaml' - - 'tools/mypy.sh' - - 'pyproject.toml' - pull_request: - branches: - - main - # This workflow is only relevant when one of the following files changes. - # However, we have github configured to expect and require this workflow - # to run and pass before github with auto-merge a pull request. Until github - # allows more flexible auto-merge policy, we can just run this on every PR. - # It doesn't take that long to run, anyway. - #paths: - # - '**/*.py' - # - '.github/workflows/mypy.yaml' - # - 'tools/mypy.sh' - # - 'pyproject.toml' - -jobs: - mypy: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install mypy==1.11.1 - pip install types-setuptools - pip install types-PyYAML - pip install types-requests - pip install types-setuptools - - name: Mypy - run: | - echo "::add-matcher::.github/workflows/matchers/mypy.json" - tools/mypy.sh 1 ${{ matrix.python-version }} diff --git a/.github/workflows/png-lint.yml b/.github/workflows/png-lint.yml deleted file mode 100644 index 4932af943a07b..0000000000000 --- a/.github/workflows/png-lint.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: Lint PNG exports from excalidraw -on: - push: - branches: - - "main" - paths: - - '*.excalidraw.png' - - '.github/workflows/png-lint.yml' - pull_request: - branches: - - "main" - paths: - - '*.excalidraw.png' - - '.github/workflows/png-lint.yml' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - actionlint: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Run png-lint.sh to check excalidraw exported images" - run: | - tools/png-lint.sh diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000000000..06564969dc778 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,19 @@ +name: pre-commit + +on: + pull_request: + push: + branches: [main] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: "3.12" + - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" + - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 + with: + extra_args: --all-files --hook-stage manual diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml deleted file mode 100644 index 7266cc378cfb0..0000000000000 --- a/.github/workflows/ruff.yml +++ /dev/null @@ -1,52 +0,0 @@ -name: ruff - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - "**/*.py" - - pyproject.toml - - requirements-lint.txt - - .github/workflows/matchers/ruff.json - - .github/workflows/ruff.yml - pull_request: - branches: - - main - # This workflow is only relevant when one of the following files changes. - # However, we have github configured to expect and require this workflow - # to run and pass before github with auto-merge a pull request. Until github - # allows more flexible auto-merge policy, we can just run this on every PR. - # It doesn't take that long to run, anyway. - #paths: - # - "**/*.py" - # - pyproject.toml - # - requirements-lint.txt - # - .github/workflows/matchers/ruff.json - # - .github/workflows/ruff.yml - -jobs: - ruff: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Analysing the code with ruff - run: | - echo "::add-matcher::.github/workflows/matchers/ruff.json" - ruff check --output-format github . - - name: Run isort - run: | - isort . --check-only diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml deleted file mode 100644 index 4b1587e373e17..0000000000000 --- a/.github/workflows/shellcheck.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: Lint shell scripts -on: - push: - branches: - - "main" - paths: - - '**/*.sh' - - '.github/workflows/shellcheck.yml' - pull_request: - branches: - - "main" - paths: - - '**/*.sh' - - '.github/workflows/shellcheck.yml' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - shellcheck: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Check shell scripts" - run: | - tools/shellcheck.sh diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml deleted file mode 100644 index ff441f94435ad..0000000000000 --- a/.github/workflows/yapf.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: yapf - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - "**/*.py" - - .github/workflows/yapf.yml - pull_request: - branches: - - main - paths: - - "**/*.py" - - .github/workflows/yapf.yml - -jobs: - yapf: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install yapf==0.32.0 - pip install toml==0.10.2 - - name: Running yapf - run: | - yapf --diff --recursive . diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000000..432bf5ed18dbc --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,92 @@ +default_stages: + - pre-commit # Run locally + - manual # Run in CI +repos: +- repo: https://github.com/google/yapf + rev: v0.32.0 + hooks: + - id: yapf + args: [--in-place, --verbose] + additional_dependencies: [toml] # TODO: Remove when yapf is upgraded +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.5 + hooks: + - id: ruff + args: [--output-format, github] +- repo: https://github.com/codespell-project/codespell + rev: v2.3.0 + hooks: + - id: codespell + exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*' +- repo: https://github.com/PyCQA/isort + rev: 5.13.2 + hooks: + - id: isort +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: v18.1.5 + hooks: + - id: clang-format + exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))' + types_or: [c++, cuda] + args: [--style=file, --verbose] +- repo: https://github.com/jackdewinter/pymarkdown + rev: v0.9.27 + hooks: + - id: pymarkdown + files: docs/.* +- repo: https://github.com/rhysd/actionlint + rev: v1.7.6 + hooks: + - id: actionlint +- repo: local + hooks: + - id: mypy-local + name: Run mypy for local Python installation + entry: tools/mypy.sh 0 "local" + language: python + types: [python] + additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests] + stages: [pre-commit] # Don't run in CI + - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.9 + entry: tools/mypy.sh 1 "3.9" + language: python + types: [python] + additional_dependencies: *mypy_deps + stages: [manual] # Only run in CI + - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.10 + entry: tools/mypy.sh 1 "3.10" + language: python + types: [python] + additional_dependencies: *mypy_deps + stages: [manual] # Only run in CI + - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.11 + entry: tools/mypy.sh 1 "3.11" + language: python + types: [python] + additional_dependencies: *mypy_deps + stages: [manual] # Only run in CI + - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.12 + entry: tools/mypy.sh 1 "3.12" + language: python + types: [python] + additional_dependencies: *mypy_deps + stages: [manual] # Only run in CI + - id: shellcheck + name: Lint shell scripts + entry: tools/shellcheck.sh + language: script + types: [shell] + - id: png-lint + name: Lint PNG exports from excalidraw + entry: tools/png-lint.sh + language: script + types: [png] + - id: suggestion + name: Suggestion + entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."' + language: system + verbose: true diff --git a/CMakeLists.txt b/CMakeLists.txt index f4b9c3ec9c14f..0945905104f32 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,6 +181,31 @@ message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") # Define other extension targets # +# +# cumem_allocator extension +# + +set(VLLM_CUMEM_EXT_SRC + "csrc/cumem_allocator.cpp") + +set_gencode_flags_for_srcs( + SRCS "${VLLM_CUMEM_EXT_SRC}" + CUDA_ARCHS "${CUDA_ARCHS}") + +if(VLLM_GPU_LANG STREQUAL "CUDA") + message(STATUS "Enabling cumem allocator extension.") + # link against cuda driver library + list(APPEND CUMEM_LIBS cuda) + define_gpu_extension_target( + cumem_allocator + DESTINATION vllm + LANGUAGE CXX + SOURCES ${VLLM_CUMEM_EXT_SRC} + LIBRARIES ${CUMEM_LIBS} + USE_SABI 3.8 + WITH_SOABI) +endif() + # # _C extension # diff --git a/Dockerfile b/Dockerfile index 4542bc9cf0bd2..261f5440aee47 100644 --- a/Dockerfile +++ b/Dockerfile @@ -52,7 +52,7 @@ WORKDIR /workspace # after this step RUN --mount=type=cache,target=/root/.cache/pip \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \ + python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \ fi COPY requirements-common.txt requirements-common.txt diff --git a/Dockerfile.cpu b/Dockerfile.cpu index f163edc27cba8..ebe226cf6d148 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -26,10 +26,10 @@ RUN pip install intel_extension_for_pytorch==2.5.0 WORKDIR /workspace -COPY requirements-build.txt requirements-build.txt ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ pip install --upgrade pip && \ pip install -r requirements-build.txt @@ -37,9 +37,9 @@ FROM cpu-test-1 AS build WORKDIR /workspace/vllm -COPY requirements-common.txt requirements-common.txt -COPY requirements-cpu.txt requirements-cpu.txt RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \ + --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \ pip install -v -r requirements-cpu.txt COPY . . diff --git a/Dockerfile.rocm b/Dockerfile.rocm index e733994f8c33e..14c522afd7f9e 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,174 +1,119 @@ -# Default ROCm 6.2 base image -ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0" +# default base image +ARG REMOTE_VLLM="0" +ARG USE_CYTHON="0" +ARG BUILD_RPD="1" +ARG COMMON_WORKDIR=/app +ARG BASE_IMAGE=rocm/vllm-dev:base -# Default ROCm ARCHes to build vLLM for. -ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100" +FROM ${BASE_IMAGE} AS base -# Whether to install CK-based flash-attention -# If 0, will not install flash-attention -ARG BUILD_FA="1" -ARG FA_GFX_ARCHS="gfx90a;gfx942" -ARG FA_BRANCH="3cea2fb" - -# Whether to build triton on rocm -ARG BUILD_TRITON="1" -ARG TRITON_BRANCH="e192dba" - -### Base image build stage -FROM $BASE_IMAGE AS base - -# Import arg(s) defined before this build stage -ARG PYTORCH_ROCM_ARCH +ARG ARG_PYTORCH_ROCM_ARCH +ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}} # Install some basic utilities -RUN apt-get update && apt-get install python3 python3-pip -y -RUN apt-get update && apt-get install -y \ - curl \ - ca-certificates \ - sudo \ - git \ - bzip2 \ - libx11-6 \ - build-essential \ - wget \ - unzip \ - tmux \ - ccache \ - && rm -rf /var/lib/apt/lists/* - -# When launching the container, mount the code directory to /vllm-workspace -ARG APP_MOUNT=/vllm-workspace -WORKDIR ${APP_MOUNT} - -RUN python3 -m pip install --upgrade pip -# Remove sccache so it doesn't interfere with ccache -# TODO: implement sccache support across components +RUN apt-get update -q -y && apt-get install -q -y \ + sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev +# Remove sccache +RUN python3 -m pip install --upgrade pip && pip install setuptools_scm RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" +ARG COMMON_WORKDIR +WORKDIR ${COMMON_WORKDIR} + + +# ----------------------- +# vLLM fetch stages +FROM base AS fetch_vllm_0 +ONBUILD COPY ./ vllm/ +FROM base AS fetch_vllm_1 +ARG VLLM_REPO="https://github.com/vllm-project/vllm.git" +ARG VLLM_BRANCH="main" +ONBUILD RUN git clone ${VLLM_REPO} \ + && cd vllm \ + && git checkout ${VLLM_BRANCH} +FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm + +# ----------------------- +# vLLM build stages +FROM fetch_vllm AS build_vllm +ARG USE_CYTHON +# Build vLLM +RUN cd vllm \ + && python3 -m pip install -r requirements-rocm.txt \ + && python3 setup.py clean --all \ + && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \ + && python3 setup.py bdist_wheel --dist-dir=dist +FROM scratch AS export_vllm +ARG COMMON_WORKDIR +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl / +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt / +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite + +# ----------------------- +# Test vLLM image +FROM base AS test + +RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* + +# Install vLLM +RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ + cd /install \ + && pip install -U -r requirements-rocm.txt \ + && pip uninstall -y vllm \ + && pip install *.whl + +WORKDIR /vllm-workspace +ARG COMMON_WORKDIR +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace + +# install development dependencies (for testing) +RUN cd /vllm-workspace \ + && rm -rf vllm \ + && python3 -m pip install -e tests/vllm_test_utils \ + && python3 -m pip install lm-eval[api]==0.4.4 \ + && python3 -m pip install pytest-shard + +# ----------------------- +# Final vLLM image +FROM base AS final -# Install torch == 2.6.0 on ROCm -RUN --mount=type=cache,target=/root/.cache/pip \ - case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ - *"rocm-6.2"*) \ - python3 -m pip uninstall -y torch torchvision \ - && python3 -m pip install --pre \ - torch==2.6.0.dev20241113+rocm6.2 \ - 'setuptools-scm>=8' \ - torchvision==0.20.0.dev20241113+rocm6.2 \ - --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \ +RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* +# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. +# Manually remove it so that later steps of numpy upgrade can continue +RUN case "$(which python3)" in \ + *"/opt/conda/envs/py_3.9"*) \ + rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \ *) ;; esac -ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer -ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin: -ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib: -ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/: - -ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} -ENV CCACHE_DIR=/root/.cache/ccache - - -### AMD-SMI build stage -FROM base AS build_amdsmi -# Build amdsmi wheel always -RUN cd /opt/rocm/share/amd_smi \ - && python3 -m pip wheel . --wheel-dir=/install - - -### Flash-Attention wheel build stage -FROM base AS build_fa -ARG BUILD_FA -ARG FA_GFX_ARCHS -ARG FA_BRANCH -# Build ROCm flash-attention wheel if `BUILD_FA = 1` -RUN --mount=type=cache,target=${CCACHE_DIR} \ - if [ "$BUILD_FA" = "1" ]; then \ - mkdir -p libs \ - && cd libs \ - && git clone https://github.com/ROCm/flash-attention.git \ - && cd flash-attention \ - && git checkout "${FA_BRANCH}" \ - && git submodule update --init \ - && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \ - # Create an empty directory otherwise as later build stages expect one - else mkdir -p /install; \ - fi - - -### Triton wheel build stage -FROM base AS build_triton -ARG BUILD_TRITON -ARG TRITON_BRANCH -# Build triton wheel if `BUILD_TRITON = 1` -RUN --mount=type=cache,target=${CCACHE_DIR} \ - if [ "$BUILD_TRITON" = "1" ]; then \ - mkdir -p libs \ - && cd libs \ - && python3 -m pip install ninja cmake wheel pybind11 \ - && git clone https://github.com/OpenAI/triton.git \ - && cd triton \ - && git checkout "${TRITON_BRANCH}" \ - && cd python \ - && python3 setup.py bdist_wheel --dist-dir=/install; \ - # Create an empty directory otherwise as later build stages expect one - else mkdir -p /install; \ - fi - - -### Final vLLM build stage -FROM base AS final -# Import the vLLM development directory from the build context -COPY . . -ARG GIT_REPO_CHECK=0 -RUN --mount=type=bind,source=.git,target=.git \ - if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi +RUN python3 -m pip install --upgrade huggingface-hub[cli] +ARG BUILD_RPD +RUN if [ ${BUILD_RPD} -eq "1" ]; then \ + git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git \ + && cd rocmProfileData/rpd_tracer \ + && pip install -r requirements.txt && cd ../ \ + && make && make install \ + && cd hipMarker && python3 setup.py install ; fi -RUN python3 -m pip install --upgrade pip +# Install vLLM +RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ + cd /install \ + && pip install -U -r requirements-rocm.txt \ + && pip uninstall -y vllm \ + && pip install *.whl -# Package upgrades for useful functionality or to avoid dependency issues -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard +ARG COMMON_WORKDIR +# Copy over the benchmark scripts as well +COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks +COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples -# Workaround for ray >= 2.10.0 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 -# Silences the HF Tokenizers warning ENV TOKENIZERS_PARALLELISM=false -RUN --mount=type=cache,target=${CCACHE_DIR} \ - --mount=type=bind,source=.git,target=.git \ - --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install -Ur requirements-rocm.txt \ - && python3 setup.py clean --all \ - && python3 setup.py develop - -# Copy amdsmi wheel into final image -RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \ - mkdir -p libs \ - && cp /install/*.whl libs \ - # Preemptively uninstall to avoid same-version no-installs - && python3 -m pip uninstall -y amdsmi; - -# Copy triton wheel(s) into final image if they were built -RUN --mount=type=bind,from=build_triton,src=/install,target=/install \ - mkdir -p libs \ - && if ls /install/*.whl; then \ - cp /install/*.whl libs \ - # Preemptively uninstall to avoid same-version no-installs - && python3 -m pip uninstall -y triton; fi - -# Copy flash-attn wheel(s) into final image if they were built -RUN --mount=type=bind,from=build_fa,src=/install,target=/install \ - mkdir -p libs \ - && if ls /install/*.whl; then \ - cp /install/*.whl libs \ - # Preemptively uninstall to avoid same-version no-installs - && python3 -m pip uninstall -y flash-attn; fi - -# Install wheels that were built to the final image -RUN --mount=type=cache,target=/root/.cache/pip \ - if ls libs/*.whl; then \ - python3 -m pip install libs/*.whl; fi - -# install development dependencies (for testing) -RUN python3 -m pip install -e tests/vllm_test_utils +# Performance environment variable. +ENV HIP_FORCE_DEV_KERNARG=1 CMD ["/bin/bash"] + diff --git a/Dockerfile.rocm_base b/Dockerfile.rocm_base new file mode 100644 index 0000000000000..5bbe98b0c2204 --- /dev/null +++ b/Dockerfile.rocm_base @@ -0,0 +1,158 @@ +ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete +ARG HIPBLASLT_BRANCH="4d40e36" +ARG HIPBLAS_COMMON_BRANCH="7c1566b" +ARG LEGACY_HIPBLASLT_OPTION= +ARG RCCL_BRANCH="648a58d" +ARG RCCL_REPO="https://github.com/ROCm/rccl" +ARG TRITON_BRANCH="e5be006" +ARG TRITON_REPO="https://github.com/triton-lang/triton.git" +ARG PYTORCH_BRANCH="8d4926e" +ARG PYTORCH_VISION_BRANCH="v0.19.1" +ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" +ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" +ARG FA_BRANCH="b7d29fb" +ARG FA_REPO="https://github.com/ROCm/flash-attention.git" + +FROM ${BASE_IMAGE} AS base + +ENV PATH=/opt/rocm/llvm/bin:$PATH +ENV ROCM_PATH=/opt/rocm +ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib: +ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942 +ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} + +ARG PYTHON_VERSION=3.12 + +RUN mkdir -p /app +WORKDIR /app +ENV DEBIAN_FRONTEND=noninteractive + +# Install Python and other dependencies +RUN apt-get update -y \ + && apt-get install -y software-properties-common git curl sudo vim less \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update -y \ + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ + python${PYTHON_VERSION}-lib2to3 python-is-python3 \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ + && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ + && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ + && python3 --version && python3 -m pip --version + +RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython + +FROM base AS build_hipblaslt +ARG HIPBLASLT_BRANCH +ARG HIPBLAS_COMMON_BRANCH +# Set to "--legacy_hipblas_direct" for ROCm<=6.2 +ARG LEGACY_HIPBLASLT_OPTION +RUN git clone https://github.com/ROCm/hipBLAS-common.git +RUN cd hipBLAS-common \ + && git checkout ${HIPBLAS_COMMON_BRANCH} \ + && mkdir build \ + && cd build \ + && cmake .. \ + && make package \ + && dpkg -i ./*.deb +RUN git clone https://github.com/ROCm/hipBLASLt +RUN cd hipBLASLt \ + && git checkout ${HIPBLASLT_BRANCH} \ + && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \ + && cd build/release \ + && make package +RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install + +FROM base AS build_rccl +ARG RCCL_BRANCH +ARG RCCL_REPO +RUN git clone ${RCCL_REPO} +RUN cd rccl \ + && git checkout ${RCCL_BRANCH} \ + && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH} +RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install + +FROM base AS build_triton +ARG TRITON_BRANCH +ARG TRITON_REPO +RUN git clone ${TRITON_REPO} +RUN cd triton \ + && git checkout ${TRITON_BRANCH} \ + && cd python \ + && python3 setup.py bdist_wheel --dist-dir=dist +RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install + +FROM base AS build_amdsmi +RUN cd /opt/rocm/share/amd_smi \ + && pip wheel . --wheel-dir=dist +RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install + +FROM base AS build_pytorch +ARG PYTORCH_BRANCH +ARG PYTORCH_VISION_BRANCH +ARG PYTORCH_REPO +ARG PYTORCH_VISION_REPO +ARG FA_BRANCH +ARG FA_REPO +RUN git clone ${PYTORCH_REPO} pytorch +RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \ + pip install -r requirements.txt && git submodule update --init --recursive \ + && python3 tools/amd_build/build_amd.py \ + && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \ + && pip install dist/*.whl +RUN git clone ${PYTORCH_VISION_REPO} vision +RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \ + && python3 setup.py bdist_wheel --dist-dir=dist \ + && pip install dist/*.whl +RUN git clone ${FA_REPO} +RUN cd flash-attention \ + && git checkout ${FA_BRANCH} \ + && git submodule update --init \ + && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist +RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \ + && cp /app/vision/dist/*.whl /app/install \ + && cp /app/flash-attention/dist/*.whl /app/install + +FROM base AS final +RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \ + dpkg -i /install/*deb \ + && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \ + && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status +RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \ + dpkg -i /install/*deb \ + && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \ + && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status +RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \ + pip install /install/*.whl +RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ + pip install /install/*.whl +RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ + pip install /install/*.whl + +ARG BASE_IMAGE +ARG HIPBLASLT_BRANCH +ARG LEGACY_HIPBLASLT_OPTION +ARG RCCL_BRANCH +ARG RCCL_REPO +ARG TRITON_BRANCH +ARG TRITON_REPO +ARG PYTORCH_BRANCH +ARG PYTORCH_VISION_BRANCH +ARG PYTORCH_REPO +ARG PYTORCH_VISION_REPO +ARG FA_BRANCH +ARG FA_REPO +RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ + && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \ + && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \ + && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \ + && echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \ + && echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \ + && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \ + && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \ + && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \ + && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \ + && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \ + && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \ + && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ + && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt diff --git a/SECURITY.md b/SECURITY.md index de0032d26c87b..47196a1f1221e 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -4,7 +4,7 @@ If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. -Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/contributing/vulnerability_management/). +Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). --- diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 9d71e4ecc4a37..f415d109bdfc8 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -22,6 +22,7 @@ class RequestFuncInput: prompt_len: int output_len: int model: str + model_name: Optional[str] = None best_of: int = 1 logprobs: Optional[int] = None extra_body: Optional[dict] = None @@ -34,6 +35,7 @@ class RequestFuncOutput: generated_text: str = "" success: bool = False latency: float = 0.0 + output_tokens: int = 0 ttft: float = 0.0 # Time to first token itl: List[float] = field( default_factory=list) # List of inter-token latencies @@ -78,7 +80,7 @@ async def async_request_tgi( continue chunk_bytes = chunk_bytes.decode("utf-8") - #NOTE: Sometimes TGI returns a ping response without + # NOTE: Sometimes TGI returns a ping response without # any data, we should skip it. if chunk_bytes.startswith(":"): continue @@ -155,7 +157,7 @@ async def async_request_trt_llm( timestamp = time.perf_counter() # First token if ttft == 0.0: - ttft = time.perf_counter() - st + ttft = timestamp - st output.ttft = ttft # Decoding phase @@ -235,7 +237,8 @@ async def async_request_openai_completions( async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: payload = { - "model": request_func_input.model, + "model": request_func_input.model_name \ + if request_func_input.model_name else request_func_input.model, "prompt": request_func_input.prompt, "temperature": 0.0, "best_of": request_func_input.best_of, @@ -243,6 +246,9 @@ async def async_request_openai_completions( "logprobs": request_func_input.logprobs, "stream": True, "ignore_eos": request_func_input.ignore_eos, + "stream_options": { + "include_usage": True, + }, } if request_func_input.extra_body: payload.update(request_func_input.extra_body) @@ -254,7 +260,6 @@ async def async_request_openai_completions( output.prompt_len = request_func_input.prompt_len generated_text = "" - ttft = 0.0 st = time.perf_counter() most_recent_timestamp = st try: @@ -269,15 +274,16 @@ async def async_request_openai_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - if chunk == "[DONE]": - latency = time.perf_counter() - st - else: + if chunk != "[DONE]": data = json.loads(chunk) # NOTE: Some completion API might have a last # usage summary response without a token so we # want to check a token was generated - if data["choices"][0]["text"]: + if choices := data.get("choices"): + # Note that text could be empty here + # e.g. for special tokens + text = choices[0].get("text") timestamp = time.perf_counter() # First token if not first_chunk_received: @@ -291,7 +297,10 @@ async def async_request_openai_completions( most_recent_timestamp) most_recent_timestamp = timestamp - generated_text += data["choices"][0]["text"] + generated_text += text + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") if first_chunk_received: output.success = True else: @@ -300,7 +309,7 @@ async def async_request_openai_completions( "Never received a valid chunk to calculate TTFT." "This response will be marked as failed!") output.generated_text = generated_text - output.latency = latency + output.latency = most_recent_timestamp - st else: output.error = response.reason or "" output.success = False @@ -328,7 +337,8 @@ async def async_request_openai_chat_completions( if request_func_input.multi_modal_content: content.append(request_func_input.multi_modal_content) payload = { - "model": request_func_input.model, + "model": request_func_input.model_name \ + if request_func_input.model_name else request_func_input.model, "messages": [ { "role": "user", @@ -339,6 +349,9 @@ async def async_request_openai_chat_completions( "max_completion_tokens": request_func_input.output_len, "stream": True, "ignore_eos": request_func_input.ignore_eos, + "stream_options": { + "include_usage": True, + }, } if request_func_input.extra_body: payload.update(request_func_input.extra_body) @@ -365,17 +378,15 @@ async def async_request_openai_chat_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - if chunk == "[DONE]": - latency = time.perf_counter() - st - else: + if chunk != "[DONE]": timestamp = time.perf_counter() data = json.loads(chunk) - delta = data["choices"][0]["delta"] - if delta.get("content", None): + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") # First token if ttft == 0.0: - ttft = time.perf_counter() - st + ttft = timestamp - st output.ttft = ttft # Decoding phase @@ -383,13 +394,16 @@ async def async_request_openai_chat_completions( output.itl.append(timestamp - most_recent_timestamp) - generated_text += delta["content"] + generated_text += content + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") most_recent_timestamp = timestamp output.generated_text = generated_text output.success = True - output.latency = latency + output.latency = most_recent_timestamp - st else: output.error = response.reason or "" output.success = False diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 4eb0e1f8ac903..bc026b0ec1ca6 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -25,6 +25,7 @@ import argparse import asyncio import base64 +import gc import io import json import os @@ -423,7 +424,7 @@ def calculate_metrics( tokenizer: PreTrainedTokenizerBase, selected_percentile_metrics: List[str], selected_percentiles: List[float], - gootput_config_dict: Dict[str, float], + goodput_config_dict: Dict[str, float], ) -> Tuple[BenchmarkMetrics, List[int]]: actual_output_lens: List[int] = [] total_input = 0 @@ -436,19 +437,23 @@ def calculate_metrics( e2els: List[float] = [] for i in range(len(outputs)): if outputs[i].success: - # We use the tokenizer to count the number of output tokens for all - # serving backends instead of looking at len(outputs[i].itl) since - # multiple output tokens may be bundled together - # Note : this may inflate the output token count slightly - output_len = len( - tokenizer(outputs[i].generated_text, - add_special_tokens=False).input_ids) + output_len = outputs[i].output_tokens + + if output_len is None: + # We use the tokenizer to count the number of output tokens + # for some serving backends instead of looking at + # len(outputs[i].itl) since multiple output tokens may be + # bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer(outputs[i].generated_text, + add_special_tokens=False).input_ids) actual_output_lens.append(output_len) total_input += input_requests[i][1] tpot = 0 if output_len > 1: - tpot = (outputs[i].latency - outputs[i].ttft) / (output_len - - 1) + latency_minus_ttft = outputs[i].latency - outputs[i].ttft + tpot = latency_minus_ttft / (output_len - 1) tpots.append(tpot) # Note: if output_len <= 1, we regard tpot as 0 for goodput all_tpots.append(tpot) @@ -459,21 +464,21 @@ def calculate_metrics( else: actual_output_lens.append(0) - if gootput_config_dict: + if goodput_config_dict: valid_metrics = [] slo_values = [] - if "ttft" in gootput_config_dict: + if "ttft" in goodput_config_dict: valid_metrics.append(ttfts) - slo_values.append(gootput_config_dict["ttft"] / + slo_values.append(goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION) - if "tpot" in gootput_config_dict: + if "tpot" in goodput_config_dict: valid_metrics.append(all_tpots) - slo_values.append(gootput_config_dict["tpot"] / + slo_values.append(goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION) - if "e2el" in gootput_config_dict: + if "e2el" in goodput_config_dict: valid_metrics.append(e2els) - slo_values.append(gootput_config_dict["e2el"] / + slo_values.append(goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION) for req_metric in zip(*valid_metrics): @@ -525,6 +530,7 @@ async def benchmark( api_url: str, base_url: str, model_id: str, + model_name: str, tokenizer: PreTrainedTokenizerBase, input_requests: List[Tuple[str, int, int]], logprobs: Optional[int], @@ -536,7 +542,7 @@ async def benchmark( selected_percentile_metrics: List[str], selected_percentiles: List[str], ignore_eos: bool, - gootput_config_dict: Dict[str, float], + goodput_config_dict: Dict[str, float], max_concurrency: Optional[int], ): if backend in ASYNC_REQUEST_FUNCS: @@ -553,6 +559,7 @@ async def benchmark( "Multi-modal content is only supported on 'openai-chat' backend.") test_input = RequestFuncInput( model=model_id, + model_name=model_name, prompt=test_prompt, api_url=api_url, prompt_len=test_prompt_len, @@ -573,6 +580,7 @@ async def benchmark( if profile: print("Starting profiler...") profile_input = RequestFuncInput(model=model_id, + model_name=model_name, prompt=test_prompt, api_url=base_url + "/start_profile", prompt_len=test_prompt_len, @@ -616,6 +624,7 @@ async def limited_request_func(request_func_input, pbar): async for request in get_request(input_requests, request_rate, burstiness): prompt, prompt_len, output_len, mm_content = request request_func_input = RequestFuncInput(model=model_id, + model_name=model_name, prompt=prompt, api_url=api_url, prompt_len=prompt_len, @@ -657,7 +666,7 @@ async def limited_request_func(request_func_input, pbar): tokenizer=tokenizer, selected_percentile_metrics=selected_percentile_metrics, selected_percentiles=selected_percentiles, - gootput_config_dict=gootput_config_dict, + goodput_config_dict=goodput_config_dict, ) print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) @@ -669,7 +678,7 @@ async def limited_request_func(request_func_input, pbar): metrics.total_output)) print("{:<40} {:<10.2f}".format("Request throughput (req/s):", metrics.request_throughput)) - if gootput_config_dict: + if goodput_config_dict: print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput)) print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", @@ -684,7 +693,7 @@ async def limited_request_func(request_func_input, pbar): "total_output_tokens": metrics.total_output, "request_throughput": metrics.request_throughput, "request_goodput:": - metrics.request_goodput if gootput_config_dict else None, + metrics.request_goodput if goodput_config_dict else None, "output_throughput": metrics.output_throughput, "total_token_throughput": metrics.total_token_throughput, "input_lens": [output.prompt_len for output in outputs], @@ -740,11 +749,11 @@ def process_one_metric( def check_goodput_args(args): # Check and parse goodput arguments - gootput_config_dict = {} + goodput_config_dict = {} VALID_NAMES = ["ttft", "tpot", "e2el"] if args.goodput: - gootput_config_dict = parse_goodput(args.goodput) - for slo_name, slo_val in gootput_config_dict.items(): + goodput_config_dict = parse_goodput(args.goodput) + for slo_name, slo_val in goodput_config_dict.items(): if slo_name not in VALID_NAMES: raise ValueError( f"Invalid metric name found, {slo_name}: {slo_val}. " @@ -755,22 +764,22 @@ def check_goodput_args(args): f"Invalid value found, {slo_name}: {slo_val}. " "The service level objective value should be " "non-negative.") - return gootput_config_dict + return goodput_config_dict def parse_goodput(slo_pairs): - gootput_config_dict = {} + goodput_config_dict = {} try: for slo_pair in slo_pairs: slo_name, slo_val = slo_pair.split(":") - gootput_config_dict[slo_name] = float(slo_val) + goodput_config_dict[slo_name] = float(slo_val) except ValueError as err: raise argparse.ArgumentTypeError( "Invalid format found for service level objectives. " "Specify service level objectives for goodput as \"KEY:VALUE\" " "pairs, where the key is a metric name, and the value is a " "number in milliseconds.") from err - return gootput_config_dict + return goodput_config_dict def main(args: argparse.Namespace): @@ -780,6 +789,7 @@ def main(args: argparse.Namespace): backend = args.backend model_id = args.model + model_name = args.served_model_name tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model tokenizer_mode = args.tokenizer_mode @@ -869,7 +879,11 @@ def main(args: argparse.Namespace): else: raise ValueError(f"Unknown dataset: {args.dataset_name}") - gootput_config_dict = check_goodput_args(args) + goodput_config_dict = check_goodput_args(args) + + # Avoid GC processing "static" data - reduce pause times. + gc.collect() + gc.freeze() benchmark_result = asyncio.run( benchmark( @@ -877,6 +891,7 @@ def main(args: argparse.Namespace): api_url=api_url, base_url=base_url, model_id=model_id, + model_name=model_name, tokenizer=tokenizer, input_requests=input_requests, logprobs=args.logprobs, @@ -890,7 +905,7 @@ def main(args: argparse.Namespace): float(p) for p in args.metric_percentiles.split(",") ], ignore_eos=args.ignore_eos, - gootput_config_dict=gootput_config_dict, + goodput_config_dict=goodput_config_dict, max_concurrency=args.max_concurrency, )) @@ -1222,5 +1237,12 @@ def main(args: argparse.Namespace): 'always use the slow tokenizer. \n* ' '"mistral" will always use the `mistral_common` tokenizer.') + parser.add_argument("--served-model-name", + type=str, + default=None, + help="The model name used in the API. " + "If not specified, the model name will be the " + "same as the ``--model`` argument. ") + args = parser.parse_args() main(args) diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py new file mode 100644 index 0000000000000..e1f613e1da509 --- /dev/null +++ b/benchmarks/kernels/benchmark_lora.py @@ -0,0 +1,1147 @@ +import argparse +import copy +import json +import pickle +import time +from dataclasses import dataclass +from enum import Enum, auto +from itertools import product +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple + +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement +from utils import ArgPool, Bench, CudaGraphBenchParams +from weight_shapes import WEIGHT_SHAPES + +from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand +from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice +from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink +from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand +from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink +from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT +from vllm.utils import FlexibleArgumentParser + +DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) +DEFAULT_TP_SIZES = [1] +DEFAULT_BATCH_SIZES = [ + 1, 16, 32, 64, 128, 192, 256, 320, 384, 448, 512, 640, 768, 896, 1024, + 2048, 3072, 4096, 5120, 6144, 7168, 8192 +] +DEFAULT_HIDDEN_SIZES = [1024, 2048, 4096, 8192, 16384] +DEFAULT_LORA_RANKS = [16] +DEFAULT_NUM_LORAS = [1, 2, 3, 4] +DEFAULT_SORT_BY_LORA_IDS = [False, True] +DEFAULT_SEQ_LENGTHS = [1] +DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False] + + +# Utilities +def dtype_to_str(dtype: torch.dtype): + if dtype == torch.float16: + return "f16" + if dtype == torch.bfloat16: + return "bf16" + if dtype == torch.float32: + return "f32" + raise ValueError(f"Unsupported dtype {dtype}") + + +def make_rand_lora_weight_tensor(k: int, + n: int, + num_loras: int, + dtype: torch.dtype, + device: str = "cuda") -> torch.Tensor: + + # LoRA weights column major + return torch.rand((num_loras, n, k), dtype=dtype).to(device) + + +def make_rand_tensors( + a_shape: Tuple[int], + b_shape: Tuple[int], + c_shape: Tuple[int], + a_dtype: torch.dtype, + b_dtype: torch.dtype, + c_dtype: torch.dtype, + num_slices: int, + device: str = "cuda", +) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]: + """ + Make LoRA input/output matrices. + """ + A = torch.rand(a_shape, dtype=a_dtype).to(device) + + # LoRA weights column major + Bs = [ + torch.rand(b_shape, dtype=b_dtype).to(device) + for _ in range(num_slices) + ] + + C = torch.zeros(c_shape, dtype=c_dtype).to(device) + return A, Bs, C + + +def make_prompt_lora_mapping(num_prompts: int, num_active_loras: int, + sort_by_lora_id: bool, + device: str) -> torch.Tensor: + """ + All prompts are mapped to a Lora ID in range [0, num_active_loras). + where 0 refers to first lora, 1 refers to second lora and so on. + """ + assert num_active_loras > 0 + + if not sort_by_lora_id: + return torch.randint(0, + num_active_loras, (num_prompts, ), + dtype=torch.long) + + # Divide LoRAs equally and in order. + part_size = num_prompts // num_active_loras + part_size = max(part_size, 1) + + lora_id = 0 + prompt_lora_mapping = [] + while len(prompt_lora_mapping) < num_prompts: + prompt_lora_mapping.extend([lora_id] * part_size) + lora_id = lora_id + 1 if lora_id + 1 < num_active_loras else lora_id + return torch.tensor(prompt_lora_mapping[:num_prompts], + dtype=torch.long, + device=device) + + +def make_token_lora_mapping(num_tokens: int, num_prompts: int, + prompt_lora_mapping: torch.Tensor, + seq_len_tensor: torch.Tensor, device: str): + """ + Make token_lora_mapping from prompt_lora_mapping and seq_lens_tensor + """ + assert prompt_lora_mapping.shape[0] == num_prompts + + # token to lora index mapping + token_lora_mapping = [0] * num_tokens + current_offset = 0 + for b_id in range(num_prompts): + lora_index = prompt_lora_mapping[b_id].item() + s = current_offset + e = s + seq_len_tensor[b_id].item() + token_lora_mapping[s:e] = [lora_index] * (e - s) + current_offset += seq_len_tensor[b_id].item() + + return torch.tensor(token_lora_mapping, dtype=torch.long, device=device) + + +def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor, + lora_weights: List[torch.Tensor], + seq_lens_cpu: torch.Tensor, + prompt_lora_mapping_cpu: torch.Tensor, scaling: float, + add_inputs: Optional[bool]): + """ + Torch group gemm reference implementation to test correctness of + benchmarking operations. + """ + batches = seq_lens_cpu.size(0) + out_list = [] + current_offset = 0 + for lora_index, b_length in zip(range(batches), seq_lens_cpu): + x = input[current_offset:b_length + current_offset, :] + current_offset += b_length + w = lora_weights[prompt_lora_mapping_cpu[lora_index]] + result = torch.nn.functional.linear(x, w) + result *= scaling + out_list.append(result) + torch.cat(out_list, dim=0) + + cat_result = torch.cat(out_list, dim=0) + + if add_inputs: + ref_out += cat_result + else: + ref_out.copy_(cat_result) + + +class OpType(Enum): + """ + LoRA Ops to benchmark and its properties. + """ + SGMV_SHRINK = auto() + BGMV_SHRINK = auto() + SGMV_EXPAND = auto() + BGMV_EXPAND = auto() + BGMV_EXPAND_SLICE = auto() + + @staticmethod + def from_str(s: str) -> "OpType": + if s.lower() == 'sgmv_shrink': + return OpType.SGMV_SHRINK + if s.lower() == 'sgmv_expand': + return OpType.SGMV_EXPAND + if s.lower() == 'bgmv_shrink': + return OpType.BGMV_SHRINK + if s.lower() == 'bgmv_expand': + return OpType.BGMV_EXPAND + if s.lower() == "bgmv_expand_slice": + return OpType.BGMV_EXPAND_SLICE + raise ValueError(f"Unrecognized str {s} to convert to OpType") + + def is_shrink_fn(self) -> bool: + return self in [OpType.SGMV_SHRINK, OpType.BGMV_SHRINK] + + def is_expand_fn(self) -> bool: + return self in [OpType.SGMV_EXPAND, OpType.BGMV_EXPAND] + + def is_prefill_op(self) -> bool: + return self in [OpType.SGMV_SHRINK, OpType.SGMV_EXPAND] + + def is_decode_op(self) -> bool: + return self in [ + OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE + ] + + def is_expand_slice_fn(self) -> bool: + return self in [OpType.BGMV_EXPAND_SLICE] + + def num_slices(self) -> List[int]: + if self in [OpType.SGMV_EXPAND, OpType.SGMV_SHRINK]: + # SGMV kernels supports slices + return [1, 2, 3] + if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]: + return [1] + if self in [OpType.BGMV_EXPAND_SLICE]: + return [2, 3] + raise ValueError(f"Unrecognized OpType {self}") + + def mkn(self, batch_size: int, seq_length: int, hidden_size: int, + lora_rank: int) -> Tuple[int, int, int]: + num_tokens = batch_size * seq_length + if self.is_shrink_fn(): + m = num_tokens + k = hidden_size + n = lora_rank + else: + assert self.is_expand_fn() or self.is_expand_slice_fn() + m = num_tokens + k = lora_rank + n = hidden_size + return m, k, n + + def matmul_dtypes( + self, op_dtype: torch.dtype + ) -> Tuple[torch.dtype, torch.dtype, torch.dtype]: + """ + return a type, b type and c type for A x B = C + """ + if self.is_shrink_fn(): + return op_dtype, op_dtype, torch.float32 + else: + assert self.is_expand_fn() or self.is_expand_slice_fn() + return torch.float32, op_dtype, op_dtype + + def matmul_shapes( + self, batch_size: int, seq_length: int, hidden_size: int, + lora_rank: int, num_loras: int, + num_slices: int) -> Tuple[Tuple[int], Tuple[int], Tuple[int]]: + """ + Given num_slices, return the shapes of the A, B, and C matrices + in A x B = C, for the op_type + """ + m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank) + + b_shape = (num_loras, n, k) # col-major + if self == OpType.SGMV_SHRINK: + # SGMV shrink supports num_slices inherently in the kernel + return ((m, k), b_shape, (num_slices, m, n)) + if self == OpType.SGMV_EXPAND: + # SGMV expand supports num_slices inherently in the kernel + return ((num_slices, m, k), b_shape, (m, n * num_slices)) + if self == OpType.BGMV_SHRINK: + return ((m, k), b_shape, (m, n)) + if self == OpType.BGMV_EXPAND: + return ((m, k), b_shape, (m, n)) + if self == OpType.BGMV_EXPAND_SLICE: + return ((num_slices, m, k), b_shape, (m, n * num_slices)) + + raise ValueError(f"Unrecognized op_type {self}") + + def bench_fn(self) -> Callable: + + def emulate_bgmv_expand_slice(kwargs_list: List[Dict[str, Any]]): + for x in kwargs_list: + bgmv_expand_slice(**x) + + if self == OpType.SGMV_SHRINK: + return sgmv_shrink + if self == OpType.SGMV_EXPAND: + return sgmv_expand + if self == OpType.BGMV_SHRINK: + return bgmv_shrink + if self == OpType.BGMV_EXPAND: + return bgmv_expand + if self == OpType.BGMV_EXPAND_SLICE: + return emulate_bgmv_expand_slice + raise ValueError(f"Unrecognized optype {self}") + + def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor, + lora_weights: List[torch.Tensor], + **kwargs) -> Callable: + """Each benchmark operation expected the input, lora_weights and outputs + in a slightly different format. Refer to self.matmul_shapes(). + run_ref_group_gemm accounts for those differences in executing a + reference group gemm for correctness testing. + """ + w_dtype = lora_weights[0].dtype + num_slices = len(lora_weights) + if self == OpType.SGMV_SHRINK: + for slice_idx in range(num_slices): + ref_group_gemm(ref_out=output[slice_idx, :], + input=input, + lora_weights=lora_weights[slice_idx], + **kwargs) + if self == OpType.SGMV_EXPAND: + hidden_size = lora_weights[0].shape[1] + for slice_idx in range(num_slices): + slice_offset = slice_idx * hidden_size + ref_group_gemm( + ref_out=output[:, slice_offset:slice_offset + hidden_size], + input=input[slice_idx].clone().to(dtype=w_dtype), + lora_weights=lora_weights[slice_idx], + **kwargs) + if self == OpType.BGMV_SHRINK: + assert num_slices == 1 + ref_group_gemm(ref_out=output, + input=input, + lora_weights=lora_weights[0], + **kwargs) + if self == OpType.BGMV_EXPAND: + assert num_slices == 1 + ref_group_gemm(ref_out=output, + input=input.clone().to(dtype=w_dtype), + lora_weights=lora_weights[0], + **kwargs) + if self == OpType.BGMV_EXPAND_SLICE: + hidden_size = lora_weights[0].shape[1] + for slice_idx in range(num_slices): + slice_offset = slice_idx * hidden_size + ref_group_gemm( + ref_out=output[:, slice_offset:slice_offset + hidden_size], + input=input[slice_idx].clone().to(dtype=w_dtype), + lora_weights=lora_weights[slice_idx], + **kwargs) + raise ValueError(f"Unrecognized optype {self}") + + +@dataclass +class BenchmarkContext: + """ + LoRA benchmark context + """ + batch_size: int + hidden_size: int + num_loras: int + num_active_loras: int + lora_rank: int + sort_by_lora_id: bool + dtype: torch.dtype + seq_length: Optional[int] = None + num_slices: Optional[int] = None # num_slices for slice based ops + + def with_seq_length(self, seq_length: int) -> "BenchmarkContext": + ctx = copy.copy(self) + ctx.seq_length = seq_length + return ctx + + def with_num_slices(self, num_slices: int) -> "BenchmarkContext": + ctx = copy.copy(self) + ctx.num_slices = num_slices + return ctx + + def bench_label(self) -> str: + return f"lora-{self.dtype}" + + def bench_sublabel(self, op_type: OpType) -> str: + m, k, n = op_type.mkn(self.batch_size, self.seq_length, + self.hidden_size, self.lora_rank) + desc = { + 'bs': self.batch_size, + 'sl': self.seq_length, + 'm': m, + 'k': k, + 'n': n, + 'num_loras': self.num_loras, + 'sort_by_lora': self.sort_by_lora_id, + 'num_slices': self.num_slices, + } + return json.dumps(desc) + + +@dataclass +class BenchmarkTensors: + """ + Input/Output tensors used for benchmarks + """ + # matmul tensors + input: torch.Tensor + lora_weights_lst: List[torch.Tensor] + output: torch.Tensor + # metadata tensors + seq_lens: torch.Tensor + seq_start_loc: torch.Tensor + prompt_lora_mapping: torch.Tensor + token_lora_mapping: torch.Tensor + + def io_types(self) -> str: + return (f"{dtype_to_str(self.input.dtype)}x" + f"{dtype_to_str(self.lora_weights_lst[0].dtype)}=>" + f"{dtype_to_str(self.output.dtype)}") + + @staticmethod + def make(ctx: BenchmarkContext, + op_type: OpType, + device: str = "cuda") -> "BenchmarkTensors": + + # Make input / output matmul tensors. + a_shape, b_shape, c_shape = op_type.matmul_shapes( + ctx.batch_size, ctx.seq_length, ctx.hidden_size, ctx.lora_rank, + ctx.num_loras, ctx.num_slices) + a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype) + input_tensor, lora_weights, output_tensor = \ + make_rand_tensors(a_shape, b_shape, c_shape, a_type, b_type, c_type, + num_slices = ctx.num_slices) + + # Make metadata tensors. + # Keep the metadata tensors in the CPU for further processing if needed. + # The tensors get moved to the GPU before benchmarking. + assert ctx.num_active_loras <= ctx.num_loras + total_tokens = ctx.batch_size * ctx.seq_length + + # Prepare seq lens tensor + seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1, + (ctx.batch_size, )) + # Prepare seq_start_loc tensor + seq_start_loc_tensor = torch.cumsum(torch.tensor( + [0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), + dim=0) + assert total_tokens == seq_len_tensor.sum() + # Prepare prompt lora indices tensor + prompt_lora_indices_tensor = make_prompt_lora_mapping( + ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu") + # Prepare token lora indices tensor + token_lora_indices_tensor = make_token_lora_mapping( + total_tokens, ctx.batch_size, prompt_lora_indices_tensor, + seq_len_tensor, "cpu") + + return BenchmarkTensors(input_tensor, lora_weights, output_tensor, + seq_len_tensor, seq_start_loc_tensor, + prompt_lora_indices_tensor, + token_lora_indices_tensor) + + def sanity_check(self) -> None: + """ + Fails asserts when non-conformality is detected. + """ + num_tokens = self.input.shape[-2] + # check metadata tensors + assert torch.sum(self.seq_lens) == num_tokens + num_seqs = self.seq_lens.shape[0] + assert self.seq_start_loc.shape[0] == num_seqs + assert self.prompt_lora_mapping.shape[0] == num_seqs + assert self.token_lora_mapping.shape[0] == num_tokens + + def to_device(self, device: str): + """ + Transfer tensors to device if the tensors aren't already on the device + """ + + def to_device(tensor: torch.Tensor): + if tensor.device != device: + tensor = tensor.to(device=device) + return tensor + + self.input = to_device(self.input) + self.output = to_device(self.output) + self.seq_lens = to_device(self.seq_lens) + self.seq_start_loc = to_device(self.seq_start_loc) + self.prompt_lora_mapping = to_device(self.prompt_lora_mapping) + self.token_lora_mapping = to_device(self.token_lora_mapping) + for i in range(len(self.lora_weights_lst)): + self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i]) + + def metadata(self) -> Tuple[int, int, int]: + """ + Return num_seqs, num_tokens and max_seq_len + """ + num_seqs = self.seq_lens.shape[0] + num_tokens = self.token_lora_mapping.shape[0] + max_seq_len = torch.max(self.seq_lens).item() + num_slices = len(self.lora_weights_lst) + return num_seqs, num_tokens, max_seq_len, num_slices + + def convert_to_sgmv_benchmark_tensors(self): + """ + For sgmv punica kernels, when consecutive sequences have the + same LoRA ID, we just merge them together. + This happens in punica.py::compute_metadata + """ + + # Collapse seq_lens and seq_start_loc + _, seq_lens = torch.unique_consecutive(self.token_lora_mapping, + return_counts=True) + cum_result = torch.cumsum(seq_lens, dim=0) + seq_start_loc = torch.zeros_like(seq_lens) + seq_start_loc[1:].copy_(cum_result[:-1]) + + # Collapse prompt mapping + prompt_lora_mapping = torch.unique_consecutive( + self.prompt_lora_mapping) + + assert torch.sum(seq_lens) == torch.sum(self.seq_lens), \ + f"dont match - new {torch.sum(seq_lens)} vs {torch.sum(self.seq_lens)}" + + self.prompt_lora_mapping = prompt_lora_mapping.to( + dtype=self.prompt_lora_mapping.dtype) + self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype) + self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype) + + def as_sgmv_shrink_kwargs(self) -> Dict[str, Any]: + self.convert_to_sgmv_benchmark_tensors() + self.sanity_check() + self.to_device(self.input.device) + + num_seqs, num_tokens, max_seq_len, num_slices = self.metadata() + + # Sanity check matrix shapes. + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape [num_tokens, hidden_size] + assert len(i_shape) == 2 + assert i_shape[0] == num_tokens + hidden_size = i_shape[1] + # Expected lora weight shape [num_loras, lora_rank, hidden_size] + assert len(lw_shape) == 3 + assert lw_shape[2] == hidden_size + lora_rank = lw_shape[1] + # Expected output shape [num_slices, num_tokens, lora_rank] + assert len(o_shape) == 3 + assert o_shape == (num_slices, num_tokens, lora_rank) + + return { + 'inputs': self.input, + 'lora_a_weights': self.lora_weights_lst, + 'output_tensor': self.output, + 'b_seq_start_loc': self.seq_start_loc, + 'seq_len_tensor': self.seq_lens, + 'lora_indices_tensor': self.prompt_lora_mapping, + 'batches': num_seqs, + 'max_seq_length': max_seq_len, + 'token_nums': num_tokens, + 'scaling': 1.0, + } + + def as_sgmv_expand_kwargs(self, add_inputs: bool) -> Dict[str, Any]: + + self.convert_to_sgmv_benchmark_tensors() + self.sanity_check() + self.to_device(self.input.device) + + num_seqs, num_tokens, max_seq_len, num_slices = self.metadata() + + # Sanity check matrix shapes. + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape : [num_slices, num_tokens, lora_rank] + assert len(i_shape) == 3 + assert i_shape[0] == num_slices + assert i_shape[1] == num_tokens + lora_rank = i_shape[2] + # Expected lora weight shape : [num_lora, hidden_size, lora_rank] + assert len(lw_shape) == 3 + assert lw_shape[2] == lora_rank + hidden_size = lw_shape[1] + # Expected output shape : [num_tokens, hidden_size * num_slices] + assert len(o_shape) == 2 + assert o_shape == (num_tokens, hidden_size * num_slices) + + return { + 'inputs': self.input, + 'lora_b_weights': self.lora_weights_lst, + 'output_tensor': self.output, + 'b_seq_start_loc': self.seq_start_loc, + 'seq_len_tensor': self.seq_lens, + 'lora_indices_tensor': self.prompt_lora_mapping, + 'batches': num_seqs, + 'max_seq_length': max_seq_len, + 'token_nums': num_tokens, + 'offset_start': 0, + 'add_inputs': add_inputs, + } + + def as_bgmv_shrink_kwargs(self) -> Dict[str, Any]: + assert len(self.lora_weights_lst) == 1 + self.to_device(self.input.device) + + _, num_tokens, _, _ = self.metadata() + # Sanity check shapes + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape [num_tokens, hidden_size] + assert len(i_shape) == 2 + assert i_shape[0] == num_tokens + hidden_size = i_shape[1] + # Expected lora weight shape [num_loras, lora_rank, hidden_size] + assert len(lw_shape) == 3 + assert lw_shape[2] == hidden_size + lora_rank = lw_shape[1] + # Expected output shape [num_tokens, lora_rank] + assert len(o_shape) == 2 + assert o_shape == (num_tokens, lora_rank) + + return { + 'inputs': self.input, + 'lora_a_weights': self.lora_weights_lst[0], + 'output_tensor': self.output, + 'lora_indices_tensor': self.token_lora_mapping, + 'scaling': 1.0 + } + + def as_bgmv_expand_kwargs(self, add_inputs: bool): + assert len(self.lora_weights_lst) == 1 + self.to_device(self.input.device) + + _, num_tokens, _, _ = self.metadata() + # Sanity check shapes + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape [num_tokens, lora_rank] + assert len(i_shape) == 2 + assert i_shape[0] == num_tokens + lora_rank = i_shape[1] + # Expected lora weight shape [num_loras, hidden_size, lora_rank] + assert len(lw_shape) == 3 + assert lw_shape[2] == lora_rank + hidden_size = lw_shape[1] + # Expected output shape [num_tokens, hidden_size] + assert len(o_shape) == 2 + assert o_shape == (num_tokens, hidden_size) + + return { + 'inputs': self.input, + 'lora_b_weights': self.lora_weights_lst[0], + 'output_tensor': self.output, + 'lora_indices_tensor': self.token_lora_mapping, + 'add_inputs': add_inputs + } + + def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> Dict[str, Any]: + + _, num_tokens, _, num_slices = self.metadata() + # Sanity check shapes + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape [num_slices, num_tokens, lora_rank] + assert len(i_shape) == 3 + assert i_shape[0] == num_slices + assert i_shape[1] == num_tokens + lora_rank = i_shape[2] + # Expected lora weight shape [num_loras, hidden_size, lora_rank] + assert len(lw_shape) == 3 + assert lw_shape[2] == lora_rank + hidden_size = lw_shape[1] + # Expected output shape [num_tokens, hidden_size * num_slices] + assert len(o_shape) == 2 + assert o_shape == (num_tokens, hidden_size * num_slices) + + self.to_device(self.input.device) + + kwargs_list = [] + for i in range(num_slices): + kwargs_list.append({ + 'inputs': self.input[i], + 'lora_b_weights': self.lora_weights_lst[i], + 'output_tensor': self.output, + 'lora_indices_tensor': self.token_lora_mapping, + 'slice_offset': i * hidden_size, + 'slice_size': hidden_size, + 'add_inputs': add_inputs, + }) + return {'kwargs_list': kwargs_list} + + def bench_fn_kwargs(self, + op_type: OpType, + add_inputs: Optional[bool] = None) -> Dict[str, Any]: + if op_type.is_shrink_fn(): + assert add_inputs is None + else: + assert add_inputs is not None + + if op_type == OpType.SGMV_SHRINK: + return self.as_sgmv_shrink_kwargs() + if op_type == OpType.SGMV_EXPAND: + return self.as_sgmv_expand_kwargs(add_inputs) + if op_type == OpType.BGMV_SHRINK: + return self.as_bgmv_shrink_kwargs() + if op_type == OpType.BGMV_EXPAND: + return self.as_bgmv_expand_kwargs(add_inputs) + if op_type == OpType.BGMV_EXPAND_SLICE: + return self.as_bgmv_expand_slice_kwargs(add_inputs) + raise ValueError(f"Unrecognized optype {self}") + + def test_correctness(self, op_type: OpType, + expand_fn_add_inputs: Optional[bool]) -> bool: + """ + Test correctness of op_type implementation against a grouped gemm + reference implementation. + """ + seq_lens_cpu = self.seq_lens.to(device="cpu") + prompt_lora_mapping_cpu = self.prompt_lora_mapping.to(device="cpu") + ref_output = self.output.clone() + + self.output.zero_() + op_type.bench_fn()( + **self.bench_fn_kwargs(op_type, expand_fn_add_inputs)) + + op_type.run_ref_group_gemm( + ref_output, + self.input, + self.lora_weights_lst, + seq_lens_cpu=seq_lens_cpu, + prompt_lora_mapping_cpu=prompt_lora_mapping_cpu, + scaling=1.0, + add_inputs=expand_fn_add_inputs) + + rtol, atol = { + torch.float16: (6e-2, 6e-2), + torch.bfloat16: (6e-2, 6e-2), + torch.float32: (1e-2, 1e-2), + }[self.output.dtype] + + return torch.allclose(ref_output, self.output, rtol=rtol, atol=atol) + + +def bench_optype(ctx: BenchmarkContext, + arg_pool_size: int, + op_type: OpType, + cuda_graph_nops: Optional[int] = None, + expand_fn_add_inputs: Optional[bool] = None, + test_correctness: bool = False) -> TMeasurement: + + assert arg_pool_size >= 1 + if op_type.is_shrink_fn(): + assert expand_fn_add_inputs is None + else: + assert expand_fn_add_inputs is not None + + # BenchmarkContext -> BenchmarkTensors + bench_tensors : List[BenchmarkTensors] = \ + [BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)] + for bt in bench_tensors: + bt.sanity_check() + + # Test correctness of our implementation. + if test_correctness: + assert all([ + bt.test_correctness(op_type, expand_fn_add_inputs) + for bt in bench_tensors + ]) + + # BenchmarkTensors -> Dict (kwargs) + kwargs_list = [ + bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs) + for bt in bench_tensors + ] + + # Clear LoRA optimization hash-maps. + _LORA_A_PTR_DICT.clear() + _LORA_B_PTR_DICT.clear() + # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup + for kwargs in kwargs_list: + op_type.bench_fn()(**kwargs) + torch.cuda.synchronize() + + # Merge into a single kwargs and qualify arguments as ArgPool + kwargs = {k: ArgPool([]) for k in kwargs_list[0]} + for _kwargs in kwargs_list: + for k, v in _kwargs.items(): + kwargs[k].values.append(v) + + describe_args = (f"add_inputs={expand_fn_add_inputs}" + if expand_fn_add_inputs is not None else "") + description = ( + f"{op_type.name}({describe_args}) ({bench_tensors[0].io_types()})") + + cuda_graph_params = None + if cuda_graph_nops: + cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops) + timer = None + with Bench(cuda_graph_params, + ctx.bench_label(), ctx.bench_sublabel(op_type), description, + op_type.bench_fn(), **kwargs) as bench: + timer = bench.run() + return timer + + +def bench_torch_mm(ctx: BenchmarkContext, + arg_pool_size: int, + op_type: OpType, + cuda_graph_nops: Optional[int] = None) -> TMeasurement: + """ + Benchmark basic torch.mm as a roofline. + + When all the input tokens have the same LoRA ID, the LoRA kernels are just + a matmul. This torch.mm benchmark serves as a roofline for that case. + + input op_type is used in determining the m, k, n dimensions for the matmul. + """ + + batch_size, hidden_size, lora_rank, seq_length, dtype = (ctx.batch_size, + ctx.hidden_size, + ctx.lora_rank, + ctx.seq_length, + ctx.dtype) + + m, k, n = op_type.mkn(batch_size, seq_length, hidden_size, lora_rank) + # For a fairer comparison. + n = n * ctx.num_slices + + # Get matmul input and output tensors for A x B = C + As, Bs, Cs = [], [], [] + for _ in range(arg_pool_size): + As.append(torch.rand((m, k), dtype=dtype).to("cuda")) + Bs.append(torch.rand((n, k), dtype=dtype).to("cuda").t()) + Cs.append(torch.rand((m, n), dtype=dtype).to("cuda")) + + # Make torch.mm kwargs + mm_kwargs = {'input': ArgPool(As), 'mat2': ArgPool(Bs), 'out': ArgPool(Cs)} + + description = ( + f"single-lora roofline using torch.mm ({dtype_to_str(dtype)}" + f"x{dtype_to_str(dtype)}" + f"=>{dtype_to_str(dtype)})") + cuda_graph_params = None + if cuda_graph_nops: + cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops) + with Bench(cuda_graph_params, ctx.bench_label(), + ctx.bench_sublabel(op_type), description, torch.mm, + **mm_kwargs) as bench: + return bench.run() + + +# runner +def use_cuda_graph_recommendation() -> str: + return """ + Triton kernels have a significant launch overhead with + launched directly via python. This overhead is more noticeable + for small the problem sizes. For these cases, it is recommended + to use the script with `--cuda-graph-nops N` to benchmark N + consecutive invocations of the benchmarking operations from + inside a CUDA Graph. Note that the returned measurement is for N + invocations of the operation. + """ + + +def print_timers(timers: List[TMeasurement], + args: Optional[argparse.Namespace] = None): + compare = TBenchmark.Compare(timers) + compare.print() + + if args and args.cuda_graph_nops: + print( + f"Note : The timings reported above is for {args.cuda_graph_nops} " + "consecutive invocations of the benchmarking functions. " + f"Please divide by {args.cuda_graph_nops} for single invocation " + "timings.") + + print("Note on Comparison with torch.mm : The torch.mm numbers are " + "benchmark numbers of a simple matmul emulating the single lora " + "case. It is provided as a roofline for comparing our LoRA Kernel " + "implementations. It is expected that the LoRA kernels will be " + "slower than torch.mm in cases where num_loras is big. But for " + "small num_loras the goal should be to match the torch.mm numbers.") + + +def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]): + + if args.cuda_graph_nops is not None: + assert args.cuda_graph_nops > 0 + print(f"Benchmarking {args.cuda_graph_nops} invocations inside a CUDA " + "Graph") + else: + print(f"CUDA Graphs not enabled.\n{use_cuda_graph_recommendation()}") + + timers = [] + for bench_ctx in bench_ctxs: + for seq_len in args.seq_lengths: + bench_ops: List[OpType] = [] + if seq_len == 1: + # bench all decode ops + bench_ops = [op for op in args.op_types if op.is_decode_op()] + else: + # bench all prefill ops + bench_ops = [op for op in args.op_types if op.is_prefill_op()] + + seq_len_timers = [] + for bench_op in bench_ops: + for num_slices in bench_op.num_slices(): + _ctx = bench_ctx.with_seq_length(seq_len).with_num_slices( + num_slices) + # Benchmark torch.mm as a roofline + seq_len_timers.append( + bench_torch_mm(_ctx, args.arg_pool_size, bench_op, + args.cuda_graph_nops)) + + # Benchmark bench_op + expand_fn_add_inputs = [ + None + ] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs + for add_input_arg in expand_fn_add_inputs: + seq_len_timers.append( + bench_optype(_ctx, args.arg_pool_size, bench_op, + args.cuda_graph_nops, add_input_arg, + args.test_correctness)) + + print_timers(seq_len_timers) + timers.extend(seq_len_timers) + + # Result stdout dump + print("== All Results ====") + print_timers(timers, args) + + if args.output_directory: + # Result file dump + od = Path(args.output_directory) + if not od.exists(): + od.mkdir() + + timestamp = int(time.time()) + pkl_file = od / f"lora_bench-{timestamp}.pkl" + print(f"Writing benchmarks to {pkl_file}") + with open(pkl_file, "wb") as f: + pickle.dump(timers, f) + + +def as_benchmark_contexts(hidden_sizes: List[int], lora_ranks: List[int], + args: argparse.Namespace) -> List[BenchmarkContext]: + + ctxs: List[BenchmarkContext] = [] + for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product( # noqa + args.batch_sizes, list(hidden_sizes), lora_ranks, args.num_loras, + args.sort_by_lora_id): + ctxs.append( + BenchmarkContext( + batch_size=batch_size, + hidden_size=hidden_size, + lora_rank=lora_rank, + num_loras=num_loras, + num_active_loras=args.num_active_loras + if args.num_active_loras else num_loras, + # To be filled based on the OpType to benchmark + seq_length=None, + sort_by_lora_id=sort_by_lora_id, + dtype=args.dtype, + # To be filled based on the OpType to benchmark + num_slices=None)) + + return ctxs + + +def run_list_bench(args: argparse.Namespace): + print(args) + + print("List bench :\n" + f" Hidden Sizes {args.hidden_sizes}" + f" LoRA Ranks {args.lora_ranks}") + + # Get all benchmarking contexts + bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args) + + run(args, bench_contexts) + + +def run_range_bench(args: argparse.Namespace): + print(args) + + hidden_sizes = list( + range(args.hidden_sizes_start, args.hidden_sizes_end + 1, + args.hidden_sizes_increment)) + lora_ranks = list( + range(args.lora_ranks_start, args.lora_ranks_end + 1, + args.lora_ranks_increment)) + + print("Range bench :\n" + f" Hidden Sizes {hidden_sizes}" + f" LoRA Ranks {lora_ranks}") + + # Get all benchmarking contexts + bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args) + + run(args, bench_contexts) + + +def run_model_bench(args: argparse.Namespace): + print(args) + + def hidden_sizes_from_model(model: str, tp_size: int) -> set[int]: + hidden_sizes = set() + for KN, tp_split_dim in WEIGHT_SHAPES[model]: + KN[tp_split_dim] = KN[tp_split_dim] // tp_size + hidden_sizes.add(KN[1]) + return hidden_sizes + + # Get all hidden sizes + hidden_sizes: set[int] = set() + for model_name, tp_size in product(args.models, args.tp_sizes): + hidden_sizes = hidden_sizes.union( + hidden_sizes_from_model(model_name, tp_size)) + + print("Model bench :\n" + f" Hidden Sizes {hidden_sizes}" + f" LoRA Ranks {args.lora_ranks}") + + # Get all benchmarking contexts + bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args) + + run(args, bench_contexts) + + +if __name__ == '__main__': + + def to_torch_dtype(dt): + if dt == "torch.float16": + return torch.float16 + if dt == "torch.bfloat16": + return torch.bfloat16 + raise ValueError("unsupported dtype") + + def get_bool(s: str) -> bool: + return s.lower() in ['true', '1'] + + def add_common_command_args(p: argparse.ArgumentParser): + p.add_argument( + "--dtype", + type=to_torch_dtype, + required=True, + help="Available options are ['torch.float16', 'torch.bfloat16']") + + p.add_argument( + "--arg-pool-size", + type=int, + default=32, + help="Run profiles with a pool of input/output/meta tensors instead" + "of simply reusing the same tensors for all runs. A bigger arg-pool" + "mitigates hardware caching effects during benchmarking.") + + p.add_argument( + "--cuda-graph-nops", + type=int, + help=("when set profiling is done using cudagraph, " + "with the given number of operations in a graph." + "Note that the measurement returned is the time " + "taken for N consecutive executions of the benchmarking " + "functions, where N is the value of this argument.")) + p.add_argument("--num-loras", + nargs="+", + type=int, + default=DEFAULT_NUM_LORAS) + p.add_argument("--num-active-loras", + type=int, + default=None, + help="Active LoRAs. When None, all LoRAs are active") + p.add_argument("--sort-by-lora-id", + nargs="+", + type=get_bool, + default=DEFAULT_SORT_BY_LORA_IDS) + p.add_argument("--op-types", + nargs="+", + type=OpType.from_str, + default=list(OpType)) + p.add_argument('--seq-lengths', + nargs="+", + type=int, + default=DEFAULT_SEQ_LENGTHS) + p.add_argument("--batch-sizes", + nargs="+", + type=int, + default=DEFAULT_BATCH_SIZES) + p.add_argument("--expand-fn-add-inputs", + nargs="+", + type=get_bool, + default=DEFAULT_EXPAND_FN_ADD_INPUTS) + p.add_argument( + '-o', + '--output-directory', + type=str, + help=("Output directory to store a the list of benchmarking" + "TMeasurement objects as a pickle file")) + + p.add_argument( + "--test-correctness", + action='store_true', + help=("When enabled, the benchmarking functions are tested" + "for correctness before the actual benchmarking")) + + parser = FlexibleArgumentParser( + description=f""" +Benchmark LoRA kernels: + {use_cuda_graph_recommendation()} + + list_bench example: + python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 + + model_bench example: + python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 + + range_bench example: + python3 benchmarks/kernels/benchmark_lora.py range_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 + """, # noqa: E501 + formatter_class=argparse.RawTextHelpFormatter) + + subparsers = parser.add_subparsers(dest="cmd", required=True) + + list_parser = subparsers.add_parser("list_bench") + list_parser.add_argument("--hidden-sizes", + nargs="+", + type=int, + default=DEFAULT_HIDDEN_SIZES) + list_parser.add_argument("--lora-ranks", + nargs="+", + type=int, + default=DEFAULT_LORA_RANKS) + add_common_command_args(list_parser) + list_parser.set_defaults(func=run_list_bench) + + range_parser = subparsers.add_parser("range_bench") + range_parser.add_argument("--hidden-sizes-start", type=int, required=True) + range_parser.add_argument("--hidden-sizes-end", type=int, required=True) + range_parser.add_argument("--hidden-sizes-increment", + type=int, + required=True) + range_parser.add_argument("--lora-ranks-start", type=int, required=True) + range_parser.add_argument("--lora-ranks-end", type=int, required=True) + range_parser.add_argument("--lora-ranks-increment", + type=int, + required=True) + add_common_command_args(range_parser) + range_parser.set_defaults(func=run_range_bench) + + model_parser = subparsers.add_parser("model_bench") + model_parser.add_argument("--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES.keys()) + model_parser.add_argument("--tp-sizes", + nargs="+", + type=int, + default=DEFAULT_TP_SIZES) + model_parser.add_argument("--lora-ranks", + nargs="+", + type=int, + default=DEFAULT_LORA_RANKS) + add_common_command_args(model_parser) + model_parser.set_defaults(func=run_model_bench) + + args = parser.parse_args() + args.func(args) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 8f538c21f7f7e..1fa0da75c79d2 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -1,6 +1,7 @@ import argparse import time from datetime import datetime +from itertools import product from typing import Any, Dict, List, Tuple, TypedDict import ray @@ -13,6 +14,9 @@ from vllm.platforms import current_platform from vllm.utils import FlexibleArgumentParser +FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm( +) else torch.float8_e4m3fn + class BenchmarkConfig(TypedDict): BLOCK_SIZE_M: int @@ -80,8 +84,8 @@ def benchmark_config( a1_scale = torch.randn(1, dtype=torch.float32) a2_scale = torch.randn(1, dtype=torch.float32) - w1 = w1.to(torch.float8_e4m3fn) - w2 = w2.to(torch.float8_e4m3fn) + w1 = w1.to(FP8_DTYPE) + w2 = w2.to(FP8_DTYPE) input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32) @@ -141,28 +145,172 @@ def run(): return avg -def get_configs_compute_bound() -> List[Dict[str, int]]: - # Reduced search space for faster tuning. - # TODO(woosuk): Increase the search space and use a performance model to - # prune the search space. +def get_rocm_tuning_space(use_fp16): + block_mn_range = [16, 32, 64, 128, 256] + block_k_range = [16, 32, 64, 128, 256] + if not use_fp16: + block_k_range.remove(16) # BLOCK_K=16 not supported for fp8 + num_warps_range = [1, 2, 4, 8] + group_m_range = [1, 4, 8, 16, 32] + num_stage_range = [2] + waves_per_eu_range = [0] + matrix_instr_nonkdim_range = [16, 32] if use_fp16 else [] + kpack_range = [1, 2] if use_fp16 else [] + + param_ranges = { + "BLOCK_SIZE_M": block_mn_range, + "BLOCK_SIZE_N": block_mn_range, + "BLOCK_SIZE_K": block_k_range, + "GROUP_SIZE_M": group_m_range, + "num_warps": num_warps_range, + "num_stages": num_stage_range, + "waves_per_eu": waves_per_eu_range, + } + if use_fp16: + param_ranges["matrix_instr_nonkdim"] = matrix_instr_nonkdim_range + param_ranges["kpack"] = kpack_range + + return param_ranges + + +def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]: configs: List[BenchmarkConfig] = [] - for num_stages in [2, 3, 4, 5]: - for block_m in [16, 32, 64, 128, 256]: - for block_k in [64, 128, 256]: - for block_n in [32, 64, 128, 256]: - for num_warps in [4, 8]: - for group_size in [1, 16, 32, 64]: - configs.append({ - "BLOCK_SIZE_M": block_m, - "BLOCK_SIZE_N": block_n, - "BLOCK_SIZE_K": block_k, - "GROUP_SIZE_M": group_size, - "num_warps": num_warps, - "num_stages": num_stages, - }) + + if current_platform.is_rocm(): + param_ranges = get_rocm_tuning_space(use_fp16) + else: + # Reduced search space for faster tuning. + # TODO(woosuk): Increase the search space and use a performance model to + # prune the search space. + block_m_range = [16, 32, 64, 128, 256] + block_n_range = [32, 64, 128, 256] + block_k_range = [64, 128, 256] + num_warps_range = [4, 8] + group_m_range = [1, 16, 32, 64] + num_stage_range = [2, 3, 4, 5] + + param_ranges = { + "BLOCK_SIZE_M": block_m_range, + "BLOCK_SIZE_N": block_n_range, + "BLOCK_SIZE_K": block_k_range, + "GROUP_SIZE_M": group_m_range, + "num_warps": num_warps_range, + "num_stages": num_stage_range, + } + + keys, values = zip(*param_ranges.items()) + for config_values in product(*values): + config = dict(zip(keys, config_values)) + configs.append(config) return configs +def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size, + search_space, is_fp16): + N1, K1 = shard_intermediate_size, hidden_size + N2, K2 = hidden_size, shard_intermediate_size // 2 + pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space, + is_fp16) + pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space, + is_fp16) + search_space = merge_unique_dicts(pruned_space_1, pruned_space_2) + return search_space + + +# The following code is inspired by ROCm/Triton GEMM tuning script: +# https://github.com/ROCm/triton/blob/triton-mlir/scripts/amd/gemm/tune_gemm.py#L89 +def prune_rocm_configs(M, N, K, configs, is_fp16=True): + pruned_configs = [] + elemBytes_a = 2 if is_fp16 else 1 + elemBytes_b = 2 if is_fp16 else 1 + + mfma = 16 if M < 32 or N < 32 else 32 + + # TODO (zhanglx): figure out the boundary between large and small gemms + large_gemm = False + if M >= 2048 and N >= 2048: + large_gemm = True + + for config in configs: + BLOCK_SIZE_M = config.get("BLOCK_SIZE_M") + BLOCK_SIZE_N = config.get("BLOCK_SIZE_N") + BLOCK_SIZE_K = config.get("BLOCK_SIZE_K") + num_warps = config.get("num_warps") + + if is_fp16: + matrix_instr_nonkdim = config.get("matrix_instr_nonkdim") + if matrix_instr_nonkdim > mfma: + continue + if mfma == 4 and BLOCK_SIZE_K < 64: + continue + # some layouts could not work properly in case + # number elements per thread is less 1 + if BLOCK_SIZE_M * BLOCK_SIZE_N < 64: + continue + SPLIT_K = config.get("SPLIT_K", 1) + GROUP_M = config.get("GROUP_SIZE_M") + if is_fp16: + if (matrix_instr_nonkdim > BLOCK_SIZE_M + or matrix_instr_nonkdim > BLOCK_SIZE_N): + continue + if (matrix_instr_nonkdim >= M + and matrix_instr_nonkdim != BLOCK_SIZE_M): + continue + if (matrix_instr_nonkdim >= N + and matrix_instr_nonkdim != BLOCK_SIZE_N): + continue + # Skip BLOCK_SIZE that is too large compare to M/N + # unless BLOCK_SIZE is already small enough + if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16: + continue + if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16: + continue + # skip large split_k when not necessary + if SPLIT_K != 1 and not need_split_k(M, N, K): + continue + # skip split_k that leads to EVEN_K = false + leap = SPLIT_K * BLOCK_SIZE_K + modv = K % leap + if modv != 0: + continue + # skip large GROUP_M + if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1: + continue + # out of shared memory resource + # TODO (zhanglx): This does not consider the LDS usage in the epilogue + LDS = (BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a + + BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b) + if LDS > 65536: + continue + # Skip small block sizes and num_warps for large gemm + # For fp16 and f8, we want to only use BLOCK_SIZE >= 64 + if large_gemm: + if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64: + continue + if BLOCK_SIZE_K < 64: + continue + if num_warps < 4: + continue + + pruned_configs.append(config) + + return pruned_configs + + +def need_split_k(SIZE_M, SIZE_N, SIZE_K): + return (SIZE_M < 64 or SIZE_N < 64) and SIZE_K > 1024 + + +def merge_unique_dicts(list1, list2): + result = [] + combined_list = list1.copy() + combined_list.extend(list2) + for dictionary in combined_list: + if dictionary not in result: + result.append(dictionary) + return result + + @ray.remote(num_gpus=1) class BenchmarkWorker: @@ -170,6 +318,10 @@ def __init__(self, seed: int) -> None: torch.set_default_device("cuda") current_platform.seed_everything(seed) self.seed = seed + # Get the device ID to allocate tensors and kernels + # on the respective GPU. This is required for Ray to work + # correctly with multi-GPU tuning on the ROCm platform. + self.device_id = int(ray.get_gpu_ids()[0]) def benchmark( self, @@ -217,25 +369,33 @@ def tune( ) -> Dict[str, int]: best_config = None best_time = float("inf") - for config in tqdm(search_space): - try: - kernel_time = benchmark_config(config, - num_tokens, - num_experts, - shard_intermediate_size, - hidden_size, - topk, - dtype, - use_fp8_w8a8, - use_int8_w8a16, - num_iters=10) - except triton.runtime.autotuner.OutOfResources: - # Some configurations may be invalid and fail to compile. - continue - - if kernel_time < best_time: - best_time = kernel_time - best_config = config + if current_platform.is_rocm(): + is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) + search_space = prune_rocm_search_space(num_tokens, + shard_intermediate_size, + hidden_size, search_space, + is_fp16) + + with torch.cuda.device(self.device_id): + for config in tqdm(search_space): + try: + kernel_time = benchmark_config(config, + num_tokens, + num_experts, + shard_intermediate_size, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + num_iters=20) + except triton.runtime.autotuner.OutOfResources: + # Some configurations may be invalid and fail to compile. + continue + + if kernel_time < best_time: + best_time = kernel_time + best_config = config now = datetime.now() print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}") assert best_config is not None @@ -244,12 +404,27 @@ def tune( def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: return { - "BLOCK_SIZE_M": config["BLOCK_SIZE_M"], - "BLOCK_SIZE_N": config["BLOCK_SIZE_N"], - "BLOCK_SIZE_K": config["BLOCK_SIZE_K"], - "GROUP_SIZE_M": config["GROUP_SIZE_M"], - "num_warps": config["num_warps"], - "num_stages": config["num_stages"], + "BLOCK_SIZE_M": + config["BLOCK_SIZE_M"], + "BLOCK_SIZE_N": + config["BLOCK_SIZE_N"], + "BLOCK_SIZE_K": + config["BLOCK_SIZE_K"], + "GROUP_SIZE_M": + config["GROUP_SIZE_M"], + "num_warps": + config["num_warps"], + "num_stages": + config["num_stages"], + **({ + "waves_per_eu": config["waves_per_eu"] + } if "waves_per_eu" in config else {}), + **({ + "matrix_instr_nonkdim": config["matrix_instr_nonkdim"] + } if "matrix_instr_nonkdim" in config else {}), + **({ + "kpack": config["kpack"] + } if "kpack" in config else {}), } @@ -294,7 +469,7 @@ def main(args: argparse.Namespace): shard_intermediate_size = 2 * intermediate_size // args.tp_size hidden_size = config.hidden_size - dtype = config.torch_dtype + dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" @@ -322,7 +497,8 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: return ray.get(outputs) if args.tune: - search_space = get_configs_compute_bound() + is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) + search_space = get_configs_compute_bound(is_fp16) print(f"Start tuning over {len(search_space)} configurations...") start = time.time() diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py new file mode 100644 index 0000000000000..fee877b6f76fa --- /dev/null +++ b/benchmarks/kernels/utils.py @@ -0,0 +1,210 @@ +import dataclasses +from typing import Any, Callable, Iterable, Optional + +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement + + +@dataclasses.dataclass +class CudaGraphBenchParams: + num_ops_in_cuda_graph: int + + +@dataclasses.dataclass +class ArgPool: + """ + When some argument of the benchmarking function is annotated with this type, + the benchmarking class (BenchMM) will collapse the argument to a pick a + single value from the given list of values, during function invocation. + For every invocation during a benchmarking run, it will choose a + different value from the list. + """ + values: Iterable[Any] + + def __getitem__(self, index): + return self.values[index] + + +class Bench: + + class ArgsIterator: + + def __init__(self, args_list, kwargs_list): + assert len(args_list) == len(kwargs_list) + self.args_list = args_list + self.kwargs_list = kwargs_list + self.n = len(self.args_list) + self.idx = 0 + + def __next__(self): + while True: + yield (self.args_list[self.idx], self.kwargs_list[self.idx]) + self.idx += 1 + self.idx = self.idx % self.n + + def reset(self): + self.idx = 0 + + @property + def n_args(self): + return self.n + + def __init__(self, cuda_graph_params: Optional[CudaGraphBenchParams], + label: str, sub_label: str, description: str, fn: Callable, + *args, **kwargs): + + self.cuda_graph_params = cuda_graph_params + self.use_cuda_graph = self.cuda_graph_params is not None + self.label = label + self.sub_label = sub_label + self.description = description + self.fn = fn + + # Process args + self._args = args + self._kwargs = kwargs + self.args_list, self.kwargs_list = self.collapse_argpool( + *args, **kwargs) + self.args_iterator = self.ArgsIterator(self.args_list, + self.kwargs_list) + + # Cudagraph runner + self.g = None + if self.use_cuda_graph: + self.g = self.get_cuda_graph_runner() + + # benchmark run params + self.min_run_time = 1 + + def collapse_argpool(self, *args, **kwargs): + argpool_args = [arg for arg in args if isinstance(arg, ArgPool)] + [ + arg for arg in kwargs.values() if isinstance(arg, ArgPool) + ] + if len(argpool_args) == 0: + return [args], [kwargs] + + # Make sure all argpools are of the same size + argpool_size = len(argpool_args[0].values) + assert all([argpool_size == len(arg.values) for arg in argpool_args]) + + # create copies of the args + args_list = [] + kwargs_list = [] + for _ in range(argpool_size): + args_list.append(args) + kwargs_list.append(kwargs.copy()) + + for i in range(argpool_size): + # collapse args; Just pick the ith value + args_list[i] = tuple([ + arg[i] if isinstance(arg, ArgPool) else arg + for arg in args_list[i] + ]) + + # collapse kwargs + kwargs_i = kwargs_list[i] + arg_pool_keys = [ + k for k, v in kwargs_i.items() if isinstance(v, ArgPool) + ] + for k in arg_pool_keys: + # again just pick the ith value + kwargs_i[k] = kwargs_i[k][i] + kwargs_list[i] = kwargs_i + + return args_list, kwargs_list + + def get_cuda_graph_runner(self): + assert self.use_cuda_graph + assert self.args_iterator is not None + + num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph + + # warmup + args_it = self.args_iterator.__next__() + for _ in range(2): + args, kwargs = next(args_it) + self.fn(*args, **kwargs) + + self.args_iterator.reset() + args_it = self.args_iterator.__next__() + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g): + for _ in range(num_graph_ops): + args, kwargs = next(args_it) + self.fn(*args, **kwargs) + return g + + def run_cudagrah(self) -> TMeasurement: + assert self.use_cuda_graph + globals = {'g': self.g} + + return TBenchmark.Timer( + stmt="g.replay()", + globals=globals, + label=( + f"{self.label}" + f" | cugraph {self.cuda_graph_params.num_ops_in_cuda_graph} ops" + ), + sub_label=self.sub_label, + description=self.description, + ).blocked_autorange(min_run_time=self.min_run_time) + + def run_eager(self) -> TMeasurement: + setup = None + stmt = None + globals = None + + has_arg_pool = self.args_iterator.n_args > 1 + if has_arg_pool: + setup = ''' + args_iterator.reset() + args_it = args_iterator.__next__() + ''' + stmt = ''' + args, kwargs = next(args_it) + fn(*args, **kwargs) + ''' + globals = {'fn': self.fn, 'args_iterator': self.args_iterator} + else: + # no arg pool. Just use the args and kwargs directly + self.args_iterator.reset() + args_it = self.args_iterator.__next__() + args, kwargs = next(args_it) + + setup = "" + stmt = ''' + fn(*args, **kwargs) + ''' + globals = {'fn': self.fn, 'args': args, 'kwargs': kwargs} + + return TBenchmark.Timer( + stmt=stmt, + setup=setup, + globals=globals, + label=self.label, + sub_label=self.sub_label, + description=self.description, + ).blocked_autorange(min_run_time=self.min_run_time) + + def run(self) -> TMeasurement: + timer = None + if self.use_cuda_graph: # noqa SIM108 + timer = self.run_cudagrah() + else: + timer = self.run_eager() + if not timer.meets_confidence() or timer.has_warnings: + print("Doesn't meet confidence - re-running bench ...") + return self.run() + return timer + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + if exc_type: + print(f"exc type {exc_type}") + print(f"exc value {exc_value}") + print(f"exc traceback {traceback}") diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 40430dae10c5b..15b09395a889f 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -58,8 +58,8 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) # set(SRCS ${ORIG_SRCS}) set(CXX_SRCS ${ORIG_SRCS}) - list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$") - list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$") + list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)|(hip)$") + list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)|(hip)$") # # Generate ROCm/HIP source file names from CUDA file names. diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp index 408e736d5bc0f..c2ae554c9f8e8 100644 --- a/csrc/core/scalar_type.hpp +++ b/csrc/core/scalar_type.hpp @@ -32,7 +32,7 @@ class ScalarType { signed_(signed_), bias(bias), finite_values_only(finite_values_only), - nan_repr(nan_repr){}; + nan_repr(nan_repr) {}; static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) { return ScalarType(0, size_bits - 1, true, bias); diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp index 28db0479748bf..a71815106133a 100644 --- a/csrc/cpu/cpu_types.hpp +++ b/csrc/cpu/cpu_types.hpp @@ -2,13 +2,13 @@ #define CPU_TYPES_HPP #if defined(__x86_64__) - //x86 implementation + // x86 implementation #include "cpu_types_x86.hpp" #elif defined(__POWER9_VECTOR__) - //ppc implementation + // ppc implementation #include "cpu_types_vsx.hpp" #elif defined(__aarch64__) - //arm implementation + // arm implementation #include "cpu_types_arm.hpp" #else #warning "unsupported vLLM cpu implementation" diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp index ae062a5b86892..990e99f2fc069 100644 --- a/csrc/cpu/cpu_types_arm.hpp +++ b/csrc/cpu/cpu_types_arm.hpp @@ -1,48 +1,50 @@ #include -#include +#include #include namespace vec_op { #ifdef ARM_BF16_SUPPORT - #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) #else - #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) #endif -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) #ifndef CPU_OP_GUARD -#define CPU_KERNEL_GUARD_IN(NAME) -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) + #define CPU_KERNEL_GUARD_OUT(NAME) #else -#define CPU_KERNEL_GUARD_IN(NAME) \ - std::cout << #NAME << " invoked." << std::endl; -#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; + #define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; + #define CPU_KERNEL_GUARD_OUT(NAME) \ + std::cout << #NAME << " exit." << std::endl; #endif #define FORCE_INLINE __attribute__((always_inline)) inline namespace { - template - constexpr void unroll_loop_item(std::integer_sequence, F &&f) { - (f(std::integral_constant{}), ...); - }; -}; +template +constexpr void unroll_loop_item(std::integer_sequence, F&& f) { + (f(std::integral_constant{}), ...); +}; +}; // namespace template >> -constexpr void unroll_loop(F &&f) { +constexpr void unroll_loop(F&& f) { unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); } -template struct Vec { +template +struct Vec { constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }; }; @@ -54,127 +56,124 @@ struct FP16Vec8 : public Vec { float16x8_t reg; - explicit FP16Vec8(const void *ptr) - : reg(vld1q_f16(static_cast(ptr))) {}; + explicit FP16Vec8(const void* ptr) + : reg(vld1q_f16(static_cast(ptr))) {}; - explicit FP16Vec8(const FP32Vec8 &); + explicit FP16Vec8(const FP32Vec8&); - void save(void *ptr) const { - vst1q_f16(static_cast<__fp16 *>(ptr), reg); - } + void save(void* ptr) const { vst1q_f16(static_cast<__fp16*>(ptr), reg); } }; struct FP16Vec16 : public Vec { - constexpr static int VEC_ELEM_NUM = 16; - - float16x8x2_t reg; - - explicit FP16Vec16(const void *ptr) { - reg.val[0] = vld1q_f16(reinterpret_cast(ptr)); - reg.val[1] = vld1q_f16(reinterpret_cast(ptr) + 8); - } - - explicit FP16Vec16(const FP32Vec16& vec); - - void save(void *ptr) const { - vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); - vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); + constexpr static int VEC_ELEM_NUM = 16; + + float16x8x2_t reg; + + explicit FP16Vec16(const void* ptr) { + reg.val[0] = vld1q_f16(reinterpret_cast(ptr)); + reg.val[1] = vld1q_f16(reinterpret_cast(ptr) + 8); + } + + explicit FP16Vec16(const FP32Vec16& vec); + + void save(void* ptr) const { + vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); + vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); + } + + void save(void* ptr, const int elem_num) const { + int full_blocks = elem_num / 8; + int remainder = elem_num % 8; + + if (full_blocks > 0) { + vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); + if (full_blocks > 1) { + vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); + } } - - void save(void *ptr, const int elem_num) const { - int full_blocks = elem_num / 8; - int remainder = elem_num % 8; - - if (full_blocks > 0) { - vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); - if (full_blocks > 1) { - vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); - } - } - - // Note: below is the unrolled version of the following code: - // - // for (int i = 0; i < remainder; ++i) { - // reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = - // vgetq_lane_f16(temp, i); - // } - // - // For macOS build (Clang), the arm/neon intrinsics function - // `vgetq_lane_f16` needs the parameter `i` to be constant at compile - // time. - - if (remainder > 0) { - float16x8_t temp = reg.val[full_blocks]; - __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr); - switch (remainder) - { - case 1: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - break; - case 2: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - break; - case 3: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - break; - case 4: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); - break; - case 5: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); - fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); - break; - case 6: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); - fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); - fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); - break; - case 7: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); - fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); - fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); - fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6); - break; - - default: - break; - } - } + + // Note: below is the unrolled version of the following code: + // + // for (int i = 0; i < remainder; ++i) { + // reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = + // vgetq_lane_f16(temp, i); + // } + // + // For macOS build (Clang), the arm/neon intrinsics function + // `vgetq_lane_f16` needs the parameter `i` to be constant at compile + // time. + + if (remainder > 0) { + float16x8_t temp = reg.val[full_blocks]; + __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr); + switch (remainder) { + case 1: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + break; + case 2: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + break; + case 3: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + break; + case 4: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + break; + case 5: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + break; + case 6: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); + break; + case 7: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); + fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6); + break; + + default: + break; + } } + } }; - #ifdef ARM_BF16_SUPPORT struct BF16Vec8 : public Vec { constexpr static int VEC_ELEM_NUM = 8; bfloat16x8_t reg; - explicit BF16Vec8(const void *ptr) - : reg(*reinterpret_cast(ptr)) {}; + explicit BF16Vec8(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; explicit BF16Vec8(bfloat16x8_t data) : reg(data) {}; - explicit BF16Vec8(const FP32Vec8 &); + explicit BF16Vec8(const FP32Vec8&); - explicit BF16Vec8(float32x4x2_t v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {}; + explicit BF16Vec8(float32x4x2_t v) + : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {}; - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } }; struct BF16Vec16 : public Vec { @@ -182,19 +181,18 @@ struct BF16Vec16 : public Vec { bfloat16x8x2_t reg; - explicit BF16Vec16(const void *ptr) - : reg(*reinterpret_cast(ptr)) {}; + explicit BF16Vec16(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {}; - explicit BF16Vec16(const FP32Vec16 &); + explicit BF16Vec16(const FP32Vec16&); - explicit BF16Vec16(float32x4x4_t v) : reg({ - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]), - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3]) - }){}; + explicit BF16Vec16(float32x4x4_t v) + : reg({vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]), + vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])}) {}; - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; }; + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; }; }; struct BF16Vec32 : public Vec { @@ -202,19 +200,15 @@ struct BF16Vec32 : public Vec { bfloat16x8x4_t reg; - explicit BF16Vec32(const void *ptr) - : reg(*reinterpret_cast(ptr)) {}; + explicit BF16Vec32(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {}; - explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({ - vec8_data.reg, - vec8_data.reg, - vec8_data.reg, - vec8_data.reg - }) {}; + explicit BF16Vec32(const BF16Vec8& vec8_data) + : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {}; - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; }; + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; }; }; #endif @@ -232,11 +226,11 @@ struct FP32Vec4 : public Vec { explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {}; - explicit FP32Vec4(const float *ptr) : reg(vld1q_f32(ptr)) {}; + explicit FP32Vec4(const float* ptr) : reg(vld1q_f32(ptr)) {}; explicit FP32Vec4(float32x4_t data) : reg(data) {}; - explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}; + explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {}; }; struct FP32Vec8 : public Vec { @@ -252,32 +246,37 @@ struct FP32Vec8 : public Vec { explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {}; - explicit FP32Vec8(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {}; + explicit FP32Vec8(const float* ptr) + : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {}; explicit FP32Vec8(float32x4x2_t data) : reg(data) {}; - explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}; + explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {}; - explicit FP32Vec8(const FP16Vec8 &v) { - reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg)); - reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); - }; + explicit FP32Vec8(const FP16Vec8& v) { + reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg)); + reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); + }; - explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {}; + explicit FP32Vec8(float16x8_t v) + : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {}; - #ifdef ARM_BF16_SUPPORT +#ifdef ARM_BF16_SUPPORT - explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {}; + explicit FP32Vec8(bfloat16x8_t v) + : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {}; - explicit FP32Vec8(const BF16Vec8 &v) : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {}; + explicit FP32Vec8(const BF16Vec8& v) + : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {}; - #endif +#endif float reduce_sum() const { AliasReg ar; ar.reg = reg; float answer = 0; - unroll_loop([&answer, &ar](int i) { answer += ar.values[i]; }); + unroll_loop( + [&answer, &ar](int i) { answer += ar.values[i]; }); return answer; } @@ -324,10 +323,14 @@ struct FP32Vec8 : public Vec { AliasReg ar; ar.reg = reg; - float32x2_t er_vec0 = {static_cast(erf(ar.values[0])), static_cast(erf(ar.values[1]))}; - float32x2_t er_vec1 = {static_cast(erf(ar.values[2])), static_cast(erf(ar.values[3]))}; - float32x2_t er_vec2 = {static_cast(erf(ar.values[4])), static_cast(erf(ar.values[5]))}; - float32x2_t er_vec3 = {static_cast(erf(ar.values[6])), static_cast(erf(ar.values[7]))}; + float32x2_t er_vec0 = {static_cast(erf(ar.values[0])), + static_cast(erf(ar.values[1]))}; + float32x2_t er_vec1 = {static_cast(erf(ar.values[2])), + static_cast(erf(ar.values[3]))}; + float32x2_t er_vec2 = {static_cast(erf(ar.values[4])), + static_cast(erf(ar.values[5]))}; + float32x2_t er_vec3 = {static_cast(erf(ar.values[6])), + static_cast(erf(ar.values[7]))}; float32x4_t result0 = vcombine_f32(er_vec0, er_vec1); float32x4_t result1 = vcombine_f32(er_vec2, er_vec3); @@ -337,25 +340,29 @@ struct FP32Vec8 : public Vec { result.val[1] = result1; return FP32Vec8(result); - } + } - FP32Vec8 operator*(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), vmulq_f32(reg.val[1], b.reg.val[1])})); + FP32Vec8 operator*(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), + vmulq_f32(reg.val[1], b.reg.val[1])})); } - FP32Vec8 operator+(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), vaddq_f32(reg.val[1], b.reg.val[1])})); + FP32Vec8 operator+(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), + vaddq_f32(reg.val[1], b.reg.val[1])})); } - FP32Vec8 operator-(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), vsubq_f32(reg.val[1], b.reg.val[1])})); + FP32Vec8 operator-(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), + vsubq_f32(reg.val[1], b.reg.val[1])})); } - FP32Vec8 operator/(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), vdivq_f32(reg.val[1], b.reg.val[1])})); + FP32Vec8 operator/(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), + vdivq_f32(reg.val[1], b.reg.val[1])})); } - void save(float *ptr) const { + void save(float* ptr) const { vst1q_f32(ptr, reg.val[0]); vst1q_f32(ptr + 4, reg.val[1]); } @@ -370,103 +377,100 @@ struct FP32Vec16 : public Vec { float32x4x4_t reg; - explicit FP32Vec16(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {} + explicit FP32Vec16(float v) + : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {} - explicit FP32Vec16() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {} + explicit FP32Vec16() + : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), + vmovq_n_f32(0.0)}) {} - explicit FP32Vec16(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), vld1q_f32(ptr + 12)}) {} + explicit FP32Vec16(const float* ptr) + : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), + vld1q_f32(ptr + 12)}) {} explicit FP32Vec16(float32x4x4_t data) : reg(data) {} - explicit FP32Vec16(const FP32Vec8 &data) { - reg.val[0] = data.reg.val[0]; - reg.val[1] = data.reg.val[1]; - reg.val[2] = data.reg.val[0]; - reg.val[3] = data.reg.val[1]; + explicit FP32Vec16(const FP32Vec8& data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + reg.val[2] = data.reg.val[0]; + reg.val[3] = data.reg.val[1]; } - explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {} + explicit FP32Vec16(const FP32Vec16& data) : reg(data.reg) {} - explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v.reg)) {} + explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v.reg)) {} - #ifdef ARM_BF16_SUPPORT - explicit FP32Vec16(bfloat16x8x2_t v) : reg({ - vcvtq_low_f32_bf16(v.val[0]), - vcvtq_high_f32_bf16(v.val[0]), - vcvtq_low_f32_bf16(v.val[1]), - vcvtq_high_f32_bf16(v.val[1]) - }) {}; - #endif +#ifdef ARM_BF16_SUPPORT + explicit FP32Vec16(bfloat16x8x2_t v) + : reg({vcvtq_low_f32_bf16(v.val[0]), vcvtq_high_f32_bf16(v.val[0]), + vcvtq_low_f32_bf16(v.val[1]), vcvtq_high_f32_bf16(v.val[1])}) {}; +#endif - explicit FP32Vec16(const FP32Vec4 &data) { + explicit FP32Vec16(const FP32Vec4& data) { reg.val[0] = data.reg; reg.val[1] = data.reg; reg.val[2] = data.reg; reg.val[3] = data.reg; }; - #ifdef ARM_BF16_SUPPORT - explicit FP32Vec16(const BF16Vec16 &v) : reg({ - vcvtq_low_f32_bf16(v.reg.val[0]), - vcvtq_high_f32_bf16(v.reg.val[0]), - vcvtq_low_f32_bf16(v.reg.val[1]), - vcvtq_high_f32_bf16(v.reg.val[1]) - }) {}; - - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}; - #endif - - explicit FP32Vec16(const FP16Vec16 &v) { - reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0])); - reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0])); - reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1])); - reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1])); +#ifdef ARM_BF16_SUPPORT + explicit FP32Vec16(const BF16Vec16& v) + : reg({vcvtq_low_f32_bf16(v.reg.val[0]), + vcvtq_high_f32_bf16(v.reg.val[0]), + vcvtq_low_f32_bf16(v.reg.val[1]), + vcvtq_high_f32_bf16(v.reg.val[1])}) {}; + + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}; +#endif + + explicit FP32Vec16(const FP16Vec16& v) { + reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0])); + reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0])); + reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1])); + reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1])); }; - FP32Vec16 operator+(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vaddq_f32(reg.val[0], b.reg.val[0]), - vaddq_f32(reg.val[1], b.reg.val[1]), - vaddq_f32(reg.val[2], b.reg.val[2]), - vaddq_f32(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator+(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vaddq_f32(reg.val[0], b.reg.val[0]), + vaddq_f32(reg.val[1], b.reg.val[1]), + vaddq_f32(reg.val[2], b.reg.val[2]), + vaddq_f32(reg.val[3], b.reg.val[3])})); }; - FP32Vec16 operator*(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vmulq_f32(reg.val[0], b.reg.val[0]), - vmulq_f32(reg.val[1], b.reg.val[1]), - vmulq_f32(reg.val[2], b.reg.val[2]), - vmulq_f32(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator*(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vmulq_f32(reg.val[0], b.reg.val[0]), + vmulq_f32(reg.val[1], b.reg.val[1]), + vmulq_f32(reg.val[2], b.reg.val[2]), + vmulq_f32(reg.val[3], b.reg.val[3])})); }; - FP32Vec16 operator-(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vsubq_f32(reg.val[0], b.reg.val[0]), - vsubq_f32(reg.val[1], b.reg.val[1]), - vsubq_f32(reg.val[2], b.reg.val[2]), - vsubq_f32(reg.val[3], b.reg.val[3]) - })); + FP32Vec16 operator-(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vsubq_f32(reg.val[0], b.reg.val[0]), + vsubq_f32(reg.val[1], b.reg.val[1]), + vsubq_f32(reg.val[2], b.reg.val[2]), + vsubq_f32(reg.val[3], b.reg.val[3])})); }; - FP32Vec16 operator/(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vdivq_f32(reg.val[0], b.reg.val[0]), - vdivq_f32(reg.val[1], b.reg.val[1]), - vdivq_f32(reg.val[2], b.reg.val[2]), - vdivq_f32(reg.val[3], b.reg.val[3]) - })); + FP32Vec16 operator/(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vdivq_f32(reg.val[0], b.reg.val[0]), + vdivq_f32(reg.val[1], b.reg.val[1]), + vdivq_f32(reg.val[2], b.reg.val[2]), + vdivq_f32(reg.val[3], b.reg.val[3])})); }; float reduce_sum() const { AliasReg ar; ar.reg = reg; float answer = 0; - unroll_loop([&answer, &ar](int i) { answer += ar.values[i]; }); + unroll_loop( + [&answer, &ar](int i) { answer += ar.values[i]; }); return answer; }; - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); AliasReg ar; @@ -479,7 +483,7 @@ struct FP32Vec16 : public Vec { return answer; }; - void save(float *ptr) const { + void save(float* ptr) const { vst1q_f32(ptr, reg.val[0]); vst1q_f32(ptr + 4, reg.val[1]); vst1q_f32(ptr + 8, reg.val[2]); @@ -487,43 +491,59 @@ struct FP32Vec16 : public Vec { }; }; -template struct VecType { using vec_type = void; }; +template +struct VecType { + using vec_type = void; +}; -template using vec_t = typename VecType::vec_type; +template +using vec_t = typename VecType::vec_type; -template <> struct VecType { using vec_type = FP32Vec8; }; +template <> +struct VecType { + using vec_type = FP32Vec8; +}; -template <> struct VecType { using vec_type = FP16Vec8; }; +template <> +struct VecType { + using vec_type = FP16Vec8; +}; #ifdef ARM_BF16_SUPPORT -template <> struct VecType { using vec_type = BF16Vec8; }; +template <> +struct VecType { + using vec_type = BF16Vec8; +}; #endif -template void storeFP32(float v, T *ptr) { *ptr = v; } +template +void storeFP32(float v, T* ptr) { + *ptr = v; +} -template <> inline void storeFP32(float v, c10::Half *ptr) { - *reinterpret_cast<__fp16 *>(ptr) = v; +template <> +inline void storeFP32(float v, c10::Half* ptr) { + *reinterpret_cast<__fp16*>(ptr) = v; } -inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) { - float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]); - float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]); - float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]); - float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]); +inline FP16Vec16::FP16Vec16(const FP32Vec16& v) { + float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]); + float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]); + float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]); + float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]); - reg.val[0] = vcombine_f16(low_0, high_0); - reg.val[1] = vcombine_f16(low_1, high_1); + reg.val[0] = vcombine_f16(low_0, high_0); + reg.val[1] = vcombine_f16(low_1, high_1); }; -inline FP16Vec8 :: FP16Vec8(const FP32Vec8 &v) { - float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]); - float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]); +inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) { + float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]); + float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]); - reg = vcombine_f16(lower_half, upper_half); + reg = vcombine_f16(lower_half, upper_half); }; -inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { - +inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a.reg.val[0], b.reg.val[0]); acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a.reg.val[1], b.reg.val[1]); acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a.reg.val[2], b.reg.val[2]); @@ -531,8 +551,7 @@ inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { }; #ifdef ARM_BF16_SUPPORT -inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { - +inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) { float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0])); float32x4_t a0_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[0])); float32x4_t a1_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[1])); @@ -551,22 +570,22 @@ inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { #endif #ifdef ARM_BF16_SUPPORT -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {}; - -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg({ - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]), - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), v.reg.val[3]) - }){}; +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) + : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) { + }; + +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) + : reg({vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]), + vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), + v.reg.val[3])}) {}; #endif -inline void prefetch(const void *addr) { - __builtin_prefetch(addr, 0, 1); -}; +inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 1); }; #ifdef ARM_BF16_SUPPORT template <> -inline void storeFP32(float v, c10::BFloat16 *ptr) { - *reinterpret_cast<__bf16 *>(ptr) = vcvth_bf16_f32(v); +inline void storeFP32(float v, c10::BFloat16* ptr) { + *reinterpret_cast<__bf16*>(ptr) = vcvth_bf16_f32(v); }; #endif -}; \ No newline at end of file +}; // namespace vec_op \ No newline at end of file diff --git a/csrc/cpu/cpu_types_vsx.hpp b/csrc/cpu/cpu_types_vsx.hpp index b50bdadc5713d..a8e1be37eb418 100644 --- a/csrc/cpu/cpu_types_vsx.hpp +++ b/csrc/cpu/cpu_types_vsx.hpp @@ -9,38 +9,40 @@ namespace vec_op { // FIXME: FP16 is not fully supported in Torch-CPU -#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) #ifndef CPU_OP_GUARD -#define CPU_KERNEL_GUARD_IN(NAME) -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) + #define CPU_KERNEL_GUARD_OUT(NAME) #else -#define CPU_KERNEL_GUARD_IN(NAME) \ - std::cout << #NAME << " invoked." << std::endl; -#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; + #define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; + #define CPU_KERNEL_GUARD_OUT(NAME) \ + std::cout << #NAME << " exit." << std::endl; #endif #define FORCE_INLINE __attribute__((always_inline)) inline namespace { template -constexpr void unroll_loop_item(std::integer_sequence, F &&f) { +constexpr void unroll_loop_item(std::integer_sequence, F&& f) { (f(std::integral_constant{}), ...); } -}; // namespace +}; // namespace template >> -constexpr void unroll_loop(F &&f) { +constexpr void unroll_loop(F&& f) { unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); } -template struct Vec { +template +struct Vec { constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } }; @@ -68,12 +70,14 @@ struct BF16Vec8 : public Vec { __vector signed short reg; - explicit BF16Vec8(const void *ptr) - : reg((__vector signed short)vec_xl(0, (__vector signed short *)ptr)) {} + explicit BF16Vec8(const void* ptr) + : reg((__vector signed short)vec_xl(0, (__vector signed short*)ptr)) {} - explicit BF16Vec8(const FP32Vec8 &); + explicit BF16Vec8(const FP32Vec8&); - void save(void *ptr) const { *reinterpret_cast<__vector signed short *>(ptr) = reg; } + void save(void* ptr) const { + *reinterpret_cast<__vector signed short*>(ptr) = reg; + } }; struct BF16Vec16 : public Vec { @@ -81,18 +85,18 @@ struct BF16Vec16 : public Vec { ss16x8x2_t reg; - explicit BF16Vec16(const void *ptr) { + explicit BF16Vec16(const void* ptr) { // Load 256 bits in two parts - reg.val[0] = (__vector signed short)vec_xl(0, (signed short *)ptr); - reg.val[1] = (__vector signed short)vec_xl(16, (signed short *)ptr); + reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr); + reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr); } - explicit BF16Vec16(const FP32Vec16 &); + explicit BF16Vec16(const FP32Vec16&); - void save(void *ptr) const { + void save(void* ptr) const { // Save 256 bits in two parts - vec_xst(reg.val[0], 0, (signed short *)ptr); - vec_xst(reg.val[1], 16, (signed short *)ptr); + vec_xst(reg.val[0], 0, (signed short*)ptr); + vec_xst(reg.val[1], 16, (signed short*)ptr); } }; @@ -102,19 +106,15 @@ struct BF16Vec32 : public Vec { constexpr static int VEC_ELEM_NUM = 32; ss16x8x4_t reg; - explicit BF16Vec32(const void *ptr) - : reg(*reinterpret_cast(ptr)) {} + explicit BF16Vec32(const void* ptr) + : reg(*reinterpret_cast(ptr)) {} explicit BF16Vec32(ss16x8x4_t data) : reg(data) {} - explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({ - vec8_data.reg, - vec8_data.reg, - vec8_data.reg, - vec8_data.reg - }) {} + explicit BF16Vec32(const BF16Vec8& vec8_data) + : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {} - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } }; struct FP32Vec4 : public Vec { @@ -130,11 +130,11 @@ struct FP32Vec4 : public Vec { explicit FP32Vec4() : reg(vec_splats(0.0f)) {} - explicit FP32Vec4(const float *ptr) : reg(vec_xl(0, ptr)) {} + explicit FP32Vec4(const float* ptr) : reg(vec_xl(0, ptr)) {} explicit FP32Vec4(__vector float data) : reg(data) {} - explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {} + explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {} }; struct FP32Vec8 : public Vec { @@ -156,19 +156,19 @@ struct FP32Vec8 : public Vec { reg.val[1] = vec_splats(0.0f); } - explicit FP32Vec8(const float *ptr) { + explicit FP32Vec8(const float* ptr) { reg.val[0] = vec_xl(0, ptr); reg.val[1] = vec_xl(16, ptr); } explicit FP32Vec8(f32x4x2_t data) : reg(data) {} - explicit FP32Vec8(const FP32Vec8 &data) { + explicit FP32Vec8(const FP32Vec8& data) { reg.val[0] = data.reg.val[0]; reg.val[1] = data.reg.val[1]; } - explicit FP32Vec8(const BF16Vec8 &v) { + explicit FP32Vec8(const BF16Vec8& v) { reg.val[0] = (__vector float)vec_mergeh(zero, v.reg); reg.val[1] = (__vector float)vec_mergel(zero, v.reg); } @@ -177,7 +177,8 @@ struct FP32Vec8 : public Vec { AliasReg ar; ar.reg = reg; float result = 0; - unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); return result; } @@ -230,23 +231,27 @@ struct FP32Vec8 : public Vec { return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]})); } - FP32Vec8 operator*(const FP32Vec8 &b) const { - return FP32Vec8({vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator*(const FP32Vec8& b) const { + return FP32Vec8( + {vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])}); } - FP32Vec8 operator+(const FP32Vec8 &b) const { - return FP32Vec8({vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator+(const FP32Vec8& b) const { + return FP32Vec8( + {vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])}); } - FP32Vec8 operator-(const FP32Vec8 &b) const { - return FP32Vec8({vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator-(const FP32Vec8& b) const { + return FP32Vec8( + {vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])}); } - FP32Vec8 operator/(const FP32Vec8 &b) const { - return FP32Vec8({vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator/(const FP32Vec8& b) const { + return FP32Vec8( + {vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])}); } - void save(float *ptr) const { + void save(float* ptr) const { vec_xst(reg.val[0], 0, ptr); vec_xst(reg.val[1], 16, ptr); } @@ -275,7 +280,7 @@ struct FP32Vec16 : public Vec { reg.val[3] = vec_splats(0.0f); } - explicit FP32Vec16(const float *ptr) { + explicit FP32Vec16(const float* ptr) { reg.val[0] = vec_xl(0, ptr); reg.val[1] = vec_xl(16, ptr); reg.val[2] = vec_xl(32, ptr); @@ -284,78 +289,76 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16(f32x4x4_t data) : reg(data) {} - explicit FP32Vec16(const FP32Vec16 &data) { + explicit FP32Vec16(const FP32Vec16& data) { reg.val[0] = data.reg.val[0]; reg.val[1] = data.reg.val[1]; reg.val[2] = data.reg.val[2]; reg.val[3] = data.reg.val[3]; } - explicit FP32Vec16(const FP32Vec4 &data) { + explicit FP32Vec16(const FP32Vec4& data) { reg.val[0] = data.reg; reg.val[1] = data.reg; reg.val[2] = data.reg; reg.val[3] = data.reg; } - explicit FP32Vec16(const FP32Vec8 &data) { + explicit FP32Vec16(const FP32Vec8& data) { reg.val[0] = data.reg.val[0]; reg.val[1] = data.reg.val[1]; reg.val[2] = data.reg.val[0]; reg.val[3] = data.reg.val[1]; } - explicit FP32Vec16(const BF16Vec16 &v) { + explicit FP32Vec16(const BF16Vec16& v) { reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]); reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]); reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]); reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]); } - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - FP32Vec16 operator*(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_mul(reg.val[0], b.reg.val[0]), - vec_mul(reg.val[1], b.reg.val[1]), - vec_mul(reg.val[2], b.reg.val[2]), - vec_mul(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator*(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]), + vec_mul(reg.val[1], b.reg.val[1]), + vec_mul(reg.val[2], b.reg.val[2]), + vec_mul(reg.val[3], b.reg.val[3])})); } - FP32Vec16 operator+(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_add(reg.val[0], b.reg.val[0]), - vec_add(reg.val[1], b.reg.val[1]), - vec_add(reg.val[2], b.reg.val[2]), - vec_add(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator+(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_add(reg.val[0], b.reg.val[0]), + vec_add(reg.val[1], b.reg.val[1]), + vec_add(reg.val[2], b.reg.val[2]), + vec_add(reg.val[3], b.reg.val[3])})); } - FP32Vec16 operator-(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_sub(reg.val[0], b.reg.val[0]), - vec_sub(reg.val[1], b.reg.val[1]), - vec_sub(reg.val[2], b.reg.val[2]), - vec_sub(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator-(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_sub(reg.val[0], b.reg.val[0]), + vec_sub(reg.val[1], b.reg.val[1]), + vec_sub(reg.val[2], b.reg.val[2]), + vec_sub(reg.val[3], b.reg.val[3])})); } - FP32Vec16 operator/(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_div(reg.val[0], b.reg.val[0]), - vec_div(reg.val[1], b.reg.val[1]), - vec_div(reg.val[2], b.reg.val[2]), - vec_div(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator/(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_div(reg.val[0], b.reg.val[0]), + vec_div(reg.val[1], b.reg.val[1]), + vec_div(reg.val[2], b.reg.val[2]), + vec_div(reg.val[3], b.reg.val[3])})); } float reduce_sum() const { AliasReg ar; ar.reg = reg; float result = 0; - unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); return result; } - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); AliasReg ar; @@ -368,7 +371,7 @@ struct FP32Vec16 : public Vec { return result; } - void save(float *ptr) const { + void save(float* ptr) const { vec_xst(reg.val[0], 0, ptr); vec_xst(reg.val[1], 16, ptr); vec_xst(reg.val[2], 32, ptr); @@ -376,43 +379,62 @@ struct FP32Vec16 : public Vec { } }; -template struct VecType { using vec_type = void; }; +template +struct VecType { + using vec_type = void; +}; -template using vec_t = typename VecType::vec_type; +template +using vec_t = typename VecType::vec_type; -template <> struct VecType { using vec_type = FP32Vec8; }; +template <> +struct VecType { + using vec_type = FP32Vec8; +}; -template <> struct VecType { using vec_type = BF16Vec8; }; +template <> +struct VecType { + using vec_type = BF16Vec8; +}; -template void storeFP32(float v, T *ptr) { *ptr = v; } +template +void storeFP32(float v, T* ptr) { + *ptr = v; +} -inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { +inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { acc = acc + a * b; } -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - c10::BFloat16 __attribute__((__may_alias__)) *v_ptr = - reinterpret_cast(&v); +template <> +inline void storeFP32(float v, c10::BFloat16* ptr) { + c10::BFloat16 __attribute__((__may_alias__))* v_ptr = + reinterpret_cast(&v); *ptr = *(v_ptr + 1); } #ifndef __VEC_CLASS_FP_NAN -#define __VEC_CLASS_FP_NAN (1 << 6) + #define __VEC_CLASS_FP_NAN (1 << 6) #endif -const static __vector unsigned char omask = { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; +const static __vector unsigned char omask = {0, 1, 4, 5, 8, 9, 12, 13, + 16, 17, 20, 21, 24, 25, 28, 29}; #ifndef _ARCH_PWR10 -const static __vector unsigned int bias = { 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff }; -const static __vector unsigned int nan = { 0x7fc00000, 0x7fc00000, 0x7fc00000, 0x7fc00000 }; -const static __vector unsigned int sh16 = { 16, 16, 16, 16 }; -const static __vector unsigned int one = { 1, 1, 1, 1 }; +const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff, + 0x00007fff}; +const static __vector unsigned int nan = {0x7fc00000, 0x7fc00000, 0x7fc00000, + 0x7fc00000}; +const static __vector unsigned int sh16 = {16, 16, 16, 16}; +const static __vector unsigned int one = {1, 1, 1, 1}; #endif -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) { #ifdef _ARCH_PWR10 __vector signed short ret[2]; - ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]); - ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]); + ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[0]); + ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[1]); reg = vec_perm(ret[0], ret[1], omask); #elif defined(_ARCH_PWR9) __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]); @@ -425,8 +447,10 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { __vector unsigned int rnd1 = vec_add(lsb1, bias); inp0 = vec_add(inp0, rnd0); inp1 = vec_add(inp1, rnd1); - __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); - __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); + __vector __bool int sel0 = + vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); + __vector __bool int sel1 = + vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); inp0 = vec_sel(inp0, nan, sel0); inp1 = vec_sel(inp1, nan, sel1); inp0 = vec_sr(inp0, sh16); @@ -435,13 +459,17 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { #endif } -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { #ifdef _ARCH_PWR10 __vector signed short ret[4]; - ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]); - ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]); - ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[2]); - ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[3]); + ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[0]); + ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[1]); + ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[2]); + ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[3]); reg.val[0] = vec_perm(ret[0], ret[1], omask); reg.val[1] = vec_perm(ret[2], ret[3], omask); #elif defined(_ARCH_PWR9) @@ -465,10 +493,14 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { inp1 = vec_add(inp1, rnd1); inp2 = vec_add(inp2, rnd2); inp3 = vec_add(inp3, rnd3); - __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); - __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); - __vector __bool int sel2 = vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN); - __vector __bool int sel3 = vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN); + __vector __bool int sel0 = + vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); + __vector __bool int sel1 = + vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); + __vector __bool int sel2 = + vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN); + __vector __bool int sel3 = + vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN); inp0 = vec_sel(inp0, nan, sel0); inp1 = vec_sel(inp1, nan, sel1); inp2 = vec_sel(inp2, nan, sel2); @@ -482,10 +514,10 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { #endif } -inline void prefetch(const void *addr) { +inline void prefetch(const void* addr) { __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory"); } -}; // namespace vec_op +}; // namespace vec_op #endif diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index 4bb4eb0f491ac..a4ef2be2a58ca 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -11,39 +11,40 @@ static_assert(false, "AVX2 must be supported for the current implementation."); namespace vec_op { -#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) #ifndef CPU_OP_GUARD -#define CPU_KERNEL_GUARD_IN(NAME) -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) + #define CPU_KERNEL_GUARD_OUT(NAME) #else -#define CPU_KERNEL_GUARD_IN(NAME) \ - RECORD_FUNCTION(#NAME, c10::ArrayRef({})); -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) \ + RECORD_FUNCTION(#NAME, c10::ArrayRef({})); + #define CPU_KERNEL_GUARD_OUT(NAME) #endif #define FORCE_INLINE __attribute__((always_inline)) inline namespace { template -constexpr void unroll_loop_item(std::integer_sequence, F &&f) { +constexpr void unroll_loop_item(std::integer_sequence, F&& f) { (f(std::integral_constant{}), ...); } -}; // namespace +}; // namespace template >> -constexpr void unroll_loop(F &&f) { +constexpr void unroll_loop(F&& f) { unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); } -template struct Vec { +template +struct Vec { constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } }; @@ -55,12 +56,12 @@ struct FP16Vec8 : public Vec { __m128i reg; - explicit FP16Vec8(const void *ptr) - : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {} + explicit FP16Vec8(const void* ptr) + : reg((__m128i)_mm_loadu_si128((__m128i*)ptr)) {} - explicit FP16Vec8(const FP32Vec8 &); + explicit FP16Vec8(const FP32Vec8&); - void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m128i*>(ptr) = reg; } }; struct FP16Vec16 : public Vec { @@ -68,12 +69,12 @@ struct FP16Vec16 : public Vec { __m256i reg; - explicit FP16Vec16(const void *ptr) - : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {} + explicit FP16Vec16(const void* ptr) + : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} - explicit FP16Vec16(const FP32Vec16 &); + explicit FP16Vec16(const FP32Vec16&); - void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } void save(void* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -87,12 +88,12 @@ struct BF16Vec8 : public Vec { __m128i reg; - explicit BF16Vec8(const void *ptr) - : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {} + explicit BF16Vec8(const void* ptr) + : reg((__m128i)_mm_loadu_si128((__m128i*)ptr)) {} - explicit BF16Vec8(const FP32Vec8 &); + explicit BF16Vec8(const FP32Vec8&); - void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m128i*>(ptr) = reg; } }; struct BF16Vec16 : public Vec { @@ -100,12 +101,12 @@ struct BF16Vec16 : public Vec { __m256i reg; - explicit BF16Vec16(const void *ptr) - : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {} + explicit BF16Vec16(const void* ptr) + : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} - explicit BF16Vec16(const FP32Vec16 &); + explicit BF16Vec16(const FP32Vec16&); - void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } void save(void* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -120,11 +121,11 @@ struct BF16Vec32 : public Vec { __m512i reg; - explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {} + explicit BF16Vec32(const void* ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {} explicit BF16Vec32(__m512i data) : reg(data) {} - explicit BF16Vec32(BF16Vec8 &vec8_data) + explicit BF16Vec32(BF16Vec8& vec8_data) : reg((__m512i)_mm512_inserti32x4( _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512( (__m128i)vec8_data.reg), @@ -132,7 +133,7 @@ struct BF16Vec32 : public Vec { (__m128i)vec8_data.reg, 2), (__m128i)vec8_data.reg, 3)) {} - void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m512i*>(ptr) = reg; } }; #else struct BF16Vec32 : public Vec { @@ -141,24 +142,24 @@ struct BF16Vec32 : public Vec { __m256i reg_low; __m256i reg_high; - explicit BF16Vec32(const void *ptr) - : reg_low(_mm256_loadu_si256((__m256i const *)ptr)), - reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {} + explicit BF16Vec32(const void* ptr) + : reg_low(_mm256_loadu_si256((__m256i const*)ptr)), + reg_high(_mm256_loadu_si256((__m256i const*)ptr + 1)) {} - explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low), - reg_high(high) {} + explicit BF16Vec32(__m256i low, __m256i high) + : reg_low(low), reg_high(high) {} - explicit BF16Vec32(BF16Vec8 &vec8_data) + explicit BF16Vec32(BF16Vec8& vec8_data) : reg_low((__m256i)_mm256_inserti32x4( - _mm256_castsi128_si256((__m128i)vec8_data.reg), - (__m128i)vec8_data.reg, 1)), + _mm256_castsi128_si256((__m128i)vec8_data.reg), + (__m128i)vec8_data.reg, 1)), reg_high((__m256i)_mm256_inserti32x4( - _mm256_castsi128_si256((__m128i)vec8_data.reg), - (__m128i)vec8_data.reg, 1)) {} + _mm256_castsi128_si256((__m128i)vec8_data.reg), + (__m128i)vec8_data.reg, 1)) {} - void save(void *ptr) const { - *reinterpret_cast<__m256i *>(ptr) = reg_low; - *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high; + void save(void* ptr) const { + *reinterpret_cast<__m256i*>(ptr) = reg_low; + *reinterpret_cast<__m256i*>((__m256i*)ptr + 1) = reg_high; } }; #endif @@ -176,11 +177,11 @@ struct FP32Vec4 : public Vec { explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {} - explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {} + explicit FP32Vec4(const float* ptr) : reg(_mm_loadu_ps(ptr)) {} explicit FP32Vec4(__m128 data) : reg(data) {} - explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {} + explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {} }; struct FP32Vec8 : public Vec { @@ -196,15 +197,15 @@ struct FP32Vec8 : public Vec { explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {} - explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {} + explicit FP32Vec8(const float* ptr) : reg(_mm256_loadu_ps(ptr)) {} explicit FP32Vec8(__m256 data) : reg(data) {} - explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {} + explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {} - explicit FP32Vec8(const FP16Vec8 &v) : reg(_mm256_cvtph_ps(v.reg)) {} + explicit FP32Vec8(const FP16Vec8& v) : reg(_mm256_cvtph_ps(v.reg)) {} - explicit FP32Vec8(const BF16Vec8 &v) + explicit FP32Vec8(const BF16Vec8& v) : reg(_mm256_castsi256_ps( _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {} @@ -212,7 +213,8 @@ struct FP32Vec8 : public Vec { AliasReg ar; ar.reg = reg; float result = 0; - unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); return result; } @@ -244,27 +246,27 @@ struct FP32Vec8 : public Vec { erf(ar.values[1]), erf(ar.values[0]))); } - FP32Vec8 operator*(const FP32Vec8 &b) const { + FP32Vec8 operator*(const FP32Vec8& b) const { return FP32Vec8(_mm256_mul_ps(reg, b.reg)); } - FP32Vec8 operator+(const FP32Vec8 &b) const { + FP32Vec8 operator+(const FP32Vec8& b) const { return FP32Vec8(_mm256_add_ps(reg, b.reg)); } - FP32Vec8 operator-(const FP32Vec8 &b) const { + FP32Vec8 operator-(const FP32Vec8& b) const { return FP32Vec8(_mm256_sub_ps(reg, b.reg)); } - FP32Vec8 operator/(const FP32Vec8 &b) const { + FP32Vec8 operator/(const FP32Vec8& b) const { return FP32Vec8(_mm256_div_ps(reg, b.reg)); } - void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); } + void save(float* ptr) const { _mm256_storeu_ps(ptr, reg); } }; #ifdef __AVX512F__ -struct INT32Vec16: public Vec { +struct INT32Vec16 : public Vec { constexpr static int VEC_ELEM_NUM = 16; union AliasReg { __m512i reg; @@ -272,12 +274,11 @@ struct INT32Vec16: public Vec { }; __m512i reg; - - explicit INT32Vec16(const void* data_ptr) : reg(_mm512_loadu_epi32(data_ptr)) {} - void save(int32_t* ptr) const { - _mm512_storeu_epi32(ptr, reg); - } + explicit INT32Vec16(const void* data_ptr) + : reg(_mm512_loadu_epi32(data_ptr)) {} + + void save(int32_t* ptr) const { _mm512_storeu_epi32(ptr, reg); } void save(int32_t* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -301,11 +302,11 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {} - explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {} + explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {} explicit FP32Vec16(__m512 data) : reg(data) {} - explicit FP32Vec16(const FP32Vec4 &data) + explicit FP32Vec16(const FP32Vec4& data) : reg((__m512)_mm512_inserti32x4( _mm512_inserti32x4( _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg), @@ -313,36 +314,37 @@ struct FP32Vec16 : public Vec { (__m128i)data.reg, 2), (__m128i)data.reg, 3)) {} - explicit FP32Vec16(const FP32Vec8 &data) + explicit FP32Vec16(const FP32Vec8& data) : reg((__m512)_mm512_inserti32x8( _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {} - explicit FP32Vec16(const BF16Vec16 &v) + explicit FP32Vec16(const BF16Vec16& v) : reg(_mm512_castsi512_ps( _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {} - explicit FP32Vec16(const FP16Vec16 &v) : reg(_mm512_cvtph_ps(v.reg)) {} + explicit FP32Vec16(const FP16Vec16& v) : reg(_mm512_cvtph_ps(v.reg)) {} - explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - explicit FP32Vec16(const INT32Vec16 &v) - : reg(_mm512_cvt_roundepi32_ps(v.reg, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)) {} + explicit FP32Vec16(const INT32Vec16& v) + : reg(_mm512_cvt_roundepi32_ps( + v.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} - FP32Vec16 operator*(const FP32Vec16 &b) const { + FP32Vec16 operator*(const FP32Vec16& b) const { return FP32Vec16(_mm512_mul_ps(reg, b.reg)); } - FP32Vec16 operator+(const FP32Vec16 &b) const { + FP32Vec16 operator+(const FP32Vec16& b) const { return FP32Vec16(_mm512_add_ps(reg, b.reg)); } - FP32Vec16 operator-(const FP32Vec16 &b) const { + FP32Vec16 operator-(const FP32Vec16& b) const { return FP32Vec16(_mm512_sub_ps(reg, b.reg)); } - FP32Vec16 operator/(const FP32Vec16 &b) const { + FP32Vec16 operator/(const FP32Vec16& b) const { return FP32Vec16(_mm512_div_ps(reg, b.reg)); } @@ -370,9 +372,7 @@ struct FP32Vec16 : public Vec { return FP32Vec16(_mm512_mask_min_ps(reg, mask, reg, b.reg)); } - FP32Vec16 abs() const { - return FP32Vec16(_mm512_abs_ps(reg)); - } + FP32Vec16 abs() const { return FP32Vec16(_mm512_abs_ps(reg)); } float reduce_sum() const { return _mm512_reduce_add_ps(reg); } @@ -380,14 +380,15 @@ struct FP32Vec16 : public Vec { float reduce_min() const { return _mm512_reduce_min_ps(reg); } - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size)); return _mm512_mask_reduce_add_ps(mask, reg); } - void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); } + void save(float* ptr) const { _mm512_storeu_ps(ptr, reg); } void save(float* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -407,32 +408,30 @@ struct FP32Vec16 : public Vec { __m256 reg_low; __m256 reg_high; - explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)), - reg_high(_mm256_set1_ps(v)) {} + explicit FP32Vec16(float v) + : reg_low(_mm256_set1_ps(v)), reg_high(_mm256_set1_ps(v)) {} - explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)), - reg_high(_mm256_set1_ps(0.0)) {} + explicit FP32Vec16() + : reg_low(_mm256_set1_ps(0.0)), reg_high(_mm256_set1_ps(0.0)) {} - explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)), - reg_high(_mm256_loadu_ps(ptr + 8)) {} + explicit FP32Vec16(const float* ptr) + : reg_low(_mm256_loadu_ps(ptr)), reg_high(_mm256_loadu_ps(ptr + 8)) {} explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {} - explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low), - reg_high(data.reg_high) {} + explicit FP32Vec16(const FP32Vec16& data) + : reg_low(data.reg_low), reg_high(data.reg_high) {} - explicit FP32Vec16(const FP32Vec4 &data) + explicit FP32Vec16(const FP32Vec4& data) : reg_low((__m256)_mm256_inserti128_si256( - _mm256_castsi128_si256((__m128i)data.reg), - (__m128i)data.reg, 1)), + _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)), reg_high((__m256)_mm256_inserti128_si256( - _mm256_castsi128_si256((__m128i)data.reg), - (__m128i)data.reg, 1)) {} + _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)) {} - explicit FP32Vec16(const FP32Vec8 &data) + explicit FP32Vec16(const FP32Vec8& data) : reg_low(data.reg), reg_high(data.reg) {} - explicit FP32Vec16(const FP16Vec16 &v) { + explicit FP32Vec16(const FP16Vec16& v) { __m128i low = _mm256_extractf128_si256(v.reg, 0); __m128i high = _mm256_extractf128_si256(v.reg, 1); @@ -440,9 +439,9 @@ struct FP32Vec16 : public Vec { reg_high = _mm256_cvtph_ps(high); } - explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - explicit FP32Vec16(const BF16Vec16 &v) { + explicit FP32Vec16(const BF16Vec16& v) { __m128i low = _mm256_extractf128_si256(v.reg, 0); __m128i high = _mm256_extractf128_si256(v.reg, 1); @@ -456,24 +455,24 @@ struct FP32Vec16 : public Vec { reg_high = _mm256_castsi256_ps(v_high_shifted); } - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - FP32Vec16 operator*(const FP32Vec16 &b) const { + FP32Vec16 operator*(const FP32Vec16& b) const { return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low), _mm256_mul_ps(reg_high, b.reg_high)); } - FP32Vec16 operator+(const FP32Vec16 &b) const { + FP32Vec16 operator+(const FP32Vec16& b) const { return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low), _mm256_add_ps(reg_high, b.reg_high)); } - FP32Vec16 operator-(const FP32Vec16 &b) const { + FP32Vec16 operator-(const FP32Vec16& b) const { return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low), _mm256_sub_ps(reg_high, b.reg_high)); } - FP32Vec16 operator/(const FP32Vec16 &b) const { + FP32Vec16 operator/(const FP32Vec16& b) const { return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low), _mm256_div_ps(reg_high, b.reg_high)); } @@ -484,7 +483,8 @@ struct FP32Vec16 : public Vec { return low.reduce_sum() + high.reduce_sum(); } - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { float sum = 0.0; static_assert(VEC_ELEM_NUM % group_size == 0); constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); @@ -507,7 +507,7 @@ struct FP32Vec16 : public Vec { return sum; } - void save(float *ptr) const { + void save(float* ptr) const { _mm256_storeu_ps(ptr, reg_low); _mm256_storeu_ps(ptr + 8, reg_high); } @@ -515,7 +515,7 @@ struct FP32Vec16 : public Vec { #endif #ifdef __AVX512F__ -struct INT8Vec16: public Vec { +struct INT8Vec16 : public Vec { constexpr static int VEC_ELEM_NUM = 16; union AliasReg { __m128i reg; @@ -523,14 +523,12 @@ struct INT8Vec16: public Vec { }; __m128i reg; - - explicit INT8Vec16(const FP32Vec16& vec) : reg( - _mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) - ) {} - void save(int8_t* ptr) const { - _mm_storeu_epi8(ptr, reg); - } + explicit INT8Vec16(const FP32Vec16& vec) + : reg(_mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32( + vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))) {} + + void save(int8_t* ptr) const { _mm_storeu_epi8(ptr, reg); } void save(int8_t* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -540,71 +538,92 @@ struct INT8Vec16: public Vec { }; #endif -template struct VecType { using vec_type = void; }; +template +struct VecType { + using vec_type = void; +}; -template using vec_t = typename VecType::vec_type; +template +using vec_t = typename VecType::vec_type; -template <> struct VecType { using vec_type = FP32Vec8; }; +template <> +struct VecType { + using vec_type = FP32Vec8; +}; -template <> struct VecType { using vec_type = FP16Vec8; }; +template <> +struct VecType { + using vec_type = FP16Vec8; +}; -template <> struct VecType { using vec_type = BF16Vec8; }; +template <> +struct VecType { + using vec_type = BF16Vec8; +}; -template void storeFP32(float v, T *ptr) { *ptr = v; } +template +void storeFP32(float v, T* ptr) { + *ptr = v; +} -inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { +inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { acc = acc + a * b; } -template <> inline void storeFP32(float v, c10::Half *ptr) { - *reinterpret_cast(ptr) = +template <> +inline void storeFP32(float v, c10::Half* ptr) { + *reinterpret_cast(ptr) = _cvtss_sh(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } -inline FP16Vec8::FP16Vec8(const FP32Vec8 &v) +inline FP16Vec8::FP16Vec8(const FP32Vec8& v) : reg(_mm256_cvtps_ph(v.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} #ifdef __AVX512F__ -inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) +inline FP16Vec16::FP16Vec16(const FP32Vec16& v) : reg(_mm512_cvtps_ph(v.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} #else -inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) - : reg(_mm256_insertf128_si256(_mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {} +inline FP16Vec16::FP16Vec16(const FP32Vec16& v) + : reg(_mm256_insertf128_si256( + _mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), + FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {} #endif #ifdef __AVX512BF16__ -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v); +template <> +inline void storeFP32(float v, c10::BFloat16* ptr) { + *reinterpret_cast<__bfloat16*>(ptr) = _mm_cvtness_sbh(v); } -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {} -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {} -inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { +inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) { acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg); } #else -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - c10::BFloat16 __attribute__((__may_alias__)) *v_ptr = - reinterpret_cast(&v); +template <> +inline void storeFP32(float v, c10::BFloat16* ptr) { + c10::BFloat16 __attribute__((__may_alias__))* v_ptr = + reinterpret_cast(&v); *ptr = *(v_ptr + 1); } -#ifdef __AVX512F__ -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) + #ifdef __AVX512F__ +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg(_mm256_cvtepi32_epi16( _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {} -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) : reg(_mm512_cvtepi32_epi16( _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {} -#else -namespace{ + #else +namespace { __m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) { __m256i ai = _mm256_castps_si256(a); ai = _mm256_srli_epi32(ai, 16); @@ -612,21 +631,21 @@ __m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) { ai = _mm256_permute4x64_epi64(ai, 0b00111001); return _mm256_extracti128_si256(ai, 0); } -} +} // namespace -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {} -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low)); BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high)); reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1); } -#endif // __AVX512F__ -#endif // __AVX512BF16__ + #endif // __AVX512F__ +#endif // __AVX512BF16__ -inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); } +inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); } -}; // namespace vec_op +}; // namespace vec_op #endif diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp new file mode 100644 index 0000000000000..e8555d853b7ac --- /dev/null +++ b/csrc/cumem_allocator.cpp @@ -0,0 +1,310 @@ +// A CUDAPluggableAllocator based on cumem* APIs. +// Important: allocation size, CUdeviceptr and CUmemGenericAllocationHandle* +// need to be unsigned long long +#include + +extern "C" { + +#define PY_SSIZE_T_CLEAN +#include + +#include +#include +#include + +#define CUDA_CHECK(condition) \ + do { \ + CUresult error = condition; \ + if (error != 0) { \ + char* error_string; \ + cuGetErrorString(error, (const char**)&error_string); \ + std::cerr << "CUDA Error: " << error_string << " at " << __FILE__ << ":" \ + << __LINE__ << std::endl; \ + } \ + } while (0) + +// Global references to Python callables +// NOTE: this is borrowed reference, so we don't need to DECREF them. +// This brings the limitation that the allocator needs to be singleton. +static PyObject* g_python_malloc_callback = nullptr; +static PyObject* g_python_free_callback = nullptr; + +// --------------------------------------------------------------------------- +// Helper functions: + +void ensure_context(unsigned long long device) { + CUcontext pctx; + CUDA_CHECK(cuCtxGetCurrent(&pctx)); + if (!pctx) { + // Ensure device context. + CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device)); + CUDA_CHECK(cuCtxSetCurrent(pctx)); + } +} + +void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem, + CUmemGenericAllocationHandle* p_memHandle) { + ensure_context(device); + // Define memory allocation properties + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE; + + // Allocate memory using cuMemCreate + CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0)); + CUDA_CHECK(cuMemMap(d_mem, size, 0, *p_memHandle, 0)); + + CUmemAccessDesc accessDesc = {}; + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = device; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + + CUDA_CHECK(cuMemSetAccess(d_mem, size, &accessDesc, 1)); + // std::cout << "create_and_map: device=" << device << ", size=" << size << ", + // d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl; +} + +void unmap_and_release(unsigned long long device, ssize_t size, + CUdeviceptr d_mem, + CUmemGenericAllocationHandle* p_memHandle) { + // std::cout << "unmap_and_release: device=" << device << ", size=" << size << + // ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl; + ensure_context(device); + CUDA_CHECK(cuMemUnmap(d_mem, size)); + CUDA_CHECK(cuMemRelease(*p_memHandle)); +} + +PyObject* create_tuple_from_c_integers(unsigned long long a, + unsigned long long b, + unsigned long long c, + unsigned long long d) { + // Create a new tuple of size 4 + PyObject* tuple = PyTuple_New(4); + if (!tuple) { + return NULL; // Return NULL on failure + } + + // Convert integers to Python objects and set them in the tuple + PyTuple_SetItem( + tuple, 0, + PyLong_FromUnsignedLongLong(a)); // Steals reference to the PyLong + PyTuple_SetItem(tuple, 1, PyLong_FromUnsignedLongLong(b)); + PyTuple_SetItem(tuple, 2, PyLong_FromUnsignedLongLong(c)); + PyTuple_SetItem(tuple, 3, PyLong_FromUnsignedLongLong(d)); + + // Note: PyTuple_SetItem "steals" a reference to each object, + // so we do not need to Py_DECREF the PyLong objects explicitly. + + return tuple; // Return the created tuple +} + +// --------------------------------------------------------------------------- +// Our exported C functions that call Python: + +// use CUstream instead of cudaStream_t, to avoid including cuda_runtime_api.h +void* my_malloc(ssize_t size, int device, CUstream stream) { + ensure_context(device); + + // first allocation, align the size, and reserve an address, and also allocate + // a CUmemGenericAllocationHandle + + // Define memory allocation properties + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE; + + // Check if the allocation is supported + size_t granularity; + CUDA_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, + CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + + size_t alignedSize = ((size + granularity - 1) / granularity) * granularity; + + CUdeviceptr d_mem; + CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0)); + + // allocate the CUmemGenericAllocationHandle + CUmemGenericAllocationHandle* p_memHandle = + (CUmemGenericAllocationHandle*)malloc( + sizeof(CUmemGenericAllocationHandle)); + + if (!g_python_malloc_callback) { + std::cerr << "ERROR: g_python_malloc_callback not set.\n"; + return nullptr; + } + + // Acquire GIL (not in stable ABI officially, but often works) + PyGILState_STATE gstate = PyGILState_Ensure(); + + PyObject* arg_tuple = create_tuple_from_c_integers( + (unsigned long long)device, (unsigned long long)alignedSize, + (unsigned long long)d_mem, (unsigned long long)p_memHandle); + + // Call g_python_malloc_callback + PyObject* py_result = + PyObject_CallFunctionObjArgs(g_python_malloc_callback, arg_tuple, NULL); + Py_DECREF(arg_tuple); + + if (!py_result) { + PyErr_Print(); + PyGILState_Release(gstate); + return nullptr; + } + + PyGILState_Release(gstate); + + // do the final mapping + create_and_map(device, alignedSize, d_mem, p_memHandle); + + return (void*)d_mem; +} + +// use CUstream instead of cudaStream_t, to avoid including cuda_runtime_api.h +void my_free(void* ptr, ssize_t size, int device, CUstream stream) { + // get memory handle from the pointer + if (!g_python_free_callback) { + std::cerr << "ERROR: g_python_free_callback not set.\n"; + return; + } + + // Acquire GIL (not in stable ABI officially, but often works) + PyGILState_STATE gstate = PyGILState_Ensure(); + + PyObject* py_ptr = + PyLong_FromUnsignedLongLong(reinterpret_cast(ptr)); + + PyObject* py_result = + PyObject_CallFunctionObjArgs(g_python_free_callback, py_ptr, NULL); + + if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) { + PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4"); + return; + } + + unsigned long long recv_device, recv_size; + unsigned long long recv_d_mem, recv_p_memHandle; + // Unpack the tuple into four C integers + if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size, + &recv_d_mem, &recv_p_memHandle)) { + // PyArg_ParseTuple sets an error if it fails + return; + } + + PyGILState_Release(gstate); + + // recv_size == size + // recv_device == device + + // Free memory + + CUdeviceptr d_mem = (CUdeviceptr)recv_d_mem; + CUmemGenericAllocationHandle* p_memHandle = + (CUmemGenericAllocationHandle*)recv_p_memHandle; + unmap_and_release(device, size, d_mem, p_memHandle); + + // free address and the handle + CUDA_CHECK(cuMemAddressFree(d_mem, size)); + free(p_memHandle); +} + +// --------------------------------------------------------------------------- +// Python extension boilerplate: + +// Python-exposed function: init_module(python_malloc, python_free) +static PyObject* py_init_module(PyObject* self, PyObject* args) { + PyObject* malloc_callback = nullptr; + PyObject* free_callback = nullptr; + + if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) { + return nullptr; + } + + if (!PyCallable_Check(malloc_callback) || !PyCallable_Check(free_callback)) { + PyErr_SetString(PyExc_TypeError, "Both arguments must be callables"); + return nullptr; + } + + // Save the Python callables + // This module does not handle GC of these objects, so they must be kept alive + // outside of this module. + g_python_malloc_callback = malloc_callback; + g_python_free_callback = free_callback; + + Py_RETURN_NONE; +} + +static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) { + if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) { + PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4"); + return nullptr; + } + + unsigned long long recv_device, recv_size; + unsigned long long recv_d_mem, recv_p_memHandle; + // Unpack the tuple into four C integers + if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem, + &recv_p_memHandle)) { + // PyArg_ParseTuple sets an error if it fails + return nullptr; + } + + CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem; + CUmemGenericAllocationHandle* p_memHandle = + (CUmemGenericAllocationHandle*)recv_p_memHandle; + + unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle); + + Py_RETURN_NONE; +} + +static PyObject* python_create_and_map(PyObject* self, PyObject* args) { + if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) { + PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4"); + return nullptr; + } + + unsigned long long recv_device, recv_size; + unsigned long long recv_d_mem, recv_p_memHandle; + // Unpack the tuple into four C integers + if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem, + &recv_p_memHandle)) { + // PyArg_ParseTuple sets an error if it fails + return nullptr; + } + + CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem; + CUmemGenericAllocationHandle* p_memHandle = + (CUmemGenericAllocationHandle*)recv_p_memHandle; + + create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle); + + Py_RETURN_NONE; +} + +static PyMethodDef module_methods[] = { + {"init_module", (PyCFunction)py_init_module, METH_VARARGS, + "Initialize module with python_malloc and python_free callables."}, + {"python_create_and_map", (PyCFunction)python_create_and_map, METH_VARARGS, + "Create and map memory on the device."}, + {"python_unmap_and_release", (PyCFunction)python_unmap_and_release, + METH_VARARGS, "Unmap and release memory on the device."}, + {NULL, NULL, 0, NULL} // sentinel +}; + +static struct PyModuleDef cumem_allocator_module = { + PyModuleDef_HEAD_INIT, "cumem_allocator", + "cumem-based allocator for CUDAPluggableAllocator", -1, module_methods}; + +PyMODINIT_FUNC PyInit_cumem_allocator(void) { + // Initialize the module + PyObject* module = PyModule_Create(&cumem_allocator_module); + if (!module) { + return NULL; + } + return module; +} +} // extern "C" diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp index 85e359aa57113..07c9e46c27b06 100644 --- a/csrc/cutlass_extensions/common.hpp +++ b/csrc/cutlass_extensions/common.hpp @@ -27,8 +27,7 @@ inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { int max_shared_mem_per_block_opt_in = 0; cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in, - cudaDevAttrMaxSharedMemoryPerBlockOptin, - device); + cudaDevAttrMaxSharedMemoryPerBlockOptin, device); return max_shared_mem_per_block_opt_in; } diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index 24341d63fb1f8..d609ce1697df3 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -21,7 +21,7 @@ __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, } } // namespace -template +template __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids, int32_t* expert_ids, @@ -32,12 +32,8 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, const size_t start_idx = threadIdx.x * tokens_per_thread; extern __shared__ int32_t shared_mem[]; - - int32_t* tokens_cnts = - shared_mem; // 2d tensor with shape (blockDim.x + 1, num_experts) - int32_t* cumsum = - shared_mem + - (blockDim.x + 1) * num_experts; // 1d tensor with shape (num_experts + 1) + int32_t* cumsum = shared_mem; // 1d tensor with shape (num_experts + 1) + token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + blockDim.x + 1); for (int i = 0; i < num_experts; ++i) { tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; @@ -74,7 +70,7 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, block_size) * block_size; } - *total_tokens_post_pad = cumsum[num_experts]; + *total_tokens_post_pad = static_cast(cumsum[num_experts]); } __syncthreads(); @@ -224,26 +220,46 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, torch::Tensor num_tokens_post_pad) { const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - // If we have very large number of experts, we can no longer use shared - // memory. - // TODO(simon): the right solution should be calculating the exact right - // amount of shared memory and use that. The num_experts >= 256 is just a - // temporary solution to unblock Deepseek V3. - if (num_experts >= 256) { + int device_max_shared_mem; + auto dev = topk_ids.get_device(); + cudaDeviceGetAttribute(&device_max_shared_mem, + cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); + + const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); + const int32_t shared_mem_i32 = + ((num_thread + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t); + const int32_t shared_mem_i16 = + ((num_thread + 1) * num_experts) * sizeof(uint16_t) + + (num_experts + 1) * sizeof(int32_t); + + bool use_global_memory = false; + bool use_i16 = false; // Use uint16_t for shared memory token counts + if (shared_mem_i32 < device_max_shared_mem) { + // Do nothing in this case. We're all set to use int32_t token counts + } else if (shared_mem_i16 < device_max_shared_mem && + topk_ids.numel() <= 65535) { + // when nelements of topk_ids is smaller than 65535 (max value of uint16), + // element value of token_cnts would also smaller than 65535, + // so we can use uint16 as dtype of token_cnts + use_i16 = true; + } else { + use_global_memory = true; + } + + if (use_global_memory) { VLLM_DISPATCH_INTEGRAL_TYPES( topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] { // calc needed amount of shared mem for `tokens_cnts` and `cumsum` // tensors const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); - const int32_t mem_tokens_cnts = - ((num_experts + 1) * num_experts) * sizeof(int32_t); - const int32_t mem_cumsum = (num_experts + 1) * sizeof(int32_t); - // allocate global memory - int32_t* tokens_cnts; - int32_t* cumsum; - cudaMalloc(&tokens_cnts, mem_tokens_cnts); - cudaMalloc(&cumsum, mem_cumsum); + auto options_int = torch::TensorOptions() + .dtype(torch::kInt) + .device(topk_ids.device()); + torch::Tensor token_cnts_buffer = + torch::empty({(num_experts + 1) * num_experts}, options_int); + torch::Tensor cumsum_buffer = + torch::empty({num_experts + 1}, options_int); auto kernel = vllm::moe::moe_align_block_size_global_mem_kernel; @@ -252,25 +268,32 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, sorted_token_ids.data_ptr(), experts_ids.data_ptr(), num_tokens_post_pad.data_ptr(), num_experts, block_size, - topk_ids.numel(), tokens_cnts, cumsum); - cudaFree(tokens_cnts); - cudaFree(cumsum); + topk_ids.numel(), token_cnts_buffer.data_ptr(), + cumsum_buffer.data_ptr()); }); - } else { + } else if (use_i16) { VLLM_DISPATCH_INTEGRAL_TYPES( topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { - // calc needed amount of shared mem for `tokens_cnts` and `cumsum` - // tensors - const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); - const int32_t shared_mem = - ((num_thread + 1) * num_experts + (num_experts + 1)) * - sizeof(int32_t); - // set dynamic shared mem - auto kernel = vllm::moe::moe_align_block_size_kernel; + auto kernel = + vllm::moe::moe_align_block_size_kernel; + AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( + (void*)kernel, shared_mem_i16)); + kernel<<<1, num_thread, shared_mem_i16, stream>>>( + topk_ids.data_ptr(), + sorted_token_ids.data_ptr(), + experts_ids.data_ptr(), + num_tokens_post_pad.data_ptr(), num_experts, block_size, + topk_ids.numel()); + }); + } else { + VLLM_DISPATCH_INTEGRAL_TYPES( + topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { + auto kernel = + vllm::moe::moe_align_block_size_kernel; AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( - (void*)kernel, shared_mem)); - kernel<<<1, num_thread, shared_mem, stream>>>( + (void*)kernel, shared_mem_i32)); + kernel<<<1, num_thread, shared_mem_i32, stream>>>( topk_ids.data_ptr(), sorted_token_ids.data_ptr(), experts_ids.data_ptr(), diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md index 76b2fb95a5009..21bd938be9e89 100644 --- a/docs/source/api/multimodal/inputs.md +++ b/docs/source/api/multimodal/inputs.md @@ -43,7 +43,7 @@ ``` ```{eval-rst} -.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2 +.. autoclass:: vllm.multimodal.inputs.MultiModalInputs :members: :show-inheritance: ``` diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md index e92104399342d..36cf8e7440eca 100644 --- a/docs/source/contributing/overview.md +++ b/docs/source/contributing/overview.md @@ -25,10 +25,12 @@ Check out the [building from source](#build-from-source) documentation for detai ```bash pip install -r requirements-dev.txt -# linting and formatting -bash format.sh -# Static type checking -mypy +# Linting, formatting and static type checking +pre-commit install + +# You can manually run pre-commit with +pre-commit run --all-files + # Unit tests pytest tests/ ``` @@ -88,7 +90,8 @@ If the PR spans more than one category, please include all relevant prefixes. The PR needs to meet the following code quality standards: - We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html). -- Pass all linter checks. Please use to format your code. +- Pass all linter checks. Please use `pre-commit` to format your code. See + if `pre-commit` is new to you. - The code needs to be well-documented to ensure future contributors can easily understand the code. - Include sufficient tests to ensure the project stays correct and robust. This diff --git a/docs/source/contributing/vulnerability_management.md b/docs/source/contributing/vulnerability_management.md index 422dc13e6a644..a9bbfde2af770 100644 --- a/docs/source/contributing/vulnerability_management.md +++ b/docs/source/contributing/vulnerability_management.md @@ -41,3 +41,20 @@ You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai) to discuss security-related topics. However, please do not disclose any vulnerabilities in this channel. If you need to report a vulnerability, please use the GitHub security advisory system or contact a VMT member privately. + +## Vulnerability Disclosure + +The process for disclosing vulnerabilities is the following: + +- The VMT will work with the project maintainers to develop a fix for the + vulnerability. +- The VMT will coordinate with the reporter and project maintainers to prepare a + security advisory that adequately describes the vulnerability and its impact. +- The VMT will coordinate with the project maintainers to publish a fix and + release an update that includes that fix. +- The VMT will publish the security advisory on GitHub. Release notes will be + updated to include a reference to the security advisory. + +The VMT and project maintainers will work to minimize the amount of time in +between disclosing any public information about the vulnerability and making a +release and advisory available. diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md index 2606e2765c1ae..438be47316f3b 100644 --- a/docs/source/deployment/docker.md +++ b/docs/source/deployment/docker.md @@ -42,6 +42,9 @@ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai By default vLLM will build for all GPU types for widest distribution. If you are just building for the current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""` for vLLM to find the current GPU type and build for that. + +If you are using Podman instead of Docker, you might need to disable SELinux labeling by +adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184). ``` ## Building for Arm64/aarch64 diff --git a/docs/source/features/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md deleted file mode 100644 index 1cd67cb8fd336..0000000000000 --- a/docs/source/features/quantization/fp8_e4m3_kvcache.md +++ /dev/null @@ -1,44 +0,0 @@ -(fp8-e4m3-kvcache)= - -# FP8 E4M3 KV Cache - -Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, -improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2 -(5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of -the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of -FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside -each quantized tensor. For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling -factors of a finer granularity (e.g. per-channel). - -These scaling factors can be specified by passing an optional quantization param JSON to the LLM engine at load time. If -this JSON is not specified, scaling factors default to 1.0. These scaling factors are typically obtained when running an -unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). - -To install AMMO (AlgorithMic Model Optimization): - -```console -pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo -``` - -Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon -offerings e.g. AMD MI300, NVIDIA Hopper or later support native hardware conversion to and from fp32, fp16, bf16, etc. -Thus, LLM inference is greatly accelerated with minimal accuracy loss. - -Here is an example of how to enable this feature: - -```python -# two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to -# https://github.com/vllm-project/vllm/blob/main/examples/other/fp8/README.md to generate kv_cache_scales.json of your own. - -from vllm import LLM, SamplingParams -sampling_params = SamplingParams(temperature=1.3, top_p=0.8) -llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", - kv_cache_dtype="fp8", - quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json") -prompt = "London is the capital of" -out = llm.generate(prompt, sampling_params)[0].outputs[0].text -print(out) - -# output w/ scaling factors: England, the United Kingdom, and one of the world's leading financial, -# output w/o scaling factors: England, located in the southeastern part of the country. It is known -``` diff --git a/docs/source/features/quantization/fp8_e5m2_kvcache.md b/docs/source/features/quantization/fp8_e5m2_kvcache.md deleted file mode 100644 index 3a81ab17f332f..0000000000000 --- a/docs/source/features/quantization/fp8_e5m2_kvcache.md +++ /dev/null @@ -1,31 +0,0 @@ -(fp8-kv-cache)= - -# FP8 E5M2 KV Cache - -The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. -The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other. - -Here is an example of how to enable this feature: - -```python -from vllm import LLM, SamplingParams -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -# Create an LLM. -llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8") -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -``` diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md index 861cb165c11c2..56ccdb5f00c34 100644 --- a/docs/source/features/quantization/index.md +++ b/docs/source/features/quantization/index.md @@ -14,6 +14,5 @@ bnb gguf int8 fp8 -fp8_e5m2_kvcache -fp8_e4m3_kvcache +quantized_kvcache ``` diff --git a/docs/source/features/quantization/quantized_kvcache.md b/docs/source/features/quantization/quantized_kvcache.md new file mode 100644 index 0000000000000..95fa5e81e2f74 --- /dev/null +++ b/docs/source/features/quantization/quantized_kvcache.md @@ -0,0 +1,145 @@ +(quantized-kvcache)= + +# Quantized KV Cache + +## FP8 KV Cache + +Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, improving throughput. + +### FP8 Formats + +[OCP (Open Compute Project)](https://www.opencompute.org) specifies two common 8-bit floating point data formats: + +- E5M2 (5 exponent bits and 2 mantissa bits) +- E4M3FN (4 exponent bits and 3 mantissa bits, often shortened as E4M3) + +The E4M3 format offers higher precision compared to E5M2. However, due to its small dynamic range (±240.0), E4M3 typically requires a higher-precision (FP32) scaling factor alongside each quantized tensor. + +### Current Limitations + +For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling factors of a finer granularity (e.g. per-channel). + +### Performance Impact + +The current FP8 KV cache implementation primarily benefits throughput by allowing approximately double the amount of space for KV cache allocation. This enables either: + +- Processing longer context lengths for individual requests, or +- Handling more concurrent request batches + +However, there are currently no latency improvements as the implementation does not yet include fused dequantization and attention operations. Future releases will support quantized attention with hardware acceleration, which should provide additional performance benefits. While the most recent silicon offerings (e.g. AMD MI300, NVIDIA Hopper or later) support native hardware conversion between FP8 and other formats (fp32, fp16, bf16), this benefit is not yet fully realized. + +Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy, making it a practical choice for throughput optimization. + +## Usage Example + +Here is an example of how to enable FP8 quantization: + +```python +from vllm import LLM, SamplingParams + +sampling_params = SamplingParams(temperature=0.7, top_p=0.8) +llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", kv_cache_dtype="fp8") +prompt = "London is the capital of" +out = llm.generate(prompt, sampling_params)[0].outputs[0].text +print(out) + +# output w/ scaling factors: England, the United Kingdom, and one of the world's leading financial, +# output w/o scaling factors: England, located in the southeastern part of the country. It is known +``` + +The `kv_cache_dtype` argument specifies the data type for KV cache storage: +- `"auto"`: Uses the model's default "unquantized" data type +- `"fp8"` or `"fp8_e4m3"`: Supported on CUDA 11.8+ and ROCm (AMD GPU) +- `"fp8_e5m2"`: Supported on CUDA 11.8+ + +## Calibrated Scales for Better Accuracy + +For optimal model quality when using FP8 KV Cache, we recommend using calibrated scales tuned to representative inference data. [LLM Compressor](https://github.com/vllm-project/llm-compressor/) is the recommended tool for this process. + +### Installation + +First, install the required dependencies: + +```console +pip install llmcompressor +``` + +### Example Usage + +Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern): + +```python +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer +from llmcompressor.transformers import oneshot + +# Select model and load it +MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" + +# Configure calibration parameters +NUM_CALIBRATION_SAMPLES = 512 # 512 samples is a good starting point +MAX_SEQUENCE_LENGTH = 2048 + +# Load and preprocess dataset +ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) +ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) + +def process_and_tokenize(example): + text = tokenizer.apply_chat_template(example["messages"], tokenize=False) + return tokenizer( + text, + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + +ds = ds.map(process_and_tokenize, remove_columns=ds.column_names) + +# Configure quantization settings +recipe = """ +quant_stage: + quant_modifiers: + QuantizationModifier: + kv_cache_scheme: + num_bits: 8 + type: float + strategy: tensor + dynamic: false + symmetric: true +""" + +# Apply quantization +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, +) + +# Save quantized model +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) +``` + +The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales. + +When running the model you must specify `kv_cache_dtype="fp8"` in order to enable the kv cache quantization and use the scales. + +```python +from vllm import LLM, SamplingParams + +sampling_params = SamplingParams(temperature=0.7, top_p=0.8) +llm = LLM(model="Llama-3.1-8B-Instruct-FP8-KV", kv_cache_dtype="fp8") +prompt = "London is the capital of" +out = llm.generate(prompt, sampling_params)[0].outputs[0].text +print(out) +``` diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md index f6f9d3c303f89..69238f6e36fb2 100644 --- a/docs/source/getting_started/installation/gpu/rocm.inc.md +++ b/docs/source/getting_started/installation/gpu/rocm.inc.md @@ -13,6 +13,14 @@ vLLM supports AMD GPUs with ROCm 6.2. Currently, there are no pre-built ROCm wheels. +However, the [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized +docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator. + +```{tip} +Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html) +for instructions on how to use this prebuilt docker image. +``` + ### Build wheel from source 0. Install prerequisites (skip if you are already in an environment/docker with the following installed): @@ -70,7 +78,7 @@ Currently, there are no pre-built ROCm wheels. # Install PyTorch $ pip uninstall torch -y - $ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 + $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/rocm6.2 # Build & install AMD SMI $ pip install /opt/rocm/share/amd_smi @@ -123,11 +131,10 @@ It is important that the user kicks off the docker build using buildkit. Either uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. It provides flexibility to customize the build of docker image using the following arguments: -- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image. -- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target. -- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` -- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c` -- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1. +- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using +- `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build +- `BUILD_RPD`: Include RocmProfileData profiling tool in the image +- `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image Their values can be passed in when running `docker build` with `--build-arg` options. @@ -137,10 +144,10 @@ To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . ``` -To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below: +To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should pick the alternative base image: ```console -DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . +DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f Dockerfile.rocm -t vllm-rocm . ``` To run the above docker image `vllm-rocm`, use the below command: diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md index 1e290d2b4c0bd..ec4b184bf4604 100644 --- a/docs/source/getting_started/troubleshooting.md +++ b/docs/source/getting_started/troubleshooting.md @@ -197,6 +197,27 @@ if __name__ == '__main__': llm = vllm.LLM(...) ``` +## `torch.compile` Error + +vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](https://github.com/vllm-project/vllm/pull/10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script: + +```python +import torch + +@torch.compile +def f(x): + # a simple function to test torch.compile + x = x + 1 + x = x * 2 + x = x.sin() + return x + +x = torch.randn(4, 4).cuda() +print(f(x)) +``` + +If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See [this issue](https://github.com/vllm-project/vllm/issues/12219) for example. + ## Known Issues - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759). diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 85d844f3d3f55..3da5aaf713c1f 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -470,6 +470,11 @@ of the whole prompt are extracted from the normalized hidden state corresponding - `Qwen/Qwen2.5-Math-RM-72B`, etc. - ✅︎ - ✅︎ +* - `Qwen2ForProcessRewardModel` + - Qwen2-based + - `Qwen/Qwen2.5-Math-PRM-7B`, `Qwen/Qwen2.5-Math-PRM-72B`, etc. + - ✅︎ + - ✅︎ ``` If your model is not in the above list, we will try to automatically convert the model using @@ -618,7 +623,7 @@ See [this page](#generative-models) for more information on how to use generativ * - `DeepseekVLV2ForCausalLM` - DeepSeek-VL2 - T + I+ - - `deepseek-ai/deepseek-vl2-tiny`(WIP), `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note) + - `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note) - - ✅︎ - ✅︎ @@ -754,7 +759,7 @@ See [this page](#generative-models) for more information on how to use generativ - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. - ✅︎ - ✅︎ - - + - ✅︎ * - `UltravoxModel` - Ultravox - T + AE+ @@ -767,17 +772,10 @@ See [this page](#generative-models) for more information on how to use generativ E Pre-computed embeddings can be inputted for this modality. + Multiple items can be inputted per text prompt for this modality. -````{note} -The `deepseek-ai/deepseek-vl2-tiny` is not supported yet. - -To use `DeepSeek-VL2` series models, you need to install a fork version `deepseek_vl2` package: -```shell -pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git +```{note} +To use `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM. ``` -Besides, to run `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM. -```` - ```{note} To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. ``` diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md index 94703a1c32ade..1f5a54f755f13 100644 --- a/docs/source/serving/offline_inference.md +++ b/docs/source/serving/offline_inference.md @@ -31,6 +31,59 @@ Please refer to the above pages for more details about each API. This section lists the most common options for running the vLLM engine. For a full list, refer to the [Engine Arguments](#engine-args) page. +### Model resolution + +vLLM loads HuggingFace-compatible models by inspecting the `architectures` field in `config.json` of the model repository +and finding the corresponding implementation that is registered to vLLM. +Nevertheless, our model resolution may fail for the following reasons: + +- The `config.json` of the model repository lacks the `architectures` field. +- Unofficial repositories refer to a model using alternative names which are not recorded in vLLM. +- The same architecture name is used for multiple models, creating ambiguity as to which model should be loaded. + +In those cases, vLLM may throw an error like: + +```text +Traceback (most recent call last): +... + File "vllm/model_executor/models/registry.py", line xxx, in inspect_model_cls + for arch in architectures: +TypeError: 'NoneType' object is not iterable +``` + +or: + +```text + File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported + raise ValueError( +ValueError: Model architectures [''] are not supported for now. Supported architectures: [...] +``` + +:::{note} +The above error is distinct from the following similar but different error: + +```text + File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported + raise ValueError( +ValueError: Model architectures [''] failed to be inspected. Please check the logs for more details. +``` + +This error means that vLLM failed to import the model file. Usually, it is related to missing dependencies or outdated +binaries in the vLLM build. Please read the logs carefully to determine the real cause of the error. +::: + +To fix this, explicitly specify the model architecture by passing `config.json` overrides to the `hf_overrides` option. +For example: + +```python +model = LLM( + model="cerebras/Cerebras-GPT-1.3B", + hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2 +) +``` + +Our [list of supported models](#supported-models) shows the model architectures that are recognized by vLLM. + ### Reducing memory usage Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem. diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py new file mode 100644 index 0000000000000..5c4918008dcb3 --- /dev/null +++ b/examples/offline_inference/rlhf.py @@ -0,0 +1,186 @@ +""" +a simple demonstration of RLHF with vLLM, inspired by +the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF . +It follows the design that, training processes and inference processes +are different, and they live on different GPUs. +Training processes send prompts to inference processes to generate data, +and also synchronize the weights of the model by broadcasting the weights +from the training process to the inference process. +Note that this is a simple demonstration of one training instance and one +inference instance. In practice, there could be multiple training instances +and multiple inference instances. For the full implementation, please refer +to the OpenRLHF framework. +""" +import os + +import ray +import torch +from ray.util.placement_group import placement_group +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy +from transformers import AutoModelForCausalLM + +from vllm import LLM, SamplingParams +from vllm.utils import get_ip, get_open_port +from vllm.worker.worker import Worker + + +def stateless_init_process_group(master_address, master_port, rank, world_size, + device): + """ + vLLM provides `StatelessProcessGroup` to create a process group + without considering the global process group in torch.distributed. + It is recommended to create `StatelessProcessGroup`, and then initialize + the data-plane communication (NCCL) between external (train processes) + and vLLM workers. + """ + from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator + from vllm.distributed.utils import StatelessProcessGroup + pg = StatelessProcessGroup.create(host=master_address, + port=master_port, + rank=rank, + world_size=world_size) + pynccl = PyNcclCommunicator(pg, device=device) + return pynccl + + +class MyWorker(Worker): + """ + The `MyWorker` class inherits from `Worker` to provide custom functions. + For simplicity, we define the `MyWorker` class in this self-contained + script. Normally, we should define the `MyWorker` class in a separate + file and pass the qualified name of the class to the `worker_cls` + parameter. + """ + + def init_weight_update_group(self, master_address, master_port, + rank_offset, world_size): + from vllm.distributed.parallel_state import get_world_group + rank = get_world_group().rank + rank_offset + self.model_update_group = stateless_init_process_group( + master_address, + master_port, + rank, + world_size, + self.device, + ) + + def update_weight(self, name, dtype, shape): + weight = torch.empty(shape, dtype=dtype, device="cuda") + self.model_update_group.broadcast(weight, + src=0, + stream=torch.cuda.current_stream()) + + self.model_runner.model.load_weights(weights=[(name, weight)]) + + del weight + + def check_weights_changed(self): + """ + Check if the weights are updated to 0. + """ + weights_updated = True + for name, p in self.model_runner.model.named_parameters(): + weights_updated = weights_updated and torch.allclose( + p, torch.zeros_like(p)) + return weights_updated + + +class MyLLM(LLM): + + def __init__(self, *args, **kwargs): + # a hack to make the script work. + # stop ray from manipulating CUDA_VISIBLE_DEVICES + # at the top-level + del os.environ["CUDA_VISIBLE_DEVICES"] + super().__init__(*args, **kwargs) + + +""" +Start the training process, here we use huggingface transformers +as an example to hold a model on GPU 0. +""" + +train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") +train_model.to("cuda:0") +""" +Start the inference process, here we use vLLM to hold a model on GPU 1 and +GPU 2. For the details on how to use ray, please refer to the ray +documentation https://docs.ray.io/en/latest/ . +""" +os.environ["CUDA_VISIBLE_DEVICES"] = "1,2" +ray.init() + +pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2) +ray.get(pg_inference.ready()) +scheduling_inference = PlacementGroupSchedulingStrategy( + placement_group=pg_inference, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=0, +) +""" +launch the vLLM inference engine. +here we use `enforce_eager` to reduce the start time. +""" +llm = ray.remote( + num_cpus=0, + num_gpus=0, + scheduling_strategy=scheduling_inference, +)(MyLLM).remote( + model="facebook/opt-125m", + enforce_eager=True, + worker_cls=MyWorker, + tensor_parallel_size=2, + distributed_executor_backend="ray", +) + +# Generate texts from the prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +sampling_params = SamplingParams(temperature=0) + +outputs = ray.get(llm.generate.remote(prompts, sampling_params)) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") + +# set up the communication between the training process +# and the inference engine. +master_address = get_ip() +master_port = get_open_port() + +handle = llm.collective_rpc.remote("init_weight_update_group", + args=(master_address, master_port, 1, 3)) +model_update_group = stateless_init_process_group(master_address, master_port, + 0, 3, torch.device("cuda:0")) +ray.get(handle) + +# simulate training, modify the weights of the model. +for name, p in train_model.named_parameters(): + p.data.zero_() + +# sync weight from the training process to the inference engine. +for name, p in train_model.named_parameters(): + handle = llm.collective_rpc.remote("update_weight", + args=(name, p.dtype, p.shape)) + model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream()) + ray.get(handle) + +# check if the weights are updated. +assert all(ray.get(llm.collective_rpc.remote("check_weights_changed"))) + +# use the updated model to generate texts, they will be nonsense +# because the weights are all zeros. +outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params)) +for output in outputs_updated: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py new file mode 100644 index 0000000000000..b6de73eb7266e --- /dev/null +++ b/examples/offline_inference/torchrun_example.py @@ -0,0 +1,64 @@ +""" +experimental support for tensor-parallel inference with torchrun, +see https://github.com/vllm-project/vllm/issues/11400 for +the motivation and use case for this example. +run the script with `torchrun --nproc-per-node=2 torchrun_example.py`, +the argument 2 should match the `tensor_parallel_size` below. +see `tests/distributed/test_torchrun_example.py` for the unit test. +""" + +from vllm import LLM, SamplingParams + +# Create prompts, the same across all ranks +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +# Create sampling parameters, the same across all ranks +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Use `distributed_executor_backend="external_launcher"` so that +# this llm engine/instance only creates one worker. +llm = LLM( + model="facebook/opt-125m", + tensor_parallel_size=2, + distributed_executor_backend="external_launcher", +) + +outputs = llm.generate(prompts, sampling_params) + +# all ranks will have the same outputs +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") +""" +Further tips: + +1. to communicate control messages across all ranks, use the cpu group, +a PyTorch ProcessGroup with GLOO backend. + +```python +from vllm.distributed.parallel_state import get_world_group +cpu_group = get_world_group().cpu_group +torch_rank = dist.get_rank(group=cpu_group) +if torch_rank == 0: + # do something for rank 0, e.g. saving the results to disk. +``` + +2. to communicate data across all ranks, use the model's device group, +a PyTorch ProcessGroup with NCCL backend. +```python +from vllm.distributed.parallel_state import get_world_group +device_group = get_world_group().device_group +``` + +3. to access the model directly in every rank, use the following code: +```python +llm.llm_engine.model_executor.driver_worker.worker.model_runner.model +``` +""" diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index ad32b9fe242e9..415439e88ed59 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -26,14 +26,12 @@ def run_aria(question: str, modality: str): # NOTE: Need L40 (or equivalent) to avoid OOM llm = LLM(model=model_name, - tokenizer_mode="slow", - dtype="bfloat16", max_model_len=4096, max_num_seqs=2, - trust_remote_code=True, + dtype="bfloat16", disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - prompt = (f"<|im_start|>user\n<|img|>\n{question}" + prompt = (f"<|im_start|>user\n<|img|>{question}" "<|im_end|>\n<|im_start|>assistant\n") stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] @@ -70,7 +68,7 @@ def run_chameleon(question: str, modality: str): def run_deepseek_vl2(question: str, modality: str): assert modality == "image" - model_name = "deepseek-ai/deepseek-vl2-small" + model_name = "deepseek-ai/deepseek-vl2-tiny" llm = LLM(model=model_name, max_model_len=4096, @@ -325,7 +323,6 @@ def run_mllama(question: str, modality: str): model=model_name, max_model_len=4096, max_num_seqs=16, - enforce_eager=True, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index c6cf3f30c31cb..43c44fa867e0a 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -55,7 +55,7 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData: def load_deepseek_vl2(question: str, image_urls: List[str]): - model_name = "deepseek-ai/deepseek-vl2-small" + model_name = "deepseek-ai/deepseek-vl2-tiny" llm = LLM(model=model_name, max_model_len=4096, @@ -186,7 +186,6 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData: model=model_name, max_model_len=4096, max_num_seqs=16, - enforce_eager=True, limit_mm_per_prompt={"image": len(image_urls)}, ) @@ -394,7 +393,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: model_example_map = { "aria": load_aria, - "deepseek_vl2": load_deepseek_vl2, + "deepseek_vl_v2": load_deepseek_vl2, "h2ovl_chat": load_h2onvl, "idefics3": load_idefics3, "internvl_chat": load_internvl, diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh index 87155273a81d1..2bb2824c6c86f 100644 --- a/examples/online_serving/disaggregated_prefill.sh +++ b/examples/online_serving/disaggregated_prefill.sh @@ -3,6 +3,8 @@ # We will launch 2 vllm instances (1 for prefill and 1 for decode), # and then transfer the KV cache between them. +set -xe + echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧" sleep 1 @@ -69,7 +71,7 @@ wait_for_server 8200 # instance # NOTE: the usage of this API is subject to change --- in the future we will # introduce "vllm connect" to connect between prefill and decode instances -python3 ../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py & +python3 ../../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py & sleep 1 # serve two example requests diff --git a/examples/template_deepseek_vl2.jinja b/examples/template_deepseek_vl2.jinja new file mode 100644 index 0000000000000..fbf3d320094d5 --- /dev/null +++ b/examples/template_deepseek_vl2.jinja @@ -0,0 +1,23 @@ +{%- if messages[0]['role'] == 'system' -%} + {%- set system_message = messages[0]['content'] -%} + {%- set messages = messages[1:] -%} +{%- else -%} + {% set system_message = '' -%} +{%- endif -%} + +{{ bos_token + system_message }} +{%- for message in messages -%} + {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {%- endif -%} + + {%- if message['role'] == 'user' -%} + {{ '<|User|>: ' + message['content'] + '\n' }} + {%- elif message['role'] == 'assistant' -%} + {{ '<|Assistant|>: ' + message['content'] + eos_token + '\n' }} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {{ '<|Assistant|>: ' }} +{% endif %} diff --git a/format.sh b/format.sh index 2277eef93c745..4bcd0be0c96e5 100755 --- a/format.sh +++ b/format.sh @@ -1,321 +1,5 @@ -#!/usr/bin/env bash -# YAPF formatter, adapted from ray and skypilot. -# -# Usage: -# # Do work and commit your work. +#!/bin/bash -# # Format files that differ from origin/main. -# bash format.sh - -# # Commit changed files with message 'Run yapf and ruff' -# -# -# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase. -# You are encouraged to run this locally before pushing changes for review. - -# Cause the script to exit if a single command fails -set -eo pipefail - -# this stops git rev-parse from failing if we run this from the .git directory -builtin cd "$(dirname "${BASH_SOURCE:-$0}")" -ROOT="$(git rev-parse --show-toplevel)" -builtin cd "$ROOT" || exit 1 - -check_command() { - if ! command -v "$1" &> /dev/null; then - echo "❓❓$1 is not installed, please run \`pip install -r requirements-lint.txt\`" - exit 1 - fi -} - -check_command yapf -check_command ruff -check_command mypy -check_command codespell -check_command isort -check_command clang-format - -YAPF_VERSION=$(yapf --version | awk '{print $2}') -RUFF_VERSION=$(ruff --version | awk '{print $2}') -MYPY_VERSION=$(mypy --version | awk '{print $2}') -CODESPELL_VERSION=$(codespell --version) -ISORT_VERSION=$(isort --vn) -CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') -PYMARKDOWNLNT_VERSION=$(pymarkdownlnt version | awk '{print $1}') - -# # params: tool name, tool version, required version -tool_version_check() { - expected=$(grep "$1" requirements-lint.txt | cut -d'=' -f3) - if [[ "$2" != "$expected" ]]; then - echo "❓❓Wrong $1 version installed: $expected is required, not $2." - exit 1 - fi -} - -tool_version_check "yapf" "$YAPF_VERSION" -tool_version_check "ruff" "$RUFF_VERSION" -tool_version_check "mypy" "$MYPY_VERSION" -tool_version_check "isort" "$ISORT_VERSION" -tool_version_check "codespell" "$CODESPELL_VERSION" -tool_version_check "clang-format" "$CLANGFORMAT_VERSION" -tool_version_check "pymarkdownlnt" "$PYMARKDOWNLNT_VERSION" - -YAPF_FLAGS=( - '--recursive' - '--parallel' -) - -YAPF_EXCLUDES=( - '--exclude' 'build/**' -) - -# Format specified files -format() { - yapf --in-place "${YAPF_FLAGS[@]}" "$@" -} - -# Format files that differ from main branch. Ignores dirs that are not slated -# for autoformat yet. -format_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause yapf to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ - yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}" - fi - -} - -# Format all files -format_all() { - yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" . -} - -## This flag formats individual files. --files *must* be the first command line -## arg to use this option. -if [[ "$1" == '--files' ]]; then - format "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is formatted. -elif [[ "$1" == '--all' ]]; then - format_all -else - # Format only the files that changed in last commit. - format_changed -fi -echo 'vLLM yapf: Done' - -# Run mypy -echo 'vLLM mypy:' -tools/mypy.sh -echo 'vLLM mypy: Done' - - -# If git diff returns a file that is in the skip list, the file may be checked anyway: -# https://github.com/codespell-project/codespell/issues/1915 -# Avoiding the "./" prefix and using "/**" globs for directories appears to solve the problem -CODESPELL_EXCLUDES=( - '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**' -) - -# check spelling of specified files -spell_check() { - codespell "$@" -} - -spell_check_all(){ - codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" -} - -# Spelling check of files that differ from main branch. -spell_check_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause ruff to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - codespell "${CODESPELL_EXCLUDES[@]}" - fi -} - -# Run Codespell -## This flag runs spell check of individual files. --files *must* be the first command line -## arg to use this option. -if [[ "$1" == '--files' ]]; then - spell_check "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is linted. -elif [[ "$1" == '--all' ]]; then - spell_check_all -else - # Check spelling only of the files that changed in last commit. - spell_check_changed -fi -echo 'vLLM codespell: Done' - - -# Lint specified files -lint() { - ruff check "$@" -} - -# Lint files that differ from main branch. Ignores dirs that are not slated -# for autolint yet. -lint_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause ruff to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - ruff check - fi - -} - -# Run Ruff -### This flag lints individual files. --files *must* be the first command line -### arg to use this option. -if [[ "$1" == '--files' ]]; then - lint "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is linted. -elif [[ "$1" == '--all' ]]; then - lint vllm tests -else - # Format only the files that changed in last commit. - lint_changed -fi -echo 'vLLM ruff: Done' - -# check spelling of specified files -isort_check() { - isort "$@" -} - -isort_check_all(){ - isort . -} - -# Spelling check of files that differ from main branch. -isort_check_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause ruff to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - isort - fi -} - -# Run Isort -# This flag runs spell check of individual files. --files *must* be the first command line -# arg to use this option. -if [[ "$1" == '--files' ]]; then - isort_check "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is linted. -elif [[ "$1" == '--all' ]]; then - isort_check_all -else - # Check spelling only of the files that changed in last commit. - isort_check_changed -fi -echo 'vLLM isort: Done' - -# Clang-format section -# Exclude some files for formatting because they are vendored -# NOTE: Keep up to date with .github/workflows/clang-format.yml -CLANG_FORMAT_EXCLUDES=( - 'csrc/moe/topk_softmax_kernels.cu' - 'csrc/quantization/gguf/ggml-common.h' - 'csrc/quantization/gguf/dequantize.cuh' - 'csrc/quantization/gguf/vecdotq.cuh' - 'csrc/quantization/gguf/mmq.cuh' - 'csrc/quantization/gguf/mmvq.cuh' -) - -# Format specified files with clang-format -clang_format() { - clang-format -i "$@" -} - -# Format files that differ from main branch with clang-format. -clang_format_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause clang-format to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - # Get the list of changed files, excluding the specified ones - changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | (grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") || echo -e)) - if [ -n "$changed_files" ]; then - echo "$changed_files" | xargs -P 5 clang-format -i - fi -} - -# Format all files with clang-format -clang_format_all() { - find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ - | grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") \ - | xargs clang-format -i -} - -# Run clang-format -if [[ "$1" == '--files' ]]; then - clang_format "${@:2}" -elif [[ "$1" == '--all' ]]; then - clang_format_all -else - clang_format_changed -fi -echo 'vLLM clang-format: Done' - -echo 'vLLM actionlint:' -tools/actionlint.sh -color -echo 'vLLM actionlint: Done' - -echo 'vLLM shellcheck:' -tools/shellcheck.sh -echo 'vLLM shellcheck: Done' - -echo 'excalidraw png check:' -tools/png-lint.sh -echo 'excalidraw png check: Done' - -if ! git diff --quiet &>/dev/null; then - echo - echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:" - git --no-pager diff --name-only - echo "🔍🔍Format checker passed, but please add, commit and push all the files above to include changes made by the format checker." - - exit 1 -else - echo "✨🎉 Format check passed! Congratulations! 🎉✨" -fi - -echo 'vLLM doc-lint:' -tools/doc-lint.sh -echo 'vLLM doc-lint: Done' +echo "vLLM linting system has been moved from format.sh to pre-commit hook." +echo "Please run 'pip install -r requirements-lint.txt' and 'pre-commit install' to install the pre-commit hook." +echo "Then linters will run automatically before each commit." diff --git a/pyproject.toml b/pyproject.toml index 82275ccafb572..8f2e20d0f5800 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,11 @@ build-backend = "setuptools.build_meta" [tool.setuptools_scm] # version_file = "vllm/_version.py" # currently handled by `setup.py:get_version()` +[tool.yapfignore] +ignore_patterns = [ + "build/**", +] + [tool.ruff] # Allow lines to be as long as 80. line-length = 80 @@ -52,6 +57,9 @@ ignore = [ "B007", # f-string format "UP032", + # Python 3.8 typing + "UP006", "UP035", + ] [tool.mypy] diff --git a/requirements-common.txt b/requirements-common.txt index 6c390bcfd18e6..777b2bb124323 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -19,7 +19,7 @@ pillow # Required for image processing prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer >= 0.10.9, < 0.11 -outlines == 0.1.11 # Requires pytorch +outlines == 0.1.11 lark == 1.2.2 xgrammar >= 0.1.6; platform_machine == "x86_64" typing_extensions >= 4.10 @@ -34,6 +34,6 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch +compressed-tensors == 0.8.1 # required for compressed-tensors depyf==0.18.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py diff --git a/requirements-lint.txt b/requirements-lint.txt index ffc73f90a0d48..62446f94048df 100644 --- a/requirements-lint.txt +++ b/requirements-lint.txt @@ -1,15 +1,2 @@ # formatting -yapf==0.32.0 -toml==0.10.2 -tomli==2.0.2 -ruff==0.6.5 -codespell==2.3.0 -isort==5.13.2 -clang-format==18.1.5 -pymarkdownlnt==0.9.26 - -# type checking -mypy==1.11.1 -types-PyYAML -types-requests -types-setuptools +pre-commit==4.0.1 diff --git a/requirements-test.in b/requirements-test.in index 4b4dc376d1fa5..bc76a91ad5356 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -29,4 +29,7 @@ lm-eval[api]==0.4.4 # required for model evaluation test bitsandbytes>=0.45.0 buildkite-test-collector==0.1.9 +genai_perf==0.0.8 +tritonclient==2.51.0 + numpy < 2.0.0 diff --git a/requirements-test.txt b/requirements-test.txt index f576e42afcbbf..09e009c2e21f4 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -37,7 +37,7 @@ audioread==3.0.1 # via librosa awscli==1.35.23 # via -r requirements-test.in -bitsandbytes>=0.45.0 +bitsandbytes==0.45.0 # via -r requirements-test.in black==24.10.0 # via datamodel-code-generator @@ -75,6 +75,8 @@ colorama==0.4.6 # tqdm-multiprocess contourpy==1.3.0 # via matplotlib +cramjam==2.9.0 + # via fastparquet cupy-cuda12x==13.3.0 # via ray cycler==0.12.1 @@ -109,6 +111,8 @@ email-validator==2.2.0 # via pydantic evaluate==0.4.3 # via lm-eval +fastparquet==2024.11.0 + # via genai-perf fastrlock==0.8.2 # via cupy-cuda12x filelock==3.16.1 @@ -130,8 +134,11 @@ fsspec[http]==2024.9.0 # via # datasets # evaluate + # fastparquet # huggingface-hub # torch +genai-perf==0.0.8 + # via -r requirements-test.in genson==1.3.0 # via datamodel-code-generator h11==0.14.0 @@ -186,6 +193,8 @@ jsonschema==4.23.0 # ray jsonschema-specifications==2024.10.1 # via jsonschema +kaleido==0.2.1 + # via genai-perf kiwisolver==1.4.7 # via matplotlib lazy-loader==0.4 @@ -200,6 +209,8 @@ lm-eval[api]==0.4.4 # via -r requirements-test.in lxml==5.3.0 # via sacrebleu +markdown-it-py==3.0.0 + # via rich markupsafe==3.0.2 # via jinja2 matplotlib==3.9.2 @@ -209,6 +220,8 @@ mbstrdecoder==1.1.3 # dataproperty # pytablewriter # typepy +mdurl==0.1.2 + # via markdown-it-py mistral-common[opencv]==1.5.1 # via # -r requirements-test.in @@ -249,6 +262,8 @@ numpy==1.26.4 # datasets # decord # evaluate + # fastparquet + # genai-perf # librosa # matplotlib # mistral-common @@ -256,15 +271,18 @@ numpy==1.26.4 # numexpr # opencv-python-headless # pandas + # patsy # peft # rouge-score # sacrebleu # scikit-learn # scipy # soxr + # statsmodels # tensorizer # torchvision # transformers + # tritonclient nvidia-cublas-cu12==12.4.5.8 # via # nvidia-cudnn-cu12 @@ -306,30 +324,39 @@ packaging==24.1 # datamodel-code-generator # datasets # evaluate + # fastparquet # huggingface-hub # lazy-loader # matplotlib # peft + # plotly # pooch # pytest # pytest-rerunfailures # ray + # statsmodels # transformers # typepy pandas==2.2.3 # via # datasets # evaluate + # fastparquet + # genai-perf + # statsmodels pathspec==0.12.1 # via black pathvalidate==3.2.1 # via pytablewriter +patsy==1.0.1 + # via statsmodels peft==0.13.2 # via # -r requirements-test.in # lm-eval pillow==10.4.0 # via + # genai-perf # matplotlib # mistral-common # sentence-transformers @@ -338,6 +365,8 @@ platformdirs==4.3.6 # via # black # pooch +plotly==5.24.1 + # via genai-perf pluggy==1.5.0 # via pytest pooch==1.8.2 @@ -360,7 +389,9 @@ psutil==6.1.0 py==1.11.0 # via pytest-forked pyarrow==18.0.0 - # via datasets + # via + # datasets + # genai-perf pyasn1==0.6.1 # via rsa pybind11==2.13.6 @@ -373,6 +404,8 @@ pydantic[email]==2.9.2 # mistral-common pydantic-core==2.23.4 # via pydantic +pygments==2.18.0 + # via rich pyparsing==3.2.0 # via matplotlib pytablewriter==1.2.0 @@ -381,14 +414,18 @@ pytest==8.3.3 # via # -r requirements-test.in # buildkite-test-collector + # genai-perf # pytest-asyncio # pytest-forked + # pytest-mock # pytest-rerunfailures # pytest-shard pytest-asyncio==0.24.0 # via -r requirements-test.in pytest-forked==1.6.0 # via -r requirements-test.in +pytest-mock==3.14.0 + # via genai-perf pytest-rerunfailures==14.0 # via -r requirements-test.in pytest-shard==0.1.2 @@ -399,6 +436,8 @@ python-dateutil==2.9.0.post0 # matplotlib # pandas # typepy +python-rapidjson==1.20 + # via tritonclient pytz==2024.2 # via # pandas @@ -409,9 +448,11 @@ pyyaml==6.0.2 # awscli # datamodel-code-generator # datasets + # genai-perf # huggingface-hub # peft # ray + # responses # timm # transformers ray[adag]==2.40.0 @@ -438,8 +479,13 @@ requests==2.32.3 # mistral-common # pooch # ray + # responses # tiktoken # transformers +responses==0.25.3 + # via genai-perf +rich==13.9.4 + # via genai-perf rouge-score==0.1.2 # via lm-eval rpds-py==0.20.1 @@ -470,6 +516,7 @@ scipy==1.13.1 # librosa # scikit-learn # sentence-transformers + # statsmodels sentence-transformers==3.2.1 # via -r requirements-test.in sentencepiece==0.2.0 @@ -490,6 +537,8 @@ soxr==0.5.0.post1 # via librosa sqlitedict==2.1.0 # via lm-eval +statsmodels==0.14.4 + # via genai-perf sympy==1.13.1 # via torch tabledata==1.3.3 @@ -499,7 +548,9 @@ tabulate==0.9.0 tcolorpy==0.1.6 # via pytablewriter tenacity==9.0.0 - # via lm-eval + # via + # lm-eval + # plotly tensorizer==2.9.0 # via -r requirements-test.in threadpoolctl==3.5.0 @@ -540,6 +591,7 @@ tqdm-multiprocess==0.0.11 # via lm-eval transformers==4.47.0 # via + # genai-perf # lm-eval # peft # sentence-transformers @@ -548,6 +600,10 @@ transformers-stream-generator==0.0.5 # via -r requirements-test.in triton==3.1.0 # via torch +tritonclient==2.51.0 + # via + # -r requirements-test.in + # genai-perf typepy[datetime]==1.3.2 # via # dataproperty @@ -555,6 +611,7 @@ typepy[datetime]==1.3.2 # tabledata typing-extensions==4.12.2 # via + # bitsandbytes # huggingface-hub # librosa # mistral-common @@ -563,10 +620,12 @@ typing-extensions==4.12.2 # torch tzdata==2024.2 # via pandas -urllib3==1.26.20 +urllib3==2.2.3 # via # botocore # requests + # responses + # tritonclient word2number==1.1 # via lm-eval xxhash==3.5.0 diff --git a/setup.py b/setup.py index 7dfcec7f9f0c5..dde5072139660 100644 --- a/setup.py +++ b/setup.py @@ -301,6 +301,7 @@ def run(self) -> None: "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so", "vllm/vllm_flash_attn/flash_attn_interface.py", "vllm/vllm_flash_attn/__init__.py", + "vllm/cumem_allocator.abi3.so", # "vllm/_version.py", # not available in nightly wheels yet ] file_members = filter(lambda x: x.filename in files_to_copy, @@ -472,13 +473,9 @@ def get_gaudi_sw_version(): def get_vllm_version() -> str: - # TODO: Revisit this temporary approach: https://github.com/vllm-project/vllm/issues/9182#issuecomment-2404860236 - try: - version = get_version( - write_to="vllm/_version.py", # TODO: move this to pyproject.toml - ) - except LookupError: - version = "0.0.0" + version = get_version( + write_to="vllm/_version.py", # TODO: move this to pyproject.toml + ) sep = "+" if "+" not in version else "." # dev versions might contain + @@ -553,7 +550,7 @@ def _read_requirements(filename: str) -> List[str]: return resolved_requirements if _no_device(): - requirements = _read_requirements("requirements-cuda.txt") + requirements = _read_requirements("requirements-cpu.txt") elif _is_cuda(): requirements = _read_requirements("requirements-cuda.txt") cuda_major, cuda_minor = torch.version.cuda.split(".") @@ -598,6 +595,7 @@ def _read_requirements(filename: str) -> List[str]: if _is_cuda(): ext_modules.append( CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c")) + ext_modules.append(CMakeExtension(name="vllm.cumem_allocator")) if _build_custom_ops(): ext_modules.append(CMakeExtension(name="vllm._C")) diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py new file mode 100644 index 0000000000000..53f4ef08f36a2 --- /dev/null +++ b/tests/basic_correctness/test_cumem.py @@ -0,0 +1,112 @@ +import torch + +from vllm import LLM, SamplingParams +from vllm.device_allocator.cumem import CuMemAllocator +from vllm.utils import GiB_bytes + +from ..utils import fork_new_process_for_each_test + + +@fork_new_process_for_each_test +def test_basic_cumem(): + # some tensors from default memory pool + shape = (1024, 1024) + x = torch.empty(shape, device='cuda') + x.zero_() + + # some tensors from custom memory pool + allocator = CuMemAllocator.get_instance() + with allocator.use_memory_pool(): + # custom memory pool + y = torch.empty(shape, device='cuda') + y.zero_() + y += 1 + z = torch.empty(shape, device='cuda') + z.zero_() + z += 2 + + # they can be used together + output = x + y + z + assert torch.allclose(output, torch.ones_like(output) * 3) + + free_bytes = torch.cuda.mem_get_info()[0] + allocator.sleep() + free_bytes_after_sleep = torch.cuda.mem_get_info()[0] + assert free_bytes_after_sleep > free_bytes + allocator.wake_up() + + # they can be used together + output = x + y + z + assert torch.allclose(output, torch.ones_like(output) * 3) + + +@fork_new_process_for_each_test +def test_cumem_with_cudagraph(): + allocator = CuMemAllocator.get_instance() + with allocator.use_memory_pool(): + weight = torch.eye(1024, device='cuda') + with allocator.use_memory_pool(tag="discard"): + cache = torch.empty(1024, 1024, device='cuda') + + def model(x): + out = x @ weight + cache[:out.size(0)].copy_(out) + return out + 1 + + x = torch.empty(128, 1024, device='cuda') + + # warmup + model(x) + + # capture cudagraph + model_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(model_graph): + y = model(x) + + free_bytes = torch.cuda.mem_get_info()[0] + allocator.sleep() + free_bytes_after_sleep = torch.cuda.mem_get_info()[0] + assert free_bytes_after_sleep > free_bytes + allocator.wake_up() + + # after waking up, the content in the weight tensor + # should be restored, but the content in the cache tensor + # should be discarded + + # this operation is also compatible with cudagraph + + x.random_() + model_graph.replay() + + # cache content is as expected + assert torch.allclose(x, cache[:x.size(0)]) + + # output content is as expected + assert torch.allclose(y, x + 1) + + +@fork_new_process_for_each_test +def test_end_to_end(): + free, total = torch.cuda.mem_get_info() + used_bytes_baseline = total - free # in case other process is running + llm = LLM("meta-llama/Llama-3.2-1B", enable_sleep_mode=True) + prompt = "How are you?" + sampling_params = SamplingParams(temperature=0, max_tokens=10) + output = llm.generate(prompt, sampling_params) + + # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, + # which is difficult to measure in the test. therefore, we only + # test sleep level 1 here. + llm.sleep(level=1) + + free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() + used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline + # now the memory usage is mostly cudagraph memory pool, + # and it should be less than the model weights (1B model, 2GiB weights) + assert used_bytes < 2 * GiB_bytes + + llm.wake_up() + output2 = llm.generate(prompt, sampling_params) + + # cmp output + assert output[0].outputs[0].text == output2[0].outputs[0].text diff --git a/tests/conftest.py b/tests/conftest.py index 95af4ac1eb17b..279c1bf9a3776 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -244,6 +244,7 @@ def video_assets() -> _VideoAssets: _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict) +_R = TypeVar("_R") class HfRunner: @@ -930,6 +931,10 @@ def score( req_outputs = self.model.score(text_1, text_2) return [req_output.outputs.score for req_output in req_outputs] + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + executor = self.model.llm_engine.model_executor + return executor.apply_model(func) + def __enter__(self): return self diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index 29ac3a3c86cb4..6642174c17d8b 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -796,6 +796,44 @@ def test_find_cached_blocks_prefix(): block_hashes=block_hashes_seq1) assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks + # Test reset prefix cache + @staticmethod + @pytest.mark.parametrize("num_blocks", [10]) + @pytest.mark.parametrize("block_size", [16]) + def test_reset_prefix_cache(num_blocks: int, block_size: int): + """This test case simulates the case of resetting the prefix cache.""" + + allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, + block_size=block_size) + token_ids = list(range(3 * block_size)) + + first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain( + block_size=block_size, + token_ids=token_ids, + allocator=allocator, + ) + second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain( + block_size=block_size, + token_ids=token_ids, + allocator=allocator, + ) + + # Free each block in the first chain. + for block in first_chain: + allocator.free(block) + + # Failed to reset prefix cache because some blocks are not freed yet. + assert not allocator.reset_prefix_cache() + assert allocator.get_prefix_cache_hit_rate() > 0.0 + + # Free each block in the second chain. + for block in second_chain: + allocator.free(block) + + # Reset prefix cache. + assert allocator.reset_prefix_cache() + assert allocator.get_prefix_cache_hit_rate() == 0.0 + @staticmethod def create_immutable_chain( block_size: int, diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py new file mode 100644 index 0000000000000..7aa03d7f0402a --- /dev/null +++ b/tests/distributed/test_torchrun_example.py @@ -0,0 +1,56 @@ +# unit test for `examples/offline_inference/torchrun_example.py` + +import random + +import torch.distributed as dist + +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import get_world_group + +# Create prompts +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# set different `gpu_memory_utilization` and `swap_space` for different ranks, +# to test if all ranks agree on the same kv cache configuration. +llm = LLM(model="facebook/opt-125m", + tensor_parallel_size=2, + distributed_executor_backend="external_launcher", + gpu_memory_utilization=random.uniform(0.7, 0.9), + swap_space=random.randint(1, 4)) + +outputs = llm.generate(prompts, sampling_params) + +cpu_group = get_world_group().cpu_group + +torch_rank = dist.get_rank(group=cpu_group) + + +def test_consistent_across_ranks(obj): + if torch_rank == 0: + dist.broadcast_object_list([obj], src=0, group=cpu_group) + else: + container = [None] + dist.broadcast_object_list(container, src=0, group=cpu_group) + assert container[0] == obj + + +test_consistent_across_ranks( + llm.llm_engine.vllm_config.cache_config.num_cpu_blocks) +test_consistent_across_ranks( + llm.llm_engine.vllm_config.cache_config.num_gpu_blocks) + +# all ranks should have the same outputs +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + test_consistent_across_ranks(prompt) + test_consistent_across_ranks(generated_text) + print(f"Rank {torch_rank}, Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py index 2a057ca488a50..0e33f3662da82 100644 --- a/tests/engine/test_custom_executor.py +++ b/tests/engine/test_custom_executor.py @@ -1,6 +1,6 @@ import asyncio import os -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import pytest @@ -18,7 +18,7 @@ class Mock: class CustomUniExecutor(UniProcExecutor): def collective_rpc(self, - method: str, + method: Union[str, Callable], timeout: Optional[float] = None, args: Tuple = (), kwargs: Optional[Dict] = None) -> List[Any]: @@ -51,7 +51,9 @@ def test_custom_executor(model, tmp_path): assert not os.path.exists(".marker") engine_args = EngineArgs( - model=model, distributed_executor_backend=CustomUniExecutor) + model=model, + distributed_executor_backend=CustomUniExecutor, + ) engine = LLMEngine.from_engine_args(engine_args) sampling_params = SamplingParams(max_tokens=1) diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py index db70a808c008b..04505fcaae24b 100644 --- a/tests/engine/test_multiproc_workers.py +++ b/tests/engine/test_multiproc_workers.py @@ -22,7 +22,7 @@ def worker_method(self, worker_input: Any) -> Tuple[int, Any]: # simulate error case raise worker_input - return self.rank, input + return self.rpc_rank, input def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]: diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py new file mode 100644 index 0000000000000..22473ce275295 --- /dev/null +++ b/tests/entrypoints/llm/test_collective_rpc.py @@ -0,0 +1,36 @@ +import pytest + +from vllm import LLM + +from ...utils import fork_new_process_for_each_test + + +@pytest.mark.parametrize("tp_size", [1, 2]) +@pytest.mark.parametrize("backend", ["mp", "ray"]) +@fork_new_process_for_each_test +def test_collective_rpc(tp_size, backend): + if tp_size == 1 and backend == "ray": + pytest.skip("Skip duplicate test case") + if tp_size == 1: + backend = None + + # intentionally define the method and class in the test function, + # to test if they can be serialized and sent to the workers + def echo_rank(self): + return self.rank + + from vllm.worker.worker import Worker + + class MyWorker(Worker): + + def echo_rank(self): + return self.rank + + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + load_format="dummy", + tensor_parallel_size=tp_size, + distributed_executor_backend=backend, + worker_cls=MyWorker) + for method in ["echo_rank", echo_rank]: + assert llm.collective_rpc(method) == list(range(tp_size)) diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py index 41163809237e9..3906ad766e0b6 100644 --- a/tests/entrypoints/llm/test_encode.py +++ b/tests/entrypoints/llm/test_encode.py @@ -105,3 +105,10 @@ def test_multiple_pooling_params(llm: LLM): # pooling_params is None, default params should be applied outputs = llm.encode(PROMPTS, pooling_params=None) assert len(PROMPTS) == len(outputs) + + +@pytest.mark.skip_global_cleanup +def test_right_side_truncation(llm: LLM): + # Embeddings models should truncate the end of the prompt + tokenizer = llm.get_tokenizer() + assert tokenizer.truncation_side == "right" diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index 46a064f6d9e68..6ff99f6faa143 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -17,6 +17,33 @@ # generation quality here LORA_NAME = "typeof/zephyr-7b-beta-lora" +BADREQUEST_CASES = [ + ( + "test_rank", + { + "r": 1024 + }, + "is greater than max_lora_rank", + ), + ( + "test_bias", + { + "bias": "all" + }, + "Adapter bias cannot be used without bias_enabled", + ), + ("test_dora", { + "use_dora": True + }, "does not yet support DoRA"), + ( + "test_modules_to_save", + { + "modules_to_save": ["lm_head"] + }, + "only supports modules_to_save being None", + ), +] + @pytest.fixture(scope="module") def zephyr_lora_files(): @@ -138,32 +165,36 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_dynamic_lora_invalid_lora_rank(client: openai.AsyncOpenAI, - tmp_path, zephyr_lora_files): - invalid_rank = tmp_path / "invalid_rank" - - # Copy adapter from zephyr_lora_files to invalid_rank - shutil.copytree(zephyr_lora_files, invalid_rank) - - with open(invalid_rank / "adapter_config.json") as f: +@pytest.mark.parametrize("test_name,config_change,expected_error", + BADREQUEST_CASES) +async def test_dynamic_lora_badrequests(client: openai.AsyncOpenAI, tmp_path, + zephyr_lora_files, test_name: str, + config_change: dict, + expected_error: str): + # Create test directory + test_dir = tmp_path / test_name + + # Copy adapter files + shutil.copytree(zephyr_lora_files, test_dir) + + # Load and modify configuration + config_path = test_dir / "adapter_config.json" + with open(config_path) as f: adapter_config = json.load(f) + # Apply configuration changes + adapter_config.update(config_change) - print(adapter_config) - - # assert False - - # Change rank to invalid value - adapter_config["r"] = 1024 - with open(invalid_rank / "adapter_config.json", "w") as f: + # Save modified configuration + with open(config_path, "w") as f: json.dump(adapter_config, f) - with pytest.raises(openai.BadRequestError, - match="is greater than max_lora_rank"): + # Test loading the adapter + with pytest.raises(openai.BadRequestError, match=expected_error): await client.post("load_lora_adapter", cast_to=str, body={ - "lora_name": "invalid-json", - "lora_path": str(invalid_rank) + "lora_name": test_name, + "lora_path": str(test_dir) }) diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index a803ea4a8d6ad..06e0f93dbe269 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -12,6 +12,9 @@ def server(): args = [ "--enforce-eager", + # Will be used on tests to compare prompt input length + "--max-model-len", + "100" ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: @@ -20,8 +23,7 @@ def server(): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_text_1_str_text_2_list(server: RemoteOpenAIServer, - model_name: str): +def test_text_1_str_text_2_list(server: RemoteOpenAIServer, model_name: str): text_1 = "What is the capital of France?" text_2 = [ "The capital of Brazil is Brasilia.", "The capital of France is Paris." @@ -45,8 +47,7 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_text_1_list_text_2_list(server: RemoteOpenAIServer, - model_name: str): +def test_text_1_list_text_2_list(server: RemoteOpenAIServer, model_name: str): text_1 = [ "What is the capital of the United States?", "What is the capital of France?" @@ -73,8 +74,7 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_text_1_str_text_2_str(server: RemoteOpenAIServer, - model_name: str): +def test_text_1_str_text_2_str(server: RemoteOpenAIServer, model_name: str): text_1 = "What is the capital of France?" text_2 = "The capital of France is Paris." @@ -91,3 +91,36 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer, assert score.data is not None assert len(score.data) == 1 assert score.data[0].score >= 0.9 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_score_max_model_len(server: RemoteOpenAIServer, model_name: str): + + text_1 = "What is the capital of France?" * 20 + text_2 = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + + score_response = requests.post(server.url_for("score"), + json={ + "model": model_name, + "text_1": text_1, + "text_2": text_2, + }) + assert score_response.status_code == 400 + # Assert just a small fragments of the response + assert "Please reduce the length of the input." in \ + score_response.text + + # Test truncation + score_response = requests.post(server.url_for("score"), + json={ + "model": model_name, + "text_1": text_1, + "text_2": text_2, + "truncate_prompt_tokens": 101 + }) + assert score_response.status_code == 400 + assert "Please, select a smaller truncation size." in \ + score_response.text diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 8f242df4a60e3..513b466c10d60 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -754,6 +754,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): ("template_chatglm.jinja", "string"), ("template_chatglm2.jinja", "string"), ("template_chatml.jinja", "string"), + ("template_deepseek_vl2.jinja", "string"), ("template_falcon_180b.jinja", "string"), ("template_falcon.jinja", "string"), ("template_inkbot.jinja", "string"), diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 3e3c0668198ad..124d5d297a574 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -31,9 +31,9 @@ NUM_PREFILL_SEQS = [3] # Arbitrary values for testing NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing -# FlashAttention forward only supports head dimension at most 128 -# https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62 -HEAD_SIZES = [64, 80, 120, 256] +# This should be sync with get_supported_head_sizes() in +# vllm.attention.ops.paged_attn.PagedAttention +HEAD_SIZES = [32, 64, 80, 96, 112, 120, 128, 192, 256] BLOCK_SIZES = [16, 32] USE_ALIBI = [False, True] diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/test_triton_scaled_mm.py index 8e96a2f70d751..a5aab3c2ea4b0 100644 --- a/tests/kernels/test_triton_scaled_mm.py +++ b/tests/kernels/test_triton_scaled_mm.py @@ -39,6 +39,23 @@ def get_8bit_types(): return types +# This test is to check regressions for int8 support on ROCm. +@pytest.mark.parametrize("model_path", [ + "neuralmagic/Llama-3.2-1B-quantized.w8a8", +]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [10]) +@pytest.mark.skipif(not current_platform.is_rocm(), + reason="Should only run on ROCm") +def test_rocm_compressed_tensors_w8a8(vllm_runner, example_prompts, model_path, + max_tokens, num_logprobs): + dtype = "bfloat16" + + with vllm_runner(model_path, dtype=dtype) as vllm_model: + vllm_model.generate_greedy_logprobs(example_prompts, max_tokens, + num_logprobs) + + @pytest.mark.parametrize("M", [1, 33, 64, 512]) @pytest.mark.parametrize("N", [256, 971, 20486]) @pytest.mark.parametrize("K", [128, 496, 1024]) diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index 537d95b025a9d..b907af47d08d7 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -3,6 +3,7 @@ import pytest from vllm.lora.models import LoRAModel +from vllm.lora.peft_helper import PEFTHelper from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM from vllm.model_executor.models.utils import WeightsMapper @@ -30,11 +31,14 @@ def test_load_checkpoints( else: expected_lora_modules.append(module) if lora_name == "baichuan7B": + peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files, + max_position_embeddings=4096) # For the baichuan7B model, load it's LoRA, # and the test should pass. LoRAModel.from_local_checkpoint( baichuan_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, @@ -43,9 +47,12 @@ def test_load_checkpoints( # Test that the target_modules contain prefix # such as "model.layers.0.self_atten.W_pack", and # the test should pass. + peft_helper = PEFTHelper.from_local_dir(baichuan_zero_lora_files, + max_position_embeddings=4096) LoRAModel.from_local_checkpoint( baichuan_zero_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, @@ -53,9 +60,12 @@ def test_load_checkpoints( elif lora_name == "baichuan7B-zero-regex": # Test that the `target_modules` in the form of regular expressions, # such as `model\\..*(W_pack|o_proj)`, and the test should pass. + peft_helper = PEFTHelper.from_local_dir(baichuan_regex_lora_files, + max_position_embeddings=4096) LoRAModel.from_local_checkpoint( baichuan_regex_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, @@ -64,10 +74,13 @@ def test_load_checkpoints( # For the baichuan7B model, load chatglm3-6b's LoRA, # and the test should raise the following error. expected_error = "Please verify that the loaded LoRA module is correct" # noqa: E501 + peft_helper = PEFTHelper.from_local_dir(chatglm3_lora_files, + max_position_embeddings=4096) with pytest.raises(ValueError, match=expected_error): LoRAModel.from_local_checkpoint( chatglm3_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, @@ -94,9 +107,12 @@ def test_lora_weights_mapping(baichuan_lora_files): ".layers.": ".baichuan_layers.", }, ) + peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files, + max_position_embeddings=4096) lora_model = LoRAModel.from_local_checkpoint( baichuan_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index e2daf9d135113..1c0ee01c038d0 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -3,6 +3,7 @@ import pytest from vllm.lora.models import LoRAModel +from vllm.lora.peft_helper import PEFTHelper from vllm.lora.utils import get_adapter_absolute_path from vllm.model_executor.models.llama import LlamaForCausalLM @@ -27,9 +28,11 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request): lora_path = get_adapter_absolute_path(lora_name) # lora loading should work for either absolute path and hugggingface id. + peft_helper = PEFTHelper.from_local_dir(lora_path, 4096) lora_model = LoRAModel.from_local_checkpoint( lora_path, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index ca523c66abe42..9a5b9aabf5078 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -1,5 +1,3 @@ -import json -import math import os from typing import Dict, List @@ -34,56 +32,6 @@ ] if current_platform.is_cuda_alike() else ["cpu"]) -def test_peft_helper(sql_lora_files): - lora_config_path = os.path.join(sql_lora_files, "adapter_config.json") - with open(lora_config_path) as f: - config = json.load(f) - peft_helper = PEFTHelper.from_dict(config) - assert peft_helper.r == 8 - assert peft_helper.lora_alpha == 16 - assert peft_helper.target_modules == [ - "q_proj", - "v_proj", - "k_proj", - "o_proj", - "gate_proj", - "up_proj", - "down_proj", - "embed_tokens", - "lm_head", - ] - scaling = peft_helper.lora_alpha / peft_helper.r - assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3 - - # test RSLoRA - config = dict(r=8, - lora_alpha=16, - target_modules=["gate_proj"], - use_rslora=True) - peft_helper = PEFTHelper.from_dict(config) - - scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r) - assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3 - - expected_error = "vLLM only supports modules_to_save being None." - with pytest.raises(ValueError, match=expected_error): - config = dict( - r=8, - lora_alpha=16, - target_modules=["gate_proj"], - modules_to_save=["lm_head"], - ) - PEFTHelper.from_dict(config) - - expected_error = "vLLM does not yet support DoRA." - with pytest.raises(ValueError, match=expected_error): - config = dict(r=8, - lora_alpha=16, - target_modules=["gate_proj"], - use_dora=True) - PEFTHelper.from_dict(config) - - @pytest.mark.parametrize("device", DEVICES) def test_from_lora_tensors(sql_lora_files, device): tensors = load_file( @@ -91,11 +39,8 @@ def test_from_lora_tensors(sql_lora_files, device): new_embeddings = load_file( os.path.join(sql_lora_files, "new_embeddings.safetensors")) - lora_config_path = os.path.join(sql_lora_files, "adapter_config.json") - with open(lora_config_path) as f: - config = json.load(f) - - peft_helper = PEFTHelper.from_dict(config) + peft_helper = PEFTHelper.from_local_dir(sql_lora_files, + max_position_embeddings=4096) lora_model = LoRAModel.from_lora_tensors( 1, tensors, diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py new file mode 100644 index 0000000000000..a524d5ce5f34a --- /dev/null +++ b/tests/lora/test_peft_helper.py @@ -0,0 +1,109 @@ +import json +import math +import shutil + +import pytest + +from vllm.config import LoRAConfig +from vllm.lora.peft_helper import PEFTHelper + +ERROR_CASES = [ + ( + "test_rank", + { + "r": 1024 + }, + "is greater than max_lora_rank", + ), + ( + "test_bias", + { + "bias": "all" + }, + "Adapter bias cannot be used without bias_enabled", + ), + ("test_dora", { + "use_dora": True + }, "does not yet support DoRA"), + ( + "test_modules_to_save", + { + "modules_to_save": ["lm_head"] + }, + "only supports modules_to_save being None", + ), +] + + +def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path): + peft_helper = PEFTHelper.from_local_dir(long_context_lora_files_16k_1, + max_position_embeddings=4096) + lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2) + peft_helper.validate_legal(lora_config) + assert peft_helper.r == 8 + assert peft_helper.lora_alpha == 16 + assert peft_helper.target_modules == [ + "q_proj", + "v_proj", + "k_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "embed_tokens", + "lm_head", + ] + assert peft_helper.context_length == 16384 + assert peft_helper.vllm_max_position_embeddings == 4096 + assert peft_helper.vllm_long_context_scaling_factor == float( + math.ceil(peft_helper.context_length / + peft_helper.vllm_max_position_embeddings)) + # test RSLoRA + rslora_config = dict(use_rslora=True) + test_dir = tmp_path / "test_rslora" + shutil.copytree(long_context_lora_files_16k_1, test_dir) + + # Load and modify configuration + config_path = test_dir / "adapter_config.json" + with open(config_path) as f: + adapter_config = json.load(f) + # Apply configuration changes + adapter_config.update(rslora_config) + + # Save modified configuration + with open(config_path, "w") as f: + json.dump(adapter_config, f) + + peft_helper = PEFTHelper.from_local_dir(test_dir, + max_position_embeddings=4096) + peft_helper.validate_legal(lora_config) + scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r) + assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3 + + +@pytest.mark.parametrize("test_name,config_change,expected_error", ERROR_CASES) +def test_peft_helper_error( + sql_lora_files, + tmp_path, + test_name: str, + config_change: dict, + expected_error: str, +): + test_dir = tmp_path / test_name + shutil.copytree(sql_lora_files, test_dir) + + # Load and modify configuration + config_path = test_dir / "adapter_config.json" + with open(config_path) as f: + adapter_config = json.load(f) + # Apply configuration changes + adapter_config.update(config_change) + + # Save modified configuration + with open(config_path, "w") as f: + json.dump(adapter_config, f) + lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2) + # Test loading the adapter + with pytest.raises(ValueError, match=expected_error): + PEFTHelper.from_local_dir( + test_dir, max_position_embeddings=4096).validate_legal(lora_config) diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 0609fd96825e3..9c1f784c1c93b 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -25,13 +25,12 @@ def test_model_loading_with_params(vllm_runner): with vllm_runner(model_name=MODEL_NAME, revision=REVISION, dtype="float16", - max_model_len=MAX_MODEL_LEN) as model: - output = model.encode("Write a short story about a robot that" - " dreams for the first time.\n") + max_model_len=MAX_MODEL_LEN) as vllm_model: + output = vllm_model.encode("Write a short story about a robot that" + " dreams for the first time.\n") - model_config = model.model.llm_engine.model_config - - model_tokenizer = model.model.llm_engine.tokenizer + model_config = vllm_model.model.llm_engine.model_config + model_tokenizer = vllm_model.model.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -46,11 +45,13 @@ def test_model_loading_with_params(vllm_runner): assert model_tokenizer.tokenizer_config["do_lower_case"] assert model_tokenizer.tokenizer.model_max_length == 512 - model = model.model.llm_engine.model_executor\ - .driver_worker.model_runner.model - assert isinstance(model, BertEmbeddingModel) - assert model._pooler.pooling_type == PoolingType.CLS - assert model._pooler.normalize + def check_model(model): + assert isinstance(model, BertEmbeddingModel) + assert model._pooler.pooling_type == PoolingType.CLS + assert model._pooler.normalize + + vllm_model.apply_model(check_model) + # assert output assert output @@ -64,13 +65,12 @@ def test_roberta_model_loading_with_params(vllm_runner): with vllm_runner(model_name=MODEL_NAME_ROBERTA, revision=REVISION_ROBERTA, dtype="float16", - max_model_len=MAX_MODEL_LEN) as model: - output = model.encode("Write a short story about a robot that" - " dreams for the first time.\n") + max_model_len=MAX_MODEL_LEN) as vllm_model: + output = vllm_model.encode("Write a short story about a robot that" + " dreams for the first time.\n") - model_config = model.model.llm_engine.model_config - - model_tokenizer = model.model.llm_engine.tokenizer + model_config = vllm_model.model.llm_engine.model_config + model_tokenizer = vllm_model.model.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -84,11 +84,12 @@ def test_roberta_model_loading_with_params(vllm_runner): assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-large" assert not model_tokenizer.tokenizer_config["do_lower_case"] - model = model.model.llm_engine.model_executor\ - .driver_worker.model_runner.model - assert isinstance(model, RobertaEmbeddingModel) - assert model._pooler.pooling_type == PoolingType.MEAN - assert model._pooler.normalize + def check_model(model): + assert isinstance(model, RobertaEmbeddingModel) + assert model._pooler.pooling_type == PoolingType.MEAN + assert model._pooler.normalize + + vllm_model.apply_model(check_model) # assert output assert output @@ -103,17 +104,18 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner): model_name = "FacebookAI/roberta-base" with vllm_runner(model_name=model_name, dtype="float16", - max_model_len=MAX_MODEL_LEN) as model: - output = model.encode("Write a short story about a robot that" - " dreams for the first time.\n") + max_model_len=MAX_MODEL_LEN) as vllm_model: + output = vllm_model.encode("Write a short story about a robot that" + " dreams for the first time.\n") - model_tokenizer = model.model.llm_engine.tokenizer + model_tokenizer = vllm_model.model.llm_engine.tokenizer assert model_tokenizer.tokenizer_id == model_name - model = model.model.llm_engine.model_executor\ - .driver_worker.model_runner.model - assert not hasattr(model, "lm_head") - assert isinstance(model, RobertaEmbeddingModel) - assert isinstance(model._pooler, CLSPool) + def check_model(model): + assert isinstance(model, RobertaEmbeddingModel) + assert not hasattr(model, "lm_head") + assert isinstance(model._pooler, CLSPool) + + vllm_model.apply_model(check_model) assert output diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py index 81b93ebdf0fc0..ad8f8a0c320e9 100644 --- a/tests/models/decoder_only/language/test_gguf.py +++ b/tests/models/decoder_only/language/test_gguf.py @@ -66,12 +66,16 @@ def gguf_model(self): gguf_filename="starcoder2-3b.Q6_K.gguf", ) +DOLPHIN_CONFIG = GGUFTestConfig( + # Test VocabParallelEmbedding sharding issue. + original_model="cognitivecomputations/TinyDolphin-2.8-1.1b", + gguf_repo="tsunemoto/TinyDolphin-2.8-1.1b-GGUF", + gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf", +) + MODELS = [ - LLAMA_CONFIG, - QWEN2_CONFIG, - PHI3_CONFIG, - GPT2_CONFIG, - STABLELM_CONFIG, + LLAMA_CONFIG, QWEN2_CONFIG, PHI3_CONFIG, GPT2_CONFIG, STABLELM_CONFIG, + DOLPHIN_CONFIG # STARCODER_CONFIG, # broken ] @@ -106,15 +110,18 @@ def test_models( messages, tokenize=False, add_generation_prompt=True) # Run unquantized model. - with vllm_runner(model_name=model.original_model, - dtype=dtype, - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=tp_size) as original_model: + with vllm_runner( + model_name=model.original_model, + enforce_eager=True, # faster tests + dtype=dtype, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=tp_size) as original_model: original_outputs = original_model.generate_greedy_logprobs( example_prompts[:-1], max_tokens, num_logprobs) # Run gguf model. with vllm_runner(model_name=model.gguf_model, + enforce_eager=True, tokenizer_name=model.original_model, dtype=dtype, max_model_len=MAX_MODEL_LEN, diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py index 057b04349e8b7..2e06b10fbb827 100644 --- a/tests/models/decoder_only/language/test_jamba.py +++ b/tests/models/decoder_only/language/test_jamba.py @@ -33,10 +33,13 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py index 06739e8f02253..1ad4f5aae8f5b 100644 --- a/tests/models/decoder_only/language/test_mamba.py +++ b/tests/models/decoder_only/language/test_mamba.py @@ -51,10 +51,13 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index 4e110366a09f3..c7efa4edbbc0a 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -73,10 +73,13 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) check_logprobs_close( outputs_0_lst=hf_outputs, diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 7620ed1107e8f..14d9a739be318 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -9,7 +9,7 @@ import pytest from transformers import AutoModelForVision2Seq -from transformers.utils import is_flash_attn_2_available +from transformers import __version__ as TRANSFORMERS_VERSION from vllm.platforms import current_platform from vllm.utils import identity @@ -139,9 +139,7 @@ #### Extended model tests "aria": VLMTestInfo( models=["rhymes-ai/Aria"], - tokenizer_mode="slow", test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - dtype="bfloat16", prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501 img_idx_to_prompt=lambda idx: "<|img|>\n", max_model_len=4096, @@ -157,8 +155,8 @@ max_tokens=64, marks=[ pytest.mark.skipif( - not is_flash_attn_2_available(), - reason="Model needs flash-attn for numeric convergence.", + TRANSFORMERS_VERSION < "4.48.0", + reason="HF model requires transformers>=4.48.0", ), large_gpu_mark(min_gb=64), ], @@ -189,30 +187,27 @@ dtype="bfloat16", ), "deepseek_vl_v2": VLMTestInfo( - models=["deepseek-ai/deepseek-vl2-small"], + models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - dtype="bfloat16", prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501 max_model_len=4096, max_num_seqs=2, single_image_prompts=IMAGE_ASSETS.prompts({ - "stop_sign": "\nWhat's the color of the stop sign and car?", - "cherry_blossom": "\nWhat's the color of the tower?", + "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501 + "cherry_blossom": "\nPlease infer the season with reason in details.", # noqa: E501 }), - multi_image_prompt="image_1:\nimage_2:\nDescribe the two images shortly.", # noqa: E501 + multi_image_prompt="image_1:\nimage_2:\nWhich image can we see the car and the tower?", # noqa: E501 vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}}, # noqa: E501 - image_size_factors=[(0.10, 0.15)], patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner, postprocess_inputs=model_utils.cast_dtype_post_processor("images"), hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output, stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501 - num_logprobs=5, + image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], marks=[ pytest.mark.skipif( - not is_flash_attn_2_available(), - reason="Model needs flash-attn for numeric convergence.", - ), - large_gpu_mark(min_gb=48), + TRANSFORMERS_VERSION >= "4.48.0", + reason="HF model is not compatible with transformers>=4.48.0", + ) ], ), "fuyu": VLMTestInfo( diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py index 16e256e040a74..5a485f3d81747 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -5,7 +5,6 @@ import torch from PIL import Image -from vllm.entrypoints.llm import LLM from vllm.multimodal.image import rescale_image_size from vllm.multimodal.video import rescale_video_size, sample_frames_from_video @@ -69,7 +68,7 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict): def batch_make_image_embeddings( image_batches: List[Union[Image.Image, List[Image.Image]]], processor, - llm: LLM) -> List[Qwen2VLPromptImageEmbeddingInput]: + llm: VllmRunner) -> List[Qwen2VLPromptImageEmbeddingInput]: """batched image embeddings for Qwen2-VL This will infer all images' embeddings in a single batch, @@ -105,17 +104,19 @@ def batch_make_image_embeddings( pixel_values = preprocess_result["pixel_values"] image_grid_thw = preprocess_result["image_grid_thw"] - # pixel values to embeddinds & grid_thws - with torch.no_grad(): - visual = llm.llm_engine.model_executor.driver_worker. \ - model_runner.model.visual + # pixel values to embeddings & grid_thws + def get_image_embeds(model): + with torch.no_grad(): + visual = model.visual - pixel_values_on_device = pixel_values.to(visual.device, - dtype=visual.dtype) - image_grid_thw_on_device = image_grid_thw.to(visual.device, - dtype=torch.int64) - image_embeds = visual(pixel_values_on_device, - grid_thw=image_grid_thw_on_device) + pixel_values_on_device = pixel_values.to(visual.device, + dtype=visual.dtype) + image_grid_thw_on_device = image_grid_thw.to(visual.device, + dtype=torch.int64) + return visual(pixel_values_on_device, + grid_thw=image_grid_thw_on_device) + + image_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches result: List[Qwen2VLPromptImageEmbeddingInput] = [] @@ -124,11 +125,10 @@ def batch_make_image_embeddings( for image_batch in image_batches_: cur_batch_image_count = len(image_batch) merge_size = image_processor.merge_size - cur_batch_embed_len = sum([ - grid_thw.prod() // merge_size // merge_size + cur_batch_embed_len = sum( + grid_thw.prod(-1) // merge_size // merge_size for grid_thw in image_grid_thw[image_counter:image_counter + - cur_batch_image_count] - ]) + cur_batch_image_count]) result.append({ "image_embeds": @@ -151,7 +151,7 @@ def batch_make_image_embeddings( def batch_make_video_embeddings( video_batches: PromptVideoInput, processor, - llm: LLM) -> List[Qwen2VLPromptVideoEmbeddingInput]: + llm: VllmRunner) -> List[Qwen2VLPromptVideoEmbeddingInput]: """batched video embeddings for Qwen2-VL A NDArray represents a single video's all frames. @@ -187,17 +187,19 @@ def batch_make_video_embeddings( pixel_values = preprocess_result["pixel_values_videos"] video_grid_thw = preprocess_result["video_grid_thw"] - # pixel values to embeddinds & grid_thws - with torch.no_grad(): - visual = llm.llm_engine.model_executor.driver_worker.\ - model_runner.model.visual + # pixel values to embeddings & grid_thws + def get_image_embeds(model): + with torch.no_grad(): + visual = model.visual + + pixel_values_on_device = pixel_values.to(visual.device, + dtype=visual.dtype) + video_grid_thw_on_device = video_grid_thw.to(visual.device, + dtype=torch.int64) + return visual(pixel_values_on_device, + grid_thw=video_grid_thw_on_device) - pixel_values_on_device = pixel_values.to(visual.device, - dtype=visual.dtype) - video_grid_thw_on_device = video_grid_thw.to(visual.device, - dtype=torch.int64) - video_embeds = visual(pixel_values_on_device, - grid_thw=video_grid_thw_on_device) + video_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches result: List[Qwen2VLPromptVideoEmbeddingInput] = [] @@ -206,11 +208,10 @@ def batch_make_video_embeddings( for video_batch in video_batches_: cur_batch_video_count = len(video_batch) merge_size = image_processor.merge_size - cur_batch_embed_len = sum([ - grid_thw.prod() // merge_size // merge_size + cur_batch_embed_len = sum( + grid_thw.prod(-1) // merge_size // merge_size for grid_thw in video_grid_thw[video_counter:video_counter + - cur_batch_video_count] - ]) + cur_batch_video_count]) result.append({ "video_embeds": @@ -280,9 +281,9 @@ def run_embedding_input_test( max_tokens, num_logprobs=num_logprobs, images=batch_make_image_embeddings( - images, processor, vllm_model.model) if images else None, + images, processor, vllm_model) if images else None, videos=batch_make_video_embeddings( - videos, processor, vllm_model.model) if videos else None) + videos, processor, vllm_model) if videos else None) for prompts, images, videos in inputs ] diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py index 6673a9fc22f69..0cbe4afe96c0a 100644 --- a/tests/models/embedding/language/test_cls_models.py +++ b/tests/models/embedding/language/test_cls_models.py @@ -24,10 +24,13 @@ def test_classification_models( ) -> None: with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.classify(example_prompts) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) with hf_runner(model, dtype=dtype, diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 04ab4dd7371a3..e17198e385475 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -17,14 +17,15 @@ marks=[pytest.mark.core_model, pytest.mark.cpu_model]), pytest.param("sentence-transformers/all-MiniLM-L12-v2"), pytest.param("intfloat/multilingual-e5-large"), - # [Encoder-decoder] - pytest.param("intfloat/e5-mistral-7b-instruct", - marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + # [Decoder-only] pytest.param("BAAI/bge-multilingual-gemma2", marks=[pytest.mark.core_model]), - pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), + pytest.param("intfloat/e5-mistral-7b-instruct", + marks=[pytest.mark.core_model, pytest.mark.cpu_model]), pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"), + pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), + # [Encoder-decoder] pytest.param("sentence-transformers/stsb-roberta-base-v2"), ], ) @@ -61,10 +62,13 @@ def test_models( max_model_len=None, **vllm_extra_kwargs) as vllm_model: vllm_outputs = vllm_model.encode(example_prompts) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) check_embeddings_close( embeddings_0_lst=hf_outputs, diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 0a38779e0e4f0..fe5b733c750a8 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -11,6 +11,7 @@ from vllm.multimodal.utils import cached_get_tokenizer from ....multimodal.utils import random_audio, random_image, random_video +from ...registry import HF_EXAMPLE_MODELS def _test_processing_correctness( @@ -20,10 +21,9 @@ def _test_processing_correctness( num_batches: int, simplify_rate: float, ): - if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3": - hf_overrides = {"architectures": ["MantisForConditionalGeneration"]} - else: - hf_overrides = {} + model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) + model_info.check_available_online(on_fail="skip") + model_info.check_transformers_version(on_fail="skip") limit_mm_per_prompt = { modality: 3 if supports_multi else 1 @@ -35,11 +35,11 @@ def _test_processing_correctness( task="auto", tokenizer=model_id, tokenizer_mode="auto", - trust_remote_code=True, + trust_remote_code=model_info.trust_remote_code, seed=0, dtype="float16", revision=None, - hf_overrides=hf_overrides, + hf_overrides=model_info.hf_overrides, limit_mm_per_prompt=limit_mm_per_prompt, ) @@ -139,6 +139,7 @@ def _test_processing_correctness( ("rhymes-ai/Aria", {"image": True}), ("Salesforce/blip2-opt-2.7b", {"image": False}), ("facebook/chameleon-7b", {"image": False}), + ("deepseek-ai/deepseek-vl2-tiny", {"image": True}), ("adept/fuyu-8b", {"image": False}), ("llava-hf/llava-1.5-7b-hf", {"image": True}), ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}), diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py index 1eec35d9c3c72..6de649f87204d 100644 --- a/tests/models/multimodal/processing/test_llava_next.py +++ b/tests/models/multimodal/processing/test_llava_next.py @@ -13,6 +13,67 @@ from ...utils import build_model_context +def _validate_image_max_tokens_one( + processor: BaseMultiModalProcessor, + max_tokens: int, + failed_size_excs: list[tuple[ImageSize, Exception]], + image_size: ImageSize, +) -> None: + info = processor.info + feature_size = info.get_num_image_tokens(image_width=image_size.width, + image_height=image_size.height) + + try: + assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}" + except Exception as exc: + failed_size_excs.append((image_size, exc)) + + +@pytest.mark.skip("This test takes around 5 minutes to run. " + "Comment this out to run it manually.") +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) +def test_processor_max_tokens(model_id): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": 1}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + info = processor.info + + seen_aspect_ratios = set[float]() + image_sizes = list[ImageSize]() + + # The aspect ratio of the grid layout is between 1 and 2 + # NOTE: Assumes that feature size calculation is the same if we + # swap the width and height of the image + for w, h in itertools.product(range(32, 4096), repeat=2): + aspect_ratio = w / h + if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios: + image_sizes.append(ImageSize(w, h)) + seen_aspect_ratios.add(aspect_ratio) + + failed_size_excs = list[tuple[ImageSize, Exception]]() + + validate_one = partial( + _validate_image_max_tokens_one, + processor, + info.get_max_image_tokens(), # type: ignore + failed_size_excs, + ) + pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes") + + if failed_size_excs: + msg = "Found failing image sizes:" \ + + "\n========\n".join(f"[{size}]\n{exc}" + for size, exc in failed_size_excs) + raise AssertionError(msg) + + def _validate_image_prompt_replacements_one( processor: BaseMultiModalProcessor, num_imgs: int, diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py index 94ea604c58b43..806437d35ec87 100644 --- a/tests/models/multimodal/processing/test_llava_onevision.py +++ b/tests/models/multimodal/processing/test_llava_onevision.py @@ -13,6 +13,68 @@ from ...utils import build_model_context +def _validate_image_max_tokens_one( + processor: BaseMultiModalProcessor, + max_tokens: int, + failed_size_excs: list[tuple[ImageSize, Exception]], + image_size: ImageSize, +) -> None: + info = processor.info + feature_size = info.get_num_image_tokens(image_width=image_size.width, + image_height=image_size.height) + + try: + assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}" + except Exception as exc: + failed_size_excs.append((image_size, exc)) + + +@pytest.mark.skip("This test takes around 5 minutes to run. " + "Comment this out to run it manually.") +@pytest.mark.parametrize("model_id", + ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) +def test_processor_max_tokens(model_id): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": 1}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + info = processor.info + + seen_aspect_ratios = set[float]() + image_sizes = list[ImageSize]() + + # The aspect ratio of the grid layout is between 1 and 6 + # NOTE: Assumes that feature size calculation is the same if we + # swap the width and height of the image + for w, h in itertools.product(range(32, 4096), repeat=2): + aspect_ratio = w / h + if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios: + image_sizes.append(ImageSize(w, h)) + seen_aspect_ratios.add(aspect_ratio) + + failed_size_excs = list[tuple[ImageSize, Exception]]() + + validate_one = partial( + _validate_image_max_tokens_one, + processor, + info.get_max_image_tokens(), # type: ignore + failed_size_excs, + ) + pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes") + + if failed_size_excs: + msg = "Found failing image sizes:" \ + + "\n========\n".join(f"[{size}]\n{exc}" + for size, exc in failed_size_excs) + raise AssertionError(msg) + + def _validate_image_prompt_replacements_one( processor: BaseMultiModalProcessor, num_imgs: int, diff --git a/tests/models/registry.py b/tests/models/registry.py index b0f0f9767a90f..0bd06dea0ec7f 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -1,5 +1,9 @@ from dataclasses import dataclass, field -from typing import AbstractSet, Mapping, Optional +from typing import AbstractSet, Any, Literal, Mapping, Optional + +import pytest +from packaging.version import Version +from transformers import __version__ as TRANSFORMERS_VERSION @dataclass(frozen=True) @@ -38,6 +42,50 @@ class _HfExamplesInfo: trust_remote_code: bool = False """The ``trust_remote_code`` level required to load the model.""" + hf_overrides: dict[str, Any] = field(default_factory=dict) + """The ``hf_overrides`` required to load the model.""" + + def check_transformers_version( + self, + *, + on_fail: Literal["error", "skip"], + ) -> None: + """ + If the installed transformers version does not meet the requirements, + perform the given action. + """ + if self.min_transformers_version is None: + return + + current_version = TRANSFORMERS_VERSION + required_version = self.min_transformers_version + if Version(current_version) < Version(required_version): + msg = ( + f"You have `transformers=={current_version}` installed, but " + f"`transformers>={required_version}` is required to run this " + "model") + + if on_fail == "error": + raise RuntimeError(msg) + else: + pytest.skip(msg) + + def check_available_online( + self, + *, + on_fail: Literal["error", "skip"], + ) -> None: + """ + If the model is not available online, perform the given action. + """ + if not self.is_available_online: + msg = "Model is not available online" + + if on_fail == "error": + raise RuntimeError(msg) + else: + pytest.skip(msg) + # yapf: disable _TEXT_GENERATION_EXAMPLE_MODELS = { @@ -48,8 +96,6 @@ class _HfExamplesInfo: trust_remote_code=True), "ArcticForCausalLM": _HfExamplesInfo("Snowflake/snowflake-arctic-instruct", trust_remote_code=True), - "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria", - trust_remote_code=True), "BaiChuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan-7B", trust_remote_code=True), "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat", @@ -69,6 +115,7 @@ class _HfExamplesInfo: "DeepseekV3ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3", # noqa: E501 trust_remote_code=True), "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501 + "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501 "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"), "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"), @@ -154,6 +201,7 @@ class _HfExamplesInfo: "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"), "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"), + "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"), "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"), # noqa: E501 "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"), # noqa: E501 "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"), # noqa: E501 @@ -174,6 +222,8 @@ class _HfExamplesInfo: _MULTIMODAL_EXAMPLE_MODELS = { # [Decoder-only] + "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria", + min_transformers_version="4.48"), "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"), # noqa: E501 "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501 "ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b", @@ -181,8 +231,8 @@ class _HfExamplesInfo: trust_remote_code=True), "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b", is_available_online=False), - # TODO(Isotr0py): Use deepseek-vl2-tiny for test after it's supported - "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-small"), # noqa: E501 + "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501 + hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501 "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"), "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"), "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", @@ -193,7 +243,8 @@ class _HfExamplesInfo: "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"), # noqa: E501 "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"), # noqa: E501 "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 - "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3"), # noqa: E501 + "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3", # noqa: E501 + hf_overrides={"architectures": ["MantisForConditionalGeneration"]}), # noqa: E501 "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5", trust_remote_code=True), "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924", @@ -210,7 +261,8 @@ class _HfExamplesInfo: trust_remote_code=True), "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501 "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 - "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"), + "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3", + trust_remote_code=True), # [Encoder-decoder] "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501 @@ -246,5 +298,17 @@ def get_supported_archs(self) -> AbstractSet[str]: def get_hf_info(self, model_arch: str) -> _HfExamplesInfo: return self.hf_models[model_arch] + def find_hf_info(self, model_id: str) -> _HfExamplesInfo: + for info in self.hf_models.values(): + if info.default == model_id: + return info + + # Fallback to extras + for info in self.hf_models.values(): + if any(extra == model_id for extra in info.extras.values()): + return info + + raise ValueError(f"No example model defined for {model_id}") + HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS) diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index daece7c93c0ef..d3a3aaf670c23 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -1,9 +1,7 @@ from unittest.mock import patch import pytest -from packaging.version import Version from transformers import PretrainedConfig -from transformers import __version__ as TRANSFORMERS_VERSION from vllm import LLM @@ -13,16 +11,8 @@ @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) def test_can_initialize(model_arch): model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) - if not model_info.is_available_online: - pytest.skip("Model is not available online") - if model_info.min_transformers_version is not None: - current_version = TRANSFORMERS_VERSION - required_version = model_info.min_transformers_version - if Version(current_version) < Version(required_version): - pytest.skip( - f"You have `transformers=={current_version}` installed, but " - f"`transformers>={required_version}` is required to run this " - "model") + model_info.check_available_online(on_fail="skip") + model_info.check_transformers_version(on_fail="skip") # Avoid OOM def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 73b70d65e8e0b..ac0366847e334 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -21,6 +21,9 @@ @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs()) def test_registry_imports(model_arch): + model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) + model_info.check_transformers_version(on_fail="skip") + # Ensure all model classes can be imported successfully model_cls, _ = ModelRegistry.resolve_model_cls(model_arch) diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 9e58ed4cfde93..13f820d013e2a 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -7,12 +7,16 @@ from vllm.config import ModelConfig from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.processing import (PlaceholderInfo, PromptReplacement, +# yapf conflicts with isort for this block +# yapf: disable +from vllm.multimodal.processing import (PlaceholderFeaturesInfo, + PromptReplacement, find_mm_placeholders, find_text_matches, find_token_matches, iter_token_matches, replace_text_matches, replace_token_matches) +# yapf: enable from vllm.multimodal.profiling import MultiModalProfiler from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -433,19 +437,19 @@ def test_find_replace_tokens( [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918], { "pattern_1": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_1", item_idx=0, start_idx=6, - replacement=[32000, 32000], + tokens=[32000, 32000], ), ], "pattern_4": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_4", item_idx=0, start_idx=3, - replacement=[32000], + tokens=[32000], ), ], } @@ -455,25 +459,25 @@ def test_find_replace_tokens( [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550], { "pattern_1": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_1", item_idx=0, start_idx=1, - replacement=[32000, 32000], + tokens=[32000, 32000], ), - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_1", item_idx=1, start_idx=5, - replacement=[32000, 32000], + tokens=[32000, 32000], ), ], "pattern_3": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_3", item_idx=0, start_idx=7, - replacement=[1550, 918, 1550], + tokens=[1550, 918, 1550], ), ], # No match for pattern_4 as it has lower priority than pattern_1 @@ -483,33 +487,33 @@ def test_find_replace_tokens( [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550], { "pattern_1": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_1", item_idx=0, start_idx=1, - replacement=[32000, 32000], + tokens=[32000, 32000], ), - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_1", item_idx=1, start_idx=3, - replacement=[32000, 32000], + tokens=[32000, 32000], ), ], "pattern_4": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_4", item_idx=0, start_idx=5, - replacement=[32000], + tokens=[32000], ), ], "pattern_3": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_3", item_idx=0, start_idx=6, - replacement=[1550, 918, 1550], + tokens=[1550, 918, 1550], ), ], } diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 92436889ecffe..0cd86cef0a475 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -30,50 +30,55 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): model_path, strategy, quant_type, shape_0, is_symmetric = model_args with vllm_runner(model_path, enforce_eager=True) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - - qkv_proj = layer.self_attn.qkv_proj - o_proj = layer.self_attn.o_proj - gate_up_proj = layer.mlp.gate_up_proj - down_proj = layer.mlp.down_proj - - # assert zp for symmetric and asymmetric cases - def zp_valid(zp: Optional[torch.Tensor]): - if is_symmetric: - return zp is None - - return zp is not None and zp.dtype is torch.int32 - - assert zp_valid(qkv_proj.input_zero_point) - assert zp_valid(o_proj.input_zero_point) - assert zp_valid(gate_up_proj.input_zero_point) - assert zp_valid(down_proj.input_zero_point) - - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(gate_up_proj.quant_method, - CompressedTensorsLinearMethod) - assert isinstance(down_proj.quant_method, - CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) - - assert qkv_proj.scheme.strategy == strategy - assert qkv_proj.scheme.is_static_input_scheme - expected_type = torch.int8 - - assert qkv_proj.weight.dtype is expected_type - assert o_proj.weight.dtype is expected_type - assert gate_up_proj.weight.dtype is expected_type - - if qkv_proj.scheme.strategy == "tensor": - # Make sure it is a channelwise buffer - # After running process_weights_after_loading - assert len(qkv_proj.weight_scale.shape) == 2 - assert qkv_proj.weight_scale.shape[0] == shape_0 - assert qkv_proj.weight_scale.shape[1] == 1 - assert qkv_proj.weight_scale.dtype is torch.float32 - assert qkv_proj.input_scale.dtype is torch.float32 + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + o_proj = layer.self_attn.o_proj + gate_up_proj = layer.mlp.gate_up_proj + down_proj = layer.mlp.down_proj + + # assert zp for symmetric and asymmetric cases + def zp_valid(zp: Optional[torch.Tensor]): + if is_symmetric: + return zp is None + + return zp is not None and zp.dtype is torch.int32 + + assert zp_valid(qkv_proj.input_zero_point) + assert zp_valid(o_proj.input_zero_point) + assert zp_valid(gate_up_proj.input_zero_point) + assert zp_valid(down_proj.input_zero_point) + + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(o_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(gate_up_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(down_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) + + assert qkv_proj.scheme.strategy == strategy + assert qkv_proj.scheme.is_static_input_scheme + expected_type = torch.int8 + + assert qkv_proj.weight.dtype is expected_type + assert o_proj.weight.dtype is expected_type + assert gate_up_proj.weight.dtype is expected_type + + if qkv_proj.scheme.strategy == "tensor": + # Make sure it is a channelwise buffer + # After running process_weights_after_loading + assert len(qkv_proj.weight_scale.shape) == 2 + assert qkv_proj.weight_scale.shape[0] == shape_0 + assert qkv_proj.weight_scale.shape[1] == 1 + assert qkv_proj.weight_scale.dtype is torch.float32 + assert qkv_proj.input_scale.dtype is torch.float32 + + llm.apply_model(check_model) output = llm.generate_greedy(["Hello my name is"], max_tokens=20) assert output @@ -129,16 +134,20 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner): def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args): model_path, strategy = model_args with vllm_runner(model_path, dtype=torch.float16) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) + assert not qkv_proj.scheme.is_static_input_scheme + assert qkv_proj.scheme.strategy == strategy + assert qkv_proj.weight.dtype is torch.int8 - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) - assert not qkv_proj.scheme.is_static_input_scheme - assert qkv_proj.scheme.strategy == strategy - assert qkv_proj.weight.dtype is torch.int8 + llm.apply_model(check_model) output = llm.generate_greedy(["Hello my name is"], max_tokens=20) assert output @@ -152,19 +161,24 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args): def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): model, strategy, group, pack_factor = wNa16_args with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16) + def check_model(model): + layer = model.model.layers[0] - assert qkv_proj.scheme.strategy == strategy - assert qkv_proj.scheme.group_size == (-1 if group is None else group) + qkv_proj = layer.self_attn.qkv_proj + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16) - assert qkv_proj.weight_packed.dtype is torch.int32 - assert qkv_proj.weight_scale.dtype is torch.float16 - assert qkv_proj.scheme.pack_factor == pack_factor + assert qkv_proj.scheme.strategy == strategy + assert qkv_proj.scheme.group_size == (-1 + if group is None else group) + + assert qkv_proj.weight_packed.dtype is torch.int32 + assert qkv_proj.weight_scale.dtype is torch.float16 + assert qkv_proj.scheme.pack_factor == pack_factor + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output @@ -173,14 +187,18 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): def test_compressed_tensors_w4a16_marlin24(vllm_runner): model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t" with vllm_runner(model_path) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24) - assert qkv_proj.weight_packed.dtype is torch.int32 + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24) + assert qkv_proj.weight_packed.dtype is torch.int32 + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output @@ -189,23 +207,27 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner): def test_compressed_tensors_fp8(vllm_runner): model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" with vllm_runner(model_path) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance( - qkv_proj.scheme, - (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8)) + qkv_proj = layer.self_attn.qkv_proj - assert qkv_proj.input_scale.dtype is torch.float32 + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance( + qkv_proj.scheme, + (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8)) - if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8): - assert len(qkv_proj.input_scale.shape) == 0 - assert qkv_proj.weight.dtype is torch.float8_e4m3fn - assert qkv_proj.weight_scale.dtype is torch.float32 - assert len(qkv_proj.weight_scale.shape) == 0 + assert qkv_proj.input_scale.dtype is torch.float32 + + if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8): + assert len(qkv_proj.input_scale.shape) == 0 + assert qkv_proj.weight.dtype is torch.float8_e4m3fn + assert qkv_proj.weight_scale.dtype is torch.float32 + assert len(qkv_proj.weight_scale.shape) == 0 + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output @@ -248,12 +270,15 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy): def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4): model, weight_strategy, input_strategy = args_2of4 with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj - assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn - _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn + _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) @@ -273,12 +298,15 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4): def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4): model, weight_strategy, input_strategy = args_2of4 with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj - assert qkv_proj.scheme.weights_dtype == torch.int8 - _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert qkv_proj.scheme.weights_dtype == torch.int8 + _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) @@ -293,20 +321,24 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4): def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4): model = args_2of4 with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - - qkv_proj = layer.self_attn.qkv_proj - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensors24) - - assert qkv_proj.scheme.weight_quant is None - assert qkv_proj.scheme.input_quant is None - assert not qkv_proj.scheme.quantized - assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map - sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501 - assert sparsity_map.get("Linear").format == "dense" - assert sparsity_map.get("Linear").sparsity_structure == "2:4" + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensors24) + + assert qkv_proj.scheme.weight_quant is None + assert qkv_proj.scheme.input_quant is None + assert not qkv_proj.scheme.quantized + assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map + sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501 + assert sparsity_map.get("Linear").format == "dense" + assert sparsity_map.get("Linear").sparsity_structure == "2:4" + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index a0c1d7e24c503..4bff734746297 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -49,13 +49,17 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool, def test_kv_cache_model_load_and_run(vllm_runner, model_id: str): with vllm_runner(model_id, kv_cache_dtype="fp8") as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - attn = model.model.layers[0].self_attn.attn - assert isinstance(attn.quant_method, Fp8KVCacheMethod) - # NOTE: it is valid for scales to be 1.0 (default value), but we know - # these checkpoints have scales < 1.0 - assert 0.0 < attn._k_scale < 1.0 - assert 0.0 < attn._v_scale < 1.0 + def check_model(model): + attn = model.model.layers[0].self_attn.attn + + assert isinstance(attn.quant_method, Fp8KVCacheMethod) + + # NOTE: it is valid for scales to be 1.0 (default value), but + # we know these checkpoints have scales < 1.0 + assert 0.0 < attn._k_scale < 1.0 + assert 0.0 < attn._v_scale < 1.0 + + llm.apply_model(check_model) # note: this does not test accuracy, just that we can run through # see lm-eval tests for accuracy @@ -77,22 +81,24 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool, quantization="fp8", kv_cache_dtype=kv_cache_dtype) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - fc1 = model.model.decoder.layers[0].fc1 - assert isinstance(fc1.quant_method, Fp8LinearMethod) - if kv_cache_dtype == "fp8": - attn = model.model.decoder.layers[0].self_attn.attn - assert isinstance(attn.quant_method, Fp8KVCacheMethod) - assert attn._k_scale == 1.0 - assert attn._v_scale == 1.0 - - if current_platform.has_device_capability(89) and not force_marlin: - # For GPUs with hardware support, we keep weights in fp8 - assert fc1.weight.dtype == torch.float8_e4m3fn - else: - # For GPUs without hardware support, we pack the fp8 weights - # for weight-only quantization using Marlin kernels - assert fc1.weight.dtype == torch.int32 + def check_model(model): + fc1 = model.model.decoder.layers[0].fc1 + assert isinstance(fc1.quant_method, Fp8LinearMethod) + if kv_cache_dtype == "fp8": + attn = model.model.decoder.layers[0].self_attn.attn + assert isinstance(attn.quant_method, Fp8KVCacheMethod) + assert attn._k_scale == 1.0 + assert attn._v_scale == 1.0 + + if current_platform.has_device_capability(89) and not force_marlin: + # For GPUs with hardware support, we keep weights in fp8 + assert fc1.weight.dtype == torch.float8_e4m3fn + else: + # For GPUs without hardware support, we pack the fp8 weights + # for weight-only quantization using Marlin kernels + assert fc1.weight.dtype == torch.int32 + + llm.apply_model(check_model) @pytest.mark.skipif(not is_quant_method_supported("fp8"), diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index ad526a4065101..fa2d9645ea47f 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -28,20 +28,23 @@ def test_lm_head( model_lm_head_quant: Tuple[str, bool], ) -> None: model, lm_head_quantized = model_lm_head_quant - vllm_model = vllm_runner(model, dtype=torch.float16, max_model_len=2048) - - lm_head_layer = (vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model.lm_head) - - if lm_head_quantized: - assert isinstance( - lm_head_layer.linear_method, - (GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod)) - else: - assert isinstance(lm_head_layer.linear_method, - UnquantizedEmbeddingMethod) - - print( - vllm_model.generate_greedy(prompts=["Hello my name is"], - max_tokens=10)[0][1]) - del vllm_model + + with vllm_runner(model, dtype=torch.float16, + max_model_len=2048) as vllm_model: + + def check_model(model): + lm_head_layer = model.lm_head + + if lm_head_quantized: + assert isinstance(lm_head_layer.linear_method, + (GPTQLinearMethod, GPTQMarlinLinearMethod, + MarlinLinearMethod)) + else: + assert isinstance(lm_head_layer.linear_method, + UnquantizedEmbeddingMethod) + + vllm_model.apply_model(check_model) + + print( + vllm_model.generate_greedy(prompts=["Hello my name is"], + max_tokens=10)[0][1]) diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py new file mode 100644 index 0000000000000..11382ad708faa --- /dev/null +++ b/tests/quantization/test_quark.py @@ -0,0 +1,33 @@ +"""Test model set-up and weight loading for quark-quantized models. + +Run `pytest tests/quantization/test_quark.py`. +""" + +import torch + +from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501 + QuarkLinearMethod, QuarkW8A8Fp8) + + +def test_quark_fp8(vllm_runner): + model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test" + with vllm_runner(model_path) as llm: + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + + assert isinstance(qkv_proj.quant_method, QuarkLinearMethod) + assert isinstance(qkv_proj.scheme, QuarkW8A8Fp8) + + if isinstance(qkv_proj.scheme, QuarkW8A8Fp8): + assert len(qkv_proj.input_scale.shape) == 0 + assert qkv_proj.weight.dtype is torch.float8_e4m3fn + #assert qkv_proj.weight.dtype is torch.float8_e4m3fnuz + assert len(qkv_proj.weight_scale.shape) == 0 + + llm.apply_model(check_model) + + output = llm.generate_greedy("Hello my name is", max_tokens=20) + assert output diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py new file mode 100644 index 0000000000000..8e7f44a399ddf --- /dev/null +++ b/tests/quantization/test_register_quantization_config.py @@ -0,0 +1,117 @@ +"""Tests register custom quantization config. + +See https://github.com/vllm-project/vllm/issues/11926 for more details. + +Run `pytest tests/quantization/test_register_quantization_config.py`. +""" +from typing import Any, Dict, List, Optional + +import pytest +import torch +import torch.nn.functional as F + +from vllm.model_executor.layers.linear import LinearBase # noqa: E501 +from vllm.model_executor.layers.linear import UnquantizedLinearMethod +from vllm.model_executor.layers.quantization import ( + get_quantization_config, register_quantization_config) +from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 + QuantizationConfig) + + +class FakeQuantLinearMethod(UnquantizedLinearMethod): + """Fake quantization linear method for per-token dynamic quantization.""" + + def __init__(self, num_bits: int = 8) -> None: + """Initialize the quantization method.""" + super().__init__() + self.num_bits = num_bits + + def apply(self, + layer: "torch.nn.Module", + x: "torch.Tensor", + bias: Optional["torch.Tensor"] = None) -> "torch.Tensor": + """Perform fake quantization before the linear layer.""" + + # Calculate the scales dynamically + max_val = torch.amax(x, dim=(0, -1), keepdims=True) + min_val = torch.amin(x, dim=(0, -1), keepdims=True) + scales = (max_val - min_val) / (2**self.num_bits - 1) + + # Fake quantize the input + quant_x = torch.clamp(torch.round(x / scales), -2**(self.num_bits - 1), + 2**(self.num_bits - 1) - 1) + dequant_x = quant_x * scales + + return F.linear(dequant_x, layer.weight, bias) + + +@register_quantization_config("custom_quant") +class CustomQuantConfig(QuantizationConfig): + """Custom quantization config for per-token dynamic fake quantization.""" + + def __init__(self, num_bits: int = 8) -> None: + """Initialize the quantization config.""" + self.num_bits = num_bits + + def get_name(self) -> str: + """Name of the quantization method.""" + return "custom_quant" + + def get_supported_act_dtypes(self) -> List["torch.dtype"]: + """List of supported activation dtypes.""" + return [torch.float16, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + """Minimum GPU capability to support the quantization method.""" + return -1 + + @staticmethod + def get_config_filenames() -> List[str]: + """List of filenames to search for in the model directory.""" + return [] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "CustomQuantConfig": + """Create a config class from the model's quantization config.""" + return CustomQuantConfig(num_bits=config.get("num_bits", 8)) + + def get_quant_method(self, layer: "torch.nn.Module", + prefix: str) -> Optional["FakeQuantLinearMethod"]: + """Get the quantize method to use for the quantized layer.""" + if isinstance(layer, LinearBase): + return FakeQuantLinearMethod(num_bits=self.num_bits) + return None + + +def test_register_quantization_config(): + """Test register custom quantization config.""" + + # The quantization method `custom_quant` should be registered. + assert get_quantization_config("custom_quant") == CustomQuantConfig + + # The quantization method `custom_quant` is already exists, + # should raise an error. + with pytest.raises(ValueError): + register_quantization_config("custom_quant")(CustomQuantConfig) + + +@pytest.mark.parametrize(argnames="model", + argvalues=[ + "meta-llama/Meta-Llama-3-8B-Instruct", + ]) +def test_custom_quant(vllm_runner, model): + """Test infer with the custom quantization method.""" + with vllm_runner(model_name=model, + quantization="custom_quant", + enforce_eager=True) as llm: + + model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 + layer = model.model.layers[0] + qkv_proj = layer.self_attn.qkv_proj + + # Check the quantization method is FakeQuantLinearMethod + assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod) + + output = llm.generate_greedy("Hello my name is", max_tokens=20) + assert output diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index 88067f19c8f07..bf1ee6c397838 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -31,7 +31,7 @@ def test_random_sample_with_seed( sampling_params = SamplingParams( # Parameters to ensure sufficient randomness - temperature=2.0, + temperature=3.0, top_p=min(random.random() + 0.3, 1), top_k=random.randint(5, 20), n=random.randint(1, 10), @@ -75,3 +75,8 @@ def test_random_sample_with_seed( # verify requests with the same seed match assert outputs[1] == outputs[4] assert outputs[2] == outputs[5] + + # verify generations within the same parallel sampling group differ + for output in outputs: + for sub_output_a, sub_output_b in combinations(output, 2): + assert sub_output_a != sub_output_b diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index bf409d2d97aa1..6e7eec1c6ab34 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -3,6 +3,7 @@ import os import pathlib import subprocess +from functools import partial from unittest.mock import MagicMock, patch import openai @@ -24,7 +25,6 @@ # yapf: enable from vllm.utils import PlaceholderModule, import_from_path -from ..conftest import VllmRunner from ..utils import VLLM_PATH, RemoteOpenAIServer from .conftest import retry_until_skip @@ -58,16 +58,6 @@ def is_curl_installed(): return False -def get_torch_model(vllm_runner: VllmRunner): - return vllm_runner \ - .model \ - .llm_engine \ - .model_executor \ - .driver_worker \ - .model_runner \ - .model - - def write_keyfile(keyfile_path: str): encryption_params = EncryptionParams.random() pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True) @@ -121,8 +111,10 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs( config_for_serializing = TensorizerConfig(tensorizer_uri=model_path, encryption_keyfile=key_path) - serialize_vllm_model(get_torch_model(vllm_model), - config_for_serializing) + + vllm_model.apply_model( + partial(serialize_vllm_model, + tensorizer_config=config_for_serializing)) config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path, encryption_keyfile=key_path) @@ -175,8 +167,10 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): with vllm_runner(model_ref, ) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") - serialize_vllm_model(get_torch_model(vllm_model), - TensorizerConfig(tensorizer_uri=model_path)) + vllm_model.apply_model( + partial( + serialize_vllm_model, + tensorizer_config=TensorizerConfig(tensorizer_uri=model_path))) with vllm_runner( model_ref, @@ -215,8 +209,10 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): with vllm_runner(model_ref, ) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") - serialize_vllm_model(get_torch_model(vllm_model), - TensorizerConfig(tensorizer_uri=model_path)) + vllm_model.apply_model( + partial( + serialize_vllm_model, + tensorizer_config=TensorizerConfig(tensorizer_uri=model_path))) model_loader_extra_config = { "tensorizer_uri": str(model_path), @@ -337,7 +333,9 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): with vllm_runner(model_ref) as vllm_model: outputs = vllm_model.generate(prompts, sampling_params) - serialize_vllm_model(get_torch_model(vllm_model), config) + + vllm_model.apply_model( + partial(serialize_vllm_model, tensorizer_config=config)) assert is_vllm_tensorized(config) diff --git a/tests/test_utils.py b/tests/test_utils.py index c68d730af7f8a..d5dc4464e634d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -9,10 +9,10 @@ from vllm_test_utils import monitor from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config -from vllm.utils import (FlexibleArgumentParser, PlaceholderModule, - StoreBoolean, bind_kv_cache, deprecate_kwargs, - get_open_port, memory_profiling, merge_async_iterators, - supports_kw) +from vllm.utils import (FlexibleArgumentParser, MemorySnapshot, + PlaceholderModule, StoreBoolean, bind_kv_cache, + deprecate_kwargs, get_open_port, memory_profiling, + merge_async_iterators, supports_kw) from .utils import error_on_warning, fork_new_process_for_each_test @@ -284,14 +284,13 @@ def test_memory_profiling(): # 512 MiB allocation outside of this instance handle1 = lib.cudaMalloc(512 * 1024 * 1024) - baseline_memory_in_bytes = \ - torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0] + baseline_snapshot = MemorySnapshot() # load weights weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32) - weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB + weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB def measure_current_non_torch(): free, total = torch.cuda.mem_get_info() @@ -300,8 +299,8 @@ def measure_current_non_torch(): current_non_torch = current_used - current_torch return current_non_torch - with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes, - weights_memory_in_bytes=weights_memory_in_bytes) as result, \ + with memory_profiling(baseline_snapshot=baseline_snapshot, + weights_memory=weights_memory) as result, \ monitor(measure_current_non_torch) as monitored_values: # make a memory spike, 1 GiB spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32) @@ -316,13 +315,12 @@ def measure_current_non_torch(): assert measured_diff == 256 * 1024 * 1024 # Check that the memory usage is within 5% of the expected values - # 5% tolerance is caused by PyTorch caching allocator, - # we cannot control PyTorch's behavior of its internal buffers, + # 5% tolerance is caused by cuda runtime. + # we cannot control cuda runtime in the granularity of bytes, # which causes a small error (<10 MiB in practice) - non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa - torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa + non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa assert abs(non_torch_ratio - 1) <= 0.05 - assert abs(torch_peak_ratio - 1) <= 0.05 + assert result.torch_peak_increase == 1024 * 1024 * 1024 del weights lib.cudaFree(handle1) lib.cudaFree(handle2) diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index fe5fc979c66a3..49a16d16eb840 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -100,32 +100,32 @@ def test_traces(trace_service): attributes = decode_attributes( request.resource_spans[0].scope_spans[0].spans[0].attributes) - assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model + assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model assert attributes.get( - SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id + SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE + ) == sampling_params.temperature assert attributes.get( - SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature + SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p assert attributes.get( - SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p - assert attributes.get( - SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens - assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n - assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len( + SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n + assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( outputs[0].prompt_token_ids) completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) assert attributes.get( - SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens + SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens metrics = outputs[0].metrics assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue + SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue ttft = metrics.first_token_time - metrics.arrival_time assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft + SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft e2e_time = metrics.finished_time - metrics.arrival_time - assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time assert metrics.scheduler_time > 0 - assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER + ) == metrics.scheduler_time # Model forward and model execute should be none, since detailed traces is # not enabled. assert metrics.model_forward_time is None @@ -166,37 +166,37 @@ def test_traces_with_detailed_steps(trace_service): attributes = decode_attributes( request.resource_spans[0].scope_spans[0].spans[0].attributes) - assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model + assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model assert attributes.get( - SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id + SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE + ) == sampling_params.temperature assert attributes.get( - SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature + SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p assert attributes.get( - SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p - assert attributes.get( - SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens - assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n - assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len( + SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n + assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( outputs[0].prompt_token_ids) completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) assert attributes.get( - SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens + SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens metrics = outputs[0].metrics assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue + SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue ttft = metrics.first_token_time - metrics.arrival_time assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft + SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft e2e_time = metrics.finished_time - metrics.arrival_time - assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time assert metrics.scheduler_time > 0 - assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER + ) == metrics.scheduler_time assert metrics.model_forward_time > 0 assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx( + SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx( metrics.model_forward_time / 1000) assert metrics.model_execute_time > 0 - assert attributes.get(SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE ) == metrics.model_execute_time assert metrics.model_forward_time < 1000 * metrics.model_execute_time diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index fafd9d0ce4455..f434fa8c61a80 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -587,3 +587,72 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): assert {block.ref_cnt for block in block_part1[:3]} == {1} # Block 3-5 are free. assert {block.ref_cnt for block in block_part1[3:]} == {0} + + +def test_reset_prefix_cache(): + manager = KVCacheManager( + block_size=16, + num_gpu_blocks=10, + max_model_len=8192, + sliding_window=None, + enable_caching=True, + num_preallocate_tokens=0, + ) + + full_block_token_ids = [i for i in range(3) for _ in range(16)] + unique_token_ids = [3] * 7 + all_token_ids = full_block_token_ids + unique_token_ids + req0 = make_request("0", all_token_ids) + blocks = manager.allocate_slots(req0, 55, []) + assert [b.block_id for b in blocks] == [0, 1, 2, 3] + + unique_token_ids = [4] * 7 + all_token_ids = full_block_token_ids + unique_token_ids + req1 = make_request("1", all_token_ids) + computed_blocks, _ = manager.get_computed_blocks(req1) + assert len(req1.kv_block_hashes) == 3 + assert len(computed_blocks) == 3 + blocks = manager.allocate_slots(req1, 7, computed_blocks) + assert [b.block_id for b in blocks] == [4] + + # Failed to reset prefix cache because some blocks are not freed yet. + assert not manager.reset_prefix_cache() + assert manager.cached_block_hash_to_block + + # Free the blocks. + manager.free(req0) + manager.free(req1) + + assert manager.reset_prefix_cache() + assert not manager.cached_block_hash_to_block + assert all([blk.block_hash is None for blk in manager.block_pool]) + + +def test_uncache_blocks(): + manager = KVCacheManager( + block_size=16, + num_gpu_blocks=10, + max_model_len=8192, + sliding_window=None, + enable_caching=True, + num_preallocate_tokens=0, + ) + + req0 = make_request("0", list(range(30))) + blocks = manager.allocate_slots(req0, 30, []) + assert [b.block_id for b in blocks] == [0, 1] + assert len(manager.cached_block_hash_to_block) == 1 + + req0.num_computed_tokens = 30 + + # Simulate speculative tokens. + for _ in range(5): + req0.append_output_token_ids(8) + manager.append_slots(req0, 5) + assert len(manager.cached_block_hash_to_block) == 2 + + # After sampling, assuming only 1 token is accepted. + req0.num_computed_tokens = 31 + num_uncached_blocks = manager.uncache_blocks(req0) + assert num_uncached_blocks == 1 + assert len(manager.cached_block_hash_to_block) == 1 diff --git a/tests/v1/test_stats.py b/tests/v1/test_stats.py new file mode 100644 index 0000000000000..580392ac5f446 --- /dev/null +++ b/tests/v1/test_stats.py @@ -0,0 +1,300 @@ +import pytest + +from vllm.sampling_params import SamplingParams +from vllm.v1.stats.common import RequestStats, RequestStatsUpdate + + +def make_update( + request_id: str, + update_type: RequestStatsUpdate.Type, + monotonic_ts_s: float, + **kwargs, +): + if update_type == RequestStatsUpdate.Type.INPUT_PROCESSED: + kwargs.setdefault("sampling_params", SamplingParams(n=1)) + kwargs.setdefault("num_prompt_tokens", 10) + elif update_type == RequestStatsUpdate.Type.PREFILLING: + kwargs.setdefault("num_computed_tokens", 10) + kwargs.setdefault("num_cached_tokens", 10) + elif update_type == RequestStatsUpdate.Type.DETOKENIZED: + kwargs.setdefault("num_new_tokens", 10) + elif update_type == RequestStatsUpdate.Type.FINISHED: + kwargs.setdefault("finish_reason", "test_reason") + + return RequestStatsUpdate( + request_id=request_id, + type=update_type, + monotonic_ts_s=monotonic_ts_s, + **kwargs, + ) + + +def test_invalid_request_update(): + request_id = "test_request" + update_specific_required_fields = { + RequestStatsUpdate.Type.INPUT_PROCESSED: [ + "sampling_params", + "num_prompt_tokens", + ], + RequestStatsUpdate.Type.PREFILLING: [ + "num_computed_tokens", + "num_cached_tokens", + ], + RequestStatsUpdate.Type.DETOKENIZED: ["num_new_tokens"], + RequestStatsUpdate.Type.FINISHED: ["finish_reason"], + } + + # Missing a required field should raise an assertion error. + for update_type in RequestStatsUpdate.Type: + required_fields = update_specific_required_fields.get(update_type, []) + + # Try to miss one of the required fields. + kwargs = {field: object() for field in required_fields} + for field in required_fields: + copy_kwargs = kwargs.copy() + copy_kwargs.pop(field) + with pytest.raises(ValueError): + RequestStatsUpdate( + request_id=request_id, + type=update_type, + **copy_kwargs, + ) + + +def test_invalid_request_update_transition(): + # Test invalid transition type. + for src in RequestStatsUpdate.Type: + for dst in RequestStatsUpdate.Type: + if dst not in RequestStatsUpdate._VALID_TRANSITIONS[src]: + with pytest.raises(AssertionError): + RequestStatsUpdate.check_valid_update( + make_update( + update_type=dst, + request_id="test_request", + monotonic_ts_s=1, + ), + last_update_type=src, + last_updated_ts_s=0, + ) + else: + RequestStatsUpdate.check_valid_update( + make_update( + request_id="test_request", + update_type=dst, + monotonic_ts_s=1, + ), + last_update_type=src, + last_updated_ts_s=0, + ) + + # Test invalid timestamp. + with pytest.raises(AssertionError): + RequestStatsUpdate.check_valid_update( + make_update( + request_id="test_request", + update_type=RequestStatsUpdate.Type.ARRIVED, + monotonic_ts_s=1, + ), + last_update_type=None, + last_updated_ts_s=2, + ) + + +def test_lifecycle_updates(): + request_id = "test_request" + stats = RequestStats(request_id=request_id) + + # Test the below scenario: + arrived_ts = 0 + input_processed_ts = 1 + queued_ts = 2 + prefilling_ts = 3 + decoded_ts = 5 + detokenized_ts = 6 + decoded_2_ts = 7 + detokenized_2_ts = 8 + preempted_ts = 9 + resumed_ts = 10 + decoded_3_ts = 11 + detokenized_3_ts = 12 + finished_ts = 13 + + # Test ARRIVED + arrived_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.ARRIVED, + monotonic_ts_s=arrived_ts, + ) + stats.update_from(arrived_update) + assert stats.arrival_ts_s == arrived_ts + assert stats.last_updated_ts_s == arrived_ts + + # Test INPUT_PROCESSED + sampling_params = SamplingParams(n=1) + input_processed_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.INPUT_PROCESSED, + monotonic_ts_s=input_processed_ts, + sampling_params=sampling_params, + num_prompt_tokens=6, + ) + stats.update_from(input_processed_update) + assert stats.input_processor_end_ts_s == input_processed_ts + assert stats.last_updated_ts_s == input_processed_ts + assert stats.num_prompt_tokens == 6 + assert stats.sampling_params == sampling_params + + assert stats.first_token_ts_s is None + assert stats.prefill_ts_s is None + + # Test QUEUED + queued_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.QUEUED, + monotonic_ts_s=queued_ts, + ) + stats.update_from(queued_update) + assert stats.queued_ts_s == queued_ts + assert stats.last_updated_ts_s == queued_ts + + # Test PREFILLING + prefilling_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.PREFILLING, + monotonic_ts_s=prefilling_ts, + num_computed_tokens=3, + num_cached_tokens=1, + ) + stats.update_from(prefilling_update) + assert stats.prefill_ts_s == prefilling_ts + assert stats.num_computed_tokens == 3 + assert stats.num_cached_tokens == 1 + assert stats.queue_duration_s == prefilling_ts - queued_ts + + # Test DECODING + decoded_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.DECODING, + monotonic_ts_s=decoded_ts, + ) + stats.update_from(decoded_update) + assert stats.last_updated_ts_s == decoded_ts + + # Test DETOKENIZED + detokenized_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.DETOKENIZED, + monotonic_ts_s=detokenized_ts, + num_new_tokens=1, + ) + stats.update_from(detokenized_update) + assert stats.last_updated_ts_s == detokenized_ts + assert stats.num_output_tokens == 1 + # Since arrival + assert stats.first_token_latency_s == detokenized_ts - arrived_ts + # Since first scheduled + assert stats.prefill_latency_s == detokenized_ts - prefilling_ts + + # Test another DECODING and DETOKENIZED should + # yield correct inter token latency + decoded_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.DECODING, + monotonic_ts_s=decoded_2_ts, + ) + stats.update_from(decoded_update) + + detokenized_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.DETOKENIZED, + monotonic_ts_s=detokenized_2_ts, + num_new_tokens=1, + ) + stats.update_from(detokenized_update) + assert stats.output_token_latency_s_lst == [ + detokenized_2_ts - detokenized_ts, + ] + assert stats.num_output_tokens == 2 + + # Test PREEMPTED + preempted_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.PREEMPTED, + monotonic_ts_s=preempted_ts, + ) + stats.update_from(preempted_update) + assert stats.last_updated_ts_s == preempted_ts + assert stats.preempted_ts_s_lst == [preempted_ts] + # States should be reset + assert stats.num_computed_tokens == 0 + assert stats.num_cached_tokens == 0 + # These states should not be reset + assert stats.num_output_tokens == 2 + assert stats.output_token_latency_s_lst == [ + detokenized_2_ts - detokenized_ts, + ] + assert stats.prefill_latency_s == prefilling_ts - arrived_ts + assert stats.num_prompt_tokens == 6 + assert stats.prefill_start_ts_s_lst == [prefilling_ts] + + # Test resumed + resumed_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.PREFILLING, + monotonic_ts_s=resumed_ts, + num_computed_tokens=6, + num_cached_tokens=2, + ) + stats.update_from(resumed_update) + # prefill timestamp should not be updated since it's a resumed prefill + assert stats.prefill_ts_s == prefilling_ts + assert stats.num_computed_tokens == 6 + assert stats.num_cached_tokens == 2 + assert stats.prefill_start_ts_s_lst == [ + prefilling_ts, + resumed_ts, + ] + assert stats.last_updated_ts_s == resumed_ts + + # Test another DECODED/DETOKENIZED should yield correct first token latency. + decoded_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.DECODING, + monotonic_ts_s=decoded_3_ts, + ) + detokenized_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.DETOKENIZED, + monotonic_ts_s=detokenized_3_ts, + num_new_tokens=1, + ) + stats.update_from(decoded_update) + stats.update_from(detokenized_update) + assert stats.first_token_ts_s == detokenized_ts - arrived_ts + assert stats.num_output_tokens == 3 + assert stats.output_token_latency_s_lst == [ + detokenized_2_ts - detokenized_ts, + detokenized_3_ts - detokenized_2_ts, + ] + + # Test FINISHED + finished_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.FINISHED, + monotonic_ts_s=finished_ts, + finish_reason="test_reason", + ) + stats.update_from(finished_update) + assert stats.last_updated_ts_s == finished_ts + assert stats.e2e_latency_s == finished_ts - arrived_ts + assert stats.inference_latency_s == finished_ts - prefilling_ts + assert stats.prefill_latency_s == detokenized_ts - prefilling_ts + assert stats.decode_latency_s == finished_ts - detokenized_ts + assert stats.first_token_latency_s == detokenized_ts - arrived_ts + assert stats.queue_duration_s == prefilling_ts - queued_ts + assert stats.is_finished + assert stats.finish_reason == "test_reason" + + # TODO(rickyx): Add model forward/execute time. + assert stats.model_forward_duration_s == 0.0 + assert stats.model_execute_duration_s == 0.0 diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py new file mode 100644 index 0000000000000..ac773b611f406 --- /dev/null +++ b/tests/v1/test_utils.py @@ -0,0 +1,62 @@ +from typing import List + +import torch + +from vllm.v1.utils import bind_kv_cache + + +def test_bind_kv_cache(): + from vllm.attention import Attention + + ctx = { + 'layers.0.self_attn': Attention(32, 128, 0.1), + 'layers.1.self_attn': Attention(32, 128, 0.1), + 'layers.2.self_attn': Attention(32, 128, 0.1), + 'layers.3.self_attn': Attention(32, 128, 0.1), + } + kv_cache = { + 'layers.0.self_attn': torch.zeros((1, )), + 'layers.1.self_attn': torch.zeros((1, )), + 'layers.2.self_attn': torch.zeros((1, )), + 'layers.3.self_attn': torch.zeros((1, )), + } + runner_kv_caches: List[torch.Tensor] = [] + bind_kv_cache(kv_cache, ctx, runner_kv_caches) + assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[ + 'layers.0.self_attn'] + assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[ + 'layers.1.self_attn'] + assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[ + 'layers.2.self_attn'] + assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[ + 'layers.3.self_attn'] + + assert runner_kv_caches[0] is kv_cache['layers.0.self_attn'] + assert runner_kv_caches[1] is kv_cache['layers.1.self_attn'] + assert runner_kv_caches[2] is kv_cache['layers.2.self_attn'] + assert runner_kv_caches[3] is kv_cache['layers.3.self_attn'] + + +def test_bind_kv_cache_non_attention(): + from vllm.attention import Attention + + # example from Jamba PP=2 + ctx = { + 'model.layers.20.attn': Attention(32, 128, 0.1), + 'model.layers.28.attn': Attention(32, 128, 0.1), + } + kv_cache = { + 'model.layers.20.attn': torch.zeros((1, )), + 'model.layers.28.attn': torch.zeros((1, )), + } + + runner_kv_caches: List[torch.Tensor] = [] + bind_kv_cache(kv_cache, ctx, runner_kv_caches) + + assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[ + 'model.layers.20.attn'] + assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[ + 'model.layers.28.attn'] + + assert runner_kv_caches[0] is kv_cache['model.layers.20.attn'] + assert runner_kv_caches[1] is kv_cache['model.layers.28.attn'] diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt index a06956ce18a93..272206d4502e9 100644 --- a/tests/weight_loading/models.txt +++ b/tests/weight_loading/models.txt @@ -30,4 +30,5 @@ marlin, nm-testing/zephyr-beta-7b-marlin-g128, main marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main qqq, HandH1998/QQQ-Llama-3-8b-g128, main qqq, HandH1998/QQQ-Llama-3-8b, main -hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main \ No newline at end of file +hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main +None, mgleize/fairseq2-dummy-Llama-3.2-1B, main \ No newline at end of file diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py index 199731bdc21fe..7a3786456d0d6 100644 --- a/tests/weight_loading/test_weight_loading.py +++ b/tests/weight_loading/test_weight_loading.py @@ -20,12 +20,13 @@ def test_weight_loading(vllm_runner): """ Test parameter weight loading with tp>1. """ - with vllm_runner(model_name=MODEL_NAME, - revision=REVISION, - dtype=torch.half if QUANTIZATION == "gptq" else "auto", - quantization=QUANTIZATION, - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=2) as model: + with vllm_runner( + model_name=MODEL_NAME, + revision=REVISION, + dtype=torch.half if QUANTIZATION == "gptq" else "auto", + quantization=None if QUANTIZATION == "None" else QUANTIZATION, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=2) as model: output = model.generate_greedy("Hello world!", max_tokens=20) print(output) diff --git a/tools/actionlint.sh b/tools/actionlint.sh deleted file mode 100755 index f6a8b5e83a2de..0000000000000 --- a/tools/actionlint.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -if command -v actionlint &> /dev/null; then - actionlint "$@" - exit 0 -elif [ -x ./actionlint ]; then - ./actionlint "$@" - exit 0 -fi - -# download a binary to the current directory - v1.7.3 -bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash) -./actionlint "$@" diff --git a/tools/doc-lint.sh b/tools/doc-lint.sh deleted file mode 100755 index 19a55ddfa91c4..0000000000000 --- a/tools/doc-lint.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -pymarkdownlnt scan docs -r diff --git a/tools/mypy.sh b/tools/mypy.sh index bf95e4c526fd1..77d342da1ec82 100755 --- a/tools/mypy.sh +++ b/tools/mypy.sh @@ -1,12 +1,16 @@ #!/bin/bash CI=${1:-0} -PYTHON_VERSION=${2:-3.9} +PYTHON_VERSION=${2:-local} if [ "$CI" -eq 1 ]; then set -e fi +if [ $PYTHON_VERSION == "local" ]; then + PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') +fi + run_mypy() { echo "Running mypy on $1" if [ "$CI" -eq 1 ] && [ -z "$1" ]; then diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh index d99fa77b96351..7efb3cabc64fe 100755 --- a/tools/shellcheck.sh +++ b/tools/shellcheck.sh @@ -19,4 +19,4 @@ if ! [ -x "$(command -v shellcheck)" ]; then fi # TODO - fix warnings in .buildkite/run-amd-test.sh -find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"' +find . -name "*.sh" ".git" -prune -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"' diff --git a/vllm/__init__.py b/vllm/__init__.py index 45252b93e3d54..2aabe820d9a84 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -1,4 +1,7 @@ """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" +import os + +import torch from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -17,6 +20,19 @@ from .version import __version__, __version_tuple__ +# set some common config/environment variables that should be set +# for all processes created by vllm and all processes +# that interact with vllm workers. +# they are executed whenever `import vllm` is called. + +# see https://github.com/NVIDIA/nccl/issues/1234 +os.environ['NCCL_CUMEM_ENABLE'] = '0' + +# see https://github.com/vllm-project/vllm/issues/10480 +os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1' +# see https://github.com/vllm-project/vllm/issues/10619 +torch._inductor.config.compile_threads = 1 + __all__ = [ "__version__", "__version_tuple__", diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 737559bfe70ca..2efe142a17b69 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -1,8 +1,8 @@ from abc import ABC, abstractmethod from contextlib import contextmanager from dataclasses import dataclass, fields -from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Set, - Tuple, Type, TypeVar) +from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, + Protocol, Set, Tuple, Type, TypeVar) import torch @@ -65,11 +65,6 @@ def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata": def get_builder_cls() -> Type["AttentionMetadataBuilder"]: raise NotImplementedError - @classmethod - def make_metadata_builder(cls, *args, - **kwargs) -> "AttentionMetadataBuilder": - return cls.get_builder_cls()(*args, **kwargs) - @staticmethod @abstractmethod def get_kv_cache_shape( @@ -214,6 +209,12 @@ class AttentionMetadataBuilder(ABC, Generic[T]): @abstractmethod def __init__(self, input_builder: "ModelRunnerInputBuilderBase") -> None: + """Create the builder, remember some configuration and parameters.""" + raise NotImplementedError + + @abstractmethod + def prepare(self) -> None: + """Prepare for one batch.""" raise NotImplementedError @abstractmethod @@ -223,6 +224,22 @@ def build(self, seq_lens: List[int], query_lens: List[int], raise NotImplementedError +class AttentionLayer(Protocol): + + _k_scale: float + _v_scale: float + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + ... + + class AttentionImpl(ABC, Generic[T]): @abstractmethod @@ -244,13 +261,12 @@ def __init__( @abstractmethod def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: T, - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: raise NotImplementedError diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index 77cfa8490172b..9089db1126c94 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -4,6 +4,7 @@ import torch from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionMetadata, AttentionType) from vllm.attention.backends.utils import (CommonAttentionState, CommonMetadataBuilder) @@ -358,13 +359,12 @@ def __init__( def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: BlocksparseFlashAttentionMetadata, - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention and PagedAttention. @@ -401,8 +401,8 @@ def forward( value_cache, attn_metadata.slot_mapping, self.kv_cache_dtype, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) if prefill_meta := attn_metadata.prefill_metadata: @@ -439,8 +439,8 @@ def forward( self.num_kv_heads, self.scale, self.alibi_slopes, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, tp_rank=self.tp_rank, blocksparse_local_blocks=self.local_blocks, blocksparse_vert_stride=self.vert_stride, diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 48b3e8d177ec9..60ed09d0cc44f 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -8,6 +8,7 @@ from vllm import _custom_ops as ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionMetadata, AttentionMetadataBuilder, AttentionType) @@ -374,6 +375,12 @@ class FlashAttentionMetadataBuilder( AttentionMetadataBuilder[FlashAttentionMetadata]): def __init__(self, input_builder: "ModelInputForGPUBuilder"): + self.input_builder = input_builder + self.runner = input_builder.runner + self.sliding_window = input_builder.sliding_window + self.block_size = input_builder.block_size + + def prepare(self): self.slot_mapping: List[int] = [] self.prefill_seq_lens: List[int] = [] self.context_lens: List[int] = [] @@ -387,11 +394,6 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.num_decode_tokens = 0 self.has_prefix_cache_hit = False - self.input_builder = input_builder - self.runner = input_builder.runner - self.sliding_window = input_builder.sliding_window - self.block_size = input_builder.block_size - def _add_seq_group( self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", chunked_prefill_enabled: bool, prefix_cache_hit: bool): @@ -634,13 +636,12 @@ def __init__( def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: FlashAttentionMetadata, - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention. @@ -657,7 +658,7 @@ def forward( NOTE: It in-place updates the output tensor. """ # NOTE(woosuk): FlashAttention does not support FP8 KV cache. - assert k_scale == 1.0 and v_scale == 1.0, ( + assert layer._k_scale == 1.0 and layer._v_scale == 1.0, ( "key/v_scale is not supported in FlashAttention.") assert output is not None, "Output tensor must be provided." @@ -709,8 +710,8 @@ def forward( kv_cache[1], updated_slot_mapping.flatten(), # type: ignore[union-attr] kv_cache_dtype, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) (num_prefill_query_tokens, num_prefill_kv_tokens, diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 6ca75fabdfc38..b8ffbe6dd64dd 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -23,6 +23,7 @@ import vllm.envs as envs from vllm import _custom_ops as ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionMetadata, AttentionMetadataBuilder, AttentionState, AttentionType) @@ -487,6 +488,14 @@ def advance_step(self, class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): def __init__(self, input_builder: "ModelInputForGPUBuilder"): + + self.input_builder = input_builder + self.runner = input_builder.runner + + self.sliding_window = input_builder.sliding_window + self.block_size = input_builder.block_size + + def prepare(self): self.slot_mapping: List[int] = [] self.prefill_seq_lens: List[int] = [] self.context_lens: List[int] = [] @@ -499,12 +508,6 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.num_prefill_tokens = 0 self.num_decode_tokens = 0 - self.input_builder = input_builder - self.runner = input_builder.runner - - self.sliding_window = input_builder.sliding_window - self.block_size = input_builder.block_size - # Please follow https://docs.flashinfer.ai/tutorials/kv_layout.html#page-layout # for the precise definition of the following fields. # An example: @@ -792,13 +795,12 @@ def __init__( def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: FlashInferMetadata, - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -826,8 +828,8 @@ def forward( kv_cache[:, 1], attn_metadata.slot_mapping.flatten(), kv_cache_dtype, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) # The FlashInfer api requires data to be in fp8_e4m3 or fp8_e5m2 # to process the cache when the kv_cache_dtype is fp8 @@ -886,8 +888,8 @@ def forward( kv_cache, logits_soft_cap=logits_soft_cap, causal=True, - k_scale=k_scale, - v_scale=v_scale, + k_scale=layer._k_scale, + v_scale=layer._v_scale, window_left=window_left) if decode_meta := attn_metadata.decode_metadata: assert decode_meta is not None @@ -897,8 +899,8 @@ def forward( kv_cache, sm_scale=softmax_scale, logits_soft_cap=logits_soft_cap, - k_scale=k_scale, - v_scale=v_scale, + k_scale=layer._k_scale, + v_scale=layer._v_scale, window_left=window_left) if prefill_output is None and decode_output is not None: diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index 93c28bcf88fa4..6a5fd7fb870e5 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -11,6 +11,7 @@ from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionMetadata, AttentionType) from vllm.attention.backends.utils import CommonAttentionState from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention, @@ -148,13 +149,12 @@ def __init__( def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: HPUAttentionMetadata, - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index da1d307daa517..cd729a1c8b274 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -7,6 +7,7 @@ from vllm._ipex_ops import ipex_ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionMetadata, AttentionType) from vllm.attention.backends.utils import CommonAttentionState from vllm.attention.ops.paged_attn import (PagedAttention, @@ -171,13 +172,12 @@ def split_kv_cache( def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: IpexAttnMetadata, # type: ignore - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with IPEX varlen_attention and PagedAttention. @@ -193,7 +193,7 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - assert k_scale == 1.0 and v_scale == 1.0 + assert layer._k_scale == 1.0 and layer._v_scale == 1.0 num_tokens, hidden_size = query.shape # Reshape the query, key, and value tensors. query = query.view(-1, self.num_heads, self.head_size) @@ -210,8 +210,8 @@ def forward( value_cache, attn_metadata.slot_mapping.flatten(), self.kv_cache_dtype, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) if attn_metadata.is_prompt: @@ -296,8 +296,8 @@ def forward( max_seq_len, self.alibi_slopes, self.kv_cache_dtype, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) else: # Run PagedAttention V2. @@ -329,8 +329,8 @@ def forward( max_seq_len, self.alibi_slopes, self.kv_cache_dtype, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) # Reshape the output tensor. diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 2ac492dd8ae54..f5bf390df6afb 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -5,6 +5,7 @@ import torch_xla.experimental.custom_kernel # Required to register custom ops. from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionMetadata, AttentionType) from vllm.attention.backends.utils import CommonAttentionState @@ -150,13 +151,12 @@ def __init__( def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: Tuple[torch.Tensor, torch.Tensor], attn_metadata: PallasMetadata, - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with Pallas attention. @@ -173,7 +173,7 @@ def forward( Returns: shape = [batch_size, seq_len, num_heads * head_size] """ - assert k_scale == 1.0 and v_scale == 1.0 + assert layer._k_scale == 1.0 and layer._v_scale == 1.0 batch_size, seq_len, hidden_size = query.shape query = query.view(batch_size, seq_len, self.num_heads, self.head_size) key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size) diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index 534f79b3a60bf..37860494702cf 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -253,6 +253,11 @@ class PlaceholderAttentionMetadataBuilder( AttentionMetadataBuilder[PlaceholderAttentionMetadata]): def __init__(self, input_builder: "ModelInputForGPUBuilder"): + + self.input_builder = input_builder + self.runner = input_builder.runner + + def prepare(self): self.prefill_seq_lens: List[int] = [] self.context_lens: List[int] = [] self.curr_seq_lens: List[int] = [] @@ -263,9 +268,6 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.num_prefill_tokens = 0 self.num_decode_tokens = 0 - self.input_builder = input_builder - self.runner = input_builder.runner - def _add_seq_group( self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", chunked_prefill_enabled: bool): diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index a91a5af5c3d58..e9f2808ff1674 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -7,6 +7,7 @@ import vllm.envs as envs from vllm import _custom_ops as ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionMetadata, AttentionType) from vllm.attention.backends.utils import (CommonAttentionState, CommonMetadataBuilder) @@ -414,13 +415,12 @@ def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor: def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: ROCmFlashAttentionMetadata, - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention and PagedAttention. @@ -458,8 +458,8 @@ def forward( value_cache, attn_metadata.slot_mapping, self.kv_cache_dtype, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) num_prefill_tokens = attn_metadata.num_prefill_tokens @@ -567,8 +567,8 @@ def forward( prefill_meta.max_query_len, self.alibi_slopes, self.sliding_window[0], - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) if decode_meta := attn_metadata.decode_metadata: @@ -613,8 +613,8 @@ def forward( max_seq_len, self.alibi_slopes, self.kv_cache_dtype, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) else: output[num_prefill_tokens:] = PagedAttention.forward_decode( @@ -628,8 +628,8 @@ def forward( self.num_kv_heads, self.scale, self.alibi_slopes, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) # Reshape the output tensor. diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index ca1c4618615de..8722d7376795a 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -7,6 +7,7 @@ from torch.nn.functional import scaled_dot_product_attention from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionMetadata, AttentionMetadataBuilder, AttentionType) @@ -281,7 +282,10 @@ class TorchSDPAMetadataBuilder(AttentionMetadataBuilder[TorchSDPAMetadata]): def __init__(self, input_builder: ModelInputForCPUBuilder) -> None: self.chunked_prefill = input_builder.chunked_prefill - self.input_data = input_builder.input_data + self.input_builder = input_builder + + def prepare(self): + self.input_data = self.input_builder.input_data def build(self, seq_lens: List[int], query_lens: List[int], cuda_graph_pad_size: int, batch_size: int) -> TorchSDPAMetadata: @@ -429,13 +433,12 @@ def __init__( def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: TorchSDPAMetadata, # type: ignore - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with torch SDPA and PagedAttention. @@ -451,7 +454,7 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - assert k_scale == 1.0 and v_scale == 1.0 + assert layer._k_scale == 1.0 and layer._v_scale == 1.0 attn_type = self.attn_type if (attn_type == AttentionType.ENCODER and (not attn_metadata.is_all_encoder_attn_metadata_set)): @@ -493,11 +496,9 @@ def forward( # Update self-attention KV cache (prefill/decode) updated_slot_mapping = attn_metadata.slot_mapping - PagedAttention.write_to_paged_cache(key, value, key_cache, - value_cache, - updated_slot_mapping, - self.kv_cache_dtype, - k_scale, v_scale) + PagedAttention.write_to_paged_cache( + key, value, key_cache, value_cache, updated_slot_mapping, + self.kv_cache_dtype, layer._k_scale, layer._v_scale) if attn_type != AttentionType.ENCODER: # Decoder self-attention supports chunked prefill. @@ -571,8 +572,8 @@ def forward( self.num_kv_heads, self.scale, self.alibi_slopes, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) # Reshape the output tensor. diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 56cc43430301f..3df7f54cbd8d2 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -122,6 +122,13 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]): _metadata_cls: Type[TAttentionMetadata] def __init__(self, input_builder: "ModelInputForGPUBuilder"): + self.input_builder = input_builder + self.runner = input_builder.runner + + self.sliding_window = input_builder.sliding_window + self.block_size = input_builder.block_size + + def prepare(self): self.slot_mapping: List[int] = [] self.prefill_seq_lens: List[int] = [] self.context_lens: List[int] = [] @@ -134,12 +141,6 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.num_prefill_tokens = 0 self.num_decode_tokens = 0 - self.input_builder = input_builder - self.runner = input_builder.runner - - self.sliding_window = input_builder.sliding_window - self.block_size = input_builder.block_size - def _add_seq_group( self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", chunked_prefill_enabled: bool): diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 8c8ca8520a9db..38e27434dab2c 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -10,6 +10,7 @@ LowerTriangularMaskWithTensorBias) from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionMetadata, AttentionType) from vllm.attention.backends.utils import ( CommonAttentionState, CommonMetadataBuilder, @@ -412,13 +413,12 @@ def __init__( def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: Optional[torch.Tensor], value: Optional[torch.Tensor], kv_cache: torch.Tensor, attn_metadata: "XFormersMetadata", - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. @@ -524,11 +524,9 @@ def forward( # If kv_cache is not provided, the new key and value tensors are # not cached. This happens during the initial memory # profiling run. - PagedAttention.write_to_paged_cache(key, value, key_cache, - value_cache, - updated_slot_mapping, - self.kv_cache_dtype, - k_scale, v_scale) + PagedAttention.write_to_paged_cache( + key, value, key_cache, value_cache, updated_slot_mapping, + self.kv_cache_dtype, layer._k_scale, layer._v_scale) (num_prefill_query_tokens, num_prefill_kv_tokens, num_decode_query_tokens) = \ get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type) @@ -580,8 +578,8 @@ def forward( prefill_meta.max_query_len, self.alibi_slopes, self.sliding_window, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) assert output[:num_prefill_query_tokens].shape == out.shape output[:num_prefill_query_tokens] = out @@ -607,8 +605,8 @@ def forward( self.num_kv_heads, self.scale, self.alibi_slopes, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) # Reshape the output tensor. diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 9b03fd73fe690..c36f8d08eb4a7 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -101,7 +101,9 @@ def __init__( self.num_heads = num_heads self.head_size = head_size self.num_kv_heads = num_kv_heads + self.sliding_window = sliding_window self.backend = backend_name_to_enum(attn_backend.get_name()) + self.dtype = dtype # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how # torch.compile works by registering the attention as one giant @@ -241,8 +243,7 @@ def unified_attention( attn_metadata = forward_context.attn_metadata self = forward_context.attn_layers[layer_name] kv_cache = self.kv_cache[forward_context.virtual_engine] - return self.impl.forward(query, key, value, kv_cache, attn_metadata, - self._k_scale, self._v_scale) + return self.impl.forward(self, query, key, value, kv_cache, attn_metadata) def unified_attention_fake( @@ -274,13 +275,12 @@ def unified_attention_with_output( attn_metadata = forward_context.attn_metadata self = forward_context.attn_layers[layer_name] kv_cache = self.kv_cache[forward_context.virtual_engine] - self.impl.forward(query, + self.impl.forward(self, + query, key, value, kv_cache, attn_metadata, - self._k_scale, - self._v_scale, output=output) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 87655530cead4..b9f96c00284b9 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -25,23 +25,30 @@ logger = init_logger(__name__) +@dataclasses.dataclass +class InductorArtifact: + hash_str: str = "" + file_path: str = "" + + class InductorHashCache: """ Disk format: a Python list of tuples, each tuple is - (runtime_shape, graph_index, hash_str) + (runtime_shape, graph_index, hash_str, file_path) We use list of tuple for readability. In-memory format: a defaultdict of dict, where the key is runtime_shape, and the value is a dict of graph_index to hash_str. - The data is essentially `Dict[Optional[int], Dict[int, str]]`, + The data is essentially `Dict[Optional[int], Dict[int, InductorArtifact]]`, we don't use json here because json doesn't support int as key. TODO: better off-the-shelf solution to serialize the data? """ def __init__(self, cache_dir: str, disabled: bool = False): - self.cache: defaultdict = defaultdict(dict) + self.cache: Dict[Optional[int], + Dict[int, InductorArtifact]] = defaultdict(dict) self.disabled = disabled self.cache_dir = cache_dir self.cache_file_path = os.path.join(cache_dir, @@ -66,14 +73,25 @@ def deserialize(self, data: str): # because it is a safe way to parse Python literals. # do not use eval(), it is unsafe. list_data = ast.literal_eval(data) - for runtime_shape, graph_index, hash_str in list_data: - self.cache[runtime_shape][graph_index] = hash_str + for item in list_data: + runtime_shape = item[0] + graph_index = item[1] + hash_str = item[2] + # for compatibility of old version, + # where we don't have file_path. + # NOTE: after running the new code, the file_path + # will be updated. + file_path = "" if len(item) == 3 else item[3] + self.cache[runtime_shape][graph_index] = InductorArtifact( + hash_str=hash_str, file_path=file_path) def serialize(self) -> str: data = [] - for runtime_shape, graph_index_to_hash_str in self.cache.items(): - for graph_index, hash_str in graph_index_to_hash_str.items(): - data.append((runtime_shape, graph_index, hash_str)) + for runtime_shape, value in self.cache.items(): + for graph_index, inductor_artifact in value.items(): + data.append( + (runtime_shape, graph_index, inductor_artifact.hash_str, + inductor_artifact.file_path)) printer = pprint.PrettyPrinter(indent=4) return printer.pformat(data) @@ -90,13 +108,14 @@ def __contains__(self, key: Tuple[Optional[int], int]) -> bool: return runtime_shape in self.cache and graph_index in self.cache[ runtime_shape] - def __getitem__(self, key: Tuple[Optional[int], int]) -> str: + def __getitem__(self, key: Tuple[Optional[int], int]) -> InductorArtifact: if self.disabled: raise KeyError("cannot read from disabled cache") runtime_shape, graph_index = key return self.cache[runtime_shape][graph_index] - def __setitem__(self, key: Tuple[Optional[int], int], value: str): + def __setitem__(self, key: Tuple[Optional[int], int], + value: InductorArtifact): # setitem for disabled cache is fine, because we # don't actually write to the disk runtime_shape, graph_index = key @@ -181,7 +200,8 @@ def wrap_inductor(graph: fx.GraphModule, if (runtime_shape, graph_index) in cache_data: # we compiled this graph before # so we can directly lookup the compiled graph via hash - hash_str = cache_data[(runtime_shape, graph_index)] + inductor_artifact = cache_data[(runtime_shape, graph_index)] + hash_str = inductor_artifact.hash_str if graph_index == 0: # adds some info logging for the first graph logger.info( @@ -199,6 +219,7 @@ def wrap_inductor(graph: fx.GraphModule, "Inductor cache lookup failed. Please remove" f"the cache file {cache_data.cache_file_path} and try again." # noqa ) + inductor_artifact.file_path = inductor_compiled_graph.current_callable.__code__.co_filename # noqa # Inductor calling convention (function signature): # f(list) -> tuple @@ -224,19 +245,20 @@ def compiled_graph(*args): # the assumption is that we don't have nested Inductor compilation. # compiled_fx_graph_hash will only be called once, and we can hook # it to get the hash of the compiled graph directly. - from torch._inductor.codecache import compiled_fx_graph_hash + + inductor_artifact = InductorArtifact() + from torch._inductor.codecache import (FxGraphCache, + compiled_fx_graph_hash) + original_load = FxGraphCache.load + + def hijack_load(*args, **kwargs): + inductor_compiled_graph = original_load(*args, **kwargs) + inductor_artifact.file_path = inductor_compiled_graph.current_callable.__code__.co_filename # noqa + return inductor_compiled_graph def hijack_compiled_fx_graph_hash(*args, **kwargs): out = compiled_fx_graph_hash(*args, **kwargs) - # store the hash in the cache - nonlocal cache_data - cache_data[(runtime_shape, graph_index)] = out[0] - if graph_index == 0: - # adds some info logging for the first graph - logger.info("Cache the graph of shape %s for later use", - str(runtime_shape)) - logger.debug("store the %s-th graph for shape %s via hash %s", - graph_index, str(runtime_shape), out[0]) + inductor_artifact.hash_str = out[0] return out def _check_can_cache(*args, **kwargs): @@ -251,19 +273,45 @@ def _check_can_cache(*args, **kwargs): def _get_shape_env() -> AlwaysHitShapeEnv: return AlwaysHitShapeEnv() - with patch(# for hijacking the hash of the compiled graph - "torch._inductor.codecache.compiled_fx_graph_hash", - hijack_compiled_fx_graph_hash), \ - patch(# for providing a dummy shape environment - "torch._inductor.codecache.FxGraphCache._get_shape_env", - _get_shape_env), \ - patch(# for forcing the graph to be cached - "torch._inductor.codecache.FxGraphCache._check_can_cache", - _check_can_cache): + with ExitStack() as stack: + if not cache_data.disabled: + # compilation cache is enabled, patch several functions + + # hijack to get the compiled graph itself + stack.enter_context( + patch("torch._inductor.codecache.FxGraphCache.load", + hijack_load)) + + # for hijacking the hash of the compiled graph + stack.enter_context( + patch("torch._inductor.codecache.compiled_fx_graph_hash", + hijack_compiled_fx_graph_hash)) + + # for providing a dummy shape environment + stack.enter_context( + patch( + "torch._inductor.codecache.FxGraphCache._get_shape_env", + _get_shape_env)) + + # for forcing the graph to be cached + stack.enter_context( + patch( + "torch._inductor.codecache.FxGraphCache._check_can_cache", + _check_can_cache)) + compiled_graph = compile_fx(graph, example_inputs, config_patches=current_config) - + # store the inductor_artifact in the cache + cache_data[(runtime_shape, graph_index)] = inductor_artifact + if graph_index == 0: + # adds some info logging for the first graph + logger.info("Cache the graph of shape %s for later use", + str(runtime_shape)) + logger.debug( + "store the %s-th graph for shape %s via hash %s from file %s", + graph_index, str(runtime_shape), inductor_artifact.hash_str, + inductor_artifact.file_path) # after compiling the last graph, record the end time if graph_index == num_graphs - 1: now = time.time() @@ -476,6 +524,7 @@ def configure_post_pass(self): def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: + vllm_config = self.vllm_config if not self.compilation_config.cache_dir: # no provided cache dir, generate one based on the known factors # that affects the compilation. if none of the factors change, @@ -484,7 +533,6 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: # 1. factors come from the vllm_config (it mainly summarizes how the # model is created) - vllm_config = self.vllm_config config_hash = vllm_config.compute_hash() # 2. factors come from the code files that are traced by Dynamo ( @@ -508,20 +556,26 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: hash_key = hashlib.md5( f"{config_hash}_{code_hash}".encode()).hexdigest()[:10] cache_dir = os.path.join( - envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key, - f"rank_{vllm_config.parallel_config.rank}") - else: - cache_dir = self.compilation_config.cache_dir + envs.VLLM_CACHE_ROOT, + "torch_compile_cache", + hash_key, + ) + self.compilation_config.cache_dir = cache_dir + + cache_dir = self.compilation_config.cache_dir os.makedirs(cache_dir, exist_ok=True) + local_cache_dir = os.path.join( + cache_dir, f"rank_{vllm_config.parallel_config.rank}") + self.compilation_config.local_cache_dir = local_cache_dir disabled = envs.VLLM_DISABLE_COMPILE_CACHE self.inductor_hash_cache: InductorHashCache = InductorHashCache( - cache_dir, disabled=disabled) + local_cache_dir, disabled=disabled) if disabled: logger.info("vLLM's torch.compile cache is disabled.") else: logger.info("Using cache directory: %s for vLLM's torch.compile", - cache_dir) + local_cache_dir) # when dynamo calls the backend, it means the bytecode # transform and analysis are done @@ -561,6 +615,18 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: self.vllm_config, self.graph_pool, self).run(*example_inputs) + graph_path = os.path.join(local_cache_dir, "computation_graph.py") + if not os.path.exists(graph_path): + # code adapted from https://github.com/thuml/depyf/blob/dab831108a752d1facc00acdd6d4243891845c37/depyf/explain/patched_lazy_format_graph_code.py#L30 # noqa + # use `print_readable` because it can include submodules + src = "from __future__ import annotations\nimport torch\n" + \ + self.split_gm.print_readable(print_output=False) + src = src.replace("", "GraphModule") + with open(graph_path, "w") as f: + f.write(src) + + logger.debug("Computation graph saved to %s", graph_path) + self._called = True if not self.compilation_config.use_cudagraph or \ @@ -576,9 +642,13 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: ] # index of tensors that have symbolic shapes (batch size) + # for weights and static buffers, they will have concrete shapes. + # symbolic shape only happens for input tensors. + from torch.fx.experimental.symbolic_shapes import is_symbolic self.sym_tensor_indices = [ i for i, x in enumerate(fake_args) - if isinstance(x, torch._subclasses.fake_tensor.FakeTensor) + if isinstance(x, torch._subclasses.fake_tensor.FakeTensor) and \ + any(is_symbolic(d) for d in x.size()) ] # compiler managed cudagraph input buffers diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 10513111ea7f1..17eb0592ced6d 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -76,8 +76,8 @@ def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): During runtime, when we actually mark dimensions of tensors, it depends on the value of arguments: - - if it is a single integer, the corresponding dimension of the argument - will be marked as dynamic. + - if it is a single integer (can be negative), the corresponding dimension + of the argument will be marked as dynamic. - if it is `None`, ignored. - if it is `IntermediateTensors`, all the tensors in the intermediate tensors will be marked as dynamic. @@ -177,10 +177,20 @@ def __call__(self, *args, **kwargs): for k, dims in dynamic_arg_dims.items(): arg = bound_args.arguments.get(k) if arg is not None: + dims = [dims] if isinstance(dims, int) else dims if isinstance(arg, torch.Tensor): + # In case dims is specified with negative indexing + dims = [ + arg.ndim + dim if dim < 0 else dim for dim in dims + ] torch._dynamo.mark_dynamic(arg, dims) elif isinstance(arg, IntermediateTensors): for tensor in arg.tensors.values(): + # In case dims is specified with negative indexing + dims = [ + tensor.ndim + dim if dim < 0 else dim + for dim in dims + ] torch._dynamo.mark_dynamic(tensor, dims) else: raise ValueError( @@ -188,6 +198,8 @@ def __call__(self, *args, **kwargs): f" {dims} for argument {k} with type {type(arg)}.") # here, it is the starting point of the `torch.compile` process start_monitoring_torch_compile(self.vllm_config) + logger.debug("Start compiling function %s", + self.original_code_object) # if we don't use custom dispatcher, we can directly call the # compiled function and let torch.compile handle the dispatching, diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index e3260a10c02ae..58a8fa76f6ce2 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -9,6 +9,9 @@ import vllm.envs as envs from vllm.config import CompilationLevel, get_current_vllm_config +from vllm.logger import init_logger + +logger = init_logger(__name__) class TorchCompileWrapperWithCustomDispatcher: @@ -82,6 +85,25 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType): return self.compiled_codes.append(new_code) + local_cache_dir = self.vllm_config.compilation_config.local_cache_dir + if isinstance(local_cache_dir, str): + decompiled_file = os.path.join(local_cache_dir, + "transformed_code.py") + if not os.path.exists(decompiled_file): + try: + # usually the decompilation will succeed for most models, + # as we guarantee a full-graph compilation in Dynamo. + # but there's no 100% guarantee, since decompliation is + # not a reversible process. + import depyf + src = depyf.decompile(new_code) + with open(decompiled_file, "w") as f: + f.write(src) + + logger.debug("Dynamo transformed code saved to %s", + decompiled_file) + except Exception: + pass if self.vllm_config.compilation_config.use_cudagraph and \ "update" in new_code.co_names: diff --git a/vllm/config.py b/vllm/config.py index 4a42aefb75026..f4548c4466e48 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -195,40 +195,43 @@ def compute_hash(self) -> str: factors.append(self.rope_theta) return hashlib.sha256(str(factors).encode()).hexdigest() - def __init__(self, - model: str, - task: Union[TaskOption, Literal["draft"]], - tokenizer: str, - tokenizer_mode: str, - trust_remote_code: bool, - dtype: Union[str, torch.dtype], - seed: int, - allowed_local_media_path: str = "", - revision: Optional[str] = None, - code_revision: Optional[str] = None, - rope_scaling: Optional[Dict[str, Any]] = None, - rope_theta: Optional[float] = None, - tokenizer_revision: Optional[str] = None, - max_model_len: Optional[int] = None, - spec_target_max_model_len: Optional[int] = None, - quantization: Optional[str] = None, - quantization_param_path: Optional[str] = None, - enforce_eager: Optional[bool] = None, - max_seq_len_to_capture: Optional[int] = None, - max_logprobs: int = 20, - disable_sliding_window: bool = False, - skip_tokenizer_init: bool = False, - served_model_name: Optional[Union[str, List[str]]] = None, - limit_mm_per_prompt: Optional[Mapping[str, int]] = None, - use_async_output_proc: bool = True, - config_format: ConfigFormat = ConfigFormat.AUTO, - hf_overrides: Optional[HfOverrides] = None, - mm_processor_kwargs: Optional[Dict[str, Any]] = None, - disable_mm_preprocessor_cache: bool = False, - override_neuron_config: Optional[Dict[str, Any]] = None, - override_pooler_config: Optional["PoolerConfig"] = None, - logits_processor_pattern: Optional[str] = None, - generation_config: Optional[str] = None) -> None: + def __init__( + self, + model: str, + task: Union[TaskOption, Literal["draft"]], + tokenizer: str, + tokenizer_mode: str, + trust_remote_code: bool, + dtype: Union[str, torch.dtype], + seed: int, + allowed_local_media_path: str = "", + revision: Optional[str] = None, + code_revision: Optional[str] = None, + rope_scaling: Optional[Dict[str, Any]] = None, + rope_theta: Optional[float] = None, + tokenizer_revision: Optional[str] = None, + max_model_len: Optional[int] = None, + spec_target_max_model_len: Optional[int] = None, + quantization: Optional[str] = None, + quantization_param_path: Optional[str] = None, + enforce_eager: Optional[bool] = None, + max_seq_len_to_capture: Optional[int] = None, + max_logprobs: int = 20, + disable_sliding_window: bool = False, + skip_tokenizer_init: bool = False, + served_model_name: Optional[Union[str, List[str]]] = None, + limit_mm_per_prompt: Optional[Mapping[str, int]] = None, + use_async_output_proc: bool = True, + config_format: ConfigFormat = ConfigFormat.AUTO, + hf_overrides: Optional[HfOverrides] = None, + mm_processor_kwargs: Optional[Dict[str, Any]] = None, + disable_mm_preprocessor_cache: bool = False, + override_neuron_config: Optional[Dict[str, Any]] = None, + override_pooler_config: Optional["PoolerConfig"] = None, + logits_processor_pattern: Optional[str] = None, + generation_config: Optional[str] = None, + enable_sleep_mode: bool = False, + ) -> None: self.model = model self.tokenizer = tokenizer self.tokenizer_mode = tokenizer_mode @@ -277,6 +280,12 @@ def __init__(self, self.max_logprobs = max_logprobs self.disable_sliding_window = disable_sliding_window self.skip_tokenizer_init = skip_tokenizer_init + self.enable_sleep_mode = enable_sleep_mode + + from vllm.platforms import current_platform + + if self.enable_sleep_mode and not current_platform.is_cuda(): + raise ValueError("Sleep mode is only supported on CUDA devices.") hf_config = get_config(self.model, trust_remote_code, revision, code_revision, config_format) @@ -348,7 +357,6 @@ def __init__(self, self.is_hybrid = self._init_is_hybrid() self.has_inner_state = self._init_has_inner_state() - from vllm.platforms import current_platform if current_platform.is_neuron(): self.override_neuron_config = override_neuron_config else: @@ -357,6 +365,10 @@ def __init__(self, supported_tasks, task = self._resolve_task(task, self.hf_config) self.supported_tasks = supported_tasks self.task: Final = task + if self.task in ("draft", "generate"): + self.truncation_side = "left" + else: + self.truncation_side = "right" self.pooler_config = self._init_pooler_config(override_pooler_config) self.logits_processor_pattern = logits_processor_pattern @@ -553,7 +565,7 @@ def _verify_quantization(self) -> None: optimized_quantization_methods = [ "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin", "awq_marlin", "fbgemm_fp8", "compressed_tensors", - "compressed-tensors", "experts_int8" + "compressed-tensors", "experts_int8", "quark" ] if self.quantization is not None: self.quantization = self.quantization.lower() @@ -603,10 +615,12 @@ def _verify_cuda_graph(self) -> None: self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, self.max_model_len) - if (self.hf_config.model_type == 'deepseek_v3' + MODEL_NOT_SUPPORT_CUDA_GRAPH = ['mllama'] + if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH and not self.enforce_eager): - logger.warning("CUDA graph is not supported for Deepseek V3 yet, " - "fallback to the eager mode.") + logger.warning( + "CUDA graph is not supported for %s yet, fallback to the eager " + "mode.", self.hf_config.model_type) self.enforce_eager = True def _verify_bnb_config(self) -> None: @@ -729,9 +743,12 @@ def get_head_size(self) -> int: if hasattr(self.hf_text_config, "model_type") and (self.hf_text_config.model_type in ('deepseek_v2', 'deepseek_v3')): - # FlashAttention supports only head_size 32, 64, 128, 256, - # we need to pad head_size 192 to 256 - return 256 + qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim", + 0) + qk_nope_head_dim = getattr(self.hf_text_config, "qk_nope_head_dim", + 0) + if qk_rope_head_dim and qk_nope_head_dim: + return qk_rope_head_dim + qk_nope_head_dim if self.is_attention_free: return 0 @@ -1276,7 +1293,7 @@ def __post_init__(self) -> None: raise ValueError(f"worker-use-ray can't be used with " f"distributed executor backend " f"'{self.distributed_executor_backend}'.") - ray_only_devices = ["tpu", "hpu"] + ray_only_devices = ["tpu"] from vllm.platforms import current_platform if (current_platform.device_type in ray_only_devices and self.world_size > 1): @@ -1331,14 +1348,15 @@ def _verify_args(self) -> None: from vllm.executor.executor_base import ExecutorBase from vllm.platforms import current_platform if self.distributed_executor_backend not in ( - "ray", "mp", "uni", None) and not (isinstance( + "ray", "mp", "uni", + "external_launcher", None) and not (isinstance( self.distributed_executor_backend, type) and issubclass( self.distributed_executor_backend, ExecutorBase)): raise ValueError( "Unrecognized distributed executor backend " f"{self.distributed_executor_backend}. Supported " - "values are 'ray', 'mp' 'uni', or custom ExecutorBase" - " subclass.") + "values are 'ray', 'mp' 'uni', 'external_launcher' or" + " custom ExecutorBase subclass.") if self.use_ray: from vllm.executor import ray_utils ray_utils.assert_ray_available() @@ -1383,13 +1401,15 @@ class SchedulerConfig: is_multimodal_model: bool = False - # FIXME(woosuk & ywang96): Below are placeholder values. We need to - # calculate the actual values from the configurations. - # Multimodal encoder run compute budget, only used in V1 - max_num_encoder_input_tokens = 16384 + # NOTE: The following multimodal encoder budget will be initialized to + # max_num_batched_tokens and overridden in case max multimodal embedding + # size is larger. + # TODO (ywang96): Make these configurable. + # Multimodal encoder compute budget, only used in V1 + max_num_encoder_input_tokens: int = field(default=None) # type: ignore # Multimodal encoder cache size, only used in V1 - encoder_cache_size = 16384 + encoder_cache_size: int = field(default=None) # type: ignore # Whether to perform preemption by swapping or # recomputation. If not specified, we determine the mode as follows: @@ -1463,6 +1483,9 @@ def __post_init__(self) -> None: _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, ) + self.max_num_encoder_input_tokens = self.max_num_batched_tokens + self.encoder_cache_size = self.max_num_batched_tokens + if self.enable_chunked_prefill: logger.info( "Chunked prefill is enabled with max_num_batched_tokens=%d.", @@ -2770,6 +2793,7 @@ def model_post_init(self, __context: Any) -> None: compile_sizes: List[int] = PrivateAttr capture_sizes: List[int] = PrivateAttr max_capture_size: int = PrivateAttr + local_cache_dir: str = PrivateAttr # local cache dir for each rank # optimization: # Intuitively, bs_to_padded_graph_size should be Dict[int, int]. # since we know all keys are in a range [0, max_capture_size], @@ -2847,17 +2871,8 @@ def model_post_init(self, __context: Any) -> None: "vllm.unified_attention_with_output", ] else: - # v0 can use full graph compilation without splitting, - # splitting is optional. - # right now we still need it. kv cache shape - # will be included in the graph if we don't split - # the graph. - # TODO: hide kv cache in static forward context - # so that inductor does not see it. - self.splitting_ops = [ - "vllm.unified_attention", - "vllm.unified_attention_with_output", - ] + # v0 uses full graph compilation + self.splitting_ops = [] for k, v in self.inductor_passes.items(): if not isinstance(v, str): @@ -3159,7 +3174,8 @@ def __post_init__(self): if self.compilation_config is None: self.compilation_config = CompilationConfig() - if envs.VLLM_USE_V1 and not self.model_config.enforce_eager: + if envs.VLLM_USE_V1 and self.model_config is not None and \ + not self.model_config.enforce_eager: # NOTE(woosuk): Currently, we use inductor because the piecewise # CUDA graphs do not work properly with the custom CUDA kernels. # FIXME(woosuk): Disable inductor to reduce the compilation time diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 3a57487a6cd8a..c3e1665b4464e 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -339,6 +339,13 @@ def get_prefix_cache_hit_rate(self, device: Device) -> float: assert device in self._allocators return self._allocators[device].get_prefix_cache_hit_rate() + def reset_prefix_cache(self) -> bool: + """Reset prefix cache for all devices.""" + success = True + for allocator in self._allocators.values(): + success = success and allocator.reset_prefix_cache() + return success + def get_and_reset_swaps(self) -> List[Tuple[int, int]]: """Returns and clears the mapping of source to destination block IDs. Will be called after every swapping operations for now, and after every diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index 985a1098b6cd1..cb432db919c73 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -192,6 +192,11 @@ def get_prefix_cache_hit_rate(self) -> float: """Prefix cache hit rate. -1 means not supported or disabled.""" pass + @abstractmethod + def reset_prefix_cache(self) -> bool: + """Reset prefix cache.""" + pass + class NoFreeBlocksError(ValueError): pass @@ -297,6 +302,11 @@ def get_prefix_cache_hit_rate(self, device: Device) -> float: """Prefix cache hit rate. -1 means not supported or disabled.""" pass + @abstractmethod + def reset_prefix_cache(self) -> bool: + """Reset prefix cache.""" + pass + @abstractmethod def find_cached_blocks_prefix( self, diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index 2e30813ddd3d6..d7842ee3243d0 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -137,16 +137,18 @@ def _allocate_block_id(self) -> BlockId: self._refcounter.incr(block_id) return block_id - def _free_block_id(self, block: Block) -> None: - block_id = block.block_id + def _free_block_id(self, block: Union[Block, BlockId]) -> None: + if isinstance(block, Block): + block_id = block.block_id + block.block_id = None + else: + block_id = block assert block_id is not None refcount = self._refcounter.decr(block_id) if refcount == 0: heapq.heappush(self._free_block_indices, block_id) - block.block_id = None - def free(self, block: Block, keep_block_object: bool = False) -> None: # Release the physical block id self._free_block_id(block) @@ -155,6 +157,9 @@ def free(self, block: Block, keep_block_object: bool = False) -> None: if not keep_block_object: self._block_pool.free_block(block) + def free_block_id(self, block_id: BlockId) -> None: + self._free_block_id(block_id) + def fork(self, last_block: Block) -> List[Block]: """Creates a new sequence of blocks that shares the same underlying memory as the original sequence. @@ -326,6 +331,10 @@ def swap_in(self, blocks: List[Block]) -> None: def get_prefix_cache_hit_rate(self) -> float: return -1 + def reset_prefix_cache(self) -> bool: + """No prefix cache for naive block allocator.""" + return True + def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]: # Not applicable for naive block allocator. return [] diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 1238303234deb..ccdc5daa9595c 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -12,6 +12,7 @@ from vllm.core.block.naive_block import (BlockPool, NaiveBlock, NaiveBlockAllocator) from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor +from vllm.logger import init_logger from vllm.sequence import Sequence PrefixHash = int @@ -21,6 +22,8 @@ # then we know this block hasn't been accessed yet. _DEFAULT_LAST_ACCESSED_TIME = -1 +logger = init_logger(__name__) + class BlockTracker: """Used to track the status of a block inside the prefix caching allocator @@ -105,7 +108,8 @@ def __init__( # Evitor used to maintain how we want to handle those computed blocks # if we find memory pressure is high. - self.evictor: Evictor = make_evictor(eviction_policy) + self.eviction_policy = eviction_policy + self.evictor: Evictor = make_evictor(self.eviction_policy) # We share the refcounter between allocators. This allows us to promote # blocks originally allocated in the hashless allocator to immutable @@ -428,6 +432,44 @@ def all_block_ids(self) -> FrozenSet[int]: def get_prefix_cache_hit_rate(self) -> float: return self.metric_data.get_hit_rate() + def reset_prefix_cache(self) -> bool: + """Reset prefix cache. This function may be used in RLHF + flows to invalid prefix caching after the weights are updated, + or used for resetting prefix caching status for benchmarking. + + Returns: + bool: True if the prefix cache is successfully reset, + False otherwise. + """ + num_used_blocks = (self.get_num_total_blocks() - + self.get_num_free_blocks()) + if num_used_blocks > 0: + logger.warning( + "Failed to reset prefix cache because some " + "blocks (%d) are not freed yet", num_used_blocks) + return False + + # Free all blocks in the evictor. + while (block_id := + self._maybe_allocate_evicted_block_id()) is not None: + self._hashless_allocator.free_block_id(block_id) + + # Should not have any cached blocks because all blocks are evicted. + assert not self._cached_blocks + + # Reset the evictor. + self.evictor = make_evictor(self.eviction_policy) + + # Reset the block tracker. + for block_id in self._block_tracker: + self._block_tracker[block_id] = BlockTracker() + + # Reset the metrics. + self.metric_data = CacheMetricData() + + logger.info("Successfully reset prefix cache") + return True + def is_block_cached(self, block: Block) -> bool: assert block.content_hash is not None return block.content_hash in self._cached_blocks diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index b41e848221882..62a5f0bda061a 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -455,6 +455,9 @@ def get_num_free_cpu_blocks(self) -> int: def get_prefix_cache_hit_rate(self, device: Device) -> float: return self.block_allocator.get_prefix_cache_hit_rate(device) + def reset_prefix_cache(self) -> bool: + return self.block_allocator.reset_prefix_cache() + def _can_swap(self, seq_group: SequenceGroup, device: Device, diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py index b10b8d3f4a5bf..9c7e246e3c4ed 100644 --- a/vllm/core/interfaces.py +++ b/vllm/core/interfaces.py @@ -122,6 +122,11 @@ def get_prefix_cache_hit_rate(self, device: Device) -> float: """Prefix cache hit rate. -1 means not supported or disabled.""" pass + @abstractmethod + def reset_prefix_cache(self) -> bool: + """Reset prefix cache for all devices.""" + pass + @abstractmethod def get_num_cached_tokens(self, seq: Sequence) -> int: pass diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py index a47e594518534..f9924be4a3835 100644 --- a/vllm/core/placeholder_block_space_manager.py +++ b/vllm/core/placeholder_block_space_manager.py @@ -90,5 +90,8 @@ def mark_blocks_as_computed(self, seq_group: SequenceGroup, def get_prefix_cache_hit_rate(self, device: Device) -> float: return -1 + def reset_prefix_cache(self) -> bool: + return True + def get_num_cached_tokens(self, seq: Sequence) -> int: return 0 diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index b3d396f9cedda..b1630b34947bd 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -504,6 +504,9 @@ def has_unfinished_seqs(self) -> bool: def get_prefix_cache_hit_rate(self, device: Device) -> float: return self.block_manager.get_prefix_cache_hit_rate(device) + def reset_prefix_cache(self) -> bool: + return self.block_manager.reset_prefix_cache() + def get_num_unfinished_seq_groups(self) -> int: return len(self.waiting) + len(self.running) + len(self.swapped) diff --git a/vllm/device_allocator/__init__.py b/vllm/device_allocator/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py new file mode 100644 index 0000000000000..a43418dbb3b46 --- /dev/null +++ b/vllm/device_allocator/cumem.py @@ -0,0 +1,254 @@ +# cumem-based pytorch pluggable allocator to implement sleep mode. +# other approaches tried but failed: +# - cuda-python package binding +# - custom libcuda driver ctypes wrapper +# both of them failed because of cuda context mismatch. +# not sure why, they are created from a different context. +# the only successful approach is to call cuda driver API in C. +import dataclasses +from contextlib import contextmanager +from typing import Callable, Dict, Optional, Tuple, Union + +import torch + +from vllm.utils import is_pin_memory_available + + +def find_loaded_library(lib_name) -> Optional[str]: + """ + According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html, + the file `/proc/self/maps` contains the memory maps of the process, which includes the + shared libraries loaded by the process. We can use this file to find the path of the + a loaded library. + """ # noqa + found_line = None + with open("/proc/self/maps") as f: + for line in f: + if lib_name in line: + found_line = line + break + if found_line is None: + # the library is not loaded in the current process + return None + # if lib_name is libcudart, we need to match a line with: + # address /path/to/libcudart-hash.so.11.0 + start = found_line.index("/") + path = found_line[start:].strip() + filename = path.split("/")[-1] + assert filename.rpartition(".so")[0].startswith(lib_name), \ + f"Unexpected filename: {filename} for library {lib_name}" + return path + + +cumem_available = False +try: + from vllm.cumem_allocator import (init_module, python_create_and_map, + python_unmap_and_release) + from vllm.distributed.device_communicators.cuda_wrapper import ( + CudaRTLibrary) + lib_name = find_loaded_library("cumem_allocator") + libcudart = CudaRTLibrary() + cumem_available = True +except ModuleNotFoundError: + # rocm platform does not support cumem allocator + init_module = None + python_create_and_map = None + python_unmap_and_release = None + CudaRTLibrary = None + lib_name = None + libcudart = None + +# py_device, py_alignedSize, py_d_mem, py_p_memHandle +HandleType = Tuple[int, int, int, int] + + +@dataclasses.dataclass +class AllocationData: + handle: HandleType + tag: str + cpu_backup_tensor: Optional[torch.Tensor] = None + + +def create_and_map(allocation_handle: HandleType) -> None: + python_create_and_map(*allocation_handle) + + +def unmap_and_release(allocation_handle: HandleType) -> None: + python_unmap_and_release(*allocation_handle) + + +def get_pluggable_allocator( + python_malloc_fn: Callable[[int], + int], python_free_func: Callable[[int, int], + None] +) -> torch.cuda.memory.CUDAPluggableAllocator: + init_module(python_malloc_fn, python_free_func) + new_alloc = torch.cuda.memory.CUDAPluggableAllocator( + lib_name, 'my_malloc', 'my_free') + return new_alloc + + +@contextmanager +def use_memory_pool_with_allocator( + python_malloc_fn: Callable[[int], int], + python_free_func: Callable[[int, int], None]) -> None: + new_alloc = get_pluggable_allocator(python_malloc_fn, python_free_func) + mem_pool = torch.cuda.memory.MemPool(new_alloc._allocator) + with torch.cuda.memory.use_mem_pool(mem_pool): + yield mem_pool + + +class CuMemAllocator: + """ + A singleton class that manages a memory pool for CUDA tensors. + The memory in this pool can be offloaded or discarded when the + allocator sleeps. + + Inside the `use_memory_pool(tag)` context, all tensors created will + be allocated in the memory pool, and has the same tag as the + tag passed to the context. + + When we call `sleep`, all tensors with the specified tag will be + offloaded to CPU memory, and the rest of the tensors will be discarded. + When we call `wake_up`, all tensors that are previously offloaded + will be loaded back to GPU memory, and the rest of the tensors will + have empty memory. + + Why it needs to be a singleton? + When allocated tensors are garbage collected, PyTorch will call + the free callback, which will call the `python_free_callback` method. + The C-extension uses a global variable to store the function of an + instance of this class. If we create multiple instances of this class, + the global variable will be overwritten and the free callback will + not work as expected. + """ + instance: "CuMemAllocator" = None + default_tag: str = "default" + + @staticmethod + def get_instance() -> "CuMemAllocator": + """ + CuMemAllocator is a singleton class. + We cannot call the constructor directly. + Call this method to get the instance. + """ + assert cumem_available, "cumem allocator is not available" + if CuMemAllocator.instance is None: + CuMemAllocator.instance = CuMemAllocator() + return CuMemAllocator.instance + + def __init__(self): + self.pointer_to_data: Dict[int, AllocationData] = {} + self.current_tag: str = CuMemAllocator.default_tag + + def python_malloc_callback(self, allocation_handle: HandleType) -> None: + """ + Internal method to store the allocation data + when memory is allocated in the memory pool.""" + py_d_mem = allocation_handle[2] + self.pointer_to_data[py_d_mem] = AllocationData( + allocation_handle, self.current_tag) + return + + def python_free_callback(self, ptr: int) -> HandleType: + """ + Internal method to look up the allocation data + when memory is freed in the memory pool.""" + data = self.pointer_to_data.pop(ptr) + if data.cpu_backup_tensor is not None: + data.cpu_backup_tensor = None + return data.handle + + def sleep( + self, + offload_tags: Optional[Union[Tuple[str, ...], + str]] = None) -> None: + """ + Put the allocator in sleep mode. + All data in the memory allocation with the specified tag will be + offloaded to CPU memory, and others will be discarded. + + :param offload_tags: The tags of the memory allocation that will be + offloaded. The rest of the memory allocation will be discarded. + """ + if offload_tags is None: + # by default, allocated tensors are offloaded + # when the allocator sleeps + offload_tags = (CuMemAllocator.default_tag, ) + elif isinstance(offload_tags, str): + offload_tags = (offload_tags, ) + + assert isinstance(offload_tags, tuple) + + for ptr, data in self.pointer_to_data.items(): + handle = data.handle + if data.tag in offload_tags: + size_in_bytes = handle[1] + cpu_backup_tensor = torch.empty( + size_in_bytes, + dtype=torch.uint8, + device='cpu', + pin_memory=is_pin_memory_available()) + cpu_ptr = cpu_backup_tensor.data_ptr() + libcudart.cudaMemcpy(cpu_ptr, ptr, size_in_bytes) + data.cpu_backup_tensor = cpu_backup_tensor + unmap_and_release(handle) + + def wake_up(self): + """ + Wake up the allocator from sleep mode. + All data that is previously offloaded will be loaded back to GPU + memory, and the rest of the data will have empty memory.""" + for ptr, data in self.pointer_to_data.items(): + handle = data.handle + create_and_map(handle) + if data.cpu_backup_tensor is not None: + cpu_backup_tensor = data.cpu_backup_tensor + if cpu_backup_tensor is not None: + size_in_bytes = cpu_backup_tensor.numel( + ) * cpu_backup_tensor.element_size() + cpu_ptr = cpu_backup_tensor.data_ptr() + libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes) + data.cpu_backup_tensor = None + + @contextmanager + def use_memory_pool(self, tag: Optional[str] = None): + """ + A context manager to use the memory pool. + All memory allocation created inside the context will be allocated + in the memory pool, and has the specified tag. + + :param tag: The tag of the memory allocation. If None, the default tag + will be used. + """ + if tag is None: + tag = CuMemAllocator.default_tag + + assert isinstance(tag, str) + + old_tag = self.current_tag + self.current_tag = tag + with use_memory_pool_with_allocator(self.python_malloc_callback, + self.python_free_callback): + yield + # PyTorch's bug, calling torch.cuda.empty_cache() will error + # when using pluggable allocator, see + # https://github.com/pytorch/pytorch/issues/145168 . + # if we have some memory allocated and then freed, + # the memory will not be released. + # right now it is fine, because we only use this allocator + # during weight loading and kv cache creation, where we only + # allocate memory. + # TODO: we need to find a way to release the memory, + # i.e. calling torch.cuda.empty_cache() + self.current_tag = old_tag + + def get_current_usage(self) -> int: + """ + Get the total number of bytes allocated in the memory pool. + """ + sum_bytes: int = 0 + for ptr, data in self.pointer_to_data.items(): + handle = data.handle + sum_bytes += handle[1] + return sum_bytes diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py index 4ace03ff1184e..7780e2dfa317d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py @@ -35,6 +35,7 @@ def __init__( ): self.config = config.kv_transfer_config + self.tp_size = config.parallel_config.tensor_parallel_size if self.config.kv_connector == "PyNcclConnector": from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import ( @@ -161,7 +162,7 @@ def send_kv_caches_and_hidden_states( end_layer = model_executable.model.end_layer model_config = model_executable.model.config - num_heads = model_config.num_key_value_heads + num_heads = int(model_config.num_key_value_heads / self.tp_size) hidden_size = model_config.hidden_size num_attention_heads = model_config.num_attention_heads head_size = int(hidden_size / num_attention_heads) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index bf8b30cccd5f6..ffdf8b0f48087 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1183,6 +1183,11 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False): from vllm.platforms import current_platform if not current_platform.is_cpu(): torch.cuda.empty_cache() + try: + torch._C._host_emptyCache() + except AttributeError: + logger.warning( + "torch._C._host_emptyCache() only available in Pytorch >=2.5") def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup], diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c31b206d6f60e..f58c1b55e0c70 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -197,6 +197,7 @@ class EngineArgs: kv_transfer_config: Optional[KVTransferConfig] = None generation_config: Optional[str] = None + enable_sleep_mode: bool = False def __post_init__(self): if not self.tokenizer: @@ -238,7 +239,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: choices=get_args(TaskOption), help='The task to use the model for. Each vLLM instance only ' 'supports one task, even if the same model can be used for ' - 'multiple tasks. When the model only supports one task, "auto" ' + 'multiple tasks. When the model only supports one task, ``"auto"`` ' 'can be used to select it; otherwise, you must specify explicitly ' 'which task to use.') parser.add_argument( @@ -250,7 +251,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument( '--skip-tokenizer-init', action='store_true', - help='Skip initialization of tokenizer and detokenizer') + help='Skip initialization of tokenizer and detokenizer.') parser.add_argument( '--revision', type=nullable_str, @@ -388,7 +389,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: # Parallel arguments parser.add_argument( '--distributed-executor-backend', - choices=['ray', 'mp'], + choices=['ray', 'mp', 'uni', 'external_launcher'], default=EngineArgs.distributed_executor_backend, help='Backend to use for distributed model ' 'workers, either "ray" or "mp" (multiprocessing). If the product ' @@ -396,12 +397,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'or equal to the number of GPUs available, "mp" will be used to ' 'keep processing on a single host. Otherwise, this will default ' 'to "ray" if Ray is installed and fail otherwise. Note that tpu ' - 'and hpu only support Ray for distributed inference.') + 'only supports Ray for distributed inference.') parser.add_argument( '--worker-use-ray', action='store_true', - help='Deprecated, use --distributed-executor-backend=ray.') + help='Deprecated, use ``--distributed-executor-backend=ray``.') parser.add_argument('--pipeline-parallel-size', '-pp', type=int, @@ -430,7 +431,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: choices=[8, 16, 32, 64, 128], help='Token block size for contiguous chunks of ' 'tokens. This is ignored on neuron devices and ' - 'set to max-model-len. On CUDA devices, ' + 'set to ``--max-model-len``. On CUDA devices, ' 'only block sizes up to 32 are supported. ' 'On HPU devices, block size defaults to 128.') @@ -439,12 +440,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: action=argparse.BooleanOptionalAction, default=EngineArgs.enable_prefix_caching, help="Enables automatic prefix caching. " - "Use --no-enable-prefix-caching to disable explicitly.", + "Use ``--no-enable-prefix-caching`` to disable explicitly.", ) parser.add_argument('--disable-sliding-window', action='store_true', help='Disables sliding window, ' - 'capping to sliding window size') + 'capping to sliding window size.') parser.add_argument('--use-v2-block-manager', action='store_true', default=True, @@ -861,7 +862,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "of the provided names. The model name in the model " "field of a response will be the first name in this " "list. If not specified, the model name will be the " - "same as the `--model` argument. Noted that this name(s) " + "same as the ``--model`` argument. Noted that this name(s) " "will also be used in `model_name` tag content of " "prometheus metrics, if multiple names provided, metrics " "tag will take the first one.") @@ -881,7 +882,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=None, help="Valid choices are " + ",".join(ALLOWED_DETAILED_TRACE_MODULES) + - ". It makes sense to set this only if --otlp-traces-endpoint is" + ". It makes sense to set this only if ``--otlp-traces-endpoint`` is" " set. If set, it will collect detailed traces for the specified " "modules. This involves use of possibly costly and or blocking " "operations and hence might have a performance impact.") @@ -955,6 +956,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "loaded from model. If set to a folder path, the generation config " "will be loaded from the specified folder path.") + parser.add_argument("--enable-sleep-mode", + action="store_true", + default=False, + help="Enable sleep mode for the engine. " + "(only cuda platform is supported)") + return parser @classmethod @@ -999,7 +1006,9 @@ def create_model_config(self) -> ModelConfig: override_neuron_config=self.override_neuron_config, override_pooler_config=self.override_pooler_config, logits_processor_pattern=self.logits_processor_pattern, - generation_config=self.generation_config) + generation_config=self.generation_config, + enable_sleep_mode=self.enable_sleep_mode, + ) def create_load_config(self) -> LoadConfig: return LoadConfig( diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 08fef8250d483..739ea06ae3818 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1182,6 +1182,9 @@ async def start_profile(self) -> None: async def stop_profile(self) -> None: self.engine.stop_profile() + async def reset_prefix_cache(self) -> None: + self.engine.reset_prefix_cache() + async def add_lora(self, lora_request: LoRARequest) -> None: self.engine.add_lora(lora_request) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 49a1e9f505d9f..7da18d5f7d2eb 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -457,6 +457,11 @@ def _get_executor_cls(cls, # JAX-style, single-process, multi-device executor. from vllm.executor.uniproc_executor import UniProcExecutor executor_class = UniProcExecutor + elif distributed_executor_backend == "external_launcher": + # executor with external launcher + from vllm.executor.uniproc_executor import ( # noqa + ExecutorWithExternalLauncher) + executor_class = ExecutorWithExternalLauncher else: from vllm.executor.uniproc_executor import UniProcExecutor executor_class = UniProcExecutor @@ -684,7 +689,9 @@ def add_request( :class:`~vllm.PoolingParams` for pooling. arrival_time: The arrival time of the request. If None, we use the current monotonic time. + lora_request: The LoRA request to add. trace_headers: OpenTelemetry trace headers. + prompt_adapter_request: The prompt adapter request to add. priority: The priority of the request. Only applicable with priority scheduling. @@ -907,6 +914,14 @@ def has_unfinished_requests_for_virtual_engine( """ return self.scheduler[virtual_engine].has_unfinished_seqs() + def reset_prefix_cache(self) -> bool: + """Reset prefix cache for all devices.""" + + success = True + for scheduler in self.scheduler: + success = success and scheduler.reset_prefix_cache() + return success + @staticmethod def _process_sequence_group_outputs( seq_group: SequenceGroup, @@ -1811,6 +1826,16 @@ def start_profile(self) -> None: def stop_profile(self) -> None: self.model_executor.stop_profile() + def sleep(self, level: int = 1) -> None: + assert self.vllm_config.model_config.enable_sleep_mode, ( + "Sleep mode is not enabled in the model config") + self.model_executor.sleep(level=level) + + def wake_up(self) -> None: + assert self.vllm_config.model_config.enable_sleep_mode, ( + "Sleep mode is not enabled in the model config") + self.model_executor.wake_up() + def check_health(self) -> None: if self.tokenizer: self.tokenizer.check_health() @@ -1850,46 +1875,44 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None: metrics = seq_group.metrics ttft = metrics.first_token_time - metrics.arrival_time e2e_time = metrics.finished_time - metrics.arrival_time - # attribute names are based on - # https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md - seq_span.set_attribute(SpanAttributes.LLM_RESPONSE_MODEL, + seq_span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL, self.model_config.model) - seq_span.set_attribute(SpanAttributes.LLM_REQUEST_ID, + seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID, seq_group.request_id) - seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TEMPERATURE, + seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE, seq_group.sampling_params.temperature) - seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TOP_P, + seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, seq_group.sampling_params.top_p) - seq_span.set_attribute(SpanAttributes.LLM_REQUEST_MAX_TOKENS, + seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS, seq_group.sampling_params.max_tokens) - seq_span.set_attribute(SpanAttributes.LLM_REQUEST_N, + seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N, seq_group.sampling_params.n) - seq_span.set_attribute(SpanAttributes.LLM_USAGE_NUM_SEQUENCES, + seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_NUM_SEQUENCES, seq_group.num_seqs()) - seq_span.set_attribute(SpanAttributes.LLM_USAGE_PROMPT_TOKENS, + seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS, len(seq_group.prompt_token_ids)) seq_span.set_attribute( - SpanAttributes.LLM_USAGE_COMPLETION_TOKENS, + SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS, sum([ seq.get_output_len() for seq in seq_group.get_finished_seqs() ])) - seq_span.set_attribute(SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE, + seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE, metrics.time_in_queue) seq_span.set_attribute( - SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN, ttft) - seq_span.set_attribute(SpanAttributes.LLM_LATENCY_E2E, e2e_time) + SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft) + seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time) if metrics.scheduler_time is not None: seq_span.set_attribute( - SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER, + SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER, metrics.scheduler_time) if metrics.model_forward_time is not None: seq_span.set_attribute( - SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD, + SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD, metrics.model_forward_time / 1000.0) if metrics.model_execute_time is not None: seq_span.set_attribute( - SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE, + SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE, metrics.model_execute_time) def _validate_model_inputs(self, inputs: ProcessorInputs, diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py index 7132f9840001a..d9703b820a779 100644 --- a/vllm/engine/multiprocessing/__init__.py +++ b/vllm/engine/multiprocessing/__init__.py @@ -121,6 +121,10 @@ class RPCUProfileRequest(Enum): STOP_PROFILE = 2 +class RPCResetPrefixCacheRequest(Enum): + RESET_PREFIX_CACHE = 1 + + @dataclass class RPCLoadAdapterRequest: lora_request: LoRARequest @@ -134,7 +138,8 @@ class RPCAdapterLoadedResponse: RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest, - RPCUProfileRequest, RPCLoadAdapterRequest] + RPCUProfileRequest, RPCLoadAdapterRequest, + RPCResetPrefixCacheRequest] REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse, RPCError] diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index a9ab899535180..5237f63c34c01 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -27,8 +27,9 @@ VLLM_RPC_SUCCESS_STR, RPCAbortRequest, RPCAdapterLoadedResponse, RPCError, RPCLoadAdapterRequest, - RPCProcessRequest, RPCStartupRequest, - RPCStartupResponse, + RPCProcessRequest, + RPCResetPrefixCacheRequest, + RPCStartupRequest, RPCStartupResponse, RPCUProfileRequest) from vllm.engine.protocol import EngineClient # yapf: enable @@ -262,7 +263,14 @@ async def setup(self): """Setup the client before it starts sending server requests.""" # Start output_loop - self.output_loop = asyncio.create_task(self.run_output_handler_loop()) + if self.output_loop is None: + # only generate once to avoid multiple concurrent output_loops + # this will lead to race conditions and wrong orders of tokens + # returned by the engine + # setup will be called multiple times during the startup of + # the engine + self.output_loop = asyncio.create_task( + self.run_output_handler_loop()) with self.get_data_socket() as socket: # Wait until server is ready. @@ -271,8 +279,9 @@ async def setup(self): self.tracing_flag = response.tracing_enabled # Start health_loop. - self.health_loop = asyncio.create_task( - self.run_heartbeat_loop(timeout=VLLM_RPC_TIMEOUT)) + if self.health_loop is None: + self.health_loop = asyncio.create_task( + self.run_heartbeat_loop(timeout=VLLM_RPC_TIMEOUT)) def close(self): """Destroy the ZeroMQ Context.""" @@ -667,6 +676,13 @@ async def stop_profile(self) -> None: await self._send_one_way_rpc_request( request=RPCUProfileRequest.STOP_PROFILE, socket=self.input_socket) + async def reset_prefix_cache(self) -> None: + """Reset the prefix cache""" + + await self._send_one_way_rpc_request( + request=RPCResetPrefixCacheRequest.RESET_PREFIX_CACHE, + socket=self.input_socket) + async def add_lora(self, lora_request: LoRARequest) -> None: """Load a new LoRA adapter into the engine for future requests.""" # Uses the same I/O as generate requests diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 8f231de912c95..166f89743b3cd 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -16,8 +16,9 @@ VLLM_RPC_SUCCESS_STR, RPCAbortRequest, RPCAdapterLoadedResponse, RPCError, RPCLoadAdapterRequest, - RPCProcessRequest, RPCStartupRequest, - RPCStartupResponse, + RPCProcessRequest, + RPCResetPrefixCacheRequest, + RPCStartupRequest, RPCStartupResponse, RPCUProfileRequest) # yapf: enable from vllm.logger import init_logger @@ -237,6 +238,8 @@ def handle_new_input(self): self.stop_profile() elif isinstance(request, RPCLoadAdapterRequest): self._handle_load_adapter_request(request) + elif isinstance(request, RPCResetPrefixCacheRequest): + self.reset_prefix_cache() else: raise ValueError("Unknown RPCRequest Type: " f"{type(request)}") @@ -296,6 +299,7 @@ def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest): is_engine_errored=False, exception=e) self._send_outputs(rpc_err) + return # Otherwise, send back the successful load message self._send_outputs( RPCAdapterLoadedResponse(request_id=request.request_id)) @@ -360,6 +364,9 @@ def start_profile(self) -> None: def stop_profile(self) -> None: self.engine.stop_profile() + def reset_prefix_cache(self) -> bool: + return self.engine.reset_prefix_cache() + def signal_handler(*_) -> None: raise KeyboardInterrupt("MQLLMEngine terminated") diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index c8b282b1a7676..99c2baf3f4df4 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -144,7 +144,7 @@ def process_outputs(self, def _process_decode_and_stop(self, seq: Sequence, sampling_params: SamplingParams) -> None: new_char_count = 0 - if sampling_params.detokenize: + if sampling_params.detokenize and self.detokenizer: new_char_count = self.detokenizer.decode_sequence_inplace( seq, sampling_params) diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index da3185f33dbe9..55c56abea0da3 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -102,9 +102,9 @@ def process_prompt_logprob(self, seq_group: SequenceGroup, Args: seq_group: the output is associated with this :class:`SequenceGroup` - output: the :class:`SequenceGroupOutput` for a single scheduler step + outputs: the :class:`SequenceGroupOutput` for a single scheduler step """ - assert len(outputs) == 1, ("Single step should only has 1 output.") + assert len(outputs) == 1, "Single step should only have 1 output." output = outputs[0] assert isinstance(output, CompletionSequenceGroupOutput) single_step_process_prompt_logprob(self, seq_group, output) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index f05ff62c4766b..de7b2c1b91f50 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -271,6 +271,11 @@ async def stop_profile(self) -> None: """Start profiling the engine""" ... + @abstractmethod + async def reset_prefix_cache(self) -> None: + """Reset the prefix cache""" + ... + @abstractmethod async def add_lora(self, lora_request: LoRARequest) -> None: """Load a new LoRA adapter into the engine for future requests.""" diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index acb4db85632a8..563031cfadc4c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1,11 +1,13 @@ import itertools import warnings from contextlib import contextmanager -from typing import (Any, ClassVar, Dict, List, Optional, Sequence, Tuple, Type, - Union, cast, overload) +from typing import (Any, Callable, ClassVar, Dict, List, Optional, Sequence, + Tuple, Type, Union, cast, overload) +import cloudpickle +import torch.nn as nn from tqdm import tqdm -from typing_extensions import deprecated +from typing_extensions import TypeVar, deprecated from vllm import envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, @@ -41,6 +43,8 @@ logger = init_logger(__name__) +_R = TypeVar("_R", default=Any) + class LLM: """An LLM for generating texts from given prompts and sampling parameters. @@ -186,6 +190,13 @@ def __init__( if "disable_log_stats" not in kwargs: kwargs["disable_log_stats"] = True + if "worker_cls" in kwargs: + worker_cls = kwargs["worker_cls"] + # if the worker_cls is not qualified string name, + # we serialize it using cloudpickle to avoid pickling issues + if isinstance(worker_cls, type): + kwargs["worker_cls"] = cloudpickle.dumps(worker_cls) + if compilation_config is not None: if isinstance(compilation_config, (int, dict)): compilation_config_instance = CompilationConfig.from_cli( @@ -455,6 +466,44 @@ def generate( outputs = self._run_engine(use_tqdm=use_tqdm) return self.engine_class.validate_outputs(outputs, RequestOutput) + def collective_rpc(self, + method: Union[str, Callable[..., _R]], + timeout: Optional[float] = None, + args: Tuple = (), + kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: + """ + Execute an RPC call on all workers. + + Args: + method: Name of the worker method to execute, or a callable that + is serialized and sent to all workers to execute. + + If the method is a callable, it should accept an additional + `self` argument, in addition to the arguments passed in `args` + and `kwargs`. The `self` argument will be the worker object. + timeout: Maximum time in seconds to wait for execution. Raises a + :exc:`TimeoutError` on timeout. `None` means wait indefinitely. + args: Positional arguments to pass to the worker method. + kwargs: Keyword arguments to pass to the worker method. + + Returns: + A list containing the results from each worker. + + Note: + It is recommended to use this API to only pass control messages, + and set up data-plane communication to pass data. + """ + executor = self.llm_engine.model_executor + return executor.collective_rpc(method, timeout, args, kwargs) + + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + """ + Run a function directly on the model inside each worker, + returning the result for each of them. + """ + executor = self.llm_engine.model_executor + return executor.apply_model(func) + def beam_search( self, prompts: List[Union[TokensPrompt, TextPrompt]], @@ -1083,6 +1132,33 @@ def start_profile(self) -> None: def stop_profile(self) -> None: self.llm_engine.stop_profile() + def reset_prefix_cache(self) -> bool: + return self.llm_engine.reset_prefix_cache() + + def sleep(self, level: int = 1): + """ + Put the engine to sleep. The engine should not process any requests. + The caller should guarantee that no requests are being processed + during the sleep period, before `wake_up` is called. + + :param level: The sleep level. Level 1 sleep will offload the model + weights and discard the kv cache. The content of kv cache is + forgotten. Level 1 sleep is good for sleeping and waking up the + engine to run the same model again. The model weights are backed + up in CPU memory. Please make sure there's enough CPU memory to + store the model weights. Level 2 sleep will discard both the model + weights and the kv cache. The content of both the model weights + and kv cache is forgotten. Level 2 sleep is good for sleeping and + waking up the engine to run a different model or update the model, + where previous model weights are not needed. It reduces CPU memory + pressure. + """ + self.reset_prefix_cache() + self.llm_engine.sleep(level=level) + + def wake_up(self): + self.llm_engine.wake_up() + # LEGACY def _convert_v1_inputs( self, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 1aeefe86cd05e..f510c41503011 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1,5 +1,6 @@ import asyncio import atexit +import gc import importlib import inspect import multiprocessing @@ -104,6 +105,11 @@ async def _force_log(): task.add_done_callback(_running_tasks.remove) else: task = None + + # Mark the startup heap as static so that it's ignored by GC. + # Reduces pause times of oldest generation collections. + gc.collect() + gc.freeze() try: yield finally: @@ -518,6 +524,18 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): }, } +if envs.VLLM_SERVER_DEV_MODE: + + @router.post("/reset_prefix_cache") + async def reset_prefix_cache(raw_request: Request): + """ + Reset the prefix cache. Note that we currently do not check if the + prefix cache is successfully reset in the API server. + """ + logger.info("Resetting prefix cache...") + await engine_client(raw_request).reset_prefix_cache() + return Response(status_code=200) + @router.post("/invocations") async def invocations(raw_request: Request): diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 22206ef8dbfe6..4df75a665bab9 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -79,29 +79,29 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument("--host", type=nullable_str, default=None, - help="host name") - parser.add_argument("--port", type=int, default=8000, help="port number") + help="Host name.") + parser.add_argument("--port", type=int, default=8000, help="Port number.") parser.add_argument( "--uvicorn-log-level", type=str, default="info", choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'], - help="log level for uvicorn") + help="Log level for uvicorn.") parser.add_argument("--allow-credentials", action="store_true", - help="allow credentials") + help="Allow credentials.") parser.add_argument("--allowed-origins", type=json.loads, default=["*"], - help="allowed origins") + help="Allowed origins.") parser.add_argument("--allowed-methods", type=json.loads, default=["*"], - help="allowed methods") + help="Allowed methods.") parser.add_argument("--allowed-headers", type=json.loads, default=["*"], - help="allowed headers") + help="Allowed headers.") parser.add_argument("--api-key", type=nullable_str, default=None, @@ -115,10 +115,10 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: action=LoRAParserAction, help="LoRA module configurations in either 'name=path' format" "or JSON format. " - "Example (old format): 'name=path' " + "Example (old format): ``'name=path'`` " "Example (new format): " - "'{\"name\": \"name\", \"local_path\": \"path\", " - "\"base_model_name\": \"id\"}'") + "``{\"name\": \"name\", \"path\": \"lora_path\", " + "\"base_model_name\": \"id\"}``") parser.add_argument( "--prompt-adapters", type=nullable_str, @@ -132,7 +132,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=None, help="The file path to the chat template, " "or the template in single-line form " - "for the specified model") + "for the specified model.") parser.add_argument( '--chat-template-content-format', type=str, @@ -141,38 +141,39 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help='The format to render message content within a chat template.' '\n\n' '* "string" will render the content as a string. ' - 'Example: "Hello World"\n' + 'Example: ``"Hello World"``\n' '* "openai" will render the content as a list of dictionaries, ' 'similar to OpenAI schema. ' - 'Example: [{"type": "text", "text": "Hello world!"}]') + 'Example: ``[{"type": "text", "text": "Hello world!"}]``') parser.add_argument("--response-role", type=nullable_str, default="assistant", help="The role name to return if " - "`request.add_generation_prompt=true`.") + "``request.add_generation_prompt=true``.") parser.add_argument("--ssl-keyfile", type=nullable_str, default=None, - help="The file path to the SSL key file") + help="The file path to the SSL key file.") parser.add_argument("--ssl-certfile", type=nullable_str, default=None, - help="The file path to the SSL cert file") + help="The file path to the SSL cert file.") parser.add_argument("--ssl-ca-certs", type=nullable_str, default=None, - help="The CA certificates file") + help="The CA certificates file.") parser.add_argument( "--ssl-cert-reqs", type=int, default=int(ssl.CERT_NONE), - help="Whether client certificate is required (see stdlib ssl module's)" + help="Whether client certificate is required (see stdlib ssl module's)." ) parser.add_argument( "--root-path", type=nullable_str, default=None, - help="FastAPI root_path when app is behind a path based routing proxy") + help="FastAPI root_path when app is behind a path based routing proxy." + ) parser.add_argument( "--middleware", type=nullable_str, @@ -182,15 +183,15 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "We accept multiple --middleware arguments. " "The value should be an import path. " "If a function is provided, vLLM will add it to the server " - "using @app.middleware('http'). " + "using ``@app.middleware('http')``. " "If a class is provided, vLLM will add it to the server " - "using app.add_middleware(). ") + "using ``app.add_middleware()``. ") parser.add_argument( "--return-tokens-as-token-ids", action="store_true", - help="When --max-logprobs is specified, represents single tokens as " - "strings of the form 'token_id:{token_id}' so that tokens that " - "are not JSON-encodable can be identified.") + help="When ``--max-logprobs`` is specified, represents single tokens " + " as strings of the form 'token_id:{token_id}' so that tokens " + "that are not JSON-encodable can be identified.") parser.add_argument( "--disable-frontend-multiprocessing", action="store_true", @@ -205,9 +206,8 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--enable-auto-tool-choice", action="store_true", default=False, - help= - "Enable auto tool choice for supported models. Use --tool-call-parser" - " to specify which parser to use") + help="Enable auto tool choice for supported models. Use " + "``--tool-call-parser`` to specify which parser to use.") valid_tool_parsers = ToolParserManager.tool_parsers.keys() parser.add_argument( @@ -219,7 +219,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help= "Select the tool call parser depending on the model that you're using." " This is used to parse the model-generated tool call into OpenAI API " - "format. Required for --enable-auto-tool-choice.") + "format. Required for ``--enable-auto-tool-choice``.") parser.add_argument( "--tool-parser-plugin", @@ -228,7 +228,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help= "Special the tool parser plugin write to parse the model-generated tool" " into OpenAI API format, the name register in this plugin can be used " - "in --tool-call-parser.") + "in ``--tool-call-parser``.") parser = AsyncEngineArgs.add_cli_args(parser) @@ -243,7 +243,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--disable-fastapi-docs", action='store_true', default=False, - help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint" + help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint." ) parser.add_argument( "--enable-prompt-tokens-details", diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 14e41346df775..80403f77d5375 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -3,7 +3,7 @@ import re import time from argparse import Namespace -from typing import Any, Dict, List, Literal, Optional, Union +from typing import Any, ClassVar, Dict, List, Literal, Optional, Set, Union import torch from pydantic import BaseModel, ConfigDict, Field, model_validator @@ -42,23 +42,31 @@ class OpenAIBaseModel(BaseModel): # OpenAI API does allow extra fields model_config = ConfigDict(extra="allow") + # Cache class field names + field_names: ClassVar[Optional[Set[str]]] = None + @model_validator(mode="before") @classmethod def __log_extra_fields__(cls, data): - if isinstance(data, dict): + + field_names = cls.field_names + if field_names is None: + if not isinstance(data, dict): + return data # Get all class field names and their potential aliases field_names = set() for field_name, field in cls.model_fields.items(): field_names.add(field_name) - if hasattr(field, 'alias') and field.alias: - field_names.add(field.alias) - - # Compare against both field names and aliases - extra_fields = data.keys() - field_names - if extra_fields: - logger.warning( - "The following fields were present in the request " - "but ignored: %s", extra_fields) + if alias := getattr(field, 'alias', None): + field_names.add(alias) + cls.field_names = field_names + + # Compare against both field names and aliases + if any(k not in field_names for k in data): + logger.warning( + "The following fields were present in the request " + "but ignored: %s", + data.keys() - field_names) return data diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 88859255f202a..3da447be06430 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -203,15 +203,19 @@ def _validate_input( ) -> TextTokensPrompt: token_num = len(input_ids) - # Note: EmbeddingRequest doesn't have max_tokens - if isinstance(request, - (EmbeddingChatRequest, EmbeddingCompletionRequest)): + # Note: EmbeddingRequest and ScoreRequest doesn't have max_tokens + if isinstance( + request, + (EmbeddingChatRequest, EmbeddingCompletionRequest, ScoreRequest)): + + operation = "score" if isinstance(request, ScoreRequest) \ + else "embedding generation" if token_num > self.max_model_len: raise ValueError( f"This model's maximum context length is " f"{self.max_model_len} tokens. However, you requested " - f"{token_num} tokens in the input for embedding " - f"generation. Please reduce the length of the input.") + f"{token_num} tokens in the input for {operation}. " + f"Please reduce the length of the input.") return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index a222eafadcb68..fc422f0917bd5 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -157,24 +157,16 @@ async def load_lora_adapter( # This will also pre-load it for incoming requests try: await self.engine_client.add_lora(lora_request) - except ValueError as e: - # Adapter not found or lora configuration errors - if "No adapter found" in str(e): - return create_error_response(message=str(e), - err_type="NotFoundError", - status_code=HTTPStatus.NOT_FOUND) - else: - return create_error_response( - message=str(e), - err_type="BadRequestError", - status_code=HTTPStatus.BAD_REQUEST) except BaseException as e: - # Some other unexpected problem loading the adapter, e.g. malformed - # input files. - # More detailed error messages for the user would be nicer here + error_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + if isinstance(e, ValueError) and "No adapter found" in str(e): + error_type = "NotFoundError" + status_code = HTTPStatus.NOT_FOUND + return create_error_response(message=str(e), - err_type="BadRequestError", - status_code=HTTPStatus.BAD_REQUEST) + err_type=error_type, + status_code=status_code) self.lora_requests.append(lora_request) logger.info("Loaded new LoRA adapter: name '%s', path '%s'", lora_name, diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 5d3e7139d7a17..381edf8fac49e 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -101,6 +101,38 @@ async def create_score( if not self.model_config.is_cross_encoder: raise ValueError("Model is not cross encoder.") + if truncate_prompt_tokens is not None and \ + truncate_prompt_tokens > self.max_model_len: + raise ValueError( + f"truncate_prompt_tokens value ({truncate_prompt_tokens}) " + f"is greater than max_model_len ({self.max_model_len})." + f" Please, select a smaller truncation size.") + + input_pairs = make_pairs(request.text_1, request.text_2) + for q, t in input_pairs: + request_prompt = f"{q}{tokenizer.sep_token}{t}" + + tokenization_kwargs: Dict[str, Any] = {} + if truncate_prompt_tokens is not None: + tokenization_kwargs["truncation"] = True + tokenization_kwargs["max_length"] = truncate_prompt_tokens + + tokenize_async = make_async(tokenizer.__call__, + executor=self._tokenizer_executor) + prompt_inputs = await tokenize_async(text=q, + text_pair=t, + **tokenization_kwargs) + + input_ids = prompt_inputs["input_ids"] + text_token_prompt = \ + self._validate_input(request, input_ids, request_prompt) + engine_prompt = TokensPrompt( + prompt_token_ids=text_token_prompt["prompt_token_ids"], + token_type_ids=prompt_inputs.get("token_type_ids")) + + request_prompts.append(request_prompt) + engine_prompts.append(engine_prompt) + except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) @@ -108,28 +140,6 @@ async def create_score( # Schedule the request and get the result generator. generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] - input_pairs = make_pairs(request.text_1, request.text_2) - - for q, t in input_pairs: - request_prompt = f"{q}{tokenizer.sep_token}{t}" - - tokenization_kwargs: Dict[str, Any] = {} - if truncate_prompt_tokens is not None: - tokenization_kwargs["truncation"] = True - tokenization_kwargs["max_length"] = truncate_prompt_tokens - - tokenize_async = make_async(tokenizer.__call__, - executor=self._tokenizer_executor) - prompt_inputs = await tokenize_async(text=q, - text_pair=t, - **tokenization_kwargs) - engine_prompt = TokensPrompt( - prompt_token_ids=prompt_inputs["input_ids"], - token_type_ids=prompt_inputs.get("token_type_ids")) - - request_prompts.append(request_prompt) - engine_prompts.append(engine_prompt) - try: pooling_params = request.to_pooling_params() diff --git a/vllm/envs.py b/vllm/envs.py index b7b597ea15af3..3a15e00e7b50a 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -72,6 +72,8 @@ VLLM_ENABLE_V1_MULTIPROCESSING: bool = True VLLM_LOG_BATCHSIZE_INTERVAL: float = -1 VLLM_DISABLE_COMPILE_CACHE: bool = False + VLLM_SERVER_DEV_MODE: bool = False + VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128 def get_default_cache_root(): @@ -467,6 +469,22 @@ def get_default_config_root(): lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")), "VLLM_DISABLE_COMPILE_CACHE": lambda: bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))), + + # If set, vllm will run in development mode, which will enable + # some additional endpoints for developing and debugging, + # e.g. `/reset_prefix_cache` + "VLLM_SERVER_DEV_MODE": + lambda: bool(int(os.getenv("VLLM_SERVER_DEV_MODE", "0"))), + + # Controls the maximum number of requests to handle in a + # single asyncio task when processing per-token outputs in the + # V1 AsyncLLM interface. It is applicable when handling a high + # concurrency of streaming requests. + # Setting this too high can result in a higher variance of + # inter-message latencies. Setting it too low can negatively impact + # TTFT and overall throughput. + "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE": + lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")), } # end-env-vars-definition diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 00ecadcf92667..6be62d4068572 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -1,6 +1,10 @@ import asyncio from abc import ABC, abstractmethod -from typing import Any, Awaitable, Dict, List, Optional, Set, Tuple, Union +from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple, + Union) + +import torch.nn as nn +from typing_extensions import TypeVar from vllm.config import VllmConfig from vllm.logger import init_logger @@ -10,9 +14,12 @@ from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest, PoolerOutput from vllm.utils import make_async +from vllm.worker.worker_base import WorkerBase logger = init_logger(__name__) +_R = TypeVar("_R", default=Any) + class ExecutorBase(ABC): """Base class for all executors. @@ -43,22 +50,37 @@ def __init__( @abstractmethod def _init_executor(self) -> None: - pass + raise NotImplementedError @abstractmethod def collective_rpc(self, - method: str, + method: Union[str, Callable[..., _R]], timeout: Optional[float] = None, args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: + kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: """ - The main interface of the executor to run a method on all workers, - with homogeneous arguments. - If the args are heterogeneous, then we can pack them into a list, - and unpack them in the method of every worker, because every worker - knows their own rank. + Execute an RPC call on all workers. + + Args: + method: Name of the worker method to execute, or a callable that + is serialized and sent to all workers to execute. + + If the method is a callable, it should accept an additional + `self` argument, in addition to the arguments passed in `args` + and `kwargs`. The `self` argument will be the worker object. + timeout: Maximum time in seconds to wait for execution. Raises a + :exc:`TimeoutError` on timeout. `None` means wait indefinitely. + args: Positional arguments to pass to the worker method. + kwargs: Keyword arguments to pass to the worker method. + + Returns: + A list containing the results from each worker. + + Note: + It is recommended to use this API to only pass control messages, + and set up data-plane communication to pass data. """ - pass + raise NotImplementedError def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of available blocks for the GPU KV cache and @@ -78,16 +100,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: b = min([r[1] for r in results]) return a, b - def initialize(self, num_gpu_blocks: int) -> None: - """ - Initialize the KV caches and begin the model execution loop of the - underlying workers. - For V1 compatibility. - """ - logger.info("# GPU blocks: %d", num_gpu_blocks) - self.collective_rpc("initialize_cache", args=(num_gpu_blocks, )) - self.collective_rpc("compile_or_warm_up_model") - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: """Initialize the KV cache by invoking the underlying worker. """ @@ -106,6 +118,17 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: self.collective_rpc("initialize_cache", args=(num_gpu_blocks, num_cpu_blocks)) + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + """ + Run a function directly on the model inside each worker, + returning the result for each of them. + """ + + def rpc_func(worker: WorkerBase) -> _R: + return func(worker.get_model()) + + return self.collective_rpc(rpc_func) + def execute_model( self, execute_model_req: ExecuteModelRequest ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]: @@ -170,6 +193,12 @@ def start_profile(self) -> None: def stop_profile(self) -> None: self.collective_rpc("stop_profile") + def sleep(self, level: int = 1): + self.collective_rpc("sleep", kwargs=dict(level=level)) + + def wake_up(self): + self.collective_rpc("wake_up") + def save_sharded_state( self, path: str, @@ -260,7 +289,7 @@ def _driver_execute_model( raise NotImplementedError def collective_rpc(self, - method: str, + method: Union[str, Callable], timeout: Optional[float] = None, args: Tuple = (), kwargs: Optional[Dict] = None) -> List[Any]: @@ -269,7 +298,7 @@ def collective_rpc(self, @abstractmethod def _run_workers( self, - method: str, + method: Union[str, Callable], *args, async_run_tensor_parallel_workers_only: bool = False, max_concurrent_workers: Optional[int] = None, diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py index d9dde949b844a..78c86321d861d 100644 --- a/vllm/executor/mp_distributed_executor.py +++ b/vllm/executor/mp_distributed_executor.py @@ -1,5 +1,8 @@ import asyncio -from typing import Any, List, Optional +import os +from typing import Any, Callable, List, Optional, Union + +import cloudpickle from vllm.executor.executor_base import DistributedExecutorBase from vllm.executor.multiproc_worker_utils import ( @@ -8,8 +11,9 @@ from vllm.logger import init_logger from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest -from vllm.utils import (_run_task_with_lock, get_distributed_init_method, - get_ip, get_open_port, make_async) +from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless, + get_distributed_init_method, get_ip, get_open_port, + make_async, run_method, update_environment_variables) from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -20,7 +24,39 @@ class MultiprocessingDistributedExecutor(DistributedExecutorBase): uses_ray: bool = False + def _check_cuda(self) -> None: + """Check that the number of GPUs is sufficient for the parallel + configuration. Separate from _init_executor to reduce the number of + indented blocks. + """ + parallel_config = self.parallel_config + world_size = parallel_config.world_size + tensor_parallel_size = parallel_config.tensor_parallel_size + + cuda_device_count = cuda_device_count_stateless() + # Use confusing message for more common TP-only case. + if tensor_parallel_size > cuda_device_count: + raise RuntimeError( + f"please set tensor_parallel_size ({tensor_parallel_size}) " + f"to less than max local gpu count ({cuda_device_count})") + + if world_size > cuda_device_count: + raise RuntimeError( + f"please ensure that world_size ({world_size}) " + f"is less than than max local gpu count ({cuda_device_count})") + + # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers + if "CUDA_VISIBLE_DEVICES" not in os.environ: + update_environment_variables({ + "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size)))) + }) + def _init_executor(self) -> None: + + from vllm.platforms import current_platform + if current_platform.is_cuda_alike(): + self._check_cuda() + # Create the parallel GPU workers. world_size = self.parallel_config.world_size tensor_parallel_size = self.parallel_config.tensor_parallel_size @@ -107,12 +143,12 @@ def _driver_execute_model( def _run_workers( self, - method: str, + method: Union[str, Callable], *args, async_run_tensor_parallel_workers_only: bool = False, max_concurrent_workers: Optional[int] = None, **kwargs, - ) -> Any: + ) -> List[Any]: """Runs the given method on all workers. Args: @@ -121,6 +157,11 @@ def _run_workers( It will also be run asynchronously and return a list of futures rather than blocking on the results. """ + if isinstance(method, str): + sent_method = method + else: + sent_method = cloudpickle.dumps(method) + del method if max_concurrent_workers: raise NotImplementedError( @@ -129,18 +170,18 @@ def _run_workers( if async_run_tensor_parallel_workers_only: # Run only non-driver workers and just return futures. return [ - worker.execute_method(method, *args, **kwargs) + worker.execute_method(sent_method, *args, **kwargs) for worker in self.non_driver_workers ] # Start all remote workers first. worker_outputs = [ - worker.execute_method(method, *args, **kwargs) + worker.execute_method(sent_method, *args, **kwargs) for worker in self.workers ] - driver_worker_method = getattr(self.driver_worker, method) - driver_worker_output = driver_worker_method(*args, **kwargs) + driver_worker_output = run_method(self.driver_worker, sent_method, + args, kwargs) # Get the results of the workers. return [driver_worker_output diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index c9fb3c664c575..539b6ae2d3572 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -15,7 +15,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.triton_utils.importing import HAS_TRITON -from vllm.utils import _check_multiproc_method, get_mp_context +from vllm.utils import _check_multiproc_method, get_mp_context, run_method if HAS_TRITON: from vllm.triton_utils import maybe_set_triton_cache_manager @@ -169,7 +169,7 @@ def __init__(self, result_handler: ResultHandler, self.process.start() def _enqueue_task(self, future: Union[ResultFuture, asyncio.Future], - method: str, args, kwargs): + method: Union[str, bytes], args, kwargs): task_id = uuid.uuid4() self.tasks[task_id] = future try: @@ -180,12 +180,13 @@ def _enqueue_task(self, future: Union[ResultFuture, asyncio.Future], del self.tasks[task_id] raise ChildProcessError("worker died") from e - def execute_method(self, method: str, *args, **kwargs): + def execute_method(self, method: Union[str, bytes], *args, **kwargs): future: ResultFuture = ResultFuture() self._enqueue_task(future, method, args, kwargs) return future - async def execute_method_async(self, method: str, *args, **kwargs): + async def execute_method_async(self, method: Union[str, bytes], *args, + **kwargs): future = asyncio.get_running_loop().create_future() self._enqueue_task(future, method, args, kwargs) return await future @@ -230,8 +231,7 @@ def _run_worker_process( exception = None task_id, method, args, kwargs = items try: - executor = getattr(worker, method) - output = executor(*args, **kwargs) + output = run_method(worker, method, args, kwargs) except SystemExit: raise except KeyboardInterrupt: diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index edceece4b68dc..2afd99f99b353 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -2,8 +2,9 @@ import os from collections import defaultdict from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union +import cloudpickle import msgspec import vllm.envs as envs @@ -172,7 +173,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", scheduling_strategy=scheduling_strategy, **ray_remote_kwargs, )(RayWorkerWrapper).remote(vllm_config=self.vllm_config, - rank=rank) + rpc_rank=rank) else: worker = ray.remote( num_cpus=0, @@ -181,7 +182,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", scheduling_strategy=scheduling_strategy, **ray_remote_kwargs, )(RayWorkerWrapper).remote(vllm_config=self.vllm_config, - rank=rank) + rpc_rank=rank) worker_metadata.append( RayWorkerMetaData(worker=worker, created_rank=rank)) rank += 1 @@ -204,7 +205,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # as the resource holder for the driver process. self.driver_dummy_worker = worker self.driver_worker = RayWorkerWrapper( - vllm_config=self.vllm_config, rank=0) + vllm_config=self.vllm_config, rpc_rank=0) worker_metadata.pop(i) break @@ -410,7 +411,7 @@ def execute_model( def _run_workers( self, - method: str, + method: Union[str, Callable], *args, async_run_tensor_parallel_workers_only: bool = False, max_concurrent_workers: Optional[int] = None, @@ -426,6 +427,11 @@ def _run_workers( rather than blocking on the results. - args/kwargs: All workers share the same args/kwargs """ + if isinstance(method, str): + sent_method = method + else: + sent_method = cloudpickle.dumps(method) + del method if self.use_ray_spmd_worker: assert not async_run_tensor_parallel_workers_only, ( "async_run_tensor_parallel_workers_only is not supported for " @@ -440,7 +446,7 @@ def _run_workers( if async_run_tensor_parallel_workers_only: ray_workers = self.non_driver_workers ray_worker_outputs = [ - worker.execute_method.remote(method, *args, **kwargs) + worker.execute_method.remote(sent_method, *args, **kwargs) for worker in ray_workers ] @@ -455,7 +461,7 @@ def _run_workers( if not self.use_ray_spmd_worker: # Start the driver worker after all the ray workers. driver_worker_output = [ - self.driver_worker.execute_method(method, *args, **kwargs) + self.driver_worker.execute_method(sent_method, *args, **kwargs) ] # Get the results of the ray workers. diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index da1d77343cf3b..a5c4dcf0ec7f9 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -1,8 +1,14 @@ -from typing import Any, Dict, List, Optional, Tuple +import os +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import torch +import torch.distributed as dist + +import vllm.envs as envs from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger -from vllm.utils import get_distributed_init_method, get_ip, get_open_port +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + run_method) from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -16,7 +22,7 @@ def _init_executor(self) -> None: """Initialize the worker and load the model. """ self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, - rank=0) + rpc_rank=0) distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) local_rank = 0 @@ -34,18 +40,13 @@ def _init_executor(self) -> None: self.collective_rpc("load_model") def collective_rpc(self, - method: str, + method: Union[str, Callable], timeout: Optional[float] = None, args: Tuple = (), kwargs: Optional[Dict] = None) -> List[Any]: if kwargs is None: kwargs = {} - try: - func = getattr(self.driver_worker, method) - except AttributeError: - raise NotImplementedError(f"Method {method} is not implemented.") \ - from None - answer = func(*args, **kwargs) + answer = run_method(self.driver_worker, method, args, kwargs) return [answer] def check_health(self) -> None: @@ -55,3 +56,77 @@ def check_health(self) -> None: UniProcExecutorAsync = UniProcExecutor + + +class ExecutorWithExternalLauncher(UniProcExecutor): + """An executor that uses external launchers to launch engines, + specially designed for torchrun-compatible launchers, for + offline inference with tensor parallelism. + + see https://github.com/vllm-project/vllm/issues/11400 for + the motivation, and examples/offline_inference/torchrun_example.py + for the usage example. + + The key idea: although it is tensor-parallel inference, we only + create one worker per executor, users will launch multiple + engines with torchrun-compatible launchers, and all these engines + work together to process the same prompts. When scheduling is + deterministic, all the engines will generate the same outputs, + and they don't need to synchronize the states with each other. + """ + uses_ray: bool = False + + def _init_executor(self) -> None: + """Initialize the worker and load the model. + """ + assert self.vllm_config.parallel_config.pipeline_parallel_size == 1, \ + ("ExecutorWithExternalLauncher does not " + "support pipeline parallelism.") + assert self.vllm_config.scheduler_config.delay_factor == 0.0, \ + ("ExecutorWithExternalLauncher needs deterministic " + "execution, so it" + "does not support delay_factor in scheduling") + assert not envs.VLLM_USE_V1, \ + ("V1 architecture cannot guarantee deterministic execution, " + "so it is not supported in ExecutorWithExternalLauncher.") + self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, + rpc_rank=0) + # engines are launched in torchrun-compatible launchers + # so we can use the env:// method. + # required env vars: + # - RANK + # - MASTER_ADDR + # - MASTER_PORT + distributed_init_method = "env://" + rank = int(os.environ["RANK"]) + local_rank = rank + is_driver_worker = True + kwargs = dict( + vllm_config=self.vllm_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + is_driver_worker=is_driver_worker, + ) + self.collective_rpc("init_worker", args=([kwargs], )) + self.collective_rpc("init_device") + self.collective_rpc("load_model") + + def determine_num_available_blocks(self) -> Tuple[int, int]: + """ + Determine the number of available KV blocks. + Add an additional all_reduce to get the min across all ranks. + Note that even if we have the same `gpu_memory_utilization` and + `swap_space`, the available memory in every rank might still + differ because NCCL can take different amounts of memory in + different ranks. Therefore, it is necessary to test if all ranks + agree on the same KV cache configuration. + """ + a, b = super().determine_num_available_blocks() + from vllm.distributed.parallel_state import get_world_group + cpu_group = get_world_group().cpu_group + a_tensor = torch.tensor([a], device="cpu", dtype=torch.int64) + b_tensor = torch.tensor([b], device="cpu", dtype=torch.int64) + dist.all_reduce(a_tensor, group=cpu_group, op=dist.ReduceOp.MIN) + dist.all_reduce(b_tensor, group=cpu_group, op=dist.ReduceOp.MIN) + return a_tensor.item(), b_tensor.item() diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index b8163a7acde1d..57e85779dd587 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs, MultiModalPlaceholderDict) - from vllm.multimodal.inputs import MultiModalInputsV2 + from vllm.multimodal.inputs import MultiModalInputs class TextPrompt(TypedDict): @@ -207,7 +207,7 @@ def token_inputs( return inputs -DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputsV2"] +DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputs"] """ The inputs in :class:`~vllm.LLMEngine` before they are passed to the model executor. @@ -222,14 +222,14 @@ class EncoderDecoderInputs(TypedDict): This specifies the required data for encoder-decoder models. """ - encoder: Union[TokenInputs, "MultiModalInputsV2"] + encoder: Union[TokenInputs, "MultiModalInputs"] """The inputs for the encoder portion.""" - decoder: Union[TokenInputs, "MultiModalInputsV2"] + decoder: Union[TokenInputs, "MultiModalInputs"] """The inputs for the decoder portion.""" -SingletonInputs = Union[TokenInputs, "MultiModalInputsV2"] +SingletonInputs = Union[TokenInputs, "MultiModalInputs"] """ A processed :class:`SingletonPrompt` which can be passed to :class:`vllm.sequence.Sequence`. @@ -311,7 +311,7 @@ def multi_modal_hashes(self) -> List[str]: return inputs.get("multi_modal_hashes", []) if inputs["type"] == "multimodal": - # only the case when we use MultiModalInputsV2 + # only the case when we use MultiModalInputs return inputs.get("mm_hashes", []) # type: ignore[return-value] assert_never(inputs) # type: ignore[arg-type] diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 0890883cc984f..70372e0cad22d 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -7,7 +7,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry -from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2 +from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup @@ -247,7 +247,7 @@ def _process_multimodal( mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], lora_request: Optional[LoRARequest], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: """ Apply the model's multi-modal processor to a multi-modal prompt, returning the corresponding token IDs and metadata. @@ -271,7 +271,7 @@ async def _process_multimodal_async( mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], lora_request: Optional[LoRARequest], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: """Async version of :meth:`_process_multimodal`.""" tokenizer_group = self.get_tokenizer_group() tokenizer = await tokenizer_group.get_lora_tokenizer_async(lora_request diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index a933ccaecf15e..e6f26d2b74b2f 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -51,6 +51,9 @@ def _get_lora_device(base_layer: nn.Module) -> torch.device: # marlin elif hasattr(base_layer, "B"): return base_layer.B.device + # HQQ marlin + elif hasattr(base_layer, "W_q"): + return base_layer.W_q.device else: raise ValueError(f"Unsupported base layer: {base_layer}") @@ -937,8 +940,8 @@ def soft_cap(self): return self.base_layer.soft_cap @property - def use_gather(self): - return self.base_layer.use_gather + def use_all_gather(self): + return self.base_layer.use_all_gather @property def org_vocab_size(self): diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 5b7225bdc8f37..9809405ca9a61 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -1,5 +1,4 @@ import copy -import json import math import os import re @@ -180,8 +179,8 @@ def from_local_checkpoint( cls, lora_dir: str, expected_lora_modules: List[str], + peft_helper: PEFTHelper, *, - max_position_embeddings: Optional[int] = None, lora_model_id: Optional[int] = None, device: str = "cuda", dtype: Optional[torch.dtype] = None, @@ -196,9 +195,7 @@ def from_local_checkpoint( lora_dir: The local path that has lora data. expected_lora_modules: Name of modules that are expected to be replaced by lora. - max_position_embeddings: Max position embedding length. Used to - scaling the largest context length. If None, the lora model's - context length is not scaled. + peft_helper: Loaded lora configuration information. lora_model_id: Lora model id. If not given, automatically set by a global counter. device: Device where the lora model is loaded. @@ -207,18 +204,13 @@ def from_local_checkpoint( Returns: Loaded LoRA Model. """ - lora_config_path = os.path.join(lora_dir, "adapter_config.json") lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors") lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin") new_embeddings_tensor_path = os.path.join( lora_dir, "new_embeddings.safetensors") new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin") - with open(lora_config_path) as f: - config = json.load(f) - config["vllm_max_position_embeddings"] = max_position_embeddings - peft_helper = PEFTHelper.from_dict(config) unexpected_modules: List[Union[list[str], str]] if os.path.isfile(lora_tensor_path): tensors: Dict[str, torch.Tensor] = {} diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index dacfb9ebd1480..b9c506f6e0bfd 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -1,9 +1,12 @@ # Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py +import json import math +import os from dataclasses import MISSING, dataclass, field, fields -from typing import Literal, Optional, Union +from typing import List, Literal, Optional, Union +from vllm.config import LoRAConfig from vllm.logger import init_logger logger = init_logger(__name__) @@ -11,6 +14,12 @@ @dataclass class PEFTHelper: + """ + A helper class for PEFT configurations, specifically designed for LoRA. + This class handles configuration validation, compatibility checks for + various LoRA implementations. + """ + # Required fields r: int lora_alpha: int @@ -29,20 +38,18 @@ class PEFTHelper: vllm_max_position_embeddings: Optional[int] = field(default=False) vllm_long_context_scaling_factor: Optional[float] = field(default=None) - def _validate_features(self): + def _validate_features(self) -> List[str]: + """ + Check if there are any unsupported Lora features. + """ error_msg = [] - if self.modules_to_save: error_msg.append("vLLM only supports modules_to_save being None.") - if self.use_dora: error_msg.append("vLLM does not yet support DoRA.") - - if error_msg: - raise ValueError(f"{', '.join(error_msg)}") + return error_msg def __post_init__(self): - self._validate_features() if self.use_rslora: logger.info_once("Loading LoRA weights trained with rsLoRA.") self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r) @@ -78,3 +85,29 @@ def from_dict(cls, config_dict: dict) -> "PEFTHelper": for k, v in config_dict.items() if k in class_fields } return cls(**filtered_dict) + + @classmethod + def from_local_dir(cls, lora_path: str, + max_position_embeddings: Optional[int]) -> "PEFTHelper": + lora_config_path = os.path.join(lora_path, "adapter_config.json") + + with open(lora_config_path) as f: + config = json.load(f) + config["vllm_max_position_embeddings"] = max_position_embeddings + return cls.from_dict(config) + + def validate_legal(self, lora_config: LoRAConfig) -> None: + """ + Validates the LoRA configuration settings against application + constraints and requirements. + """ + error_msg = self._validate_features() + if self.r > lora_config.max_lora_rank: + error_msg.append( + f"LoRA rank {self.r} is greater than max_lora_rank" + f" {lora_config.max_lora_rank}.") + if self.bias != "none" and not lora_config.bias_enabled: + error_msg.append( + "Adapter bias cannot be used without bias_enabled.") + if error_msg: + raise ValueError(f"{' '.join(error_msg)}") diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index eec462743fe9d..a64296f7fd902 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -12,6 +12,7 @@ from vllm.logger import init_logger from vllm.lora.models import (LoRAModel, LoRAModelManager, LRUCacheLoRAModelManager, create_lora_manager) +from vllm.lora.peft_helper import PEFTHelper from vllm.lora.request import LoRARequest from vllm.lora.utils import get_adapter_absolute_path @@ -95,6 +96,13 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: expected_lora_modules = list(set(expected_lora_modules)) lora_path = get_adapter_absolute_path(lora_request.lora_path) + peft_helper = PEFTHelper.from_local_dir( + lora_path, self.max_position_embeddings) + + # Validates the LoRA configuration against requirements before + # loading weights, throwing an exception if validation fails. + peft_helper.validate_legal(self.lora_config) + # For some models like Qwen2VL, we need to use hf_to_vllm_mapper # to ensure correct loading of lora weights. hf_to_vllm_mapper = None @@ -105,7 +113,7 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: lora = self._lora_model_cls.from_local_checkpoint( lora_path, expected_lora_modules, - max_position_embeddings=self.max_position_embeddings, + peft_helper=peft_helper, lora_model_id=lora_request.lora_int_id, device="cpu", dtype=self.lora_config.lora_dtype, @@ -120,15 +128,14 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: # - No adapter found to download from huggingface (or in # offline mode) # - No local adapter files found at `lora_request.lora_path` + # For NotFoundError raise ValueError( f"Loading lora {lora_request.lora_name} failed: No adapter " f"found for {lora_path}") from e except Exception as e: - raise RuntimeError(f"Loading lora {lora_path} failed") from e - if lora.rank > self.lora_config.max_lora_rank: - raise ValueError( - f"LoRA rank {lora.rank} is greater than max_lora_rank " - f"{self.lora_config.max_lora_rank}.") + # For BadRequestError + raise e + if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size: raise ValueError(f"LoRA added vocab size {lora.extra_vocab_size} " f"is greater than lora_extra_vocab_size " diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py index 20abaefbacc51..90dfa62ec4670 100644 --- a/vllm/model_executor/guided_decoding/utils.py +++ b/vllm/model_executor/guided_decoding/utils.py @@ -20,6 +20,13 @@ def check_object(obj: dict) -> bool: ]): return True + # Check for array unsupported keywords + if obj.get("type") == "array" and any(key in obj for key in [ + "uniqueItems", "contains", "minContains", "maxContains", + "minItems", "maxItems" + ]): + return True + # Recursively check all nested objects and arrays for value in obj.values(): if isinstance(value, dict): diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index f10a8fb8e03cf..2d8594cb8aafa 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -298,8 +298,11 @@ def __call__(self, input_ids: list[int], # token_bitmask is a CPU tensor for use with accept_token and # fill_next_token_bitmask so we move it to the device of scores device_type = scores.device.type + dtype = scores.dtype if device_type != "cuda": - scores = scores.to("cpu").unsqueeze(0) + # xgrammar on cpu only supports float32 scores + # see: https://github.com/mlc-ai/xgrammar/blob/c1b64920cad24f44f235778c1c00bb52d57da01a/python/xgrammar/kernels/apply_token_bitmask_inplace_cpu.py#L22 + scores = scores.to("cpu").float().unsqueeze(0) # Note: In this method, if the tensors have different dimensions # on CPU device fails, but on GPU it runs without error. Hence the @@ -307,7 +310,7 @@ def __call__(self, input_ids: list[int], xgr.apply_token_bitmask_inplace(scores, self.token_bitmask.to(scores.device)) if device_type != "cuda": - scores = scores.to(device_type).squeeze() + scores = scores.to(dtype).to(device_type).squeeze() return scores diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index af7894b42c560..fb9684ac1c184 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -30,8 +30,10 @@ class FatreluAndMul(CustomOp): def __init__(self, threshold: float = 0.): super().__init__() self.threshold = threshold - if current_platform.is_cuda_alike() or current_platform.is_cpu(): + if current_platform.is_cuda_alike(): self.op = torch.ops._C.fatrelu_and_mul + elif current_platform.is_cpu(): + self._forward_method = self.forward_native def forward_native(self, x: torch.Tensor) -> torch.Tensor: d = x.shape[-1] // 2 @@ -100,11 +102,13 @@ class MulAndSilu(CustomOp): def __init__(self): super().__init__() - if current_platform.is_cuda_alike() or current_platform.is_cpu(): + if current_platform.is_cuda_alike(): self.op = torch.ops._C.mul_and_silu elif current_platform.is_xpu(): from vllm._ipex_ops import ipex_ops self.op = ipex_ops.silu_and_mul + elif current_platform.is_cpu(): + self._forward_method = self.forward_native def forward_native(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json index 6a976788f9b10..4d4b752fa5d64 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 1, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 1, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 32, "kpack": 2 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -181,7 +181,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -192,7 +192,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json index 0a46390b2e31b..a218fc40642c1 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -181,7 +181,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -192,7 +192,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json index 91011e64c7de4..3682cc548f352 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 32, "kpack": 2 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -181,7 +181,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -192,7 +192,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json index f807d4a5abaed..21742854c613f 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 32, "kpack": 2 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -181,7 +181,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -192,7 +192,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 8876ca72792cf..52263e96fb9f9 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -32,7 +32,7 @@ "MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod", "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod", "ModelOptFp8LinearMethod", "IPEXAWQLinearMethod", "IPEXGPTQLinearMethod", - "HQQMarlinMethod" + "HQQMarlinMethod", "QuarkLinearMethod" ] @@ -344,11 +344,13 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype) use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit param_data = param.data - # bitsandbytes loads the weights of the specific portion - # no need to narrow here - if output_dim is not None and not use_bitsandbytes_4bit: + if output_dim is not None and not is_sharded_weight: shard_size = param_data.shape[output_dim] start_idx = tp_rank * shard_size loaded_weight = loaded_weight.narrow(output_dim, start_idx, @@ -546,6 +548,11 @@ def weight_loader(self, use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit + if use_bitsandbytes_4bit: shard_size = loaded_weight.shape[output_dim] shard_offset = loaded_weight.shape[output_dim] * \ @@ -554,9 +561,7 @@ def weight_loader(self, param_data = param_data.narrow(output_dim, shard_offset, shard_size) start_idx = tp_rank * shard_size - # bitsandbytes loads the weights of the specific portion - # no need to narrow here - if not use_bitsandbytes_4bit: + if not is_sharded_weight: loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) # Special case for AQLM codebooks. @@ -941,6 +946,11 @@ def weight_loader(self, use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit + if use_bitsandbytes_4bit: orig_qkv_offsets = { "q": (0, self.num_heads * self.head_size), @@ -964,9 +974,7 @@ def weight_loader(self, shard_id = tp_rank // self.num_kv_head_replicas start_idx = shard_id * shard_size - # bitsandbytes loads the weights of the specific portion - # no need to narrow here - if not use_bitsandbytes_4bit: + if not is_sharded_weight: loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) @@ -1070,6 +1078,10 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): tp_size = get_tensor_model_parallel_world_size() input_dim = getattr(param, "input_dim", None) use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit # Special case for GGUF is_gguf_weight = getattr(param, "is_gguf_weight", False) @@ -1085,9 +1097,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype) param_data = param.data - # bitsandbytes loads the weights of the specific portion - # no need to narrow here - if input_dim is not None and not use_bitsandbytes_4bit: + if input_dim is not None and not is_sharded_weight: shard_size = param_data.shape[input_dim] start_idx = tp_rank * shard_size loaded_weight = loaded_weight.narrow(input_dim, start_idx, diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 2bc7e458494f7..42decde1d0f79 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -6,6 +6,7 @@ import torch.nn as nn import vllm.envs as envs +from vllm.config import get_current_vllm_config from vllm.distributed import (tensor_model_parallel_all_gather, tensor_model_parallel_gather) from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -44,8 +45,10 @@ def __init__(self, self.soft_cap = soft_cap # Whether to use gather or all-gather to gather the logits. - self.use_gather = not current_platform.is_tpu( - ) and not envs.VLLM_USE_V1 + parallel_config = get_current_vllm_config().parallel_config + self.use_all_gather = current_platform.is_tpu() \ + or envs.VLLM_USE_V1 \ + or parallel_config.distributed_executor_backend == "external_launcher" # noqa def forward( self, @@ -88,16 +91,17 @@ def _get_logits( logits = lm_head.linear_method.apply(lm_head, hidden_states, bias=embedding_bias) - if self.use_gather: - # None may be returned for rank > 0 - logits = tensor_model_parallel_gather(logits) - else: + + if self.use_all_gather: # Gather is not supported for some devices such as TPUs. # Use all-gather instead. # NOTE(woosuk): Here, the outputs of every device should not be None # because XLA requires strict SPMD among all devices. Every device # should execute the same operations after gathering the logits. logits = tensor_model_parallel_all_gather(logits) + else: + # None may be returned for rank > 0 + logits = tensor_model_parallel_gather(logits) # Remove paddings in vocab (if any). if logits is not None: logits = logits[..., :self.org_vocab_size] diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index dd10c434f0752..d2bde13fcf546 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -26,14 +26,56 @@ "experts_int8", "neuron_quant", "ipex", + "quark" ] +# The customized quantization methods which will be added to this dict. +_CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {} + + +def register_quantization_config(quantization: str): + """Register a customized vllm quantization config. + + When a quantization method is not supported by vllm, you can register a customized + quantization config to support it. + + Args: + quantization (str): The quantization method name. + + Examples: + >>> from vllm.model_executor.layers.quantization import register_quantization_config + >>> from vllm.model_executor.layers.quantization import get_quantization_config + >>> from vllm.model_executor.layers.quantization.base_config import QuantizationConfig + >>> + >>> @register_quantization_config("my_quant") + ... class MyQuantConfig(QuantizationConfig): + ... pass + >>> + >>> get_quantization_config("my_quant") + + """ # noqa: E501 + + def _wrapper(quant_config_cls): + if quantization in QUANTIZATION_METHODS: + raise ValueError( + f"The quantization method `{quantization}` is already exists.") + if not issubclass(quant_config_cls, QuantizationConfig): + raise ValueError("The quantization config must be a subclass of " + "`QuantizationConfig`.") + _CUSTOMIZED_METHOD_TO_QUANT_CONFIG[quantization] = quant_config_cls + QUANTIZATION_METHODS.append(quantization) + return quant_config_cls + + return _wrapper + def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: if quantization not in QUANTIZATION_METHODS: raise ValueError(f"Invalid quantization method: {quantization}") # lazy import to avoid triggering `torch.compile` too early + from vllm.model_executor.layers.quantization.quark.quark import QuarkConfig + from .aqlm import AQLMConfig from .awq import AWQConfig from .awq_marlin import AWQMarlinConfig @@ -79,7 +121,10 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: "experts_int8": ExpertsInt8Config, "neuron_quant": NeuronQuantConfig, "ipex": IPEXConfig, + "quark": QuarkConfig } + # Update the `method_to_config` with customized quantization methods. + method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG) return method_to_config[quantization] diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 6dfac8aad5358..2fb2642dd5156 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -133,3 +133,6 @@ def get_quant_method(self, layer: torch.nn.Module, method. """ raise NotImplementedError + + def get_cache_scale(self, name: str) -> Optional[str]: + return None diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index f0e4eda76734b..b2fc2360f47f1 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -412,6 +412,22 @@ def get_scheme( self._check_scheme_supported(scheme.get_min_capability()) return scheme + def get_cache_scale(self, name: str) -> Optional[str]: + """ + Check whether the param name matches the format for k/v cache scales + in compressed-tensors. If this is the case, return its equivalent + param name expected by vLLM + + :param name: param name + :return: matching param name for KV cache scale in vLLM + """ + if name.endswith(".output_scale") and ".k_proj" in name: + return name.replace(".k_proj.output_scale", ".attn.k_scale") + if name.endswith(".output_scale") and ".v_proj" in name: + return name.replace(".v_proj.output_scale", ".attn.v_scale") + # If no matches, return None + return None + @staticmethod def supports_cutlass_24( weight_quant: Optional[QuantizationArgs], diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py index bc697ef93b34b..21e6fe7a22616 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py @@ -42,7 +42,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, if not sparse_cutlass_supported(): raise ValueError( - "Sparse CUTLASS not supported. vLLM must be built with" + "Sparse CUTLASS not supported. vLLM must be built with " "CUDA 12.2 or later to use this feature") self.output_dtype = params_dtype diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py index 2659afcdc74a9..f4c1dbc0361c6 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py @@ -136,6 +136,10 @@ def triton_scaled_mm(input: torch.Tensor, assert N > 0 and K > 0 and M > 0 assert weight.shape[0] == K assert input.dtype == weight.dtype + + scale_a = scale_a.reshape(-1, 1) if scale_a.dim() <= 1 else scale_a + scale_b = scale_b.reshape(-1, 1) if scale_b.dim() <= 1 else scale_b + assert scale_a.dtype == scale_b.dtype and scale_a.is_floating_point() assert scale_a.shape == torch.Size([1, 1]) or scale_a.shape == torch.Size( [M, 1]) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index dfae4db71e546..8fcbda377428e 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -133,23 +133,6 @@ def _find_first_match(value: str, return None -def get_compressed_tensors_cache_scale(name: str) -> Optional[str]: - """ - Check whether the param name matches the format for k/v cache scales - in compressed-tensors. If this is the case, return its equivalent - param name expected by vLLM - - :param name: param name - :return: matching param name for KV cache scale in vLLM - """ - if name.endswith(".output_scale") and ".k_proj" in name: - return name.replace(".k_proj.output_scale", ".attn.k_scale") - if name.endswith(".output_scale") and ".v_proj" in name: - return name.replace(".v_proj.output_scale", ".attn.v_scale") - # If no matches, return None - return None - - def _is_equal_or_regex_match(value: str, target: str, check_contains: bool = False) -> bool: diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index a1be45a49e94a..26dd5df4e55b2 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -247,6 +247,15 @@ def create_weights( def process_weights_after_loading(self, layer: Module) -> None: # Block quant doesn't need to process weights after loading if self.block_quant: + if current_platform.is_rocm(): + weight, weight_scale, _ = \ + normalize_e4m3fn_to_e4m3fnuz( + weight=layer.weight, + weight_scale=layer.weight_scale_inv, + input_scale=layer.input_scale) + layer.weight = Parameter(weight, requires_grad=False) + layer.weight_scale_inv = Parameter(weight_scale, + requires_grad=False) return layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False) @@ -355,7 +364,8 @@ def apply(self, input_scale=layer.input_scale, bias=bias, cutlass_fp8_supported=self.cutlass_fp8_supported, - use_per_token_if_dynamic=False) + # Default to using per_token quantization if cutlass is supported + use_per_token_if_dynamic=self.cutlass_fp8_supported) class Fp8MoEMethod(FusedMoEMethodBase): @@ -494,6 +504,30 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int, def process_weights_after_loading(self, layer: Module) -> None: # Block quant doesn't need to process weights after loading if self.block_quant: + if current_platform.is_rocm(): + w13_weight, w13_weight_scale_inv, w13_input_scale = \ + normalize_e4m3fn_to_e4m3fnuz( + layer.w13_weight, layer.w13_weight_scale_inv, + layer.w13_input_scale) + w2_weight, w2_weight_scale_inv, w2_input_scale = \ + normalize_e4m3fn_to_e4m3fnuz( + layer.w2_weight, layer.w2_weight_scale_inv, + layer.w2_input_scale) + # Reset the parameter + layer.w13_weight = torch.nn.Parameter(w13_weight, + requires_grad=False) + layer.w13_weight_scale_inv = torch.nn.Parameter( + w13_weight_scale_inv, requires_grad=False) + if w13_input_scale is not None: + layer.w13_input_scale = torch.nn.Parameter( + w13_input_scale, requires_grad=False) + layer.w2_weight = torch.nn.Parameter(w2_weight, + requires_grad=False) + layer.w2_weight_scale_inv = torch.nn.Parameter( + w2_weight_scale_inv, requires_grad=False) + if w2_input_scale is not None: + layer.w2_input_scale = torch.nn.Parameter( + w2_input_scale, requires_grad=False) return # If checkpoint is fp16, quantize in place. if not self.quant_config.is_checkpoint_fp8_serialized: diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py index 586752d3d34e3..4824a11804163 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -5,8 +5,8 @@ CutlassScaledMMLinearKernel) from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import ( # noqa: E501 ScaledMMLinearKernel, ScaledMMLinearLayerConfig) -# from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import ( -# TritonScaledMMLinear) +from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import ( + TritonScaledMMLinearKernel) from vllm.model_executor.layers.quantization.kernels.scaled_mm.xla import ( XLAScaledMMLinearKernel) from vllm.platforms import PlatformEnum, current_platform @@ -15,9 +15,7 @@ _POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = { PlatformEnum.CPU: [CutlassScaledMMLinearKernel], PlatformEnum.CUDA: [CutlassScaledMMLinearKernel], - # TODO(rob): Create TritonScaledMMLinear kernel. ROCM will - # incorrectly attempt to run AZP models if prompted to. - PlatformEnum.ROCM: [CutlassScaledMMLinearKernel], + PlatformEnum.ROCM: [TritonScaledMMLinearKernel], PlatformEnum.TPU: [XLAScaledMMLinearKernel], } diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py new file mode 100644 index 0000000000000..97ec8cb0500d7 --- /dev/null +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py @@ -0,0 +1,38 @@ +from typing import Optional, Tuple + +import torch + +from vllm.platforms import current_platform + +from .cutlass import CutlassScaledMMLinearKernel +from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig + + +class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel): + + @classmethod + def get_min_capability(cls) -> int: + return 75 + + @classmethod + def can_implement( + cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]: + if current_platform.is_cpu(): + return ( + False, + "TritonScaledMMLinearKernel requires Triton which is not " + + "currently supported on CPU.") + if not c.input_symmetric: + return (False, + "TritonScaledMMLinearKernel only supports symmetric " + + "quantization.") + return True, None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + super().process_weights_after_loading(layer) + + def apply_weights(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + return super().apply_weights(layer, x, bias) diff --git a/vllm/model_executor/layers/quantization/quark/__init__.py b/vllm/model_executor/layers/quantization/quark/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py new file mode 100644 index 0000000000000..fc214255eca71 --- /dev/null +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -0,0 +1,387 @@ +import fnmatch +import re +from typing import Any, Dict, List, Optional, cast + +import torch + +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, + UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 + QuantizationConfig, QuantizeMethodBase) +from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.model_executor.layers.quantization.quark.quark_moe import ( # noqa: E501 + QuarkMoEMethod) +from vllm.model_executor.layers.quantization.quark.schemes import ( + QuarkScheme, QuarkW8A8Fp8, QuarkW8A8Int8) +from vllm.model_executor.layers.quantization.quark.utils import ( + deep_compare, should_ignore_layer) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + FUSED_LAYER_NAME_MAPPING) +from vllm.platforms import current_platform + +__all__ = ["QuarkLinearMethod"] + + +class QuarkConfig(QuantizationConfig): + + def __init__(self, + quant_config: Dict[str, Any], + kv_cache_group: Optional[List[str]] = None, + kv_cache_config: Optional[Dict[str, Any]] = None, + pack_method: str = "reorder"): + if kv_cache_group is None: + kv_cache_group = [] + self.quant_config = quant_config + self.kv_cache_group = kv_cache_group + self.kv_cache_config = kv_cache_config + self.pack_method = pack_method + + def get_linear_method(self) -> "QuarkLinearMethod": + return QuarkLinearMethod(self) + + def get_supported_act_dtypes(cls) -> List[torch.dtype]: + return [torch.float16, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return 70 + + def get_name(self) -> str: + return "quark" + + def get_quant_method(self, layer: torch.nn.Module, + prefix: str) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import Attention # Avoid circular import + + # Check if the layer is skipped for quantization. + exclude_layers = cast(List[str], self.quant_config.get("exclude")) + if should_ignore_layer(prefix, ignore=exclude_layers): + return UnquantizedLinearMethod() + if isinstance(layer, LinearBase): + scheme = self.get_scheme(layer=layer, layer_name=prefix) + layer.scheme = scheme + return QuarkLinearMethod(self) + if isinstance(layer, Attention): + return QuarkKVCacheMethod(self) + if isinstance(layer, FusedMoE): + return QuarkMoEMethod.get_moe_method(self, + module=layer, + layer_name=prefix) + return None + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "QuarkConfig": + export_config = config.get("export") + if export_config is None: + raise ValueError("The export key should be included in " + "the configurations of Quark quantized model") + kv_cache_group = cast(List[str], export_config.get("kv_cache_group")) + pack_method = cast(str, export_config.get("pack_method")) + + # In the export model of quark, the quantization configuration + # of kv_cache is stored in layer_quant_config. First, it is + # judged whether kv_cache_group exists, and then it is judged + # whether layer_quant_config has a quantization configuration + # that matches kv_cache. + if len(kv_cache_group) == 0: + kv_cache_config = None + else: + kv_cache_set = set(kv_cache_group) + layer_quant_config = cast(Dict[str, Any], + config.get("layer_quant_config")) + layer_quant_names = list(layer_quant_config.keys()) + layer_quant_set = set(layer_quant_names) + + if not kv_cache_set.issubset(layer_quant_set): + raise ValueError("The Quark quantized model has the " + "kv_cache_group parameter setting, " + "but no kv_cache quantization settings " + "were found in the quantization " + "configuration.") + + q_configs = [ + cast(Dict[str, Any], layer_quant_config.get(name)) + for name in kv_cache_group + ] + if not all( + deep_compare(q_config, q_configs[0]) + for q_config in q_configs): + raise ValueError( + "The quantization method used for kv_cache should " + "be the same, but the quantization method for the " + "kv_cache layer in the config is different.") + kv_cache_config = q_configs[0].get("output_tensors") + if kv_cache_config is None: + raise ValueError( + "The kv_cache quantization configuration is empty.") + + # Since we have already set kv_cache quantization configurations, + # we will remove the quantization configuration for the + # output_tensors corresponding to the kv_cache layer. + for q_config in q_configs: + q_config["output_tensors"] = None + + return cls(quant_config=config, + kv_cache_group=kv_cache_group, + kv_cache_config=kv_cache_config, + pack_method=pack_method) + + @classmethod + def get_config_filenames(cls) -> List[str]: + return [] + + def _check_scheme_supported(self, + min_capability: int, + error: bool = True) -> bool: + capability_tuple = current_platform.get_device_capability() + + if capability_tuple is not None: + capability = capability_tuple.to_int() + supported = capability >= min_capability + if error and not supported: + raise RuntimeError( + "Quantization scheme is not supported for ", + f"the current GPU. Min capability: {min_capability}. ", + f"Current capability: {capability}.") + return supported + else: + return False + + def _is_fp8_w8a8(self, weight_quant: Optional[Dict[str, Any]], + input_quant: Optional[Dict[str, Any]]) -> bool: + # Confirm weights and input quantized. + if weight_quant is None or input_quant is None: + return False + + # Confirm weight scheme is supported + is_fp8_dtype = (weight_quant.get("dtype") == "fp8_e4m3" + and input_quant.get("dtype") == "fp8_e4m3") + is_static_weight = not weight_quant.get("is_dynamic") + is_per_tensor_or_channel_weight = (weight_quant.get("qscheme") + in ["per_tensor", "per_channel"]) + + if not (is_fp8_dtype and is_static_weight + and is_per_tensor_or_channel_weight): + return False + + # Dynamic quantization is always supported if weights supported. + if input_quant.get("is_dynamic"): + return True + + # Confirm activation scheme is supported. + is_per_tensor_activation = (input_quant.get("qscheme") == "per_tensor") + return is_per_tensor_activation + + def _is_static_tensor_w8a8(self, weight_quant: Optional[Dict[str, Any]], + input_quant: Optional[Dict[str, Any]]) -> bool: + # Confirm weights and input quantized. + if weight_quant is None or input_quant is None: + return False + + is_int8_dtype = (weight_quant.get("dtype") == "int8" + and input_quant.get("dtype") == "int8") + + is_tensor = (weight_quant.get("qscheme") + in ["per_tensor", "per_channel"] + and input_quant.get("qscheme") == "per_tensor") + + is_static = (not weight_quant.get("is_dynamic") + and not input_quant.get("is_dynamic")) + + is_weight_symmetric = (weight_quant.get("symmetric") is True) + + # Both symmetric and asymmetric input quantization supported. + # Only symmetric weight quantization supported. + return is_int8_dtype and is_tensor and is_weight_symmetric and is_static + + def _find_matched_config(self, layer_name: str, + module: torch.nn.Module) -> Dict[str, Any]: + + proj_name = layer_name.split(".")[-1] + if proj_name in FUSED_LAYER_NAME_MAPPING: + shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name] + + # Convert fused_name --> [shard_names] + shard_names = [ + layer_name.replace(proj_name, shard_proj_name) + for shard_proj_name in shard_proj_names + ] + shard_configs = [ + self._find_matched_config(shard_name, module) + for shard_name in shard_names + ] + if not all( + deep_compare(q_config, shard_configs[0]) + for q_config in shard_configs): + raise ValueError( + f"Found a different quantization configuration for " + f"{shard_proj_names} in {layer_name}. vLLM " + "requires all to use the same scheme.") + return shard_configs[0] + else: + layer_quant_config = cast( + Dict[str, Any], self.quant_config.get("layer_quant_config")) + for name_pattern in layer_quant_config: + if fnmatch.fnmatch(layer_name, name_pattern): + return layer_quant_config[name_pattern] + + layer_type = cast(str, type(module)) + layer_type_quant_config = cast( + Dict[str, Any], + self.quant_config.get("layer_type_quant_config")) + if layer_type in layer_type_quant_config: + return layer_type_quant_config[layer_type] + + global_quant_config = cast( + Dict[str, Any], self.quant_config.get("global_quant_config")) + return global_quant_config + + def _get_scheme_from_config(self, config: Dict[str, Any]) -> "QuarkScheme": + if config.get("output_tensors") or config.get("bias"): + raise NotImplementedError( + "Currently, Quark models with output_tensors " + "and bias quantized are not supported") + weight_config = cast(Dict[str, Any], config.get("weight")) + input_config = cast(Dict[str, Any], config.get("input_tensors")) + + if self._is_fp8_w8a8(weight_config, input_config): + is_fp8_w8a8_supported = self._check_scheme_supported( + QuarkW8A8Fp8.get_min_capability(), error=False) + if is_fp8_w8a8_supported: + weight_qscheme = cast(str, weight_config.get("qscheme")) + input_static = (input_config is not None and + not cast(bool, input_config.get("is_dynamic"))) + return QuarkW8A8Fp8(qscheme=weight_qscheme, + is_static_input_scheme=input_static) + elif self._is_static_tensor_w8a8(weight_config, input_config): + weight_qscheme = cast(str, weight_config.get("qscheme")) + return QuarkW8A8Int8(qscheme=weight_qscheme, + is_static_input_scheme=True, + input_symmetric=input_config.get("symmetric")) + + raise NotImplementedError("No quark compatible scheme was found. " + f"Weight config: {weight_config}, " + f"Input config: {input_config}") + + def get_scheme(self, layer: torch.nn.Module, + layer_name: str) -> "QuarkScheme": + + layer_quant_config = self._find_matched_config(layer_name, layer) + + # Find the quant_scheme + scheme = self._get_scheme_from_config(layer_quant_config) + # Raise error if device does not support the scheme + # (e.g. fp8 needs ada lovelace) + self._check_scheme_supported(scheme.get_min_capability()) + + return scheme + + def get_cache_scale(self, name: str) -> Optional[str]: + """ + Check whether the param name matches the format for k/v cache scales + in quark. If this is the case, return its equivalent param name + expected by vLLM + + :param name: param name + :return: matching param name for KV cache scale in vLLM + """ + if self.kv_cache_group is None or len(self.kv_cache_group) == 0: + return None + + kv_proj_names = [ + re.split(r"[*.]", kv_cache)[-1] for kv_cache in self.kv_cache_group + ] + if name.endswith(".output_scale"): + if len(kv_proj_names) == 1 and kv_proj_names[0] in name: + kv_output_scale_name = "." + kv_proj_names[0] + ".output_scale" + return name.replace(kv_output_scale_name, ".attn.k_scale") + + elif len(kv_proj_names) == 2: + for kv_proj_name in kv_proj_names: + if kv_proj_name in name and kv_proj_name == "k_proj": + return name.replace(".k_proj.output_scale", + ".attn.k_scale") + elif kv_proj_name in name and kv_proj_name == "v_proj": + return name.replace(".v_proj.output_scale", + ".attn.v_scale") + + # If no matches, return None + return None + + +class QuarkLinearMethod(LinearMethodBase): + + def __init__(self, quantization_config: QuarkConfig): + self.quantization_config = quantization_config + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + layer.scheme.process_weights_after_loading(layer) + + def create_weights(self, layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], input_size: int, + output_size: int, params_dtype: torch.dtype, + **extra_weight_attrs): + """ + Use the CompressedTensorsScheme associated with each layer to create + the necessary parameters for the layer. See LinearMethodBase for param + details + """ + weight_loader = extra_weight_attrs.get("weight_loader") + layer.scheme.create_weights( + layer=layer, + input_size=input_size, + input_size_per_partition=input_size_per_partition, + output_partition_sizes=output_partition_sizes, + output_size=output_size, + params_dtype=params_dtype, + weight_loader=weight_loader) + + def apply(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None): + """ + Use the output of create_weights and the CompressedTensorsScheme + associated with the layer to apply the forward pass with the + layer input. See LinearMethodBase for param details + + """ + scheme = layer.scheme + if scheme is None: + raise ValueError("A scheme must be defined for each layer") + return scheme.apply_weights(layer, x, bias=bias) + + +class QuarkKVCacheMethod(BaseKVCacheMethod): + """ + Supports loading kv-cache scaling factors from quark checkpoints. + """ + + def __init__(self, quant_config: QuarkConfig): + self.validate_kv_cache_config(quant_config.kv_cache_config) + super().__init__(quant_config) + + @staticmethod + def validate_kv_cache_config(kv_cache_config: Optional[Dict[str, Any]]): + """ + Validator for the kv cache configuration. Useful for controlling the + kv cache quantization schemes, that are being supported in vLLM + :param kv_cache_config: the quark kv cache scheme + """ + if kv_cache_config is None: + return + + dtype = kv_cache_config.get("dtype") + if dtype != "fp8_e4m3": + raise NotImplementedError( + "Currently supported kv cache quantization is " + f"dtype=fp8_e4m3, however received {dtype}") + + qscheme = kv_cache_config.get("qscheme") + if qscheme != "per_tensor": + raise NotImplementedError( + "Only support per-tensor scaling factor " + "for quark KV cache. " + f"Expected qscheme: per_tensor, found qscheme: {qscheme}") diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py new file mode 100644 index 0000000000000..3e19247300808 --- /dev/null +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -0,0 +1,225 @@ +from typing import Any, Callable, Dict, Optional + +import torch + +import vllm.model_executor.layers.fused_moe # noqa +from vllm import _custom_ops as ops +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase, + FusedMoeWeightScaleSupported) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize) +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform + +logger = init_logger(__name__) + +__all__ = ["QuarkMoEMethod", "QuarkW8A8Fp8MoEMethod"] + + +class QuarkMoEMethod(FusedMoEMethodBase): + + @staticmethod + def get_moe_method( + quant_config: "QuarkConfig", # type: ignore # noqa E501 # noqa F821 + module: torch.nn.Module, + layer_name: str) -> "QuarkMoEMethod": + layer_quant_config = quant_config._find_matched_config( + layer_name, module) + + if (layer_quant_config.get("output_tensors") + or layer_quant_config.get("bias")): + raise NotImplementedError("Currently, Quark models with " + "output_tensors and bias " + "quantized are not supported") + weight_config = layer_quant_config.get("weight") + input_config = layer_quant_config.get("input_tensors") + + if quant_config._is_fp8_w8a8(weight_config, input_config): + return QuarkW8A8Fp8MoEMethod(weight_config, input_config) + else: + raise RuntimeError("Unsupported FusedMoe scheme") + + +class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): + + def __init__(self, weight_config: Dict[str, Any], input_config: Dict[str, + Any]): + self.weight_quant = weight_config + self.input_quant = input_config + + weight_qscheme = self.weight_quant.get("qscheme") + input_qscheme = self.input_quant.get("qscheme") + if not (weight_qscheme == "per_tensor" + and input_qscheme == "per_tensor"): + raise ValueError( + "For FP8 Fused MoE layers, only per-tensor scales" + "for weights and activations are supported. Found " + f"{weight_qscheme}, {input_qscheme}") # noqa E501 + + self.static_input_scales = not self.input_quant.get("is_dynamic") + + def create_weights(self, layer: torch.nn.Module, num_experts: int, + hidden_size: int, intermediate_size: int, + params_dtype: torch.dtype, **extra_weight_attrs): + + params_dtype = torch.float8_e4m3fn + + # WEIGHTS + w13_weight = torch.nn.Parameter(torch.empty(num_experts, + 2 * intermediate_size, + hidden_size, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + w2_weight = torch.nn.Parameter(torch.empty(num_experts, + hidden_size, + intermediate_size, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + # WEIGHT_SCALES + # Allocate 2 scales for w1 and w3 respectively. + # They will be combined to a single scale after weight loading. + w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts, + 2, + dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + + w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts, + dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + # Add the quantization method used (per tensor/grouped/channel) + # to ensure the weight scales are loaded in properly + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + # INPUT_SCALES + if self.static_input_scales: + w13_input_scale = torch.nn.Parameter(torch.ones( + num_experts, dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w13_input_scale", w13_input_scale) + set_weight_attrs(w13_input_scale, extra_weight_attrs) + + w2_input_scale = torch.nn.Parameter(torch.ones( + num_experts, dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w2_input_scale", w2_input_scale) + set_weight_attrs(w2_input_scale, extra_weight_attrs) + else: + layer.w13_input_scale = None + layer.w2_input_scale = None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # Fp8 moe kernels require a single activation scale. + # We take the max of all the scales in case they differ. + if self.static_input_scales: + if (layer.w13_input_scale is None or layer.w2_input_scale is None): + raise ValueError( + "QuantConfig has static quantization, but found " + "activation scales are None.") + if (not all_close_1d(layer.w13_input_scale) + or not all_close_1d(layer.w2_input_scale)): + logger.warning_once( + "Found input_scales that are not equal for " + "fp8 MoE layer. Using the maximum across experts " + "for each layer. ") + layer.w13_input_scale = torch.nn.Parameter( + layer.w13_input_scale.max(), requires_grad=False) + layer.w2_input_scale = torch.nn.Parameter( + layer.w2_input_scale.max(), requires_grad=False) + + # If rocm, normalize the weights and scales to e4m3fnuz + if current_platform.is_rocm(): + # Normalize the weights and scales + w13_weight, w13_weight_scale, w13_input_scale = \ + normalize_e4m3fn_to_e4m3fnuz( + layer.w13_weight, layer.w13_weight_scale, + layer.w13_input_scale) + w2_weight, w2_weight_scale, w2_input_scale = \ + normalize_e4m3fn_to_e4m3fnuz( + layer.w2_weight, layer.w2_weight_scale, + layer.w2_input_scale) + # Reset the parameter + layer.w13_weight = torch.nn.Parameter(w13_weight, + requires_grad=False) + layer.w13_weight_scale = torch.nn.Parameter(w13_weight_scale, + requires_grad=False) + if w13_input_scale is not None: + layer.w13_input_scale = torch.nn.Parameter(w13_input_scale, + requires_grad=False) + layer.w2_weight = torch.nn.Parameter(w2_weight, + requires_grad=False) + layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale, + requires_grad=False) + if w2_input_scale is not None: + layer.w2_input_scale = torch.nn.Parameter(w2_input_scale, + requires_grad=False) + + # Fp8 moe kernel needs single weight scale for w13 per expert. + # We take the max then dequant and requant each expert. + assert layer.w13_weight_scale is not None + shard_size = layer.intermediate_size_per_partition + max_w13_scales = layer.w13_weight_scale.max(dim=1).values + for expert_id in range(layer.num_experts): + start = 0 + for shard_id in range(2): + dq_weight = per_tensor_dequantize( + layer.w13_weight[expert_id][start:start + shard_size, :], + layer.w13_weight_scale[expert_id][shard_id]) + layer.w13_weight[expert_id][ + start:start + shard_size, :], _ = ops.scaled_fp8_quant( + dq_weight, max_w13_scales[expert_id]) + start += shard_size + + layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales, + requires_grad=False) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + from vllm.model_executor.layers.fused_moe import fused_experts + + topk_weights, topk_ids = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias) + + return fused_experts(x, + layer.w13_weight, + layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + use_fp8_w8a8=True, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale) diff --git a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py new file mode 100644 index 0000000000000..fb0ba9bd5220c --- /dev/null +++ b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py @@ -0,0 +1,5 @@ +from .quark_scheme import QuarkScheme +from .quark_w8a8_fp8 import QuarkW8A8Fp8 +from .quark_w8a8_int8 import QuarkW8A8Int8 + +__all__ = ["QuarkScheme", "QuarkW8A8Fp8", "QuarkW8A8Int8"] diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py new file mode 100644 index 0000000000000..239597fa4be0e --- /dev/null +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py @@ -0,0 +1,52 @@ +from abc import ABC, abstractmethod +from typing import Optional + +import torch + +__all__ = ["QuarkScheme"] + + +class QuarkScheme(ABC): + """ + Abstract class used to describe the weight creation and forward pass + of different quantization schemes supported by Quark. + """ + + @classmethod + @abstractmethod + def get_min_capability(cls) -> int: + """ + Get minimum device capability. + """ + raise NotImplementedError + + @abstractmethod + def create_weights(self, *args, **kwargs): + """ + Weight creation for the particular scheme. Inputs to this function + + """ + raise NotImplementedError + + @abstractmethod + def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, + bias: Optional[torch.Tensor]): + """ + Run the forward pass for the particular scheme. This is where + scheme-specific dequant/quant steps/kernels should be applied. + + :param layer: torch.nn.Module with the registered weights and + other parameters relevant to the particular scheme. + :param x: input to the layer + :param bias: bias parameter + + """ + raise NotImplementedError + + @abstractmethod + def process_weights_after_loading(self, layer: torch.nn.Module): + """ + Called after weight loading is complete for any cleanup that + needs to occur. + """ + raise NotImplementedError diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py new file mode 100644 index 0000000000000..206931ea2ffc0 --- /dev/null +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py @@ -0,0 +1,140 @@ +from typing import Callable, List, Optional + +import torch +from torch.nn import Parameter + +from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + apply_fp8_linear, cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz, + requantize_with_max_scale) +from vllm.model_executor.parameter import (ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter) +from vllm.platforms import current_platform + +__all__ = ["QuarkW8A8Fp8"] + + +class QuarkW8A8Fp8(QuarkScheme): + + def __init__(self, qscheme: str, is_static_input_scheme: Optional[bool]): + self.qscheme = qscheme + self.is_static_input_scheme = is_static_input_scheme + self.cutlass_fp8_supported = cutlass_fp8_supported() + + @classmethod + def get_min_capability(cls) -> int: + # lovelace and up + return 89 + + def process_weights_after_loading(self, layer) -> None: + # If per tensor, when we have a fused module (e.g. QKV) with per + # tensor scales (thus N scales being passed to the kernel), + # requantize so we can always run per tensor + if self.qscheme == "per_tensor": + max_w_scale, weight = requantize_with_max_scale( + weight=layer.weight, + weight_scale=layer.weight_scale, + logical_widths=layer.logical_widths, + ) + + if current_platform.is_rocm(): + weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( + weight=weight, + weight_scale=max_w_scale, + input_scale=layer.input_scale) + if input_scale is not None: + layer.input_scale = Parameter(input_scale, + requires_grad=False) + + layer.weight = Parameter(weight.t(), requires_grad=False) + layer.weight_scale = Parameter(max_w_scale, requires_grad=False) + + # If channelwise, scales are already lined up, so just transpose. + elif self.qscheme == "per_channel": + weight = layer.weight + + if current_platform.is_rocm(): + weight, weight_scale, input_scale = \ + normalize_e4m3fn_to_e4m3fnuz( + weight=weight, + weight_scale=layer.weight_scale, + input_scale=layer.input_scale) + if input_scale is not None: + layer.input_scale = Parameter(input_scale, + requires_grad=False) + else: + weight_scale = layer.weight_scale.data + + layer.weight = Parameter(weight.t(), requires_grad=False) + # required by torch.compile to be torch.nn.Parameter + layer.weight_scale = Parameter(weight_scale, requires_grad=False) + + else: + raise ValueError(f"Unknown quantization scheme {self.qscheme}") + + # INPUT SCALE + if self.is_static_input_scheme: + layer.input_scale = Parameter(layer.input_scale.max(), + requires_grad=False) + else: + layer.input_scale = None + + def create_weights(self, layer: torch.nn.Module, + output_partition_sizes: List[int], + input_size_per_partition: int, + params_dtype: torch.dtype, weight_loader: Callable, + **kwargs): + output_size_per_partition = sum(output_partition_sizes) + layer.logical_widths = output_partition_sizes + + # WEIGHT + weight = ModelWeightParameter(data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) + layer.register_parameter("weight", weight) + + # WEIGHT SCALE + # TODO: update create_xxx_parameter functions to return + # the newly added parameters + if self.qscheme == "per_channel": + weight_scale = ChannelQuantScaleParameter( + data=torch.empty((sum(output_partition_sizes), 1), + dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader) + else: + assert self.qscheme == "per_tensor" + weight_scale = PerTensorScaleParameter(data=torch.empty( + len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader) + + # min requirement for fp8 kernels + weight_scale[:] = torch.finfo(torch.float32).min + layer.register_parameter("weight_scale", weight_scale) + + # INPUT SCALE + if self.is_static_input_scheme: + input_scale = PerTensorScaleParameter(data=torch.empty( + len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader) + input_scale[:] = torch.finfo(torch.float32).min + layer.register_parameter("input_scale", input_scale) + + def apply_weights(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + + return apply_fp8_linear( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=layer.input_scale, + bias=bias, + cutlass_fp8_supported=self.cutlass_fp8_supported, + use_per_token_if_dynamic=True) diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py new file mode 100644 index 0000000000000..8cb47e9c37e56 --- /dev/null +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py @@ -0,0 +1,105 @@ +from typing import Callable, List, Optional, Set + +import torch + +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.kernels.scaled_mm import ( + ScaledMMLinearLayerConfig, choose_scaled_mm_linear_kernel) +from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme +from vllm.model_executor.parameter import (BasevLLMParameter, + ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter) + +logger = init_logger(__name__) + + +class QuarkW8A8Int8(QuarkScheme): + _kernel_backends_being_used: Set[str] = set() + + def __init__(self, qscheme: str, is_static_input_scheme: Optional[bool], + input_symmetric: Optional[bool]): + self.qscheme = qscheme + self.is_static_input_scheme = is_static_input_scheme + self.input_symmetric = input_symmetric + + @classmethod + def get_min_capability(cls) -> int: + # turing and up + return 75 + + def create_weights(self, layer: torch.nn.Module, + output_partition_sizes: List[int], + input_size_per_partition: int, + params_dtype: torch.dtype, weight_loader: Callable, + **kwargs): + self.logical_widths = output_partition_sizes + + scaled_mm_linear_kernel_config = ScaledMMLinearLayerConfig( + is_channelwise=(self.qscheme == "per_channel"), + is_static_input_scheme=(self.is_static_input_scheme is True), + input_symmetric=(self.input_symmetric is True)) + + kernel_type = choose_scaled_mm_linear_kernel( + scaled_mm_linear_kernel_config) + + if kernel_type.__name__ not in self._kernel_backends_being_used: + logger.info("Using %s for QuarkW8A8Int8", kernel_type.__name__) + self._kernel_backends_being_used.add(kernel_type.__name__) + + # WEIGHT + weight = ModelWeightParameter(data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition, + dtype=torch.int8), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) + + layer.register_parameter("weight", weight) + + # WEIGHT SCALE + if self.qscheme == "per_channel": + weight_scale = ChannelQuantScaleParameter( + data=torch.empty((sum(output_partition_sizes), 1), + dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader) + else: + assert self.qscheme == "per_tensor" + weight_scale = PerTensorScaleParameter(data=torch.empty( + len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader) + layer.register_parameter("weight_scale", weight_scale) + + # INPUT SCALE + if self.is_static_input_scheme: + input_scale = BasevLLMParameter(data=torch.empty( + 1, dtype=torch.float32), + weight_loader=weight_loader) + layer.register_parameter("input_scale", input_scale) + + if not self.input_symmetric: + # Note: quark stores the zp using the same dtype + # as the weights + # AZP loaded as int8 but used as int32 + input_zero_point = BasevLLMParameter( + data=torch.empty(1, dtype=torch.int8), + weight_loader=weight_loader) + layer.register_parameter("input_zero_point", input_zero_point) + + self.kernel = kernel_type(c=scaled_mm_linear_kernel_config, + w_q_param_name="weight", + w_s_param_name="weight_scale", + i_s_param_name="input_scale", + i_zp_param_name="input_zero_point", + azp_adj_param_name="azp_adj") + + # Checkpoints are serialized in quark format, which is + # different from the format the kernel may want. Handle repacking here. + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + self.kernel.process_weights_after_loading(layer) + + def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, + bias: Optional[torch.Tensor]) -> torch.Tensor: + return self.kernel.apply_weights(layer, x, bias) diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py new file mode 100644 index 0000000000000..742a629bdb1c5 --- /dev/null +++ b/vllm/model_executor/layers/quantization/quark/utils.py @@ -0,0 +1,99 @@ +import re +from typing import Any, Iterable, Optional + +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + FUSED_LAYER_NAME_MAPPING) + + +def deep_compare(dict1: Any, dict2: Any) -> bool: + if type(dict1) is not type(dict2): + return False + if isinstance(dict1, dict): + if dict1.keys() != dict2.keys(): + return False + return all(deep_compare(dict1[k], dict2[k]) for k in dict1) + elif isinstance(dict1, list): + return set(dict1) == set(dict2) + else: + return dict1 == dict2 + + +def should_ignore_layer(layer_name: Optional[str], + ignore: Iterable[str]) -> bool: + if layer_name is None: + return False + + # layer_name = model.layers.0.self_attn.qkv_proj + # proj_name = qkv_proj + proj_name = layer_name.split(".")[-1] + + # Fused layers like gate_up_proj or qkv_proj will not be fused + # in the safetensors checkpoint. So, we convert the name + # from the fused version to unfused + check to make sure that + # each shard of the fused layer has the same scheme. + if proj_name in FUSED_LAYER_NAME_MAPPING: + shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name] + + # Convert fused_name --> [shard_names] + shard_names = [ + layer_name.replace(proj_name, shard_proj_name) + for shard_proj_name in shard_proj_names + ] + + # Layer should be ignored if shards are ignored. + should_ignore_layer = None + for shard_name in shard_names: + should_ignore_shard = check_equal_or_regex_match( + layer_name=shard_name, targets=ignore) + + # If shard_idx=0, set layer ignore to match shard. + if should_ignore_layer is None: + should_ignore_layer = should_ignore_shard + + # If shard_idx=1+ confirm scheme matches prior shards. + elif should_ignore_shard != should_ignore_layer: + raise ValueError(f"Found a different quantization schemes for " + f"{shard_proj_names} in {layer_name}. vLLM " + "requires all to use the same scheme.") + + # Unfused layers like down_proj and o_proj will match + # the safetensors checkpoint already. + else: + should_ignore_layer = check_equal_or_regex_match(layer_name=layer_name, + targets=ignore) + + assert should_ignore_layer is not None + return should_ignore_layer + + +def check_equal_or_regex_match(layer_name: str, + targets: Iterable[str]) -> bool: + """ + Checks whether a layer_name is exactly equal or a regex match for + if target starts with 're:' to any target in list. + """ + for target in targets: + if _is_equal_or_regex_match(layer_name, target): + return True + return False + + +def _is_equal_or_regex_match(value: str, + target: str, + check_contains: bool = False) -> bool: + """ + Checks whether a value is exactly equal or a regex match for target + if target starts with 're:'. If check_contains is set to True, + additionally checks if the target string is contained within the value. + """ + + if target.startswith("re:"): + pattern = target[3:] + if re.match(pattern, value): + return True + elif check_contains: + if target.lower() in value.lower(): + return True + elif target == value: + return True + return False diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index f3c3e130e4161..b6882cc7c837c 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -5,6 +5,8 @@ import triton import triton.language as tl +from vllm.platforms import current_platform + def apply_w8a8_block_fp8_linear( input: torch.Tensor, @@ -33,11 +35,14 @@ def apply_w8a8_block_fp8_linear( def input_to_float8( - x: torch.Tensor, - dtype: torch.dtype = torch.float8_e4m3fn + x: torch.Tensor, + dtype: Optional[torch.dtype] = None ) -> Tuple[torch.Tensor, torch.Tensor]: """This function quantizes input values to float8 values " "with tensor-wise quantization.""" + if dtype is None: + dtype = (torch.float8_e4m3fnuz + if current_platform.is_rocm() else torch.float8_e4m3fn) finfo = torch.finfo(dtype) min_val, max_val = x.aminmax() amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12) @@ -125,7 +130,7 @@ def per_token_group_quant_fp8( x: torch.Tensor, group_size: int, eps: float = 1e-10, - dtype: torch.dtype = torch.float8_e4m3fn, + dtype: Optional[torch.dtype] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: """Function to perform per-token-group quantization on an input tensor `x`. It converts the tensor values into signed float8 values and returns the @@ -140,6 +145,9 @@ def per_token_group_quant_fp8( Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor for quantization. """ + if dtype is None: + dtype = (torch.float8_e4m3fnuz + if current_platform.is_rocm() else torch.float8_e4m3fn) assert (x.shape[-1] % group_size == 0), ( f"the last dimension of `x` {x.shape[-1]} must be divisible " f"by `group_size` {group_size}") diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 3fcd81a3c4213..d071cfe888f05 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -841,6 +841,37 @@ def get_input_positions( ) -> Tuple[List[List[int]], int]: """Get mrope input positions and delta value.""" + llm_positions, mrope_position_delta = \ + MRotaryEmbedding.get_input_positions_tensor( + input_tokens, + image_grid_thw, + video_grid_thw, + image_token_id, + video_token_id, + vision_start_token_id, + vision_end_token_id, + spatial_merge_size, + context_len, + seq_len, + ) + + return llm_positions.tolist(), mrope_position_delta + + @staticmethod + def get_input_positions_tensor( + input_tokens: List[int], + image_grid_thw: Union[List[List[int]], torch.Tensor], + video_grid_thw: Union[List[List[int]], torch.Tensor], + image_token_id: int, + video_token_id: int, + vision_start_token_id: int, + vision_end_token_id: int, + spatial_merge_size: int, + context_len: int = 0, + seq_len: Optional[int] = None, + ) -> Tuple[torch.Tensor, int]: + """Get mrope input positions and delta value.""" + if isinstance(image_grid_thw, torch.Tensor): image_grid_thw = image_grid_thw.tolist() if isinstance(video_grid_thw, torch.Tensor): @@ -916,7 +947,7 @@ def get_input_positions( len(input_tokens)).item() llm_positions = llm_positions[:, context_len:seq_len] - return llm_positions.tolist(), mrope_position_delta + return llm_positions, mrope_position_delta @staticmethod def get_next_input_positions( @@ -930,6 +961,17 @@ def get_next_input_positions( seq_len + mrope_position_delta)) for _ in range(3) ] + @staticmethod + def get_next_input_positions_tensor( + mrope_position_delta: int, + context_len: int, + seq_len: int, + ) -> torch.Tensor: + return torch.arange( + mrope_position_delta + context_len, + mrope_position_delta + seq_len, + ).expand(3, -1) + _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {} diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 65920aa61ba15..3eb5c39ccf580 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -355,7 +355,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): elif isinstance(param, UninitializedParameter): shape = list(loaded_weight.shape) if output_dim is not None: - shape[output_dim] = shape[output_dim] // self.tp_size + shape[output_dim] = self.num_embeddings_per_partition param.materialize(tuple(shape), dtype=loaded_weight.dtype) # If parameter does not have output dim, then it should @@ -381,7 +381,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): else: assert loaded_weight.shape[output_dim] == self.org_vocab_size - # Copy the data. + # Copy the data. Select chunk corresponding to current shard. loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) if current_platform.is_hpu(): diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 9fe0db62435a0..e9779878710ee 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -182,6 +182,9 @@ class Source: fall_back_to_pt: bool = True """Whether .pt weights can be used.""" + allow_patterns_overrides: Optional[list[str]] = None + """If defined, weights will load exclusively using these patterns.""" + def __init__(self, load_config: LoadConfig): super().__init__(load_config) if load_config.model_loader_extra_config: @@ -218,6 +221,7 @@ def _prepare_weights( model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool, + allow_patterns_overrides: Optional[list[str]], ) -> Tuple[str, List[str], bool]: """Prepare weights for the model. @@ -249,6 +253,9 @@ def _prepare_weights( if fall_back_to_pt: allow_patterns += ["*.pt"] + if allow_patterns_overrides is not None: + allow_patterns = allow_patterns_overrides + if not is_local: hf_folder = download_weights_from_hf( model_name_or_path, @@ -298,7 +305,8 @@ def _get_weights_iterator( ) -> Generator[Tuple[str, torch.Tensor], None, None]: """Get an iterator for the model weights based on the load format.""" hf_folder, hf_weights_files, use_safetensors = self._prepare_weights( - source.model_or_path, source.revision, source.fall_back_to_pt) + source.model_or_path, source.revision, source.fall_back_to_pt, + source.allow_patterns_overrides) if self.load_config.load_format == LoadFormat.NPCACHE: # Currently np_cache only support *.bin checkpoints assert use_safetensors is False @@ -340,6 +348,8 @@ def _get_all_weights( prefix="", fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load", True), + allow_patterns_overrides=getattr(model, "allow_patterns_overrides", + None), ) yield from self._get_weights_iterator(primary_weights) @@ -353,7 +363,8 @@ def _get_all_weights( def download_model(self, model_config: ModelConfig) -> None: self._prepare_weights(model_config.model, model_config.revision, - fall_back_to_pt=True) + fall_back_to_pt=True, + allow_patterns_overrides=None) def load_model(self, vllm_config: VllmConfig) -> nn.Module: device_config = vllm_config.device_config @@ -1065,8 +1076,8 @@ def _load_weights(self, model_config: ModelConfig, # weight tensor. So TP does not work with pre_quantized bnb models. if pre_quant and get_tensor_model_parallel_world_size() > 1: raise ValueError( - "Prequant BitsAndBytes models with TP is not supported." - "Please try with PP.") + "Prequant BitsAndBytes models with tensor parallelism is not " + "supported. Please try with pipeline parallelism.") load_8bit = False if pre_quant: @@ -1105,15 +1116,22 @@ def _load_weights(self, model_config: ModelConfig, weight_name, index, ) in self.modules_mapping.inverse_packed_mapping.items(): - shard_pos = quant_param_name.find(shard_name) # Some models, such as MiniCPM V2.5/2.6, contain both # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj' # from being incorrectly identified as being present in # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight - if shard_pos > 0 and quant_param_name[shard_pos - 1] == ".": + shard_pos = quant_param_name.find(shard_name) + can_correct_rename = (shard_pos > 0) and ( + quant_param_name[shard_pos - 1] == ".") + # If the quant_param_name is packed, it won't occur in the + # param_dict before renaming. + new_quant_param_name = quant_param_name.replace( + shard_name, weight_name) + need_rename = (quant_param_name not in param_dict) \ + and (new_quant_param_name in param_dict) + if can_correct_rename and need_rename: shard_index = index - quant_param_name = quant_param_name.replace( - shard_name, weight_name) + quant_param_name = new_quant_param_name break # Models like Clip/Siglip may skip some layers in initialization, diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index fbd4937112e11..5b4757072353f 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -459,16 +459,7 @@ def tensorize_vllm_model(engine_args: EngineArgs, stream.write(encryption_params.key) engine = LLMEngine.from_engine_args(engine_args) - if tensorizer_config._is_sharded: - # if the engine is a distributed engine (for tensor parallel) then each - # worker shard needs to serialize its part of the model. - engine.model_executor._run_workers( - "save_tensorized_model", - tensorizer_config=tensorizer_config, - ) - else: - # with a single worker, we can get to the underlying model directly - serialize_vllm_model( - engine.model_executor.driver_worker.model_runner.model, - tensorizer_config, - ) + engine.model_executor.collective_rpc( + "save_tensorized_model", + kwargs=dict(tensorizer_config=tensorizer_config), + ) diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 089062ab53fc3..8c6873de13627 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -1,9 +1,11 @@ -from typing import (Callable, Iterable, List, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, + Union) import torch import torch.nn as nn -from transformers import BatchFeature, PretrainedConfig +from transformers import AriaConfig, AriaTextConfig, BatchFeature +from transformers.models.aria.modeling_aria import AriaCrossAttention +from transformers.models.aria.processing_aria import AriaProcessor from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, QuantizationConfig, VllmConfig @@ -13,8 +15,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) from vllm.model_executor.layers.sampler import (SamplerOutput, SamplingMetadata, get_sampler) from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead @@ -28,10 +28,12 @@ BaseProcessingInfo, PromptReplacement) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.configs.aria import (AriaMoELMConfig, - AriaVisionConfig) -from .idefics2_vision_model import Idefics2VisionTransformer +# yapf: disable +from .idefics2_vision_model import Idefics2VisionConfig +from .idefics2_vision_model import ( + Idefics2VisionTransformer as Idefics3VisionTransformer) +# yapf: enable from .interfaces import SupportsMultiModal from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, @@ -49,87 +51,69 @@ class AriaImagePixelInputs(TypedDict): """ -class AriaVisionTransformer(Idefics2VisionTransformer): - """ - AriaVisionTransformer is a modified version of Idefics2VisionTransformer - that replaces the post-layernorm with an identity layer. - """ +class AriaVisionTransformer(Idefics3VisionTransformer): def __init__( self, - config: AriaVisionConfig, + config: Idefics2VisionConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: super().__init__(config, quant_config, prefix) + # Unlike Idefics3VisionTransformer which uses LayerNorm after the + # final layer, Aria omits this normalization, so we replace it with an + # Identity layer self.post_layernorm = nn.Identity() + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() + for name, loaded_weight in weights: + + # NOTE: post_layernorm is not used in Aria + if "post_layernorm" in name: + continue + + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + -class AriaVisionModel(nn.Module): - config_class = AriaVisionConfig +class AriaProjectorMLP(nn.Module): def __init__( self, - config: AriaVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - *, - prefix: str = "", + in_features: int, + hidden_features: int, + output_dim: int, ) -> None: super().__init__() - self.vision_model = AriaVisionTransformer( - config, - quant_config, - prefix=f"{prefix}.vision_model", - ) - - def forward( - self, - pixel_values: torch.Tensor, - pixel_mask: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - patch_attention_mask = self._create_patch_attention_mask(pixel_mask) - - vit_oup = self.vision_model( - pixel_values=pixel_values, - patch_attention_mask=patch_attention_mask, - ) - - image_atts = self._create_image_attention_mask(patch_attention_mask) - - return vit_oup, image_atts - - def _create_patch_attention_mask( - self, pixel_mask: Optional[torch.Tensor]) -> torch.Tensor: - if pixel_mask is None: - return None - - patches_subgrid = pixel_mask.unfold( - dimension=1, - size=self.vision_model.config.patch_size, - step=self.vision_model.config.patch_size, - ).unfold( - dimension=2, - size=self.vision_model.config.patch_size, - step=self.vision_model.config.patch_size, - ) - return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() - - def _create_image_attention_mask( - self, patch_attention_mask: torch.Tensor) -> torch.Tensor: - if patch_attention_mask is None: - return None - - flattened_mask = patch_attention_mask.flatten(1) - return torch.logical_not(flattened_mask) - - -class FFN(nn.Module): - - def __init__(self, embed_dim: int, ff_dim: int, output_dim: int) -> None: - super().__init__() - self.linear_in = ColumnParallelLinear(embed_dim, ff_dim, bias=False) - self.linear_out = RowParallelLinear(ff_dim, output_dim, bias=False) + self.linear_in = ColumnParallelLinear(in_features, + hidden_features, + bias=False) + self.linear_out = RowParallelLinear(hidden_features, + output_dim, + bias=False) self.act = get_act_fn("gelu_new") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -139,46 +123,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return hidden_states -class CrossAttention(nn.Module): - - def __init__(self, kv_dim: int, embed_dim: int, num_heads: int) -> None: - super().__init__() - self.num_heads = num_heads - self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False) - self.k_proj = nn.Linear(kv_dim, embed_dim, bias=False) - self.v_proj = nn.Linear(kv_dim, embed_dim, bias=False) - - self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) - self.linear = nn.Linear(embed_dim, embed_dim) - - self.layer_norm = nn.LayerNorm(embed_dim) - self.ln_kv = nn.LayerNorm(kv_dim) - - def forward( - self, - x: torch.Tensor, - hidden_states: torch.Tensor, - attn_mask: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - normed_hidden_states = self.layer_norm(hidden_states) - query = self.q_proj(normed_hidden_states).permute(1, 0, 2) - - x = self.ln_kv(x) - key = self.k_proj(x).permute(1, 0, 2) - value = self.v_proj(x).permute(1, 0, 2) - - attn_output, _ = self.multihead_attn(query, - key, - value, - attn_mask=attn_mask) - - attn_output = attn_output.permute(1, 0, 2) - - attn_output = self.linear(attn_output) - - return attn_output - - class AriaProjector(nn.Module): """ A projection module with one cross attention layer and one FFN layer, which @@ -200,42 +144,42 @@ class AriaProjector(nn.Module): A tensor with the shape of (batch_size, query_number, output_dim) """ - def __init__( - self, - patch_to_query_dict: dict[int, int], - embed_dim: int, - num_heads: int, - kv_dim: int, - ff_dim: int, - output_dim: int, - norm_layer: Callable[[int], nn.Module] = nn.LayerNorm, - ) -> None: + def __init__(self, config: AriaConfig) -> None: super().__init__() - self.patch_to_query_dict = patch_to_query_dict - self.embed_dim = embed_dim - self.num_heads = num_heads + + self.patch_to_query_dict = config.projector_patch_to_query_dict + self.in_features = config.vision_config.hidden_size + self.num_heads = config.vision_config.num_attention_heads + self.kv_dim = config.vision_config.hidden_size + self.hidden_features = config.text_config.hidden_size + self.output_dim = config.text_config.hidden_size self.query = nn.Parameter( - torch.empty(max(patch_to_query_dict.values()), self.embed_dim)) + torch.empty(config.max_value_projector_patch_to_query_dict, + self.in_features)) - self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads) + self.cross_attn = AriaCrossAttention(config) - self.ln_ffn = norm_layer(embed_dim) - self.ffn = FFN(embed_dim, ff_dim, output_dim) + self.layer_norm = nn.LayerNorm(self.in_features) + self.feed_forward = AriaProjectorMLP(self.in_features, + self.hidden_features, + self.output_dim) def forward( self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: - bs = x.shape[0] - queries = self.query.unsqueeze(0).repeat(bs, 1, 1) + batch_size, num_patches = x.shape[0], x.shape[1] + + if num_patches not in self.patch_to_query_dict: + raise KeyError(f"Number of patches {num_patches} not found in " + "patch_to_query_dict amongst possible values " + f"{self.patch_to_query_dict.keys()}.") - query_num = self.patch_to_query_dict.get(x.shape[1], None) - assert (query_num is not None - ), f"Query number for {x.shape[1]} patches is not provided" + query_num = self.patch_to_query_dict[num_patches] - queries = queries[:, :query_num, :] + queries = self.query[:query_num].unsqueeze(0).repeat(batch_size, 1, 1) if attn_mask is not None: attn_mask = attn_mask.repeat_interleave(self.num_heads, 0) @@ -243,7 +187,7 @@ def forward( attention_out = self.cross_attn(x, queries, attn_mask=attn_mask) - out = self.ffn(self.ln_ffn(attention_out)) + out = self.feed_forward(self.layer_norm(attention_out)) return out @@ -280,7 +224,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, param.data.copy_(loaded_weight.transpose(1, 2)) -class MoELayer(nn.Module): +class AriaTextMoELayer(nn.Module): """ Mixture of Experts (MoE) Layer for the AriaMoE model. @@ -291,7 +235,7 @@ class MoELayer(nn.Module): def __init__( self, - config: AriaMoELMConfig, + config: AriaTextConfig, quant_config: Optional[QuantizationConfig], ) -> None: super().__init__() @@ -305,15 +249,16 @@ def __init__( num_experts=config.moe_num_experts, top_k=config.moe_topk, hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size, + intermediate_size=config.intermediate_size, quant_config=quant_config, reduce_results=True, ) self.shared_experts = LlamaMLP( config.hidden_size, - config.moe_intermediate_size * config.moe_num_shared_experts, + config.intermediate_size * config.moe_num_shared_experts, "silu", quant_config=quant_config, + bias=config.mlp_bias, ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -331,13 +276,15 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: router_output = torch.nn.functional.linear(hidden_states, self.router_weight) - shared_expert_output = self.shared_experts(hidden_states) + hidden_states_copy = hidden_states.clone() + # NOTE: hidden_states will be modified inplace by `FusedMoE` sparse_expert_output = self.experts(hidden_states, router_output) + shared_expert_output = self.shared_experts(hidden_states_copy) return sparse_expert_output + shared_expert_output -class MoEDecoderLayer(LlamaDecoderLayer): +class AriaTextDecoderLayer(LlamaDecoderLayer): """ Custom Decoder Layer for the AriaMoE model which modifies the standard `LlamaDecoderLayer` by replacing the traditional MLP with a Mixture of @@ -346,16 +293,16 @@ class MoEDecoderLayer(LlamaDecoderLayer): def __init__( self, - config: AriaMoELMConfig, + config: AriaTextConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: super().__init__(config, cache_config, quant_config, prefix) - self.mlp = MoELayer(config, quant_config=quant_config) + self.mlp = AriaTextMoELayer(config, quant_config=quant_config) -class AriaMoELMModel(LlamaModel): +class AriaTextModel(LlamaModel): """ Custom LlamaModel for the AriaMoE model which modifies the standard LlamaModel by replacing the `LlamaDecoderLayer` with `MoEDecoderLayer`. @@ -364,7 +311,7 @@ class AriaMoELMModel(LlamaModel): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix, - layer_type=MoEDecoderLayer) + layer_type=AriaTextDecoderLayer) # Adapted from LlamaModel.load_weights with the modification of adding # the expert weights mapping to `stacked_params_mapping` @@ -390,12 +337,14 @@ def load_weights(self, weights: Iterable[Tuple[str, # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - if scale_name := get_compressed_tensors_cache_scale(name): - # Loading kv cache scales for compressed-tensors quantization + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue @@ -434,25 +383,17 @@ def load_weights(self, weights: Iterable[Tuple[str, return loaded_params -def build_mm_projector(config: PretrainedConfig): - return AriaProjector( - patch_to_query_dict=config.projector_patch_to_query_dict, - embed_dim=config.vision_config.hidden_size, - num_heads=config.vision_config.num_attention_heads, - kv_dim=config.vision_config.hidden_size, - ff_dim=config.text_config.hidden_size, - output_dim=config.text_config.hidden_size, - ) - - class AriaProcessingInfo(BaseProcessingInfo): def get_hf_config(self): - return self.ctx.get_hf_config() + return self.ctx.get_hf_config(AriaConfig) - def get_vision_config(self) -> AriaVisionConfig: + def get_vision_config(self): return self.get_hf_config().vision_config + def get_hf_processor(self): + return self.ctx.get_hf_processor(AriaProcessor) + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} @@ -484,7 +425,7 @@ def get_dummy_processor_inputs( } hf_processor = self.info.get_hf_processor() - image_token: str = hf_processor.image_token # type: ignore + image_token: str = hf_processor.tokenizer.image_token # type: ignore return ProcessorInputs( prompt_text=image_token * num_images, @@ -554,10 +495,14 @@ def __init__( quant_config = vllm_config.quant_config self.config = config - self.vision_tower = AriaVisionModel(config.vision_config) - self.multi_modal_projector = build_mm_projector(config) + self.vision_tower = AriaVisionTransformer( + config.vision_config, + quant_config, + prefix=f"{prefix}.vision_tower", + ) + self.multi_modal_projector = AriaProjector(config) self.vocab_size = config.text_config.vocab_size - self.language_model = AriaMoELMModel( + self.language_model = AriaTextModel( vllm_config=vllm_config.with_hf_config(config.text_config), prefix=maybe_prefix(prefix, "language_model.model"), ) @@ -608,6 +553,22 @@ def _parse_and_validate_image_input( pixel_mask=pixel_mask, ) + def _create_patch_attention_mask( + self, pixel_mask: Optional[torch.Tensor]) -> torch.Tensor: + if pixel_mask is None: + return None + + patches_subgrid = pixel_mask.unfold( + dimension=1, + size=self.vision_tower.config.patch_size, + step=self.vision_tower.config.patch_size, + ).unfold( + dimension=2, + size=self.vision_tower.config.patch_size, + step=self.vision_tower.config.patch_size, + ) + return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + def _process_image_input( self, image_input: AriaImagePixelInputs ) -> Tuple[torch.Tensor, torch.Tensor]: @@ -616,9 +577,18 @@ def _process_image_input( pixel_values = image_input['pixel_values'] pixel_mask = image_input['pixel_mask'] - image_feature, image_attn_mask = self.vision_tower( - pixel_values, pixel_mask=pixel_mask) - return self.multi_modal_projector(image_feature, image_attn_mask) + patch_attention_mask = self._create_patch_attention_mask(pixel_mask) + + image_outputs = self.vision_tower( + pixel_values=pixel_values, + patch_attention_mask=patch_attention_mask, + ) + image_attn_mask = None + if patch_attention_mask is not None: + flattened_mask = patch_attention_mask.flatten(1) + image_attn_mask = torch.logical_not(flattened_mask) + + return self.multi_modal_projector(image_outputs, image_attn_mask) def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) @@ -683,6 +653,5 @@ def sample( return next_tokens def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - loader = AutoWeightsLoader(self) loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 917b88e802071..09c5087c2dc36 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -14,12 +14,12 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - NestedTensors, PlaceholderRange) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement) + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors @@ -475,36 +475,27 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + bos_token_id = tokenizer.bos_token_id + assert isinstance(bos_token_id, int) + + image_token_id = vocab["image"] num_image_tokens = self.info.get_num_image_tokens() + image_tokens = [image_token_id] * num_image_tokens return [ PromptReplacement( modality="image", target="", - replacement="" * num_image_tokens + "", + replacement=PromptReplacementDetails( + full=image_tokens + [bos_token_id], + features=image_tokens, + ), ) ] - def apply( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) - - # Only tokens should be considered as placeholders, - # so we ignore the trailing bos_token - result["mm_placeholders"] = { - modality: [ - PlaceholderRange(offset=p["offset"], length=p["length"] - 1) - for p in ps - ] - for modality, ps in result["mm_placeholders"].items() - } - - return result - @MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor, info=Blip2ProcessingInfo, diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index a6634204699c9..e834c9004f140 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -28,12 +28,12 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - NestedTensors, PlaceholderRange) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement) + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors @@ -122,8 +122,9 @@ def _apply_hf_processor_tokens_only( ) -> list[int]: # HF processor adds sep token for chat mode tokenizer = self.info.get_tokenizer() - sep_token_id: int = \ - tokenizer.vocab[tokenizer.sep_token] # type: ignore + vocab = tokenizer.get_vocab() + + sep_token_id = vocab[tokenizer.sep_token] # type: ignore return prompt_tokens + [sep_token_id] @@ -141,39 +142,27 @@ def _get_prompt_replacements( out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + image_start_id = vocab[processor.image_start_token] + image_token_id = vocab[processor.image_token] + image_end_id = vocab[processor.image_end_token] + + num_image_tokens = self.info.get_num_image_tokens() + image_tokens = [image_token_id] * num_image_tokens return [ PromptReplacement( modality="image", - target="", - replacement="".join([ - processor.image_start_token, - processor.image_token * self.info.get_num_image_tokens(), - processor.image_end_token, - ]), + target=[image_token_id], + replacement=PromptReplacementDetails( + full=([image_start_id] + image_tokens + [image_end_id]), + features=image_tokens, + ), ) ] - def apply( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) - - # Only tokens should be considered as placeholders, - # so we ignore the image_start_token and image_end_token - result["mm_placeholders"] = { - modality: [ - PlaceholderRange(offset=p["offset"] + 1, - length=p["length"] - 2) for p in ps - ] - for modality, ps in result["mm_placeholders"].items() - } - - return result - class ChameleonLayerNorm(nn.LayerNorm): diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 7e37ce3086e6b..d5f9b4d19e5ca 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -41,7 +41,7 @@ from vllm.transformers_utils.configs import ChatGLMConfig from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP -from .utils import (is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -605,9 +605,50 @@ def forward( return IntermediateTensors({"hidden_states": hidden_states}) return hidden_states + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("linear_proj.merged_proj", "linear_proj.gate_proj", 0), + ("linear_proj.merged_proj", "linear_proj.dense_h_to_4h", 1), + ] + params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() + + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + if "rotary_pos_emb.inv_freq" in name: + continue + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={".word_embeddings": ""}, ) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config @@ -660,52 +701,9 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: - # Merge two ColumnParallelLinear into one MergedColumnParallelLinear - merged_weights_dict: Dict[str, Dict[str, Optional[torch.Tensor]]] = { - "transformer.vision.linear_proj.merged_proj.weight": { - "transformer.vision.linear_proj.gate_proj.weight": None, - "transformer.vision.linear_proj.dense_h_to_4h.weight": None, - } - } - - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() - for name, loaded_weight in weights: - is_weight_to_be_merge = False - for _, merged_weight_dict in merged_weights_dict.items(): - if name in merged_weight_dict: - assert merged_weight_dict[name] is None - merged_weight_dict[name] = loaded_weight - is_weight_to_be_merge = True - if is_weight_to_be_merge: - continue - if "rotary_pos_emb.inv_freq" in name: - continue - if "word_embeddings" in name: - name = name.replace(".word_embeddings", "") - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - - for combined_name, merged_weight_dict in merged_weights_dict.items(): - if combined_name in params_dict: - param = params_dict[combined_name] - combined_weight = torch.cat(list(merged_weight_dict.values()), - dim=0) - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, combined_weight) - loaded_params.add(combined_name) - return loaded_params + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) class ChatGLM(ChatGLMBaseModel): @@ -726,6 +724,7 @@ class ChatGLM(ChatGLMBaseModel): class ChatGLMV(ChatGLMBaseModel, SupportsMultiModal): + packed_modules_mapping = { "query_key_value": ["query_key_value"], "dense_h_to_4h": ["dense_h_to_4h"], @@ -777,7 +776,7 @@ def __new__( ) -> None: config = vllm_config.model_config.hf_config # Initialize VL - if hasattr(config, "visual"): + if hasattr(config, "vision_config"): return ChatGLMV(vllm_config=vllm_config, prefix=prefix) # Initialize LLM else: diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 8d61ece289412..989056bf5c155 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -437,6 +437,19 @@ def load_weights(self, weights: Iterable[Tuple[str, params_dict = dict(self.named_parameters()) loaded_params: Set[str] = set() for name, loaded_weight in weights: + + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for param_name, shard_name, shard_id in stacked_params_mapping: if shard_name not in name: continue diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 3932d8b52a9d1..b2aa3c0709bd4 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -83,7 +83,7 @@ def __init__( # Define custom weight loader for dbrx model def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, - weight_name: str): + weight_name: str, param_name: str): tp_rank = get_tensor_model_parallel_rank() param_data = param.data shard_size = self.intermediate_size @@ -91,25 +91,37 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, # DBRX uses GLU for each experts. # GLU has 3 linear layers: w1, v1 and w2. if weight_name.endswith("w1"): - loaded_weight = torch.reshape( - loaded_weight, - [-1, self.intermediate_size * self.tp_size, self.d_model], - ) - param_data[:, 0:shard_size, :] = loaded_weight[:, shard, :] + if param_name.endswith("weight"): + loaded_weight = torch.reshape( + loaded_weight, + [-1, self.intermediate_size * self.tp_size, self.d_model], + ) + param_data[:, 0:shard_size, :] = loaded_weight[:, shard, :] + elif param_name.endswith("weight_scale"): + param_data[:, 0] = loaded_weight + else: + param_data = loaded_weight if weight_name.endswith("v1"): - loaded_weight = torch.reshape( - loaded_weight, - [-1, self.intermediate_size * self.tp_size, self.d_model], - ) - param_data[:, - shard_size:2 * shard_size, :] = loaded_weight[:, - shard, :] + if param_name.endswith("weight"): + loaded_weight = torch.reshape( + loaded_weight, + [-1, self.intermediate_size * self.tp_size, self.d_model], + ) + param_data[:, shard_size:2 * + shard_size, :] = loaded_weight[:, shard, :] + elif param_name.endswith("weight_scale"): + param_data[:, 1] = loaded_weight + else: + param_data[:] = loaded_weight if weight_name.endswith("w2"): - loaded_weight = torch.reshape( - loaded_weight, - [-1, self.intermediate_size * self.tp_size, self.d_model], - ).transpose(1, 2) - param_data[:] = loaded_weight[:, :, shard] + if param_name.endswith("weight"): + loaded_weight = torch.reshape( + loaded_weight, + [-1, self.intermediate_size * self.tp_size, self.d_model], + ).transpose(1, 2) + param_data[:] = loaded_weight[:, :, shard] + else: + param_data[:] = loaded_weight class DbrxMoE(nn.Module): @@ -430,14 +442,28 @@ def sample( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: - expert_params_mapping = [( - "w13_weight" if weight_name in ["w1", "v1"] else "w2_weight", + "w13" if weight_name in ["w1", "v1"] else "w2", f"mlp.{weight_name}", ) for weight_name in ["w1", "v1", "w2"]] params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: Set[str] = set() + for name, loaded_weight in weights: + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + + if name.endswith(("w1", "w2", "v1")): + name = name + "_weight" for param_name, weight_name in expert_params_mapping: if weight_name not in name: continue @@ -446,8 +472,9 @@ def load_weights(self, weights: Iterable[Tuple[str, continue param = params_dict[name] weight_loader = param.weight_loader - weight_loader(param, loaded_weight, weight_name) + weight_loader(param, loaded_weight, weight_name, name) break + else: # Remapping the name of FP8 kv-scale. name = maybe_remap_kv_scale_name(name, params_dict) @@ -456,6 +483,9 @@ def load_weights(self, weights: Iterable[Tuple[str, if is_pp_missing_parameter(name, self): continue + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index d83cafaf998ab..af6810a140b43 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -262,14 +262,8 @@ def __init__( mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.scaling = self.scaling * mscale * mscale - # self.attn = Attention(self.num_heads, - # self.qk_head_dim, - # self.scaling, - # num_kv_heads=self.num_heads) - - # TODO, support head_size 192 self.attn = Attention(self.num_local_heads, - 256, + self.qk_head_dim, self.scaling, num_kv_heads=self.num_local_heads, cache_config=cache_config, @@ -319,18 +313,14 @@ def forward( k = torch.empty_like(q) k[..., :self.qk_nope_head_dim] = k_nope k[..., self.qk_nope_head_dim:] = k_pe - q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim], - value=0).view(-1, - self.num_local_heads * 256) - k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim], - value=0).view(-1, - self.num_local_heads * 256) - v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim], - value=0).view(-1, - self.num_local_heads * 256) + # padding value to qk_head_dim for alignment + v = torch.nn.functional.pad( + v, [0, self.qk_head_dim - self.v_head_dim], + value=0).view(-1, self.num_local_heads * self.qk_head_dim) attn_output = self.attn(q, k, v, kv_cache, attn_metadata) attn_output = attn_output.view( - -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape( + -1, self.num_local_heads, + self.qk_head_dim)[..., :self.v_head_dim].reshape( -1, self.num_local_heads * self.v_head_dim) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py index ca79b14c55fea..0b44f0d062c40 100644 --- a/vllm/model_executor/models/deepseek_v3.py +++ b/vllm/model_executor/models/deepseek_v3.py @@ -269,14 +269,8 @@ def __init__( mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.scaling = self.scaling * mscale * mscale - # self.attn = Attention(self.num_heads, - # self.qk_head_dim, - # self.scaling, - # num_kv_heads=self.num_heads) - - # TODO, support head_size 192 self.attn = Attention(self.num_local_heads, - 256, + self.qk_head_dim, self.scaling, num_kv_heads=self.num_local_heads, cache_config=cache_config, @@ -326,18 +320,14 @@ def forward( k = torch.empty_like(q) k[..., :self.qk_nope_head_dim] = k_nope k[..., self.qk_nope_head_dim:] = k_pe - q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim], - value=0).view(-1, - self.num_local_heads * 256) - k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim], - value=0).view(-1, - self.num_local_heads * 256) - v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim], - value=0).view(-1, - self.num_local_heads * 256) + # padding value to qk_head_dim for alignment + v = torch.nn.functional.pad( + v, [0, self.qk_head_dim - self.v_head_dim], + value=0).view(-1, self.num_local_heads * self.qk_head_dim) attn_output = self.attn(q, k, v, kv_cache, attn_metadata) attn_output = attn_output.view( - -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape( + -1, self.num_local_heads, + self.qk_head_dim)[..., :self.v_head_dim].reshape( -1, self.num_local_heads * self.v_head_dim) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 99fa941c055d2..344832d8b33e6 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -1,7 +1,7 @@ # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py """Inference-only Deepseek-VL2 model compatible with HuggingFace weights.""" import math -from functools import cached_property, partial +from functools import cached_property from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) @@ -9,7 +9,7 @@ import torch.nn as nn import torch.nn.functional as F from einops import rearrange, repeat -from transformers import AutoProcessor, BatchFeature, ProcessorMixin +from transformers import BatchFeature from vllm.attention import AttentionMetadata from vllm.config import VllmConfig @@ -31,6 +31,8 @@ from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config, MlpProjectorConfig, VisionEncoderConfig) +from vllm.transformers_utils.processors.deepseek_vl2 import ( + DeepseekVLV2Processor) from vllm.utils import is_list_of from .interfaces import SupportsMultiModal, SupportsPP @@ -129,25 +131,8 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(DeepseekVLV2Config) - def get_hf_processor(self) -> ProcessorMixin: - # TODO(Isotr0py): we should get rid of dependency on deepseek_vl2 - # in the future, because it's flasky and lack of maintenance. - try: - from deepseek_vl2.models.processing_deepseek_vl_v2 import ( - DeepseekVLV2Processor, select_best_resolution) - AutoProcessor.register("DeepseekVLV2Processor", - DeepseekVLV2Processor) - except ModuleNotFoundError as exc: - raise ModuleNotFoundError( - "You need to `pip install " - "git+https://github.com/deepseek-ai/DeepSeek-VL2.git` " - "to use this model") from exc - - processor = self.ctx.get_hf_processor(DeepseekVLV2Processor) - processor.select_best_resolution = partial( - select_best_resolution, - candidate_resolutions=processor.candidate_resolutions) - return processor + def get_hf_processor(self) -> DeepseekVLV2Processor: + return self.ctx.get_hf_processor(DeepseekVLV2Processor) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} @@ -224,31 +209,21 @@ def _call_hf_processor( mm_kwargs: Mapping[str, object], ) -> BatchFeature: if mm_data: - outputs = self.info.ctx.call_hf_processor( + processed_outputs = self.info.ctx.call_hf_processor( self.info.get_hf_processor(**mm_kwargs), dict(prompt=prompt, **mm_data), mm_kwargs, ) - - # Deepseek-vl2 processor don't return BatchFeature, - # we need to manually create it - processed_outputs = dict(input_ids=outputs["input_ids"]) - processed_outputs = BatchFeature(data=dict(processed_outputs), - tensor_type="pt") - - # Remove batch dimension from processor outputs, - # because we will try batch to create NestedTensors target_dtype = self.info.ctx.model_config.dtype - pixel_values = outputs["images"].to(target_dtype).squeeze(0) - images_spatial_crop = outputs["images_spatial_crop"].squeeze(0) + pixel_values = processed_outputs.pop("pixel_values").to( + target_dtype) + # split pixel values into patches corresponding to each image + images_spatial_crop = processed_outputs["images_spatial_crop"] patches_per_image = [ x.prod().item() + 1 for x in images_spatial_crop ] - - # Rename `images` -> `pixel_values` to avoid confusion - processed_outputs["pixel_values"] = list( - pixel_values.split(patches_per_image)) - processed_outputs["images_spatial_crop"] = images_spatial_crop + pixel_values = pixel_values.split(patches_per_image) + processed_outputs["pixel_values"] = pixel_values else: tokenizer = self.info.get_tokenizer() processed_outputs = tokenizer(prompt, @@ -274,8 +249,10 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_processor = self.info.get_hf_processor() - image_token_id: int = hf_processor.image_token_id + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + image_token_id = hf_processor.image_token_id + assert isinstance(image_token_id, int) def get_replacement_deepseek_vl2(item_idx: int): images = mm_items.get_items( @@ -356,13 +333,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): f"Only 2D tile_tag is supported currently, got: {self.tile_tag}" ) + if self.text_config.topk_method == "noaux_tc": + architectures = ["DeepseekV3ForCausalLM"] + elif not self.text_config.use_mla: + architectures = ["DeepseekForCausalLM"] + else: + architectures = ["DeepseekV2ForCausalLM"] + self.language_model = init_vllm_registered_model( vllm_config=vllm_config, hf_config=self.text_config, prefix=maybe_prefix(prefix, "language"), - architectures=["DeepseekV3ForCausalLM"] - if self.text_config.topk_method == "noaux_tc" else - ["DeepseekV2ForCausalLM"], + architectures=architectures, ) self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index ad15f835b1609..eab3bf0756fca 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -39,8 +39,6 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -439,6 +437,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = config self.lora_config = lora_config + self.quant_config = quant_config self.transformer = ExaoneModel( vllm_config=vllm_config, @@ -532,12 +531,14 @@ def load_weights(self, weights: Iterable[Tuple[str, # processed with quantization, LoRA, fine-tuning, etc. if self.config.tie_word_embeddings and "lm_head.weight" in name: continue - if scale_name := get_compressed_tensors_cache_scale(name): - # Loading kv cache scales for compressed-tensors quantization + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py new file mode 100644 index 0000000000000..b93a68680375d --- /dev/null +++ b/vllm/model_executor/models/fairseq2_llama.py @@ -0,0 +1,151 @@ +# Copyright 2024 The vLLM team. +# Copyright 2024 Meta Platforms, Inc. and affiliates. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Llama model for fairseq2 weights.""" + +from typing import Iterable, Set, Tuple + +import torch +from torch.nn import Parameter + +from vllm.config import VllmConfig +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size) +from vllm.model_executor.layers.linear import set_weight_attrs +from vllm.model_executor.models.llama import LlamaForCausalLM + +from .utils import AutoWeightsLoader, WeightsMapper + + +class Fairseq2LlamaForCausalLM(LlamaForCausalLM): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + # For the model loader to read only the relevant checkpoint files + self.allow_patterns_overrides = [ + # either the full checkpoint + "model.pt", + # or the tp-sharded checkpoint of the current rank + f"model.{self.tp_rank}.pt", + ] + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + # fairseq2's serialization adds a wrapper to usual .pt state_dict's: + # { "model_key": my_model_name, "my_model_name": state_dict } + # which we first need to unpack + weights_wrapped = dict(weights) + weights = weights_wrapped[ + weights_wrapped["model_key"]].items() # type: ignore + + # remap keys + fs2_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "decoder_frontend.embed.": "model.embed_tokens.", + "decoder.": "model.", + "final_proj.": "lm_head.", + }, + orig_to_new_substr={ + ".self_attn_layer_norm.": ".input_layernorm.", + ".ffn_layer_norm.": ".post_attention_layernorm.", + ".self_attn.output_proj.": ".self_attn.o_proj.", + ".ffn.gate_proj.": ".mlp.gate_proj.", + ".ffn.inner_proj.": ".mlp.up_proj.", + ".ffn.output_proj.": ".mlp.down_proj.", + ".layer_norm.": ".norm.", + }, + ) + weights = fs2_to_vllm_mapper.apply(weights) + + params = dict(self.named_parameters()) + + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + ) + return loader.load_weights( + (self.reshape_fairseq2_weights(name, loaded_weight, params) + for name, loaded_weight in weights)) + + def flag_sharded_weights(self, params: dict[str, Parameter]): + """Sets the `is_sharded_weight` flag to True for all sharded weights""" + for name, param in params.items(): + modules = name.split(".") + if "norm" in name and len(param.size()) < 2: + # layer norms are not sharded + continue + elif any(emb in modules for emb in ["embed_tokens", "lm_head"]): + # for now we repeat embedding layers for compatibility + continue + else: + # all other layers are sharded + set_weight_attrs(param, {"is_sharded_weight": True}) + + def reshape_fairseq2_weights( + self, + name: str, + loaded_weight: torch.Tensor, + params: dict[str, Parameter], + ) -> Tuple[str, torch.Tensor]: + """Reshape fairseq2's weights.""" + + def permute(w: torch.Tensor, n_heads: int) -> torch.Tensor: + attn_in = self.config.head_dim * n_heads + # check for a sharded weight on dim 0 + if attn_in // self.tp_size == w.size()[0]: + attn_in //= self.tp_size + n_heads //= self.tp_size + attn_out = self.config.hidden_size + return (w.view(n_heads, attn_in // n_heads // 2, 2, + attn_out).transpose(1, + 2).reshape(attn_in, attn_out)) + + modules = name.split(".") + + # rotary embeds should be sliced + if "k_proj" in modules: + loaded_weight = permute(loaded_weight, + self.config.num_key_value_heads) + + elif "q_proj" in modules: + loaded_weight = permute(loaded_weight, + self.config.num_attention_heads) + + # We make the loaded weights compatible with both + # full checkpoints and tp sharded checkpoints. + # Embeddings are repeated to fit the vocab size. + # Other weights are flagged for the weight_loader calls. + if any(emb in modules for emb in ["embed_tokens", "lm_head"]): + # Embeddings are sharded on dim 0 + dim = 0 + # In fairseq2, vocab size has to be divisible by tp_size + # so we don't worry about padding + if self.tp_size > 1 and loaded_weight.shape[ + dim] < self.config.vocab_size: + assert loaded_weight.shape[ + dim] * self.tp_size == self.config.vocab_size, \ + "vocab_size should be divisible by tp_size." + repeats = [1] * len(loaded_weight.size()) + repeats[dim] = self.tp_size + # repeat to match vocab size and to be easily 'narrow'able + loaded_weight = loaded_weight.repeat(repeats) + set_weight_attrs(params[name], {"is_sharded_weight": False}) + # if embeddings are sharded, the rest is too + if "embed_tokens" in modules: + self.flag_sharded_weights(params) + + return name, loaded_weight diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 63e7147f84e03..dbf9da50cc9de 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -16,7 +16,7 @@ """ PyTorch Fuyu model.""" import math from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) + TypedDict) import torch import torch.nn as nn @@ -30,13 +30,13 @@ from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - NestedTensors, PlaceholderRange) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement) + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors @@ -183,7 +183,9 @@ def _apply_hf_processor_tokens_only( ) -> list[int]: # HF processor adds boa_token_id tokenizer = self.info.get_tokenizer() - boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore + vocab = tokenizer.get_vocab() + + boa_token_id = vocab["<0x04>"] return prompt_tokens + [boa_token_id] @@ -202,6 +204,7 @@ def _get_prompt_replacements( ) -> list[PromptReplacement]: hf_config = self.info.get_hf_config() bos_token_id = hf_config.bos_token_id + assert isinstance(bos_token_id, int) tokenizer = self.info.get_tokenizer() eot_token_id = tokenizer.bos_token_id @@ -215,9 +218,13 @@ def get_replacement_fuyu(item_idx: int): image_width=image_size.width, image_height=image_size.height, ) + image_tokens = ([_IMAGE_TOKEN_ID] * ncols + + [_NEWLINE_TOKEN_ID]) * nrows - return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows + - [bos_token_id]) + return PromptReplacementDetails( + full=image_tokens + [bos_token_id], + features=image_tokens, + ) return [ PromptReplacement( @@ -227,26 +234,6 @@ def get_replacement_fuyu(item_idx: int): ) ] - def apply( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) - - # Only |SPEAKER| (image) tokens should be considered as placeholders, - # so we ignore the trailing bos_token_id - result["mm_placeholders"] = { - modality: [ - PlaceholderRange(offset=p["offset"], length=p["length"] - 1) - for p in ps - ] - for modality, ps in result["mm_placeholders"].items() - } - - return result - @MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor, info=FuyuProcessingInfo, diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 698b9a5b6b1d6..f0dc7693974be 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -31,8 +31,6 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -254,6 +252,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config + self.quant_config = quant_config self.embed_tokens = VocabParallelEmbedding( config.vocab_size, @@ -329,7 +328,8 @@ def load_weights(self, weights: Iterable[Tuple[str, params_dict = dict(self.named_parameters()) loaded_params: Set[str] = set() for name, loaded_weight in weights: - if scale_name := get_compressed_tensors_cache_scale(name): + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): # Loading kv cache scales for compressed-tensors quantization param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py index 39a5736eb199b..51922e6f2d03d 100644 --- a/vllm/model_executor/models/glm4_vision_encoder.py +++ b/vllm/model_executor/models/glm4_vision_encoder.py @@ -42,7 +42,8 @@ def forward(self, images: torch.Tensor) -> torch.Tensor: torch.Tensor Transformed tensor with shape (B, L, D) """ - images = images.to(self.proj.weight.device) + images = images.to(device=self.proj.weight.device, + dtype=self.proj.weight.dtype) x = self.proj(images) x = x.flatten(2).transpose(1, 2) cls_token = self.cls_embedding.expand(x.shape[0], -1, -1) diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 4829578a56959..08298cc0db36f 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -313,6 +313,19 @@ def load_weights(self, weights: Iterable[Tuple[str, for name, loaded_weight in weights: if "attn.bias" in name or "attn.masked_bias" in name: continue + + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 3e95926fd1e22..ddd2d7a16b242 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -39,8 +39,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -371,6 +369,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = config self.lora_config = lora_config + self.quant_config = quant_config self.model = GraniteModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) @@ -474,12 +473,14 @@ def load_weights(self, weights: Iterable[Tuple[str, # processed with quantization, LoRA, fine-tuning, etc. if self.config.tie_word_embeddings and "lm_head.weight" in name: continue - if scale_name := get_compressed_tensors_cache_scale(name): - # Loading kv cache scales for compressed-tensors quantization + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 4c353ae6ffc13..37b91a803d71e 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -3,7 +3,6 @@ import torch import torch.nn as nn -from transformers import PretrainedConfig from typing_extensions import TypeIs, TypeVar from vllm.logger import init_logger @@ -19,9 +18,6 @@ logger = init_logger(__name__) -# The type of HF config -C_co = TypeVar("C_co", bound=PretrainedConfig, covariant=True) - # The type of hidden states # Currently, T = torch.Tensor for all models except for Medusa # which has T = List[torch.Tensor] @@ -34,7 +30,7 @@ @runtime_checkable -class VllmModel(Protocol[C_co, T_co]): +class VllmModel(Protocol[T_co]): """The interface required for all models in vLLM.""" def __init__( @@ -97,7 +93,7 @@ def is_vllm_model( @runtime_checkable -class VllmModelForTextGeneration(VllmModel[C_co, T], Protocol[C_co, T]): +class VllmModelForTextGeneration(VllmModel[T], Protocol[T]): """The interface required for all generative models in vLLM.""" def compute_logits( @@ -143,7 +139,7 @@ def is_text_generation_model( @runtime_checkable -class VllmModelForPooling(VllmModel[C_co, T], Protocol[C_co, T]): +class VllmModelForPooling(VllmModel[T], Protocol[T]): """The interface required for all pooling models in vLLM.""" def pooler( diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index e8732c57fad49..a5bd418801f2c 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -38,8 +38,6 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -107,9 +105,9 @@ def __init__(self, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, + bias_o_proj: bool = False, cache_config: Optional[CacheConfig] = None, - prefix: str = "", - bias_o_proj: bool = False) -> None: + prefix: str = "") -> None: super().__init__() layer_idx = extract_layer_index(prefix) self.hidden_size = hidden_size @@ -306,6 +304,7 @@ def __init__(self, lora_config = vllm_config.lora_config self.config = config + self.quant_config = quant_config self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0 @@ -396,12 +395,14 @@ def load_weights(self, weights: Iterable[Tuple[str, # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - if scale_name := get_compressed_tensors_cache_scale(name): - # Loading kv cache scales for compressed-tensors quantization + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 722fff98d5c19..296af2aac5660 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -5,9 +5,11 @@ import torch import torch.nn as nn +from packaging.version import Version from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig, PixtralVisionConfig, PretrainedConfig, SiglipVisionConfig) +from transformers import __version__ as TRANSFORMERS_VERSION from transformers.models.llava import LlavaProcessor from transformers.models.pixtral import PixtralProcessor @@ -22,7 +24,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, + MultiModalInputs, MultiModalKwargs, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) @@ -313,13 +315,14 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: + processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_config = self.info.get_hf_config() - image_token_id = hf_config.image_token_index + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() - processor = self.info.get_hf_processor() - image_token = processor.image_token - image_break_token = processor.image_break_token - image_end_token = processor.image_end_token + image_break_id = vocab[processor.image_break_token] + image_token_id = hf_config.image_token_index + image_end_id = vocab[processor.image_end_token] vision_config = hf_config.vision_config assert isinstance(vision_config, PixtralVisionConfig) @@ -334,10 +337,10 @@ def get_replacement(item_idx: int): image_height=image_size.height, ) - tokens = ([image_token] * ncols + [image_break_token]) * nrows - tokens[-1] = image_end_token + tokens = ([image_token_id] * ncols + [image_break_id]) * nrows + tokens[-1] = image_end_id - return "".join(tokens) + return tokens return [ PromptReplacement( @@ -716,6 +719,27 @@ def load_weights(self, weights: Iterable[Tuple[str, return loader.load_weights(weights) +class MantisProcessingInfo(LlavaProcessingInfo): + + def get_hf_processor(self): + hf_config = self.get_hf_config() + vision_info = self.get_vision_encoder_info() + + if Version(TRANSFORMERS_VERSION) < Version("4.48"): + # BUG: num_additional_image_tokens = 0 but treated as 1, + # so we set vision_feature_select_strategy to None to offset this + vision_feature_select_strategy = None + else: + # FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150 + vision_feature_select_strategy = hf_config.vision_feature_select_strategy # noqa: E501 + + return self.ctx.get_hf_processor( + LlavaProcessor, + patch_size=vision_info.get_patch_size(), + vision_feature_select_strategy=vision_feature_select_strategy, + ) + + class MantisMultiModalProcessor(LlavaMultiModalProcessor): def apply( @@ -723,7 +747,7 @@ def apply( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index @@ -782,7 +806,7 @@ def get_replacement_mantis(item_idx: int): for modality, placeholders in mm_placeholders.items() } - return MultiModalInputsV2( + return MultiModalInputs( type="multimodal", prompt=prompt, prompt_token_ids=prompt_ids, @@ -794,7 +818,7 @@ def get_replacement_mantis(item_idx: int): # To use this model, please use # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` @MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor, - info=LlavaProcessingInfo, + info=MantisProcessingInfo, dummy_inputs=LlavaDummyInputsBuilder) class MantisForConditionalGeneration(LlavaForConditionalGeneration): pass diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 78a47e64d9afc..5b0f35b08646b 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -19,8 +19,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, NestedTensors) -from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems, - VideoProcessorItems) +from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, + VideoEmbeddingItems, VideoProcessorItems) from vllm.multimodal.processing import PromptReplacement from vllm.multimodal.profiling import ProcessorInputs from vllm.sequence import IntermediateTensors @@ -145,6 +145,10 @@ def _get_num_unpadded_features( return (unpadded_features, newline_features) + def get_image_size_with_most_features(self) -> ImageSize: + # NOTE: This hardcoded value is found via processor tests + return ImageSize(width=1153, height=944) + def _get_num_frame_tokens( self, *, @@ -550,10 +554,12 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: # Preserve the order of modalities if there are multiple of them # from the order of kwargs. for input_key in kwargs: - if input_key == "pixel_values" and "images" not in modalities: + if input_key in ("pixel_values", + "image_embeds") and "images" not in modalities: modalities["images"] = self._parse_and_validate_image_input( **kwargs) - if input_key == "pixel_values_videos" and "videos" not in modalities: # noqa E501 + if input_key in ("pixel_values_videos", + "video_embeds") and "videos" not in modalities: modalities["videos"] = self._parse_and_validate_video_input( **kwargs) @@ -810,7 +816,7 @@ def apply_pooling(self, image_features, stride=2): return image_feature def get_multimodal_embeddings( - self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]: + self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]: modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: return None @@ -836,8 +842,7 @@ def get_multimodal_embeddings( def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[List[Tuple[NestedTensors, - str]]] = None, + multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: @@ -846,6 +851,34 @@ def get_input_embeddings( [self.config.image_token_index, self.config.video_token_index]) return inputs_embeds + def get_input_embeddings_v0( + self, + input_ids: torch.Tensor, + image_input: Optional[NestedTensors] = None, + video_input: Optional[NestedTensors] = None, + ) -> torch.Tensor: + + inputs_embeds = self.get_input_embeddings(input_ids) + if image_input is not None: + image_embeds = self._process_image_input(image_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + image_embeds, + placeholder_token_id=self.config.image_token_index, + ) + + if video_input is not None: + video_embeds = self._process_video_pixels(video_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + video_embeds, + placeholder_token_id=self.config.video_token_index, + ) + + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -865,13 +898,21 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. + # NOTE: In v1, inputs_embeds is always generated at model runner from + # `get_multimodal_embeddings` and `get_input_embeddings`, this + # condition is only for v0 compatibility. elif inputs_embeds is None: - multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - multimodal_embeddings) - input_ids = None + image_input = self._parse_and_validate_image_input(**kwargs) + video_input = self._parse_and_validate_video_input(**kwargs) + + if image_input is None and video_input is None: + inputs_embeds = None + else: + inputs_embeds = self.get_input_embeddings_v0( + input_ids, + image_input=image_input, + video_input=video_input) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index a5b364fe5ec85..da415cdae96ed 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -347,6 +347,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config + self.quant_config = quant_config self.model = MixtralModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) @@ -428,6 +429,18 @@ def load_weights(self, weights: Iterable[Tuple[str, if "rotary_emb.inv_freq" in name: continue + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index b2368ffff5412..2554281610a30 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1116,6 +1116,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config + self.quant_config = quant_config self.vocab_size = config.text_config.vocab_size self.hidden_size = config.text_config.hidden_size self.max_num_tiles = config.vision_config.max_num_tiles @@ -1429,6 +1430,17 @@ def load_weights(self, weights: Iterable[Tuple[str, name = name.replace('patch_embedding.weight', 'patch_embedding._linear.weight') loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1) + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + updated_params.add(scale_name) + continue for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 8cc62d5c803cc..2340283b69665 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -405,6 +405,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = config self.lora_config = lora_config + self.quant_config = quant_config self.model = NemotronModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) @@ -489,6 +490,17 @@ def load_weights(self, weights: Iterable[Tuple[str, # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index f9ad0c67adaba..5a28b1ffbb7b4 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -136,6 +136,17 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: @INPUT_REGISTRY.register_input_processor(input_processor_for_paligemma) class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 7a230e5beb367..0fcda81da2800 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -30,15 +30,19 @@ VocabParallelEmbedding) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - NestedTensors, PlaceholderRange) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) +# yapf conflicts with isort for this block +# yapf: disable from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, BoundPromptReplacement, - PlaceholderInfo, PromptReplacement) + PlaceholderFeaturesInfo, + PromptReplacement, + PromptReplacementDetails) +# yapf: enable from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -437,7 +441,12 @@ def get_replacement_phi3v(item_idx: int): processor=hf_processor, ) - return [_IMAGE_TOKEN_ID] * num_image_tokens + [bos_token_id] + image_tokens = [_IMAGE_TOKEN_ID] * num_image_tokens + + return PromptReplacementDetails( + full=image_tokens + [bos_token_id], + features=image_tokens, + ) num_images = mm_items.get_count("image", strict=False) @@ -454,7 +463,7 @@ def _apply_prompt_replacements( token_ids: list[int], mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], mm_item_counts: Mapping[str, int], - ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]: + ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]: token_ids, text, placeholders = super()._apply_prompt_replacements( token_ids=token_ids, mm_prompt_repls=mm_prompt_repls, @@ -467,11 +476,11 @@ def _apply_prompt_replacements( token_ids = [token_ids[0], *token_ids[2:]] placeholders = { modality: [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality=p.modality, item_idx=p.item_idx, start_idx=p.start_idx - 1, - replacement=p.replacement, + tokens=p.tokens, ) for p in ps ] for modality, ps in placeholders.items() @@ -479,26 +488,6 @@ def _apply_prompt_replacements( return token_ids, text, placeholders - def apply( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) - - # Only <|image|> tokens should be considered as placeholders, - # so we ignore the trailing bos_token_id - result["mm_placeholders"] = { - modality: [ - PlaceholderRange(offset=p["offset"], length=p["length"] - 1) - for p in ps - ] - for modality, ps in result["mm_placeholders"].items() - } - - return result - @MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor, info=Phi3VProcessingInfo, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 1febd62f2f705..881c09ea9db99 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -546,6 +546,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config + self.quant_config = vllm_config.quant_config self.model = PhiMoEModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) @@ -623,6 +624,18 @@ def load_weights(self, weights: Iterable[Tuple[str, if "rotary_emb.inv_freq" in name: continue + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 0a99c87470850..82de1c3574090 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -256,7 +256,15 @@ def forward( return hidden_states, residual -@support_torch_compile +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl, + # otherwise (seq_len, ). + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + }) class Qwen2Model(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -279,6 +287,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): )) self.config = config + self.quant_config = quant_config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -364,6 +373,17 @@ def load_weights(self, weights: Iterable[Tuple[str, for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 0dff9595c6c08..fc5aed5c94abb 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -41,7 +41,8 @@ from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement) + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors @@ -153,29 +154,24 @@ def _call_hf_processor( mm_data: Mapping[str, object], mm_kwargs: Mapping[str, Any], ) -> BatchFeature: - mm_data = dict(mm_data) - audios = mm_data.pop("audios", []) - - if audios: - mm_data["audios"] = audios - - feature_extractor = self.info.get_feature_extractor(**mm_kwargs) - mm_kwargs = dict( - **mm_kwargs, - sampling_rate=feature_extractor.sampling_rate, - ) - else: - # NOTE: WhisperFeatureExtractor cannot handle empty list of audios - pass + # Text-only input not supported in composite processor + if not mm_data or not mm_data.get("audios", []): + prompt_ids = self.info.get_tokenizer().encode(prompt) + prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") + + feature_extractor = self.info.get_feature_extractor(**mm_kwargs) + mm_kwargs = dict( + **mm_kwargs, + sampling_rate=feature_extractor.sampling_rate, + ) - processed_outputs = super()._call_hf_processor( + return super()._call_hf_processor( prompt=prompt, mm_data=mm_data, mm_kwargs=mm_kwargs, ) - return processed_outputs - def _get_mm_fields_config( self, hf_inputs: BatchFeature, @@ -192,8 +188,20 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self.info.get_hf_config() - placeholder = hf_config.audio_token_index + processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + # Use getattr with default to be compatible with transformers<4.48 + audio_token = getattr(processor, "audio_token", "<|AUDIO|>") + audio_bos_token = getattr(processor, "audio_bos_token", + "<|audio_bos|>") + audio_eos_token = getattr(processor, "audio_eos_token", + "<|audio_eos|>") + + audio_token_id = vocab[audio_token] + audio_bos_id = vocab[audio_bos_token] + audio_eos_id = vocab[audio_eos_token] feature_attention_mask = out_mm_kwargs.get("feature_attention_mask") if feature_attention_mask is None: @@ -206,20 +214,25 @@ def _get_prompt_replacements( audio_output_lengths = audio_output_lens.tolist() def get_replacement_qwen2_audio(item_idx: int): - num_placeholders = audio_output_lengths[item_idx] - if num_placeholders == 0: + num_features = audio_output_lengths[item_idx] + if num_features == 0: audios = mm_items.get_items("audio", AudioProcessorItems) audio = audios.get(item_idx) raise ValueError( f"The audio {audio} (len={len(audio)}) is too short " "to be represented inside the model") - return [placeholder] * num_placeholders + audio_tokens = [audio_token_id] * num_features + + return PromptReplacementDetails( + full=[audio_bos_id] + audio_tokens + [audio_eos_id], + features=audio_tokens, + ) return [ PromptReplacement( modality="audio", - target=[placeholder], + target=audio_token, replacement=get_replacement_qwen2_audio, ) ] diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 988d682d36be3..593ce4857af0f 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -12,7 +12,7 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.pooler import Pooler, PoolingType, SimplePooler from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput @@ -32,7 +32,7 @@ def forward(self, input): return self.activation(input) -class Qwen2ForRewardModel(nn.Module, SupportsLoRA, SupportsPP): +class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -60,7 +60,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config - pooler_config = vllm_config.model_config.pooler_config self.config = config self.lora_config = lora_config @@ -74,14 +73,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.hidden_size, quant_config=quant_config), ReLU(), - RowParallelLinear(config.hidden_size, 1, + RowParallelLinear(config.hidden_size, + config.num_labels, quant_config=quant_config), ) - self._pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.ALL, - normalize=False, - softmax=False) + self._pooler: SimplePooler self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -115,3 +111,31 @@ def load_weights(self, weights: Iterable[Tuple[str, loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["lm_head."]) return loader.load_weights(weights) + + +class Qwen2ForRewardModel(Qwen2RewardBaseModel): + + def __init__(self, *, vllm_config, prefix=""): + vllm_config.model_config.hf_config.num_labels = 1 + super().__init__(vllm_config=vllm_config, prefix=prefix) + pooler_config = vllm_config.model_config.pooler_config + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.ALL, + normalize=False, + softmax=False) + + +class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel): + + def __init__(self, *, vllm_config, prefix=""): + vllm_config.model_config.hf_config.num_labels = 2 + super().__init__(vllm_config=vllm_config, prefix=prefix) + pooler_config = vllm_config.model_config.pooler_config + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.STEP, + normalize=False, + softmax=True, + step_tag_id=151651, + ) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index d00e5d362c8bc..a2778ee73810e 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -55,7 +55,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (ImageItem, ModalityData, MultiModalFieldConfig, MultiModalKwargs, - NestedTensors, VideoItem) + VideoItem) from vllm.multimodal.parse import (ImageSize, ModalityDataItems, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -67,11 +67,15 @@ from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, - init_vllm_registered_model, maybe_prefix) + init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings) from .vision import get_vit_attn_backend logger = init_logger(__name__) +# For profile run +_MAX_FRAMES_PER_VIDEO = 16 + # === Vision Inputs === # @@ -135,7 +139,7 @@ class Qwen2VLVideoEmbeddingInputs(TypedDict): - List[`torch.Tensor`]: A list of tensors holding all videos' features. Each tensor holds an video's features. - `torch.Tensor`: A tensor holding all videos' features - (concatenation of all videos' feature tensors). + (concatenation of all videos' feature tensors). Tensor shape: `(num_image_features, hidden_size)` - `num_image_features` varies based on @@ -611,6 +615,7 @@ def forward( # adapter x = self.merger(x) + return x def load_weights(self, weights: Iterable[Tuple[str, @@ -874,8 +879,8 @@ def get_num_frames_with_most_features(self, seq_len: int) -> int: max_image_tokens = self.get_max_image_tokens() * max_images max_total_frames = self._get_max_video_frames(seq_len - max_image_tokens) - - num_frames = max(max_total_frames // max(max_videos, 1), 1) + num_frames = min(max(max_total_frames // max(max_videos, 1), 1), + _MAX_FRAMES_PER_VIDEO) # Temporary workaround for https://github.com/huggingface/transformers/issues/35412 if num_frames > 1 and num_frames % 2 == 1: @@ -948,26 +953,29 @@ def _get_prompt_replacements( hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_processor = self.info.get_image_processor( **hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has # image_token and video_token registered placeholder = { - "image": hf_processor.image_token, - "video": hf_processor.video_token, + "image": vocab[hf_processor.image_token], + "video": vocab[hf_processor.video_token], } + merge_length = image_processor.merge_size**2 def get_replacement_qwen2vl(item_idx: int, modality: str): grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx] assert isinstance(grid_thw, torch.Tensor) - num_tokens = grid_thw.prod() // merge_length - return placeholder[modality] * num_tokens + num_tokens = int(grid_thw.prod()) // merge_length + return [placeholder[modality]] * num_tokens return [ PromptReplacement( modality=modality, - target=placeholder[modality], + target=[placeholder[modality]], replacement=partial(get_replacement_qwen2vl, modality=modality), ) for modality in ("image", "video") @@ -1047,11 +1055,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config: Qwen2VLConfig = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config - assert not cache_config.enable_prefix_caching, \ - "Qwen2-VL currently does not support prefix caching" self.config = config self.multimodal_config = multimodal_config @@ -1173,85 +1178,122 @@ def _parse_and_validate_video_input( video_embeds=video_embeds, video_grid_thw=video_grid_thw) - def _process_image_input(self, - image_input: Qwen2VLImageInputs) -> torch.Tensor: + def _process_image_input( + self, image_input: Qwen2VLImageInputs) -> tuple[torch.Tensor, ...]: + + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + if image_input["type"] == "image_embeds": - return image_input["image_embeds"].type(self.visual.dtype) + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: + pixel_values = image_input["pixel_values"].type(self.visual.dtype) + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + + # Split concatenated embeddings for each image item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + + return image_embeds.split(sizes.tolist()) + + def _process_video_input( + self, video_input: Qwen2VLVideoInputs) -> tuple[torch.Tensor, ...]: - pixel_values = image_input["pixel_values"].type(self.visual.dtype) - image_embeds = self.visual(pixel_values, - grid_thw=image_input["image_grid_thw"]) - return image_embeds + grid_thw = video_input["video_grid_thw"] + assert grid_thw.ndim == 2 - def _process_video_input(self, - video_input: Qwen2VLVideoInputs) -> torch.Tensor: if video_input["type"] == "video_embeds": - return video_input["video_embeds"].type(self.visual.dtype) + video_embeds = video_input["video_embeds"].type(self.visual.dtype) + else: + pixel_values_videos = video_input["pixel_values_videos"].type( + self.visual.dtype) + video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) - pixel_values_videos = video_input["pixel_values_videos"].type( - self.visual.dtype) - video_embeds = self.visual(pixel_values_videos, - grid_thw=video_input["video_grid_thw"]) - return video_embeds + # Split concatenated embeddings for each video item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size - def _merge_multimodal_embeddings( - self, - input_ids: torch.Tensor, - inputs_embeds: torch.Tensor, - multimodal_embeddings: torch.Tensor, - placeholder_token_id: int, - ) -> torch.Tensor: - mask = (input_ids == placeholder_token_id) - inputs_embeds[mask, :] = multimodal_embeddings - return inputs_embeds + return video_embeds.split(sizes.tolist()) + + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + modalities = {} + + # Preserve the order of modalities if there are multiple of them + # from the order of kwargs. + for input_key in kwargs: + if input_key in ("pixel_values", + "image_embeds") and "images" not in modalities: + modalities["images"] = self._parse_and_validate_image_input( + **kwargs) + if input_key in ("pixel_values_videos", + "video_embeds") and "videos" not in modalities: + modalities["videos"] = self._parse_and_validate_video_input( + **kwargs) + + return modalities def get_multimodal_embeddings( - self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]: + self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - if image_input is None and video_input is None: + modalities = self._parse_and_validate_multimodal_inputs(**kwargs) + if not modalities: return None - # We make a tuple of each embedding with its modality string. This is a - # temporary workaround for models to handle mixed modalities when - # get_multimodal_embeddings and get_input_embeddings are called - # separately. - # TODO(ywang96): Add support for mixed-modality inference for v1. - multimodal_embeddings: List[Tuple[NestedTensors, str]] = [] - - if image_input is not None: - image_embeds = self._process_image_input(image_input) - multimodal_embeddings.append((image_embeds, "image")) - if video_input is not None: - video_embeds = self._process_video_input(video_input) - multimodal_embeddings.append((video_embeds, "video")) + # The result multimodal_embeddings is tuple of tensors, with each + # tensor correspoending to a multimodal data item (image or video). + multimodal_embeddings: tuple[torch.Tensor, ...] = () + + # NOTE: It is important to iterate over the keys in this dictionary + # to preserve the order of the modalities. + for modality in modalities: + if modality == "images": + image_input = modalities["images"] + vision_embeddings = self._process_image_input(image_input) + multimodal_embeddings += vision_embeddings + if modality == "videos": + video_input = modalities["videos"] + video_embeddings = self._process_video_input(video_input) + multimodal_embeddings += video_embeddings return multimodal_embeddings def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[List[Tuple[NestedTensors, - str]]] = None, + multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: - for embeddings, modality in multimodal_embeddings: - if modality == "image": - inputs_embeds = self._merge_multimodal_embeddings( - input_ids, - inputs_embeds, - embeddings, - placeholder_token_id=self.config.image_token_id, - ) - if modality == "video": - inputs_embeds = self._merge_multimodal_embeddings( - input_ids, - inputs_embeds, - embeddings, - placeholder_token_id=self.config.video_token_id, - ) + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + [self.config.image_token_id, self.config.video_token_id]) + return inputs_embeds + + def get_input_embeddings_v0( + self, + input_ids: torch.Tensor, + image_input: Optional[tuple[torch.Tensor, ...]] = None, + video_input: Optional[tuple[torch.Tensor, ...]] = None, + ) -> torch.Tensor: + + inputs_embeds = self.get_input_embeddings(input_ids) + if image_input is not None: + image_embeds = self._process_image_input(image_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + image_embeds, + placeholder_token_id=self.config.image_token_id, + ) + + if video_input is not None: + video_embeds = self._process_video_input(video_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + video_embeds, + placeholder_token_id=self.config.video_token_id, + ) return inputs_embeds def forward( @@ -1287,22 +1329,25 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. + # NOTE: In v1, inputs_embeds is always generated at model runner from + # `get_multimodal_embeddings` and `get_input_embeddings`, this + # condition is only for v0 compatibility. elif inputs_embeds is None: - multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - - # We need to check for usage of mrope here in case there is - # multimodal data. - # TODO (ywang96): move this to model runner in V1. - if multimodal_embeddings is not None and uses_mrope(self.config): - assert positions.ndim == 2 and positions.size(0) == 3, ( - "multimodal section rotary embedding requires " - f"(3, seq_len) positions, but got {positions.size()}") - - inputs_embeds = self.get_input_embeddings(input_ids, - multimodal_embeddings) - input_ids = None + image_input = self._parse_and_validate_image_input(**kwargs) + video_input = self._parse_and_validate_video_input(**kwargs) + + if image_input is None and video_input is None: + inputs_embeds = None + else: + if uses_mrope(self.config): + assert positions.ndim == 2 and positions.size(0) == 3, ( + "multimodal section rotary embedding requires " + f"(3, seq_len) positions, but got {positions.size()}") + inputs_embeds = self.get_input_embeddings_v0( + input_ids, + image_input=image_input, + video_input=video_input) + input_ids = None hidden_states = self.language_model.model( input_ids=input_ids, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index a71f7f7029c7d..8d2719ca2d00d 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -47,6 +47,7 @@ "DeepseekV3ForCausalLM": ("deepseek_v3", "DeepseekV3ForCausalLM"), "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"), "FalconForCausalLM": ("falcon", "FalconForCausalLM"), + "Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"), "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"), "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"), "GlmForCausalLM": ("glm", "GlmForCausalLM"), @@ -126,6 +127,7 @@ "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), + "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"), "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), # [Multimodal] "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index cca42842bc06e..1e51018973e8c 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -344,10 +344,14 @@ def __init__( self.config = config self.activation_fn = get_act_fn(config.hidden_act) - - # For quantization, we require the hidden size to be a multiple of 64 - quantizable = (config.hidden_size % 64 == 0 - and config.intermediate_size % 64 == 0) + # Special handling for BNB quantization + if quant_config and quant_config.get_name() == "bitsandbytes": + quantizable = True + else: + # For other quantization, we require the hidden size to be a + # multiple of 64 + quantizable = (config.hidden_size % 64 == 0 + and config.intermediate_size % 64 == 0) self.fc1 = ColumnParallelLinear( config.hidden_size, config.intermediate_size, diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index e83d316f74de2..37c5a4b5713b8 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -39,8 +39,6 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -409,6 +407,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config + self.quant_config = quant_config self.model = SolarModel( vllm_config=vllm_config, @@ -491,12 +490,14 @@ def load_weights(self, weights: Iterable[Tuple[str, # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - if scale_name := get_compressed_tensors_cache_scale(name): - # Loading kv cache scales for compressed-tensors quantization + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 587f18ccaf98f..d577e545a473b 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -137,7 +137,7 @@ def _call_hf_processor( mm_kwargs: Mapping[str, object], ) -> BatchFeature: # Text-only input not supported in composite processor - if not mm_data: + if not mm_data or not mm_data.get("audios", []): prompt_ids = self.info.get_tokenizer().encode(prompt) prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") @@ -146,13 +146,6 @@ def _call_hf_processor( audios = mm_data.pop("audios", []) assert isinstance(audios, list) - if not audios: - return super()._call_hf_processor( - prompt=prompt, - mm_data=mm_data, - mm_kwargs=mm_kwargs, - ) - feature_extractor = self.info.get_feature_extractor() mm_kwargs = dict( **mm_kwargs, @@ -212,11 +205,15 @@ def _get_prompt_replacements( out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - placeholder = hf_processor.audio_token_replacement # type: ignore + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + replacement_id = vocab[ + hf_processor.audio_token_replacement] # type: ignore def get_replacement_ultravox(item_idx: int): audio_token_len = out_mm_kwargs["audio_token_len"][item_idx] - return placeholder * audio_token_len + return [replacement_id] * int(audio_token_len) # type: ignore return [ PromptReplacement( diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index a1395982af44c..57166f05cd9bf 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -82,23 +82,25 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend: if backend_by_env_var is not None: selected_backend = backend_name_to_enum(backend_by_env_var) if selected_backend is None: - # For Volta and Turing GPUs, use xformers instead. - device_available = current_platform.has_device_capability(80) - if device_available and support_fa: - from transformers.utils import is_flash_attn_2_available - if is_flash_attn_2_available(): - selected_backend = _Backend.FLASH_ATTN + if current_platform.is_cuda(): + device_available = current_platform.has_device_capability(80) + if device_available and support_fa: + from transformers.utils import is_flash_attn_2_available + if is_flash_attn_2_available(): + selected_backend = _Backend.FLASH_ATTN + else: + logger.warning_once( + "Current `vllm-flash-attn` has a bug inside vision " + "module, so we use xformers backend instead. You can " + "run `pip install flash-attn` to use flash-attention " + "backend.") + selected_backend = _Backend.XFORMERS else: - logger.warning_once( - "Current `vllm-flash-attn` has a bug inside vision module, " - "so we use xformers backend instead. You can run " - "`pip install flash-attn` to use flash-attention backend.") + # For Volta and Turing GPUs, use xformers instead. selected_backend = _Backend.XFORMERS - elif current_platform.is_cpu() or current_platform.is_rocm(): - # ROCM doesn't support xformers - selected_backend = _Backend.TORCH_SDPA else: - selected_backend = _Backend.XFORMERS + # Default to torch SDPA for other non-GPU platforms. + selected_backend = _Backend.TORCH_SDPA return selected_backend diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py index fc5a3e7fba674..a9ce8af15d3bb 100644 --- a/vllm/model_executor/parameter.py +++ b/vllm/model_executor/parameter.py @@ -56,8 +56,14 @@ def __init__(self, data: torch.Tensor, weight_loader: Callable): def weight_loader(self): return self._weight_loader + def _is_1d_and_scalar(self, loaded_weight: torch.Tensor): + cond1 = self.data.ndim == 1 and self.data.numel() == 1 + cond2 = loaded_weight.ndim == 0 and loaded_weight.numel() == 1 + return (cond1 and cond2) + def _assert_and_load(self, loaded_weight: torch.Tensor): - assert self.data.shape == loaded_weight.shape + assert (self.data.shape == loaded_weight.shape + or self._is_1d_and_scalar(loaded_weight)) self.data.copy_(loaded_weight) def load_column_parallel_weight(self, loaded_weight: torch.Tensor): diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 4b63703585214..b35184f6855ab 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -491,7 +491,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]: """ -class MultiModalInputsV2(TypedDict): +class MultiModalInputs(TypedDict): """ Represents the outputs of :class:`vllm.multimodal.processing.BaseMultiModalProcessor`, diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index fa199a07b4cf8..750646ac6e431 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,7 +1,8 @@ import re from abc import ABC, abstractmethod from collections import defaultdict -from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence +from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping, + Sequence) from dataclasses import dataclass, field from functools import lru_cache from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol, @@ -18,8 +19,8 @@ from .hasher import MultiModalHasher from .inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - MultiModalKwargsItem, PlaceholderRange) + MultiModalInputs, MultiModalKwargs, MultiModalKwargsItem, + PlaceholderRange) from .parse import MultiModalDataItems, MultiModalDataParser if TYPE_CHECKING: @@ -28,23 +29,101 @@ logger = init_logger(__name__) _S = TypeVar("_S", str, list[int]) -_PromptSeq = Union[str, list[int]] + +PromptSeq = Union[str, list[int]] +"""A token sequence (list of token IDs) or text.""" + + +@dataclass +class PromptReplacementDetails: + """Details about the replacement token sequence or text.""" + + full: PromptSeq + """The full replacement.""" + + features: PromptSeq + """ + The part of the replacement that corresponds to feature placeholders; + this will be replaced by the output of the vision encoder during model + inference. + """ + + @staticmethod + def from_seq(seq: PromptSeq) -> "PromptReplacementDetails": + return PromptReplacementDetails(full=seq, features=seq) + + +PromptRepl = Union[PromptSeq, PromptReplacementDetails] +""" +The replacement token sequence or text. + +If only part of the replacement corresponds to feature placeholders, you can +use :class:`PromptReplacementDetails` to specify which part. +""" @dataclass class PromptReplacement: """ Defines how to replace portions of an input prompt with placeholder tokens. + + Example: + + For each image, replace one ```` input placeholder in the prompt + with a number of ```` feature placeholders + equal to the feature size of the vision encoder: + + .. code-block:: python + + PromptReplacement( + modality="image", + target="", + replacement="" * image_feature_size, + ) + + As above, but further pad the feature placeholders with ```` + and ```, which are not supposed to be passed to the vision + encoder: + + .. code-block:: python + + PromptReplacement( + modality="image", + target="", + replacement=PromptReplacementDetails( + full="".join([ + "", + "" * image_feature_size, + "", + ]), + features="" * image_feature_size, + ), + ) + + To avoid unnecessary tokenization during prompt replacement, + we recommended passing token sequences instead of text: + + .. code-block:: python + + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=PromptReplacementDetails( + full=([image_bos_id] + [image_token_id] * image_feature_size + + [image_eos_id]), + features=[image_token_id] * image_feature_size, + ), + ) """ modality: str """The modality for which the replacement is made.""" - target: _PromptSeq + target: PromptSeq """The token sequence (or text) to find and replace.""" - replacement: Union[Callable[[int], _PromptSeq], - _PromptSeq] = field(repr=False) + replacement: Union[Callable[[int], PromptRepl], + PromptRepl] = field(repr=False) """ Given the index of the processed item within :attr:`modality`, output the replacement token sequence (or text). @@ -107,11 +186,26 @@ def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]: @dataclass class _BoundPromptSequence: + """ + A :data:`_PromptSeq` bound to a tokenizer to automatically + convert between token sequence and text representations. + """ tokenizer: AnyTokenizer = field(repr=False) _text: Optional[str] _token_ids: Optional[list[int]] + @staticmethod + def from_seq( + tokenizer: AnyTokenizer, + seq: PromptSeq, + ) -> "_BoundPromptSequence": + return _BoundPromptSequence( + tokenizer=tokenizer, + _text=seq if isinstance(seq, str) else None, + _token_ids=seq if isinstance(seq, list) else None, + ) + def __post_init__(self) -> None: if self._text is None and self._token_ids is None: raise ValueError("At least one of 'text' and 'token_ids' must be " @@ -134,6 +228,12 @@ def token_ids(self) -> list[int]: return self._token_ids +@dataclass +class _BoundPromptReplacementGroup: + full: _BoundPromptSequence + features: _BoundPromptSequence + + @dataclass class BoundPromptReplacement: """ @@ -144,25 +244,19 @@ class BoundPromptReplacement: tokenizer: AnyTokenizer = field(repr=False) modality: str - _target: _PromptSeq - _replacement: Union[Callable[[int], _PromptSeq], - _PromptSeq] = field(repr=False) + _target: PromptSeq + _replacement: Union[Callable[[int], PromptRepl], + PromptRepl] = field(repr=False) def __post_init__(self) -> None: - self._replacement_cache = dict[int, _BoundPromptSequence]() + self._replacement_cache = dict[int, _BoundPromptReplacementGroup]() @property def target(self) -> _BoundPromptSequence: """The token sequence (or text) to find and replace.""" - target = self._target + return _BoundPromptSequence.from_seq(self.tokenizer, self._target) - return _BoundPromptSequence( - tokenizer=self.tokenizer, - _text=target if isinstance(target, str) else None, - _token_ids=target if isinstance(target, list) else None, - ) - - def get_replacement(self, item_idx: int) -> _BoundPromptSequence: + def get_replacement(self, item_idx: int) -> _BoundPromptReplacementGroup: """ Given the index of the processed item within :attr:`modality`, output the replacement token sequence (or text). @@ -177,10 +271,16 @@ def get_replacement(self, item_idx: int) -> _BoundPromptSequence: else: cache_key = None - bound_replacement = _BoundPromptSequence( - tokenizer=self.tokenizer, - _text=replacement if isinstance(replacement, str) else None, - _token_ids=replacement if isinstance(replacement, list) else None, + if not isinstance(replacement, PromptReplacementDetails): + replacement = PromptReplacementDetails.from_seq(replacement) + + bound_full = _BoundPromptSequence.from_seq(self.tokenizer, + replacement.full) + bound_features = _BoundPromptSequence.from_seq(self.tokenizer, + replacement.features) + bound_replacement = _BoundPromptReplacementGroup( + full=bound_full, + features=bound_features, ) if cache_key is not None: @@ -197,7 +297,7 @@ class _TokenMatch(NamedTuple): def iter_token_matches( token_ids: list[int], match_ids: list[int], -) -> Iterable[_TokenMatch]: +) -> Generator[_TokenMatch]: """ Yield each occurrence of :code:`match_ids` in :code:`token_ids`. @@ -272,15 +372,15 @@ def end_idx(self) -> int: @dataclass -class PlaceholderInfo: +class PlaceholderFeaturesInfo: modality: str item_idx: int start_idx: int - replacement: list[int] + tokens: list[int] @property def length(self) -> int: - return len(self.replacement) + return len(self.tokens) def to_range(self) -> PlaceholderRange: return PlaceholderRange( @@ -314,7 +414,7 @@ def find_text_matches( def _resolve_matches( - prompt: _PromptSeq, + prompt: PromptSeq, mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]], ) -> list[_PromptReplacementMatch]: """ @@ -362,10 +462,10 @@ def _replace_matches( replacement = repl_info.get_replacement(item_idx) if isinstance(prompt, str): - repl_seq = replacement.text + repl_seq = replacement.full.text out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq) else: - repl_seq = replacement.token_ids + repl_seq = replacement.full.token_ids out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq) prev_end_idx = end_idx @@ -408,7 +508,7 @@ def _iter_placeholders( mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], prompt: list[int], mm_item_counts: Mapping[str, int], -) -> Iterable[PlaceholderInfo]: +) -> Iterable[PlaceholderFeaturesInfo]: """ Yield each set of placeholder tokens found in :code:`prompt`. @@ -432,23 +532,33 @@ def _iter_placeholders( for repl_info in modality_repls: replacement = repl_info.get_replacement(item_idx) - repl_tokens = replacement.token_ids - repl_len = len(repl_tokens) - end_idx = start_idx + repl_len + repl_tokens_full = replacement.full.token_ids + repl_len_full = len(repl_tokens_full) + end_idx_full = start_idx + repl_len_full - if repl_len == 0 or end_idx > prompt_len: + if repl_len_full == 0 or end_idx_full > prompt_len: continue - if prompt[start_idx:end_idx] == repl_tokens: - yield PlaceholderInfo( - modality=modality, - item_idx=item_idx, - start_idx=start_idx, - replacement=repl_tokens, - ) + if prompt[start_idx:end_idx_full] == repl_tokens_full: + repl_tokens_feat = replacement.features.token_ids + + try: + match = next( + iter_token_matches(repl_tokens_full, + repl_tokens_feat)) + yield PlaceholderFeaturesInfo( + modality=modality, + item_idx=item_idx, + start_idx=start_idx + match.start_idx, + tokens=repl_tokens_feat, + ) + except StopIteration: + raise AssertionError( + f"{repl_tokens_feat=} should be a " + f"subsequence of {repl_tokens_full=}") from None # Exclude overlapping matches - start_idx = end_idx + start_idx = end_idx_full item_idx_by_modality[modality] += 1 found = True break @@ -464,7 +574,7 @@ def find_mm_placeholders( mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], prompt: list[int], mm_item_counts: Mapping[str, int], -) -> Mapping[str, list[PlaceholderInfo]]: +) -> Mapping[str, list[PlaceholderFeaturesInfo]]: it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts) return dict(full_groupby_modality(it)) @@ -609,7 +719,7 @@ def __call__( prompt: str, mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: return self.apply(prompt, mm_data, hf_processor_mm_kwargs) def _get_data_parser(self) -> MultiModalDataParser: @@ -679,7 +789,7 @@ def _find_mm_placeholders( mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], new_token_ids: list[int], mm_item_counts: Mapping[str, int], - ) -> Mapping[str, list[PlaceholderInfo]]: + ) -> Mapping[str, list[PlaceholderFeaturesInfo]]: return find_mm_placeholders(mm_prompt_repls, new_token_ids, mm_item_counts) @@ -948,7 +1058,7 @@ def _apply_prompt_replacements( token_ids: list[int], mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], mm_item_counts: Mapping[str, int], - ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]: + ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]: tokenizer = self.info.get_tokenizer() mm_token_matches = { @@ -1037,7 +1147,7 @@ def _validate_mm_kwargs( def _validate_mm_placeholders( self, - mm_placeholders: Mapping[str, list[PlaceholderInfo]], + mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]], mm_item_counts: Mapping[str, int], *, allow_missing: bool = False, @@ -1067,7 +1177,7 @@ def apply( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1169,7 +1279,7 @@ def apply( for modality, placeholders in mm_placeholders.items() } - return MultiModalInputsV2( + return MultiModalInputs( type="multimodal", prompt=prompt, prompt_token_ids=prompt_ids, diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index ec580cd6ecddd..c68edaff80167 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -11,7 +11,7 @@ from vllm.inputs import DummyData from vllm.logger import init_logger -from .inputs import MultiModalDataDict, MultiModalInputsV2 +from .inputs import MultiModalDataDict, MultiModalInputs from .processing import BaseMultiModalProcessor, BaseProcessingInfo logger = init_logger(__name__) @@ -106,7 +106,7 @@ def processing_info(self) -> BaseProcessingInfo: def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]: return self.processor.dummy_inputs - def _get_mm_limits(self) -> Mapping[str, int]: + def get_mm_limits(self) -> Mapping[str, int]: mm_config = self.processing_info.ctx.get_mm_config() mm_limit_per_prompt = mm_config.limit_per_prompt @@ -131,7 +131,7 @@ def _get_dummy_mm_inputs( self, seq_len: int, mm_counts: Mapping[str, int], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: factory = self.dummy_inputs processor_inputs = factory.get_dummy_processor_inputs( seq_len, mm_counts) @@ -146,7 +146,7 @@ def get_dummy_data(self, seq_len: int) -> DummyData: # Avoid circular import from vllm.sequence import SequenceData - mm_counts = self._get_mm_limits() + mm_counts = self.get_mm_limits() info = self.processing_info mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(seq_len) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 2961f7c76ca12..7a4b85385cac9 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -17,7 +17,7 @@ from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors from .processing import (BaseMultiModalProcessor, BaseProcessingInfo, ProcessingCache) -from .profiling import BaseDummyInputsBuilder +from .profiling import BaseDummyInputsBuilder, MultiModalProfiler from .utils import cached_get_tokenizer from .video import VideoPlugin @@ -252,11 +252,8 @@ def get_max_tokens_per_item_by_modality( model_config: "ModelConfig", ) -> Mapping[str, int]: """ - Get the maximum number of tokens per data item from each modality - for profiling the memory usage of a model. - - Note: - This is currently directly used only in V1. + Get the maximum number of tokens per data item from each modality based + on underlying model configuration. """ if self.has_processor(model_config): tokenizer = cached_get_tokenizer( @@ -272,6 +269,28 @@ def get_max_tokens_per_item_by_modality( for key, plugin in self._plugins.items() } + def get_max_tokens_per_item_by_nonzero_modality( + self, + model_config: "ModelConfig", + ) -> Mapping[str, int]: + """ + Get the maximum number of tokens per data item from each modality based + on underlying model configuration, excluding modalities that user + explicitly disabled via `limit_mm_per_prompt`. + + Note: + This is currently directly used only in V1 for profiling the memory + usage of a model. + """ + mm_limits = self.get_mm_limits_per_prompt(model_config) + + return { + key: max_tokens_per_mm_item + for key, max_tokens_per_mm_item in + self.get_max_tokens_per_item_by_modality(model_config).items() + if mm_limits[key] > 0 + } + def get_max_tokens_by_modality( self, model_config: "ModelConfig", @@ -285,10 +304,10 @@ def get_max_tokens_by_modality( Note: This should be called after :meth:`init_mm_limits_per_prompt`. """ - limits_per_plugin = self._limits_by_model[model_config] + mm_limits = self.get_mm_limits_per_prompt(model_config) return { - key: limits_per_plugin[key] * max_tokens_per_mm_item + key: mm_limits[key] * max_tokens_per_mm_item for key, max_tokens_per_mm_item in self.get_max_tokens_per_item_by_modality(model_config).items() } @@ -352,6 +371,15 @@ def get_mm_limits_per_prompt( Note: This should be called after :meth:`init_mm_limits_per_prompt`. """ + if self.has_processor(model_config): + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code, + ) + processor = self.create_processor(model_config, tokenizer) + profiler = MultiModalProfiler(processor) + return profiler.get_mm_limits() + return self._limits_by_model[model_config] def register_processor( diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 1c6bbf77b926f..094e0682a068b 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -1,4 +1,5 @@ from functools import lru_cache +from itertools import groupby from pathlib import Path from typing import TYPE_CHECKING, Optional, TypeVar, Union from urllib.parse import ParseResult, urlparse @@ -26,7 +27,7 @@ if TYPE_CHECKING: from .hasher import MultiModalHashDict - from .inputs import MultiModalPlaceholderDict + from .inputs import MultiModalKwargs, MultiModalPlaceholderDict class MediaConnector: @@ -477,3 +478,34 @@ def merge_and_sort_multimodal_metadata( merged_hashes = None return sorted_modalities, merged_placeholders, merged_hashes + + +def group_mm_inputs_by_modality( + mm_inputs: list["MultiModalKwargs"]) -> list[list["MultiModalKwargs"]]: + """Group consecutive MultiModalKwargs from mm_inputs with the same modality + together into the same list for batching purpose. For MultiModalKwargs with + multiple modalities, put them into their own list. + + Args: + mm_inputs: List of MultiModalKwargs. + + Returns: + list[list[MultiModalKwargs]]: List of list of MultiModalKwargs, each + inner list contains consecutive MultiModalKwargs with same modality, or + one with multimodal modalities. + """ + if not mm_inputs: + return [] + + def modality_group_func(mm_input: "MultiModalKwargs") -> Union[str, int]: + # If the input has multiple modalities, return a id as the unique key + # for the mm_input input. + if len(mm_input.modalities) > 1: + return id(mm_input) + + # Otherwise return the modality string + return list(mm_input.modalities)[0] + + return [ + list(group) for _, group in groupby(mm_inputs, key=modality_group_func) + ] diff --git a/vllm/outputs.py b/vllm/outputs.py index b519c159b1531..63df7dcf519b5 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -172,9 +172,9 @@ def from_seq_group( if seq_group.request_id in seq_id_to_seq_group: group: SequenceGroupBase = seq_id_to_seq_group[ seq_group.request_id] + assembled_seq_group = group.maybe_assemble_group(seq_group) if finished: group.finish_seq(seq_group) - assembled_seq_group = group.maybe_assemble_group(seq_group) if assembled_seq_group is None: return None return cls.from_seq_group(assembled_seq_group, use_cache, diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 6ca95b41dbb07..ddbdc43ca5710 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -101,6 +101,10 @@ def cpu_platform_plugin() -> Optional[str]: try: from importlib.metadata import version is_cpu = "cpu" in version("vllm") + if not is_cpu: + import platform + is_cpu = platform.machine().lower().startswith("arm") + except Exception: pass @@ -213,8 +217,11 @@ def __getattr__(name: str): global _init_trace _init_trace = "".join(traceback.format_stack()) return _current_platform - else: + elif name in globals(): return globals()[name] + else: + raise AttributeError( + f"No attribute named '{name}' exists in {__name__}.") __all__ = [ diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 8350177b68ade..2587e3a11dde3 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -139,28 +139,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: else: parallel_config.worker_cls = "vllm.worker.worker.Worker" - world_size = parallel_config.world_size - tensor_parallel_size = parallel_config.tensor_parallel_size - - from vllm.utils import (cuda_device_count_stateless, - update_environment_variables) - - # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers - if "CUDA_VISIBLE_DEVICES" not in os.environ: - update_environment_variables({ - "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size)))) - }) - - cuda_device_count = cuda_device_count_stateless() - # Use confusing message for more common TP-only case. - assert tensor_parallel_size <= cuda_device_count, ( - f"please set tensor_parallel_size ({tensor_parallel_size}) " - f"to less than max local gpu count ({cuda_device_count})") - - assert world_size <= cuda_device_count, ( - f"please ensure that world_size ({world_size}) " - f"is less than than max local gpu count ({cuda_device_count})") - cache_config = vllm_config.cache_config if cache_config and cache_config.block_size is None: cache_config.block_size = 16 diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 242c2c127979a..a32c262c84efa 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -1,7 +1,9 @@ +import os from typing import TYPE_CHECKING, Optional import torch +from vllm import envs from vllm.logger import init_logger from .interface import Platform, PlatformEnum, _Backend @@ -58,6 +60,22 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: cache_config = vllm_config.cache_config if cache_config and cache_config.block_size is None: cache_config.block_size = 128 + if (parallel_config.distributed_executor_backend == 'mp' + and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'): + if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", + None) is not None: + logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork " + "might cause application hangs on exit. Using " + "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, " + "as it was explicitly requested.") + else: + logger.warning( + "On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork " + "might cause application hangs on exit. Setting " + "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " + "To override that behavior, please set " + "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @classmethod def is_pin_memory_available(cls): diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 67a9e816cb658..5ef56406e1935 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -70,7 +70,7 @@ class RocmPlatform(Platform): supported_quantization: list[str] = [ "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors", - "fbgemm_fp8", "gguf" + "fbgemm_fp8", "gguf", "quark" ] @classmethod diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index e5fa4f0e4a2f6..a78a054917756 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -50,22 +50,18 @@ def load_general_plugins(): processes. They should be designed in a way that they can be loaded multiple times without causing issues. """ + global plugins_loaded + if plugins_loaded: + return + plugins_loaded = True - # all processes created by vllm will load plugins, - # and here we can inject some common environment variables - # for all processes. - - # see https://github.com/vllm-project/vllm/issues/10480 - os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1' - # see https://github.com/vllm-project/vllm/issues/10619 - torch._inductor.config.compile_threads = 1 - + # some platform-specific configurations from vllm.platforms import current_platform if current_platform.is_xpu(): # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158 torch._dynamo.config.disable = True - if current_platform.is_hpu(): + elif current_platform.is_hpu(): # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1) # does not support torch.compile # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for @@ -78,10 +74,6 @@ def load_general_plugins(): # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501 os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true' - global plugins_loaded - if plugins_loaded: - return - plugins_loaded = True plugins = load_plugins_by_group(group='vllm.general_plugins') # general plugins, we only need to execute the loaded functions for func in plugins.values(): diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py index 33babfebdca1e..29c0edd0ee535 100644 --- a/vllm/profiler/layerwise_profile.py +++ b/vllm/profiler/layerwise_profile.py @@ -1,7 +1,7 @@ import copy from collections import defaultdict from dataclasses import asdict, dataclass, field -from typing import Callable, Dict, List, Optional, Tuple, TypeAlias, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, TypeAlias, Union import pandas as pd from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult @@ -128,7 +128,7 @@ def export_summary_stats_table_csv(self, filename: str): ]) df.to_csv(filename) - def convert_stats_to_dict(self) -> str: + def convert_stats_to_dict(self) -> dict[str, Any]: return { "metadata": { "num_running_seqs": self.num_running_seqs @@ -227,7 +227,7 @@ def _total_cuda_time(self): [self._cumulative_cuda_time(root) for root in self._module_tree]) def _build_stats_trees(self): - summary_dict: Dict[str, self.StatsTreeNode] = {} + summary_dict: Dict[str, _StatsTreeNode] = {} total_cuda_time = self._total_cuda_time() def pct_cuda_time(cuda_time_us): diff --git a/vllm/sequence.py b/vllm/sequence.py index 5857f656dfc10..74320db709f94 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -815,7 +815,9 @@ def set_finished_time(self, time: Optional[float]) -> None: def get_max_num_running_seqs(self) -> int: """The maximum number of sequences running in parallel in the remaining lifetime of the request.""" - return 0 if self.first_seq.is_finished() else 1 + if self.is_single_seq: + return 0 if self.first_seq.is_finished() else 1 + return self.num_seqs() - self.num_finished_seqs() def get_seqs( self, @@ -824,7 +826,10 @@ def get_seqs( if status is None: return self.seqs - return self.seqs if self.first_seq.status == status else [] + if self.is_single_seq: + return self.seqs if self.first_seq.status == status else [] + + return [seq for seq in self.seqs if seq.status == status] def is_encoder_decoder(self) -> bool: return self.encoder_seq is not None @@ -833,19 +838,22 @@ def get_encoder_seq(self) -> Optional[Sequence]: return self.encoder_seq def get_finished_seqs(self) -> List[Sequence]: - return self.seqs if self.first_seq.is_finished() else [] + if self.is_single_seq: + return self.seqs if self.first_seq.is_finished() else [] + + return [seq for seq in self.seqs if seq.is_finished()] def update_num_computed_tokens(self, num_new_computed_tokens: int): """Update number of tokens computed so far.""" - seq = self.first_seq - if not seq.is_finished(): - seq.data.update_num_computed_tokens(num_new_computed_tokens) + for seq in self.seqs: + if not seq.is_finished(): + seq.data.update_num_computed_tokens(num_new_computed_tokens) def get_num_uncomputed_tokens(self) -> int: num_uncomputed_tokens = 0 - seq = self.first_seq - if not seq.is_finished(): - num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens() + for seq in self.seqs: + if not seq.is_finished(): + num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens() return num_uncomputed_tokens def num_seqs(self, status: Optional[SequenceStatus] = None) -> int: @@ -860,10 +868,14 @@ def num_seqs(self, status: Optional[SequenceStatus] = None) -> int: return len(self.get_seqs(status)) def num_finished_seqs(self) -> int: - return 1 if self.first_seq.is_finished() else 0 + if self.is_single_seq: + return 1 if self.seqs[0].is_finished() else 0 + return len(self.get_finished_seqs()) def is_finished(self) -> bool: - return self.first_seq.is_finished() + if self.is_single_seq: + return self.first_seq.is_finished() + return all(seq.is_finished() for seq in self.seqs) def is_prefill(self) -> bool: return self.first_seq.is_prefill() @@ -1391,13 +1403,15 @@ class ParallelSampleSequenceGroup(SequenceGroupBase): @staticmethod def add_request(request_id: str, engine, params, **kwargs): original_params = params - params = original_params.clone() - params.n = 1 group = ParallelSampleSequenceGroup(request_id) seqs = [] for i in range(original_params.n): request_id_i = f"{request_id}_parallel_sample_{i}" group.seq_id_to_index[request_id_i] = i + params = copy.deepcopy(original_params) + params.n = 1 + if params.seed is not None: + params.seed += i seq_group = engine._add_processed_request( request_id_i, params=params, @@ -1432,33 +1446,34 @@ def maybe_assemble_group( self, seq_group: SequenceGroup) -> Optional[SequenceGroup]: # in the streaming mode, we will return the assembled sequence - # for the first sequence, and then return None for the rest of - # sequences + # for the first remaining sequence, and then return None for the + # rest of sequences if self.streaming: - if self.seq_id_to_index[seq_group.request_id] == 0: + first_remaining_id = next(iter(self.to_be_finished)) + if seq_group.request_id == first_remaining_id: return self.assembled_seq_group return None # in the non-streaming mode, we will return the assembled sequence - # once after all sequences finish, and then return None for the + # when the last sequences finishes, and then return None for the # rest of the time - - if len(self.to_be_finished) > 0: - return None - - assert self.assembled_seq_group is not None - params = self.assembled_seq_group.sampling_params - assert isinstance(params, SamplingParams) - if not self.output_produced: - self.output_produced = True - if params._real_n is not None: - # Get the top-n sequences. - n = params._real_n or params.n - seqs = self.assembled_seq_group.seqs - sorting_key = lambda seq: seq.get_cumulative_logprob() - sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) - top_n_seqs = sorted_seqs[:n] - self.assembled_seq_group.seqs = top_n_seqs - return self.assembled_seq_group - if self.output_produced: - return None + if (len(self.to_be_finished) == 1 + and seq_group.request_id in self.to_be_finished + and seq_group.is_finished()): + assert self.assembled_seq_group is not None + params = self.assembled_seq_group.sampling_params + assert isinstance(params, SamplingParams) + if not self.output_produced: + self.output_produced = True + if params._real_n is not None: + # Get the top-n sequences. + n = params._real_n or params.n + seqs = self.assembled_seq_group.seqs + sorting_key = lambda seq: seq.get_cumulative_logprob() + sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) + top_n_seqs = sorted_seqs[:n] + self.assembled_seq_group.seqs = top_n_seqs + return self.assembled_seq_group + if self.output_produced: + return None + return None diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index bb6b99135580e..e906b1789cde8 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -2,6 +2,7 @@ from typing import List, Optional, Set, Tuple import torch +import torch.nn as nn from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest @@ -10,6 +11,10 @@ from vllm.spec_decode.top1_proposer import Top1Proposer +class _DummyModel(nn.Module): + pass + + class NGramWorker(NonLLMProposerWorkerBase): """NGramWorker provides a light drafter without need for model. @@ -36,7 +41,6 @@ def set_ngram_window_size(self, ngram_prompt_lookup_min: int, def init_device(self): self.device = torch.device(f"{self.device_type}:{self.local_rank}") - self.load_model = lambda *args, **kwargs: None # Current NGramWorker only supports Top1Proposer self._proposer = Top1Proposer( @@ -45,6 +49,12 @@ def init_device(self): vocab_size=self.vocab_size, ) + def load_model(self) -> None: + pass # Dummy + + def get_model(self) -> nn.Module: + return _DummyModel() + def sampler_output( self, execute_model_req: ExecuteModelRequest, diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 8896b7dbc6b8a..c6ff5e52f9388 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -1,6 +1,7 @@ from typing import List, Optional, Set, Tuple import torch +import torch.nn as nn from vllm.distributed.parallel_state import (get_tp_group, init_model_parallel_group, @@ -15,6 +16,10 @@ logger = init_logger(__name__) +class _DummyModel(nn.Module): + pass + + class SmallerTpProposerWorker(ProposerWorkerBase): """Class which allows a speculative draft model to run with smaller tensor parallel degree than target model. @@ -139,6 +144,13 @@ def get_spec_proposals( return self._worker.get_spec_proposals( execute_model_req, seq_ids_with_bonus_token_in_last_step) + def get_model(self) -> nn.Module: + if self._is_dummy: + return _DummyModel() + + with self._patch_tensor_parallel_group(): + return self._worker.get_model() + def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 540d118d65ecb..0d66ede3d907a 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Type import torch +import torch.nn as nn from vllm.config import ParallelConfig, SpeculativeConfig, VllmConfig from vllm.distributed.communication_op import broadcast_tensor_dict @@ -403,6 +404,9 @@ def initialize_cache(self, num_gpu_blocks: int, self.proposer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) + def get_model(self) -> nn.Module: + return self.scorer_worker.get_model() + @torch.inference_mode() def execute_model( self, diff --git a/vllm/tracing.py b/vllm/tracing.py index 50068d8cf9c25..72a3f85118d36 100644 --- a/vllm/tracing.py +++ b/vllm/tracing.py @@ -16,7 +16,6 @@ OTEL_EXPORTER_OTLP_TRACES_PROTOCOL) from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor - from opentelemetry.semconv_ai import SpanAttributes as BaseSpanAttributes from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider from opentelemetry.trace.propagation.tracecontext import ( TraceContextTextMapPropagator) @@ -92,21 +91,30 @@ def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]: return {h: headers[h] for h in TRACE_HEADERS if h in headers} -class SpanAttributes(BaseSpanAttributes): - # The following span attribute names are added here because they are missing - # from the Semantic Conventions for LLM. - LLM_REQUEST_ID = "gen_ai.request.id" - LLM_REQUEST_N = "gen_ai.request.n" - LLM_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences" - LLM_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue" - LLM_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token" - LLM_LATENCY_E2E = "gen_ai.latency.e2e" - LLM_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler" +class SpanAttributes: + # Attribute names copied from here to avoid version conflicts: + # https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-spans.md + GEN_AI_USAGE_COMPLETION_TOKENS = "gen_ai.usage.completion_tokens" + GEN_AI_USAGE_PROMPT_TOKENS = "gen_ai.usage.prompt_tokens" + GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens" + GEN_AI_REQUEST_TOP_P = "gen_ai.request.top_p" + GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature" + GEN_AI_RESPONSE_MODEL = "gen_ai.response.model" + # Attribute names added until they are added to the semantic conventions: + GEN_AI_REQUEST_ID = "gen_ai.request.id" + GEN_AI_REQUEST_N = "gen_ai.request.n" + GEN_AI_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences" + GEN_AI_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue" + GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token" + GEN_AI_LATENCY_E2E = "gen_ai.latency.e2e" + GEN_AI_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler" # Time taken in the forward pass for this across all workers - LLM_LATENCY_TIME_IN_MODEL_FORWARD = "gen_ai.latency.time_in_model_forward" + GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD = ( + "gen_ai.latency.time_in_model_forward") # Time taken in the model execute function. This will include model # forward, block/sync across workers, cpu-gpu sync time and sampling time. - LLM_LATENCY_TIME_IN_MODEL_EXECUTE = "gen_ai.latency.time_in_model_execute" + GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = ( + "gen_ai.latency.time_in_model_execute") def contains_trace_headers(headers: Mapping[str, str]) -> bool: diff --git a/vllm/transformers_utils/configs/aria.py b/vllm/transformers_utils/configs/aria.py deleted file mode 100644 index d253da0d96a34..0000000000000 --- a/vllm/transformers_utils/configs/aria.py +++ /dev/null @@ -1,47 +0,0 @@ -from transformers.models.idefics2.configuration_idefics2 import ( - Idefics2VisionConfig) -from transformers.models.llama.configuration_llama import LlamaConfig - - -class AriaVisionConfig(Idefics2VisionConfig): - model_type = "aria_vision_model" - - -class AriaMoELMConfig(LlamaConfig): - """ - Configuration class for AriaMoE language model. - - This class extends the LlamaConfig to include additional parameters specific - to the Mixture of Experts (MoE) architecture. - """ - - model_type = "aria_moe_lm" - - def __init__( - self, - moe_intermediate_size: int = 4096, - moe_num_experts: int = 8, - moe_topk: int = 2, - moe_num_shared_experts: int = 2, - **kwargs, - ): - """ - Initialize the AriaMoELMConfig. - - Args: - moe_intermediate_size (int): The intermediate size for MoE layers. - Default is 4096. - moe_num_experts (int): The number of experts in the MoE layer. - Default is 8. - moe_topk (int): The number of top experts to route to for each - token. Default is 2. - moe_num_shared_experts (int): The number of shared experts. Default - is 2. - **kwargs: Additional keyword arguments to be passed to the parent - LlamaConfig. - """ - super().__init__(**kwargs) - self.moe_intermediate_size = moe_intermediate_size - self.moe_num_experts = moe_num_experts - self.moe_topk = moe_topk - self.moe_num_shared_experts = moe_num_shared_experts diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py new file mode 100644 index 0000000000000..9c71b8cada32e --- /dev/null +++ b/vllm/transformers_utils/processors/__init__.py @@ -0,0 +1,4 @@ +from vllm.transformers_utils.processors.deepseek_vl2 import ( + DeepseekVLV2Processor) + +__all__ = ["DeepseekVLV2Processor"] diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py new file mode 100644 index 0000000000000..27cdf6bc22d0e --- /dev/null +++ b/vllm/transformers_utils/processors/deepseek_vl2.py @@ -0,0 +1,361 @@ +# yapf: disable +# ruff: noqa: E501 +# coding=utf-8 +# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py +# Copyright (c) 2023-2024 DeepSeek. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import math +from typing import List, Tuple + +import torch +import torchvision.transforms as T +from PIL import Image, ImageOps +from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast +from transformers.processing_utils import ProcessorMixin + + +class ImageTransform: + + def __init__(self, + mean: Tuple[float, float, float] = (0.5, 0.5, 0.5), + std: Tuple[float, float, float] = (0.5, 0.5, 0.5), + normalize: bool = True): + self.mean = mean + self.std = std + self.normalize = normalize + + transform_pipelines = [T.ToTensor()] + + if normalize: + transform_pipelines.append(T.Normalize(mean, std)) + + self.transform = T.Compose(transform_pipelines) + + def __call__(self, pil_img: Image.Image): + x = self.transform(pil_img) + return x + + +class DeepseekVLV2Processor(ProcessorMixin): + tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast") + attributes = ["tokenizer"] + + def __init__( + self, + tokenizer: LlamaTokenizerFast, + candidate_resolutions: Tuple[Tuple[int, int]], + patch_size: int, + downsample_ratio: int, + image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5), + image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5), + normalize: bool = True, + image_token: str = "", + pad_token: str = "<|▁pad▁|>", + add_special_token: bool = False, + sft_format: str = "deepseek", + mask_prompt: bool = True, + ignore_id: int = -100, + **kwargs, + ): + + self.candidate_resolutions = candidate_resolutions + self.image_size = candidate_resolutions[0][0] + self.patch_size = patch_size + self.image_mean = image_mean + self.image_std = image_std + self.normalize = normalize + self.downsample_ratio = downsample_ratio + + self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize) + self.tokenizer = tokenizer + self.tokenizer.padding_side = 'left' # must set this,padding side with make a difference in batch inference + + # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id' + if tokenizer.pad_token is None: + self.tokenizer.add_special_tokens({'pad_token': pad_token}) + + # add image token + image_token_id = self.tokenizer.vocab.get(image_token) + if image_token_id is None: + special_tokens = [image_token] + special_tokens_dict = {"additional_special_tokens": special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + self.image_token_id = self.tokenizer.vocab.get(image_token) + + # add five special tokens for grounding-related tasks + # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|> + special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>'] + special_tokens_dict = {"additional_special_tokens": special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + + # add special tokens for SFT data + special_tokens = ["<|User|>", "<|Assistant|>"] + special_tokens_dict = {"additional_special_tokens": special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + + self.image_token = image_token + self.pad_token = pad_token + self.add_special_token = add_special_token + self.sft_format = sft_format + self.mask_prompt = mask_prompt + self.ignore_id = ignore_id + + super().__init__( + tokenizer, + **kwargs, + ) + + def select_best_resolution(self, image_size): + # used for cropping + original_width, original_height = image_size + best_fit = None + max_effective_resolution = 0 + min_wasted_resolution = float("inf") + + for width, height in self.candidate_resolutions: + scale = min(width / original_width, height / original_height) + downscaled_width, downscaled_height = int( + original_width * scale), int(original_height * scale) + effective_resolution = min(downscaled_width * downscaled_height, + original_width * original_height) + wasted_resolution = (width * height) - effective_resolution + + if effective_resolution > max_effective_resolution or ( + effective_resolution == max_effective_resolution + and wasted_resolution < min_wasted_resolution): + max_effective_resolution = effective_resolution + min_wasted_resolution = wasted_resolution + best_fit = (width, height) + + return best_fit + + @property + def bos_id(self): + return self.tokenizer.bos_token_id + + @property + def eos_id(self): + return self.tokenizer.eos_token_id + + @property + def pad_id(self): + return self.tokenizer.pad_token_id + + def encode(self, text: str, bos: bool = True, eos: bool = False): + t = self.tokenizer.encode(text, add_special_tokens=False) + + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + + return t + + def decode(self, t: List[int], **kwargs) -> str: + return self.tokenizer.decode(t, **kwargs) + + def process_one( + self, + prompt: str, + images: List[Image.Image], + inference_mode: bool = True, + **kwargs, + ): + """ + + Args: + prompt (str): the formatted prompt; + conversations (List[Dict]): conversations with a list of messages; + images (List[ImageType]): the list of images; + inference_mode (bool): if True, then remove the last eos token; + system_prompt (str): the system prompt; + **kwargs: + + Returns: + outputs (BaseProcessorOutput): the output of the processor, + - input_ids (torch.LongTensor): [N + image tokens] + - target_ids (torch.LongTensor): [N + image tokens] + - pixel_values (torch.FloatTensor): [n_patches, 3, H, W] + - image_id (int): the id of the image token + - num_image_tokens (List[int]): the number of image tokens + """ + + assert (prompt is not None and images is not None + ), "prompt and images must be used at the same time." + + sft_format = prompt + tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images( + sft_format, images, bos=True, eos=True, cropping=len(images) <= 2) + masked_tokenized_str = [] + for token_index in tokenized_str: + if token_index != self.image_token_id: + masked_tokenized_str.append(token_index) + else: + masked_tokenized_str.append(self.ignore_id) + + assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \ + (f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, " + f"imags_seq_mask's length {len(images_seq_mask)}, are not equal") + + input_ids = torch.LongTensor(tokenized_str) + target_ids = torch.LongTensor(masked_tokenized_str) + images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool) + + # set input_ids < 0 | input_ids == self.image_token_id as ignore_id + target_ids[(input_ids < 0) | + (input_ids == self.image_token_id)] = self.ignore_id + input_ids[input_ids < 0] = self.pad_id + + if inference_mode: + # 去掉结尾的eos token + assert input_ids[-1] == self.eos_id + input_ids = input_ids[:-1] + target_ids = target_ids[:-1] + images_seq_mask = images_seq_mask[:-1] + + if len(images_list) == 0: + pixel_values = torch.zeros((1, 3, self.image_size, self.image_size)) + images_spatial_crop = torch.zeros((1, 2), dtype=torch.long) + else: + pixel_values = torch.stack(images_list, dim=0) + images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long) + + input_ids = input_ids.unsqueeze(0) + + prepare = BatchFeature( + data=dict( + input_ids=input_ids, + pixel_values=pixel_values, + images_seq_mask=images_seq_mask, + images_spatial_crop=images_spatial_crop, + num_image_tokens=num_image_tokens, + ), + tensor_type="pt", + ) + return prepare + + def __call__( + self, + *, + prompt: str, + images: List[Image.Image], + inference_mode: bool = True, + **kwargs, + ): + """ + + Args: + prompt (str): the formatted prompt; + images (List[ImageType]): the list of images; + inference_mode (bool): if True, then remove the last eos token; + **kwargs: + + Returns: + outputs (BaseProcessorOutput): the output of the processor, + - input_ids (torch.LongTensor): [N + image tokens] + - images (torch.FloatTensor): [n_images, 3, H, W] + - image_id (int): the id of the image token + - num_image_tokens (List[int]): the number of image tokens + """ + + prepare = self.process_one( + prompt=prompt, + images=images, + inference_mode=inference_mode, + ) + + return prepare + + def tokenize_with_images( + self, + conversation: str, + images: List[Image.Image], + bos: bool = True, + eos: bool = True, + cropping: bool = True, + ): + """Tokenize text with tags.""" + assert conversation.count(self.image_token) == len(images) + text_splits = conversation.split(self.image_token) + images_list, images_seq_mask, images_spatial_crop = [], [], [] + num_image_tokens = [] + tokenized_str = [] + for text_sep, image in zip(text_splits, images): + """encode text_sep""" + tokenized_sep = self.encode(text_sep, bos=False, eos=False) + tokenized_str += tokenized_sep + images_seq_mask += [False] * len(tokenized_sep) + + """select best resolution for anyres""" + if cropping: + best_width, best_height = self.select_best_resolution(image.size) + else: + best_width, best_height = self.image_size, self.image_size + + """process the global view""" + global_view = ImageOps.pad(image, (self.image_size, self.image_size), + color=tuple(int(x * 255) for x in self.image_transform.mean)) + images_list.append(self.image_transform(global_view)) + + """process the local views""" + local_view = ImageOps.pad(image, (best_width, best_height), + color=tuple(int(x * 255) for x in self.image_transform.mean)) + for i in range(0, best_height, self.image_size): + for j in range(0, best_width, self.image_size): + images_list.append( + self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size)))) + + """record height / width crop num""" + num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size + images_spatial_crop.append([num_width_tiles, num_height_tiles]) + + """add image tokens""" + h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio) + # global views tokens h * (w + 1), 1 is for line separator + tokenized_image = [self.image_token_id] * h * (w + 1) + # add a separator between global and local views + tokenized_image += [self.image_token_id] + # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1) + tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1) + + tokenized_str += tokenized_image + images_seq_mask += [True] * len(tokenized_image) + num_image_tokens.append(len(tokenized_image)) + + """process the last text split""" + tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False) + tokenized_str += tokenized_sep + images_seq_mask += [False] * len(tokenized_sep) + + """add the bos and eos tokens""" + if bos: + tokenized_str = [self.bos_id] + tokenized_str + images_seq_mask = [False] + images_seq_mask + if eos: + tokenized_str = tokenized_str + [self.eos_id] + images_seq_mask = images_seq_mask + [False] + + assert len(tokenized_str) == len( + images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}" + + return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens + + +AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 294262484f2fb..1f1d67fabb243 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -67,9 +67,10 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer: tokenizer_all_special_tokens_extended = ( tokenizer.all_special_tokens_extended) tokenizer_all_special_tokens = set(tokenizer.all_special_tokens) + tokenizer_vocab = tokenizer.get_vocab() tokenizer_len = len(tokenizer) - max_token_id = max(tokenizer.get_vocab().values()) + max_token_id = max(tokenizer_vocab.values()) # Some tokenizers (e.g., QwenTokenizer) have special tokens that # are added and included in the implementation of the vocab_size # property, but not in get_vocab(); if there is an implementation @@ -96,6 +97,9 @@ def all_special_tokens_extended(self): def max_token_id(self): return max_token_id + def get_vocab(self): + return tokenizer_vocab + def __len__(self): return tokenizer_len diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py index d400276796996..09569c564a58d 100644 --- a/vllm/transformers_utils/tokenizer_group/__init__.py +++ b/vllm/transformers_utils/tokenizer_group/__init__.py @@ -24,7 +24,8 @@ def init_tokenizer_from_configs(model_config: ModelConfig, max_input_length=None, tokenizer_mode=model_config.tokenizer_mode, trust_remote_code=model_config.trust_remote_code, - revision=model_config.tokenizer_revision) + revision=model_config.tokenizer_revision, + truncation_side=model_config.truncation_side) return get_tokenizer_group(parallel_config.tokenizer_pool_config, **init_kwargs) diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 17d722e3d88fe..d801cf4e4c7b1 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -18,6 +18,7 @@ Tekkenizer) from vllm.logger import init_logger +from vllm.utils import is_list_of if TYPE_CHECKING: from vllm.entrypoints.chat_utils import ChatCompletionMessageParam @@ -27,7 +28,7 @@ @dataclass class Encoding: - input_ids: List[int] + input_ids: Union[List[int], List[List[int]]] def maybe_serialize_tool_calls(request: ChatCompletionRequest): @@ -223,17 +224,25 @@ def __len__(self) -> int: def __call__( self, - prompt: str, + prompt: Union[str, List[str], List[int]], add_special_tokens: bool = False, truncation: bool = False, max_length: Optional[int] = None, ): - # Mistral Tokenizers should not add special tokens - input_ids = self.encode(prompt) - - if truncation: - input_ids = input_ids[:max_length] - + input_ids: Union[List[int], List[List[int]]] + # For List[str], original prompt text + if is_list_of(prompt, str): + input_ids_: List[List[int]] = [] + for p in prompt: + each_input_ids = self.encode_one(p, truncation, max_length) + input_ids_.append(each_input_ids) + input_ids = input_ids_ + # For List[int], apply chat template output, already tokens. + elif is_list_of(prompt, int): + input_ids = prompt + # For str, single prompt text + else: + input_ids = self.encode_one(prompt, truncation, max_length) return Encoding(input_ids=input_ids) def get_vocab(self) -> Dict[str, int]: @@ -245,6 +254,19 @@ def get_added_vocab(self) -> Dict[str, int]: # Mistral tokenizers have no added vocabulary return {} + def encode_one( + self, + prompt: str, + truncation: bool = False, + max_length: Optional[int] = None, + ) -> List[int]: + # Mistral Tokenizers should not add special tokens + input_ids = self.encode(prompt) + + if truncation: + input_ids = input_ids[:max_length] + return input_ids + def encode(self, prompt: str) -> List[int]: # `encode` should only be used for prompt completion # it should never be used for chat_completion. diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index a9deee881f41a..7f5cc906382af 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -27,6 +27,17 @@ _GLOBAL_RUNTIME_DATA: Dict[str, Union[str, int, bool]] = {} +_USAGE_ENV_VARS_TO_COLLECT = [ + "VLLM_USE_MODELSCOPE", + "VLLM_USE_TRITON_FLASH_ATTN", + "VLLM_ATTENTION_BACKEND", + "VLLM_USE_FLASHINFER_SAMPLER", + "VLLM_PP_LAYER_PARTITION", + "VLLM_USE_TRITON_AWQ", + "VLLM_USE_V1", + "VLLM_ENABLE_V1_MULTIPROCESSING", +] + def set_runtime_usage_data(key: str, value: Union[str, int, bool]) -> None: """Set global usage data that will be sent with every usage heartbeat.""" @@ -119,9 +130,11 @@ def __init__(self) -> None: self.total_memory: Optional[int] = None self.architecture: Optional[str] = None self.platform: Optional[str] = None + self.cuda_runtime: Optional[str] = None self.gpu_count: Optional[int] = None self.gpu_type: Optional[str] = None self.gpu_memory_per_device: Optional[int] = None + self.env_var_json: Optional[str] = None # vLLM Information self.model_architecture: Optional[str] = None @@ -157,6 +170,8 @@ def _report_usage_once(self, model_architecture: str, self.gpu_count = torch.cuda.device_count() self.gpu_type = device_property.name self.gpu_memory_per_device = device_property.total_memory + if current_platform.is_cuda(): + self.cuda_runtime = torch.version.cuda self.provider = _detect_cloud_provider() self.architecture = platform.machine() self.platform = platform.platform() @@ -176,6 +191,12 @@ def _report_usage_once(self, model_architecture: str, self.vllm_version = VLLM_VERSION self.model_architecture = model_architecture + # Environment variables + self.env_var_json = json.dumps({ + env_var: getattr(envs, env_var) + for env_var in _USAGE_ENV_VARS_TO_COLLECT + }) + # Metadata self.log_time = _get_current_timestamp_ns() self.source = envs.VLLM_USAGE_SOURCE diff --git a/vllm/utils.py b/vllm/utils.py index 7477e7028f5ef..17bffd2846b46 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -36,6 +36,7 @@ overload) from uuid import uuid4 +import cloudpickle import numpy as np import numpy.typing as npt import psutil @@ -1922,36 +1923,57 @@ def kill_process_tree(pid: int): @dataclass class MemorySnapshot: """Memory snapshot.""" - torch_peak_in_bytes: int = 0 - torch_memory_in_bytes: int = 0 + torch_peak: int = 0 + cuda_memory: int = 0 + torch_memory: int = 0 + non_torch_memory: int = 0 timestamp: float = 0.0 + auto_measure: bool = True + + def __post_init__(self): + if self.auto_measure: + self.measure() def measure(self): - self.torch_peak_in_bytes = torch.cuda.max_memory_reserved() + # we measure the torch peak memory usage via allocated_bytes, + # rather than `torch.cuda.memory_reserved()` . + # After `torch.cuda.reset_peak_memory_stats()`, + # `torch.cuda.memory_reserved()` will keep growing, and only shrink + # when we call `torch.cuda.empty_cache()` or OOM happens. + self.torch_peak = torch.cuda.memory_stats().get( + "allocated_bytes.all.peak", 0) + + self.cuda_memory = torch.cuda.mem_get_info( + )[1] - torch.cuda.mem_get_info()[0] + # torch.cuda.memory_reserved() is how many bytes # PyTorch gets from cuda (by calling cudaMalloc, etc.) - self.torch_memory_in_bytes = torch.cuda.memory_reserved() + # this is used to measure the non-torch memory usage + self.torch_memory = torch.cuda.memory_reserved() + + self.non_torch_memory = self.cuda_memory - self.torch_memory self.timestamp = time.time() def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot": - """support a - b""" return MemorySnapshot( - torch_peak_in_bytes=self.torch_peak_in_bytes - - other.torch_peak_in_bytes, - torch_memory_in_bytes=self.torch_memory_in_bytes - - other.torch_memory_in_bytes, - timestamp=self.timestamp - other.timestamp) + torch_peak=self.torch_peak - other.torch_peak, + cuda_memory=self.cuda_memory - other.cuda_memory, + torch_memory=self.torch_memory - other.torch_memory, + non_torch_memory=self.non_torch_memory - other.non_torch_memory, + timestamp=self.timestamp - other.timestamp, + auto_measure=False, + ) @dataclass class MemoryProfilingResult: - """Memory profiling result. - """ # noqa - baseline_memory_in_bytes: int = 0 - non_kv_cache_memory_in_bytes: int = 0 - torch_peak_increase_in_bytes: int = 0 - non_torch_increase_in_bytes: int = 0 - weights_memory_in_bytes: float = 0 + """Memory profiling result. All numbers are in bytes. + """ + non_kv_cache_memory: int = 0 + torch_peak_increase: int = 0 + non_torch_increase: int = 0 + weights_memory: float = 0 + before_create: MemorySnapshot = field(default_factory=MemorySnapshot) before_profile: MemorySnapshot = field(default_factory=MemorySnapshot) after_profile: MemorySnapshot = field(default_factory=MemorySnapshot) profile_time: float = 0.0 @@ -1959,18 +1981,14 @@ class MemoryProfilingResult: @contextlib.contextmanager def memory_profiling( - baseline_memory_in_bytes: int, weights_memory_in_bytes: int -) -> Generator[MemoryProfilingResult, None, None]: + baseline_snapshot: MemorySnapshot, + weights_memory: int) -> Generator[MemoryProfilingResult, None, None]: """Memory profiling context manager. - baseline_memory_in_bytes: memory used by all the components other than - the current vLLM instance. It contains: memory used by other processes, memory - used by another vLLM instance in the same process, etc. It is usually measured - before the current vLLM instance initialize the device. And we assume it is - constant during the profiling of the current vLLM instance. - weights_memory_in_bytes: memory used by PyTorch when loading the model weights. + baseline_snapshot: the memory snapshot before the current vLLM instance. + weights_memory: memory used by PyTorch when loading the model weights. Note that, before loading the model weights, we also initialize the device and distributed environment, which may consume some memory. This part is not - included in the weights_memory_in_bytes because PyTorch does not control it. + included in the weights_memory because PyTorch does not control it. The memory in one GPU can be classified into 3 categories: 1. memory used by anything other than the current vLLM instance. @@ -2005,20 +2023,21 @@ def memory_profiling( b. 2 GiB reserved for the peak activation tensors (category 2) c. 1 GiB used by non-torch components (category 3) - The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`. + The memory used for loading weights (a.) is directly given from the argument `weights_memory`. - The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.). + The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` during profiling gives (b.). - (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`), - subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_reserved()`. + The increase of `non_torch_memory` from creating the current vLLM instance until after profiling to get (c.). """ # noqa + gc.collect() + torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() result = MemoryProfilingResult() - result.baseline_memory_in_bytes = baseline_memory_in_bytes + result.before_create = baseline_snapshot # the part of memory used for holding the model weights - result.weights_memory_in_bytes = weights_memory_in_bytes + result.weights_memory = weights_memory result.before_profile.measure() @@ -2029,13 +2048,12 @@ def memory_profiling( result.after_profile.measure() - diff = result.after_profile - result.before_profile - result.torch_peak_increase_in_bytes = diff.torch_peak_in_bytes - current_cuda_memory_bytes = torch.cuda.mem_get_info( - )[1] - torch.cuda.mem_get_info()[0] - result.non_torch_increase_in_bytes = current_cuda_memory_bytes - baseline_memory_in_bytes - weights_memory_in_bytes - diff.torch_memory_in_bytes # noqa - result.profile_time = diff.timestamp - result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes # noqa + diff_profile = result.after_profile - result.before_profile + diff_from_create = result.after_profile - result.before_create + result.torch_peak_increase = diff_profile.torch_peak + result.non_torch_increase = diff_from_create.non_torch_memory + result.profile_time = diff_profile.timestamp + result.non_kv_cache_memory = result.non_torch_increase + result.torch_peak_increase + result.weights_memory # noqa # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501 @@ -2166,3 +2184,25 @@ def bind_kv_cache( assert len(forward_ctx.kv_cache) == len(kv_cache) for ve, ve_kv_cache in enumerate(kv_cache): forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx] + + +def run_method(obj: Any, method: Union[str, bytes, Callable], args: Tuple[Any], + kwargs: Dict[str, Any]) -> Any: + """ + Run a method of an object with the given arguments and keyword arguments. + If the method is string, it will be converted to a method using getattr. + If the method is serialized bytes and will be deserialized using + cloudpickle. + If the method is a callable, it will be called directly. + """ + if isinstance(method, bytes): + func = partial(cloudpickle.loads(method), obj) + elif isinstance(method, str): + try: + func = getattr(obj, method) + except AttributeError: + raise NotImplementedError(f"Method {method!r} is not" + " implemented.") from None + else: + func = partial(method, obj) # type: ignore + return func(*args, **kwargs) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 7b0786261a6a6..fd36ea8d8806b 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -130,13 +130,12 @@ def __init__( def forward( self, + layer: torch.nn.Module, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: FlashAttentionMetadata, - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention. @@ -151,7 +150,7 @@ def forward( shape = [num_tokens, num_heads * head_size] """ # NOTE(woosuk): FlashAttention does not support FP8 KV cache. - assert k_scale == 1.0 and v_scale == 1.0, ( + assert layer._k_scale == 1.0 and layer._v_scale == 1.0, ( "key/v_scale is not supported in FlashAttention.") assert output is not None, "Output tensor must be provided." @@ -183,8 +182,8 @@ def forward( value_cache, attn_metadata.slot_mapping, self.kv_cache_dtype, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) # Compute attention and update output up to `num_actual_tokens`. diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 845bd5ea05e3c..0cd8c806a3e47 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -1,7 +1,14 @@ -from typing import Dict, List, Set, Tuple +from typing import TYPE_CHECKING, Dict, List, Set, Tuple +from vllm.logger import init_logger +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.v1.request import Request +if TYPE_CHECKING: + from vllm.config import ModelConfig, SchedulerConfig + +logger = init_logger(__name__) + class EncoderCacheManager: @@ -46,3 +53,72 @@ def get_freed_ids(self) -> List[Tuple[str, int]]: freed = self.freed self.freed = [] return freed + + +def compute_encoder_budget( + model_config: "ModelConfig", + scheduler_config: "SchedulerConfig", +) -> Tuple[int, int]: + """Compute the encoder cache budget based on the model and scheduler + configurations. + + Args: + model_config: Model configuration. + scheduler_config: Scheduler configuration. + + Returns: + - Compute budget for encoder execution, in unit of number of tokens + in the input sequence. + - Space budget for encoder cache size, in unit of number of tokens + in the input sequence. + """ + + if not model_config.is_multimodal_model: + return 0, 0 + + # TODO: handle encoder-decoder models once we support them. + ( + encoder_compute_budget, + encoder_cache_size, + ) = _compute_encoder_budget_multimodal(model_config, scheduler_config) + + return encoder_compute_budget, encoder_cache_size + + +def _compute_encoder_budget_multimodal( + model_config: "ModelConfig", + scheduler_config: "SchedulerConfig", +) -> Tuple[int, int]: + """Compute the encoder cache budget based on the model and scheduler + configurations for a multimodal model. + + Args: + model_config: Model configuration. + scheduler_config: Scheduler configuration. + + Returns: + - Compute budget for encoder execution, in unit of number of tokens + in the input sequence. + - Space budget for encoder cache size, in unit of number of tokens + in the input sequence. + """ + + max_tokens_by_modality_dict = MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality( # noqa: E501 + model_config) + + if not max_tokens_by_modality_dict: + logger.warning( + "All non-text modalities supported by the model have been " + "explicitly disabled via limit_mm_per_prompt. Encoder cache will " + "not be initialized.") + return 0, 0 + + _, max_tokens_per_mm_item = max(max_tokens_by_modality_dict.items(), + key=lambda item: item[1]) + + encoder_compute_budget = max(scheduler_config.max_num_encoder_input_tokens, + max_tokens_per_mm_item) + encoder_cache_size = max(scheduler_config.encoder_cache_size, + max_tokens_per_mm_item) + + return encoder_compute_budget, encoder_cache_size diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index bac77443c8560..18fdfdfe4a010 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -285,6 +285,56 @@ def free(self, request: Request) -> None: if block.ref_cnt == 0: self.free_block_queue.append(block) + def uncache_blocks(self, request: Request) -> int: + """Uncache the blocks that are no longer full based on the + num_computed_tokens in the given request. This happens when + the blocks were full and cached due to speculative tokens, but the + speculative tokens are not accepted. + + Args: + request: The request. + + Returns: + The number of uncached blocks. + """ + blocks = self.req_to_blocks[request.request_id] + num_computed_tokens = request.num_computed_tokens + num_full_blocks = num_computed_tokens // self.block_size + num_uncached_blocks = 0 + for block in blocks[num_full_blocks:]: + # If the block is not cached, the following blocks are not cached. + if not self._maybe_evict_cached_block(block): + break + num_uncached_blocks += 1 + return num_uncached_blocks + + def reset_prefix_cache(self) -> bool: + """Reset prefix cache. This function may be used in RLHF + flows to invalid prefix caching after the weights are updated, + or used for resetting prefix caching status for benchmarking. + + Returns: + bool: True if the prefix cache is successfully reset, + False otherwise. + """ + num_used_blocks = (self.num_gpu_blocks - + self.free_block_queue.num_free_blocks) + if num_used_blocks > 0: + logger.warning( + "Failed to reset prefix cache because some " + "blocks (%d) are not freed yet", num_used_blocks) + return False + + # Remove all hashes so that no new blocks will hit. + self.cached_block_hash_to_block = defaultdict(dict) + + # Remove all hashes from all blocks. + for block in self.block_pool: + block.reset_hash() + + logger.info("Successfully reset prefix cache") + return True + def get_num_common_prefix_blocks( self, request: Request, @@ -359,7 +409,7 @@ def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]: # If the block is cached, evict it. if self.enable_caching: - self._evict_cached_block(curr_block) + self._maybe_evict_cached_block(curr_block) curr_block.incr_ref() ret.append(curr_block) @@ -367,13 +417,16 @@ def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]: return ret - def _evict_cached_block(self, block: KVCacheBlock) -> None: + def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool: """ If a block is cached in `cached_block_hash_to_block`, we reset its hash metadata and evict it from the cache. Args: block: The block to evict. + + Returns: + True if the block is evicted, False otherwise. """ block_hash = block.block_hash if block_hash and block_hash in self.cached_block_hash_to_block: @@ -383,6 +436,9 @@ def _evict_cached_block(self, block: KVCacheBlock) -> None: if len(self.cached_block_hash_to_block[block_hash]) == 0: del self.cached_block_hash_to_block[block_hash] + return True + return False + def _get_cached_block(self, block_hash: BlockHashType) -> Optional[KVCacheBlock]: """Get a cached block by the block hash, or None if cache miss. diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 22a5d2fb08a48..bab99fe37caee 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -3,7 +3,10 @@ from dataclasses import dataclass from typing import Any, List, NamedTuple, Optional, Tuple +from vllm.config import VllmConfig from vllm.logger import init_logger +from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheSpec, + KVCacheTensor) from vllm.v1.request import Request logger = init_logger(__name__) @@ -305,3 +308,124 @@ def hash_request_tokens(block_size: int, ret.append(block_hash) parent_block_hash_value = block_hash.hash_value return ret + + +def check_enough_kv_cache_memory(vllm_config: VllmConfig, + kv_cache_spec: KVCacheSpec, + available_memory: int): + """ + Checks whether `available_memory` is enough for the KV cache to hold at + least one request with the model's max_model_len. + + Args: + vllm_config: The global VllmConfig + kv_cache_spec: The kv cache spec of the model + available_memory: Memory available for KV cache in bytes. + + Raises: + ValueError: If there is not enough memory available for the KV cache. + """ + + if available_memory <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + + max_model_len = vllm_config.model_config.max_model_len + needed_memory = 0 + for layer_spec in kv_cache_spec.values(): + needed_memory += layer_spec.bytes_for_tokens(max_model_len) + + if needed_memory > available_memory: + raise ValueError( + f"To serve at least one request with the models's max seq len " + f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GB KV " + f"cache is needed, which is larger than the available KV cache " + f"memory ({available_memory/1024/1024/1024:.2f} GB). Try " + f"increasing `gpu_memory_utilization` or decreasing " + f"`max_model_len` when initializing the engine.") + + +def is_kv_cache_type_uniform(kv_cache_spec: KVCacheSpec) -> bool: + """ + Whether all layers in the given KVCacheSpec have the same type of KV cache. + + Args: + kv_cache_spec: The KVCacheSpec of the model + + Returns: + True if all layers have the same type, False otherwise. + """ + + layer_keys = set(layer.type_id for layer in kv_cache_spec.values()) + return len(layer_keys) == 1 + + +def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, + kv_cache_spec: KVCacheSpec, + available_memory: int) -> KVCacheConfig: + """ + Generates the KV cache configuration for a model with one type of KV cache. + Divide the available memory equally among all layers. + + Args: + vllm_config: The global VllmConfig + kv_cache_spec: The kv cache spec of the model + available_memory: Memory available for KV cache in bytes. + + Returns: + The generated KVCacheConfig + """ + + page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()} + assert len(page_sizes) == 1 + page_size = page_sizes.pop() + + num_blocks = int(available_memory // page_size // len(kv_cache_spec)) + num_blocks = max(num_blocks, 0) + + if vllm_config.cache_config.num_gpu_blocks_override is not None: + num_gpu_blocks_override = \ + vllm_config.cache_config.num_gpu_blocks_override + logger.info( + "Overriding num_gpu_blocks=%d with " + "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override) + num_blocks = num_gpu_blocks_override + + logger.info("# GPU blocks: %d", num_blocks) + + per_layer_size = page_size * num_blocks + + kv_cache_config = KVCacheConfig( + num_blocks=num_blocks, + tensors={ + layer_name: KVCacheTensor(size=per_layer_size) + for layer_name in kv_cache_spec + }, + groups=[[layer_name for layer_name in kv_cache_spec]], + kv_cache_spec=kv_cache_spec) + return kv_cache_config + + +def get_kv_cache_config(vllm_config: VllmConfig, kv_cache_spec: KVCacheSpec, + available_memory: int) -> KVCacheConfig: + """ + Generates the KV cache configuration for a model + TODO: support hybrid models with more than one type of KV cache. + + Args: + vllm_config: The global VllmConfig + kv_cache_spec: The kv cache spec of the model + available_memory: Memory available for KV cache in bytes. + + Returns: + The generated KVCacheConfig + """ + check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory) + if is_kv_cache_type_uniform(kv_cache_spec): + # KV cache of all layers are the same, which is true for most models. + # Allocate the same amount of memory for each layer. + return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec, + available_memory) + else: + raise NotImplementedError diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 45e67c94f8f15..8ded5e5787133 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -3,10 +3,11 @@ from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set, Tuple, Union) -from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig +from vllm.config import CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig from vllm.logger import init_logger from vllm.sampling_params import SamplingParams -from vllm.v1.core.encoder_cache_manager import EncoderCacheManager +from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager, + compute_encoder_budget) from vllm.v1.core.kv_cache_manager import KVCacheManager from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs from vllm.v1.metrics.stats import SchedulerStats @@ -25,6 +26,7 @@ class Scheduler: def __init__( self, scheduler_config: SchedulerConfig, + model_config: ModelConfig, cache_config: CacheConfig, lora_config: Optional[LoRAConfig], ) -> None: @@ -69,16 +71,24 @@ def __init__( self.running_reqs_data: Dict[str, RunningRequestData] = {} # Encoder-related. + # Calculate encoder cache size if applicable + # NOTE: For now we use the same budget for both compute and space. + # This can be changed when we make encoder cache for embedding caching + # across requests. + encoder_compute_budget, encoder_cache_size = compute_encoder_budget( + model_config=model_config, + scheduler_config=scheduler_config, + ) + # NOTE(woosuk): Here, "encoder" includes the vision encoder (and # projector if needed). Currently, we assume that the encoder also # has the Transformer architecture (e.g., ViT). - self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens #noqa: E501 - # NOTE(woosuk): For the models without encoder (e.g., text-only models), - # the encoder cache will not be initialized and used, regardless of - # the cache size. This is because the memory space for the encoder cache - # is preallocated in the profiling run. + self.max_num_encoder_input_tokens = encoder_compute_budget + # NOTE: For the models without encoder (e.g., text-only models), + # the encoder cache will not be initialized because cache size is 0 + # for these models. self.encoder_cache_manager = EncoderCacheManager( - cache_size=self.scheduler_config.encoder_cache_size) + cache_size=encoder_cache_size) def schedule(self) -> "SchedulerOutput": # NOTE(woosuk) on the scheduling algorithm: @@ -519,6 +529,9 @@ def get_num_unfinished_requests(self) -> int: def has_unfinished_requests(self) -> bool: return self.get_num_unfinished_requests() > 0 + def reset_prefix_cache(self) -> bool: + return self.kv_cache_manager.reset_prefix_cache() + def make_stats(self) -> SchedulerStats: return SchedulerStats( num_running_reqs=len(self.running), diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 6d90c38c72cf5..abe4952c4baff 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -66,6 +66,11 @@ class EngineCoreProfile: is_start: bool +@dataclass +class EngineCoreResetPrefixCache: + pass + + class EngineCoreRequestType(enum.Enum): """ Request types defined as hex byte strings, so it can be sent over sockets @@ -74,6 +79,8 @@ class EngineCoreRequestType(enum.Enum): ADD = b'\x00' ABORT = b'\x01' PROFILE = b'\x02' + RESET_PREFIX_CACHE = b'\x03' -EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile, List[str]] +EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile, + EngineCoreResetPrefixCache, List[str]] diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a74699f7513e6..1505b62504a2f 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -2,9 +2,12 @@ import os from typing import AsyncGenerator, List, Mapping, Optional, Type, Union +import numpy as np + from vllm.config import ModelConfig, VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient +from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger @@ -16,7 +19,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.utils import kill_process_tree +from vllm.utils import cdiv, kill_process_tree from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.output_processor import OutputProcessor from vllm.v1.engine.processor import Processor @@ -205,17 +208,15 @@ async def generate( # The output_handler task pushes items into the queue. # This task pulls from the queue and yields to caller. - while True: + finished = False + while not finished: # Note: drain queue without await if possible (avoids # task switching under load which helps performance). - out = q.get_nowait() if q.qsize() > 0 else await q.get() + out = q.get_nowait() if not q.empty() else await q.get() # Note: both OutputProcessor and EngineCore handle their # own request cleanup based on finished. - if out.finished: - yield out - break - + finished = out.finished yield out # If the request is disconnected by the client, the @@ -233,22 +234,41 @@ async def _run_output_handler(self): # 1) Pull EngineCoreOutputs from the EngineCore. outputs = await self.engine_core.get_output_async() - # 2) Process EngineCoreOutputs. - processed_outputs = self.output_processor.process_outputs( - outputs.outputs) - # NOTE: RequestOutputs are pushed to their queues. - assert len(processed_outputs.request_outputs) == 0 - - # 3) Abort any reqs that finished due to stop strings. - await self.engine_core.abort_requests_async( - processed_outputs.reqs_to_abort) + # Split outputs into chunks of at most + # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the + # event loop for too long. + num_outputs = len(outputs.outputs) + if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: + slices = (outputs.outputs, ) + else: + slices = np.array_split( + outputs.outputs, + cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE)) + + iteration_stats = None + for i, outputs_slice in enumerate(slices): + # 2) Process EngineCoreOutputs. + processed_outputs = self.output_processor.process_outputs( + outputs_slice, iteration_stats) + # NOTE: RequestOutputs are pushed to their queues. + assert not processed_outputs.request_outputs + iteration_stats = processed_outputs.iteration_stats + + # Allow other asyncio tasks to run between chunks + if i + 1 < len(slices): + await asyncio.sleep(0) + + # 3) Abort any reqs that finished due to stop strings. + await self.engine_core.abort_requests_async( + processed_outputs.reqs_to_abort) # 4) Logging. # TODO(rob): make into a coroutine and launch it in # background thread once we add Prometheus. + assert iteration_stats is not None self._log_stats( scheduler_stats=outputs.scheduler_stats, - iteration_stats=processed_outputs.iteration_stats, + iteration_stats=iteration_stats, ) except Exception as e: @@ -321,6 +341,9 @@ async def start_profile(self) -> None: async def stop_profile(self) -> None: await self.engine_core.profile_async(False) + async def reset_prefix_cache(self) -> None: + await self.engine_core.reset_prefix_cache_async() + @property def is_running(self) -> bool: return True diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index e7f90d3c62142..cf94033a38d96 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -11,15 +11,16 @@ import zmq.asyncio from msgspec import msgpack -from vllm.config import CacheConfig, VllmConfig +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.utils import get_exception_traceback, zmq_socket_ctx +from vllm.v1.core.kv_cache_utils import get_kv_cache_config from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, - EngineCoreRequestUnion) + EngineCoreRequestUnion, EngineCoreResetPrefixCache) from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus @@ -49,34 +50,41 @@ def __init__( # Setup KV Caches and update CacheConfig after profiling. num_gpu_blocks, num_cpu_blocks = self._initialize_kv_caches( - vllm_config.cache_config) + vllm_config) vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks # Setup scheduler. - self.scheduler = Scheduler(vllm_config.scheduler_config, - vllm_config.cache_config, - vllm_config.lora_config) + self.scheduler = Scheduler( + scheduler_config=vllm_config.scheduler_config, + model_config=vllm_config.model_config, + cache_config=vllm_config.cache_config, + lora_config=vllm_config.lora_config, + ) self.mm_input_mapper_server = MMInputMapperServer( vllm_config.model_config) def _initialize_kv_caches(self, - cache_config: CacheConfig) -> Tuple[int, int]: + vllm_config: VllmConfig) -> Tuple[int, int]: start = time.time() - num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks( - ) - if cache_config.num_gpu_blocks_override is not None: - num_gpu_blocks_override = cache_config.num_gpu_blocks_override - logger.info( - "Overriding num_gpu_blocks=%d with " - "num_gpu_blocks_override=%d", num_gpu_blocks, - num_gpu_blocks_override) - num_gpu_blocks = num_gpu_blocks_override + # Get all kv cache needed by the model + kv_cache_spec = self.model_executor.get_kv_cache_spec() + + # Profiles the peak memory usage of the model to determine how much + # memory can be allocated for kv cache. + availble_gpu_memory = self.model_executor.determine_available_memory() + # Get the kv cache tensor size + kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, + availble_gpu_memory) + num_gpu_blocks = kv_cache_config.num_blocks num_cpu_blocks = 0 - self.model_executor.initialize(num_gpu_blocks) + + # Initialize kv cache and warmup the execution + self.model_executor.initialize(kv_cache_config) + elapsed = time.time() - start logger.info(("init engine (profile, create kv cache, " "warmup model) took %.2f seconds"), elapsed) @@ -127,6 +135,9 @@ def shutdown(self): def profile(self, is_start: bool = True): self.model_executor.profile(is_start) + def reset_prefix_cache(self): + self.scheduler.reset_prefix_cache() + class EngineCoreProc(EngineCore): """ZMQ-wrapper for running EngineCore in background process.""" @@ -239,6 +250,8 @@ def _handle_client_request(self, request: EngineCoreRequestUnion) -> None: self.add_request(request) elif isinstance(request, EngineCoreProfile): self.model_executor.profile(request.is_start) + elif isinstance(request, EngineCoreResetPrefixCache): + self.reset_prefix_cache() else: # TODO: make an EngineCoreAbort wrapper assert isinstance(request, list) @@ -263,7 +276,9 @@ def process_input_socket(self, input_path: str): request = decoder_add_req.decode(request_data) elif request_type == EngineCoreRequestType.ABORT.value: request = decoder_abort_req.decode(request_data) - elif request_type == EngineCoreRequestType.PROFILE.value: + elif request_type in ( + EngineCoreRequestType.PROFILE.value, + EngineCoreRequestType.RESET_PREFIX_CACHE.value): request = pickle.loads(request_data) else: raise ValueError(f"Unknown RequestType: {request_type}") diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index ac0f0f14bf1ab..f3b992d6873e7 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,8 +1,9 @@ +import asyncio import os import signal import weakref from abc import ABC, abstractmethod -from typing import List, Type +from typing import List, Optional, Type import msgspec import zmq @@ -14,7 +15,7 @@ make_zmq_socket) from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, - EngineCoreRequestUnion) + EngineCoreRequestUnion, EngineCoreResetPrefixCache) from vllm.v1.engine.core import EngineCore, EngineCoreProc from vllm.v1.executor.abstract import Executor from vllm.v1.serial_utils import PickleEncoder @@ -69,6 +70,9 @@ def add_request(self, request: EngineCoreRequest) -> None: def profile(self, is_start: bool = True) -> None: raise NotImplementedError + def reset_prefix_cache(self) -> None: + raise NotImplementedError + def abort_requests(self, request_ids: List[str]) -> None: raise NotImplementedError @@ -81,6 +85,9 @@ async def add_request_async(self, request: EngineCoreRequest) -> None: async def profile_async(self, is_start: bool = True) -> None: raise NotImplementedError + async def reset_prefix_cache_async(self) -> None: + raise NotImplementedError + async def abort_requests_async(self, request_ids: List[str]) -> None: raise NotImplementedError @@ -108,12 +115,15 @@ def abort_requests(self, request_ids: List[str]) -> None: if len(request_ids) > 0: self.engine_core.abort_requests(request_ids) - def shutdown(self): + def shutdown(self) -> None: self.engine_core.shutdown() def profile(self, is_start: bool = True) -> None: self.engine_core.profile(is_start) + def reset_prefix_cache(self) -> None: + self.engine_core.reset_prefix_cache() + class MPClient(EngineCoreClient): """ @@ -229,6 +239,10 @@ def profile(self, is_start: bool = True) -> None: self._send_input(EngineCoreRequestType.PROFILE, EngineCoreProfile(is_start)) + def reset_prefix_cache(self) -> None: + self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE, + EngineCoreResetPrefixCache()) + class AsyncMPClient(MPClient): """Asyncio-compatible client for multi-proc EngineCore.""" @@ -242,10 +256,24 @@ def __init__(self, vllm_config: VllmConfig, log_stats=True, ) + self.outputs_queue: Optional[asyncio.Queue[bytes]] = None + self.queue_task: Optional[asyncio.Task] = None + async def get_output_async(self) -> EngineCoreOutputs: + if self.outputs_queue is None: + # Perform IO in separate task to parallelize as much as possible + self.outputs_queue = asyncio.Queue() + + async def process_outputs_socket(): + assert self.outputs_queue is not None + while True: + (frame, ) = await self.output_socket.recv_multipart( + copy=False) + self.outputs_queue.put_nowait(frame.buffer) - frames = await self.output_socket.recv_multipart(copy=False) - return self.decoder.decode(frames[0].buffer) + self.queue_task = asyncio.create_task(process_outputs_socket()) + + return self.decoder.decode(await self.outputs_queue.get()) async def _send_input(self, request_type: EngineCoreRequestType, request: EngineCoreRequestUnion) -> None: @@ -266,3 +294,7 @@ async def abort_requests_async(self, request_ids: List[str]) -> None: async def profile_async(self, is_start: bool = True) -> None: await self._send_input(EngineCoreRequestType.PROFILE, EngineCoreProfile(is_start)) + + async def reset_prefix_cache_async(self) -> None: + await self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE, + EngineCoreResetPrefixCache()) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index f5999ccda6447..55d314ebeb955 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -162,6 +162,9 @@ def start_profile(self): def stop_profile(self): self.engine_core.profile(False) + def reset_prefix_cache(self): + self.engine_core.reset_prefix_cache() + def get_tokenizer_group( self, group_type: Type[_G] = BaseTokenizerGroup, diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 749f4f5043c97..564eab51bd3a8 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -101,6 +101,7 @@ def add_request( def process_outputs( self, engine_core_outputs: List[EngineCoreOutput], + iteration_stats: Optional[IterationStats] = None, ) -> OutputProcessorOutput: """ Process the EngineCoreOutputs: @@ -133,7 +134,8 @@ def process_outputs( request_outputs: List[RequestOutput] = [] reqs_to_abort: List[str] = [] - iteration_stats = IterationStats(self.log_stats) + if not iteration_stats: + iteration_stats = IterationStats(self.log_stats) for engine_core_output in engine_core_outputs: req_id = engine_core_output.request_id req_state = self.request_states.get(req_id) @@ -175,8 +177,8 @@ def process_outputs( iteration_stats=iteration_stats, ) + @staticmethod def _make_request_output( - self, request_state: RequestState, detokenizer_output: Optional[DetokenizerOutput], ) -> Optional[RequestOutput]: diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 7c17f60510ae1..131be759842c7 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -1,58 +1,92 @@ -from abc import ABC, abstractmethod -from typing import Tuple, Type +from typing import Type from vllm.config import VllmConfig +from vllm.executor.executor_base import ExecutorBase +from vllm.executor.ray_distributed_executor import ( # noqa + RayDistributedExecutor as RayDistributedExecutorV0) +from vllm.executor.uniproc_executor import ( # noqa + ExecutorWithExternalLauncher as ExecutorWithExternalLauncherV0) +from vllm.executor.uniproc_executor import ( # noqa + UniProcExecutor as UniProcExecutorV0) +from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ModelRunnerOutput -class Executor(ABC): - """Abstract class for executors.""" +class Executor(ExecutorBase): + """ + Abstract class for v1 executors, mainly define some methods for v1. + For methods shared by v0 and v1, define them in ExecutorBase""" @staticmethod def get_class(vllm_config: VllmConfig) -> Type["Executor"]: executor_class: Type[Executor] + parallel_config = vllm_config.parallel_config distributed_executor_backend = ( - vllm_config.parallel_config.distributed_executor_backend) + parallel_config.distributed_executor_backend) + if distributed_executor_backend is None: + # If the user does not specify the distributed executor backend, + # we will choose the backend based on the world size. + if parallel_config.world_size > 1: + distributed_executor_backend = "mp" + else: + distributed_executor_backend = "uni" + if distributed_executor_backend == "ray": - from vllm.executor.ray_distributed_executor import ( # noqa - RayDistributedExecutor) executor_class = RayDistributedExecutor elif distributed_executor_backend == "mp": from vllm.v1.executor.multiproc_executor import MultiprocExecutor executor_class = MultiprocExecutor + elif distributed_executor_backend == "uni": + executor_class = UniProcExecutor + elif distributed_executor_backend == "external_launcher": + # TODO: make v1 scheduling deterministic + # to support external launcher + executor_class = ExecutorWithExternalLauncher else: - assert (distributed_executor_backend is None) - from vllm.v1.executor.uniproc_executor import UniprocExecutor - executor_class = UniprocExecutor + raise ValueError("Unknown distributed executor backend: " + f"{distributed_executor_backend}") return executor_class - @abstractmethod - def __init__(self, vllm_config: VllmConfig) -> None: - raise NotImplementedError + def initialize(self, kv_cache_config: KVCacheConfig) -> None: + """ + Initialize the KV caches and begin the model execution loop of the + underlying workers. + """ + self.collective_rpc("initialize_cache", args=(kv_cache_config, )) + self.collective_rpc("compile_or_warm_up_model") - @abstractmethod - def initialize(self, num_gpu_blocks: int) -> None: - raise NotImplementedError + def determine_available_memory(self) -> int: # in bytes + output = self.collective_rpc("determine_available_memory") + # Since we use a shared centralized controller, we take the minimum + # memory size across all workers to make sure all the memory + # operators can be applied to all workers. + return min(output) - @abstractmethod - def determine_num_available_blocks(self) -> Tuple[int, int]: - raise NotImplementedError + def get_kv_cache_spec(self) -> KVCacheSpec: + output = self.collective_rpc("get_kv_cache_spec") + for x in output: + assert x == output[0] + return output[0] - @abstractmethod def execute_model( self, scheduler_output, ) -> ModelRunnerOutput: - raise NotImplementedError + output = self.collective_rpc("execute_model", + args=(scheduler_output, )) + return output[0] - @abstractmethod def profile(self, is_start: bool = True): - raise NotImplementedError + self.collective_rpc("profile", args=(is_start, )) + + +class UniProcExecutor(UniProcExecutorV0, Executor): + pass + + +class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor): + pass - @abstractmethod - def shutdown(self): - pass - @abstractmethod - def check_health(self) -> None: - raise NotImplementedError +class RayDistributedExecutor(RayDistributedExecutorV0, Executor): + pass diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index cee0fcc0bad68..f6cf35da0106b 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -6,9 +6,11 @@ import weakref from dataclasses import dataclass from enum import Enum, auto +from functools import partial from multiprocessing.process import BaseProcess -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import cloudpickle import psutil import zmq @@ -23,7 +25,6 @@ from vllm.utils import (get_distributed_init_method, get_mp_context, get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx) from vllm.v1.executor.abstract import Executor -from vllm.v1.outputs import ModelRunnerOutput from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -34,7 +35,7 @@ class MultiprocExecutor(Executor): - def __init__(self, vllm_config: VllmConfig) -> None: + def _init_executor(self) -> None: # Call self.shutdown at exit to clean up # and ensure workers will be terminated. self._finalizer = weakref.finalize(self, self.shutdown) @@ -52,9 +53,6 @@ def sigusr1_handler(signum, frame): signal.signal(signal.SIGUSR1, sigusr1_handler) - self.vllm_config = vllm_config - self.parallel_config = vllm_config.parallel_config - self.world_size = self.parallel_config.world_size tensor_parallel_size = self.parallel_config.tensor_parallel_size assert self.world_size == tensor_parallel_size, ( @@ -79,7 +77,8 @@ def sigusr1_handler(signum, frame): # Create workers self.workers: List[WorkerProcHandle] = [] for rank in range(self.world_size): - worker = WorkerProc.make_worker_process(vllm_config, rank, rank, + worker = WorkerProc.make_worker_process(self.vllm_config, rank, + rank, distributed_init_method, scheduler_output_handle) self.workers.append(worker) @@ -90,53 +89,24 @@ def sigusr1_handler(signum, frame): for w in self.workers: w.worker_response_mq.wait_until_ready() - def initialize(self, num_gpu_blocks: int) -> None: - """ - Initialize the KV caches and begin the model execution loop of the - underlying workers. - """ - logger.info("# GPU blocks: %d", num_gpu_blocks) - self.collective_rpc("initialize_cache", args=(num_gpu_blocks, )) - self.collective_rpc("compile_or_warm_up_model") - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """ - Determine the number of available KV blocks by invoking the - underlying worker. - """ - num_blocks = self.collective_rpc("determine_num_available_blocks") - - # Since we use a shared centralized controller, we take the minimum - # number of blocks across all workers to make sure all the memory - # operators can be applied to all workers. - num_gpu_blocks = min(b[0] for b in num_blocks) - num_cpu_blocks = min(b[1] for b in num_blocks) - - return num_gpu_blocks, num_cpu_blocks - def collective_rpc(self, - method: str, + method: Union[str, Callable], timeout: Optional[float] = None, args: Tuple = (), kwargs: Optional[Dict] = None) -> List[Any]: - """ - Execute an RPC call on workers. - - Args: - method: Name of the worker method to execute - timeout: Maximum time in seconds to wait for execution. Rases a - TimeoutError on timeout. None means wait indefinitely. - args: Positional arguments to pass to the worker method - kwargs: Keyword arguments to pass to the worker method - - Returns: - List of results from each worker - """ start_time = time.monotonic() kwargs = kwargs or {} + # NOTE: If the args are heterogeneous, then we pack them into a list, + # and unpack them in the method of every worker, because every worker + # knows their own rank. try: - self.rpc_broadcast_mq.enqueue((method, args, kwargs)) + if isinstance(method, str): + send_method = method + else: + send_method = cloudpickle.dumps( + method, protocol=pickle.HIGHEST_PROTOCOL) + self.rpc_broadcast_mq.enqueue((send_method, args, kwargs)) responses = [None] * self.world_size for w in self.workers: @@ -160,18 +130,6 @@ def collective_rpc(self, # Re-raise any other exceptions raise e - def execute_model( - self, - scheduler_output, - ) -> ModelRunnerOutput: - model_output = self.collective_rpc("execute_model", - args=(scheduler_output, ))[0] - return model_output - - def profile(self, is_start: bool = True): - self.collective_rpc("profile", args=(is_start, )) - return - def _ensure_worker_termination(self): """Ensure that all worker processes are terminated. Assumes workers have received termination requests. Waits for processing, then sends @@ -246,7 +204,7 @@ def __init__( ready_path: str, ): self.rank = rank - wrapper = WorkerWrapperBase(vllm_config=vllm_config, rank=rank) + wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank) # TODO: move `init_worker` to executor level as a collective rpc call all_kwargs: List[Dict] = [ {} for _ in range(vllm_config.parallel_config.world_size) @@ -403,7 +361,11 @@ def worker_busy_loop(self): method, args, kwargs = self.rpc_broadcast_mq.dequeue() try: - output = getattr(self.worker, method)(*args, **kwargs) + if isinstance(method, str): + func = getattr(self.worker, method) + elif isinstance(method, bytes): + func = partial(cloudpickle.loads(method), self.worker) + output = func(*args, **kwargs) except Exception as e: self.worker_response_mq.enqueue( (WorkerProc.ResponseStatus.FAILURE, e)) diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py deleted file mode 100644 index 79acc60001c99..0000000000000 --- a/vllm/v1/executor/ray_executor.py +++ /dev/null @@ -1,342 +0,0 @@ -import os -from collections import defaultdict -from itertools import islice, repeat -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple - -import vllm.envs as envs -from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.utils import get_distributed_init_method, get_ip, get_open_port -from vllm.v1.executor.abstract import Executor -from vllm.v1.executor.ray_utils import (RayWorkerWrapper, - initialize_ray_cluster, ray) -from vllm.v1.outputs import ModelRunnerOutput - -if ray is not None: - from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy - -if TYPE_CHECKING: - from ray.util.placement_group import PlacementGroup - -logger = init_logger(__name__) - - -class RayExecutor(Executor): - - def __init__(self, vllm_config: VllmConfig) -> None: - self.vllm_config = vllm_config - self.parallel_config = vllm_config.parallel_config - self.model_config = vllm_config.model_config - self.forward_dag: Optional[ray.dag.CompiledDAG] = None - - # Disable Ray usage stats collection. - ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") - if ray_usage != "1": - os.environ["RAY_USAGE_STATS_ENABLED"] = "0" - - initialize_ray_cluster(self.parallel_config) - placement_group = self.parallel_config.placement_group - - # Create the parallel GPU workers. - self._init_workers_ray(placement_group) - - def _init_workers_ray(self, placement_group: "PlacementGroup", - **ray_remote_kwargs): - # A list of workers to run a model. - self.workers: List[RayWorkerWrapper] = [] - if self.parallel_config.ray_workers_use_nsight: - ray_remote_kwargs = self._configure_ray_workers_use_nsight( - ray_remote_kwargs) - - # Create the workers. - driver_ip = get_ip() - for bundle_id, bundle in enumerate(placement_group.bundle_specs): - if not bundle.get("GPU", 0): - # Skip bundles that don't have GPUs, - # as each worker needs one GPU. - continue - scheduling_strategy = PlacementGroupSchedulingStrategy( - placement_group=placement_group, - placement_group_capture_child_tasks=True, - placement_group_bundle_index=bundle_id, - ) - - worker = ray.remote( - num_cpus=0, - num_gpus=1, - scheduling_strategy=scheduling_strategy, - **ray_remote_kwargs, - )(RayWorkerWrapper).remote(vllm_config=self.vllm_config) - self.workers.append(worker) - - logger.debug("workers: %s", self.workers) - worker_ips = [ - ray.get(worker.get_node_ip.remote()) # type: ignore[attr-defined] - for worker in self.workers - ] - ip_counts: Dict[str, int] = {} - for ip in worker_ips: - ip_counts[ip] = ip_counts.get(ip, 0) + 1 - - worker_to_ip = dict(zip(self.workers, worker_ips)) - - def sort_by_driver_then_worker_ip(worker): - """ - Sort the workers based on 3 properties: - 1. If the worker is on the same node as the driver (vllm engine), - it should be placed first. - 2. Then, if the worker is on a node with fewer workers, it should - be placed first. - 3. Finally, if the work is on a node with smaller IP address, it - should be placed first. This is simply a tiebreaker to make - sure the workers are sorted in a deterministic way. - """ - ip = worker_to_ip[worker] - return (ip != driver_ip, ip_counts[ip], ip) - - # After sorting, the workers on the same node will be - # close to each other, and the workers on the driver - # node will be placed first. - self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip) - - # Get the set of GPU IDs used on each node. - worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids") - - node_workers = defaultdict(list) # node id -> list of worker ranks - node_gpus = defaultdict(list) # node id -> list of gpu ids - - for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): - node_workers[node_id].append(i) - # `gpu_ids` can be a list of strings or integers. - # convert them to integers for consistency. - # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs), - # string sorting is not sufficient. - # see https://github.com/vllm-project/vllm/issues/5590 - gpu_ids = [int(x) for x in gpu_ids] - node_gpus[node_id].extend(gpu_ids) - - for node_id, gpu_ids in node_gpus.items(): - node_gpus[node_id] = sorted(gpu_ids) - - all_ips = set(worker_ips) - n_ips = len(all_ips) - n_nodes = len(node_workers) - - if n_nodes != n_ips: - raise RuntimeError( - f"Every node should have a unique IP address. Got {n_nodes}" - f" nodes with node ids {list(node_workers.keys())} and " - f"{n_ips} unique IP addresses {all_ips}. Please check your" - " network configuration. If you set `VLLM_HOST_IP` or " - "`HOST_IP` environment variable, make sure it is unique for" - " each node.") - - # Set environment variables for the driver and workers. - all_args_to_update_environment_variables = [({ - "CUDA_VISIBLE_DEVICES": - ",".join(map(str, node_gpus[node_id])), - "VLLM_TRACE_FUNCTION": - str(envs.VLLM_TRACE_FUNCTION), - "VLLM_USE_V1": - str(int(envs.VLLM_USE_V1)), - **({ - "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND - } if envs.VLLM_ATTENTION_BACKEND is not None else {}) - }, ) for (node_id, _) in worker_node_and_gpu_ids] - - self._env_vars_for_all_workers = ( - all_args_to_update_environment_variables) - - self._run_workers("update_environment_variables", - all_args=self._get_env_vars_to_be_updated()) - - if len(node_gpus) == 1: - # in single node case, we don't need to get the IP address. - # the loopback address is sufficient - # NOTE: a node may have several IP addresses, one for each - # network interface. `get_ip()` might return any of them, - # while they might not work for communication inside the node - # if the network setup is complicated. Using the loopback address - # solves this issue, as it always works for communication inside - # the node. - driver_ip = "127.0.0.1" - distributed_init_method = get_distributed_init_method( - driver_ip, get_open_port()) - - # Initialize the actual workers inside worker wrapper. - init_worker_all_kwargs = [ - self._get_worker_kwargs( - local_rank=node_workers[node_id].index(rank), - rank=rank, - distributed_init_method=distributed_init_method, - ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids) - ] - self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) - self._run_workers("initialize") - self._run_workers("load_model") - - def _configure_ray_workers_use_nsight(self, - ray_remote_kwargs) -> Dict[str, Any]: - # If nsight profiling is enabled, we need to set the profiling - # configuration for the ray workers as runtime env. - runtime_env = ray_remote_kwargs.setdefault("runtime_env", {}) - runtime_env.update({ - "nsight": { - "t": "cuda,cudnn,cublas", - "o": "'worker_process_%p'", - "cuda-graph-trace": "node", - } - }) - - return ray_remote_kwargs - - def _get_env_vars_to_be_updated(self): - return self._env_vars_for_all_workers - - def _get_worker_kwargs( - self, - local_rank: int = 0, - rank: int = 0, - distributed_init_method: Optional[str] = None) -> Dict[str, Any]: - """ - Return worker init args for a given rank. - """ - if distributed_init_method is None: - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - return dict( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - ) - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """ - Determine the number of available KV blocks. - - This invokes `determine_num_available_blocks` on each worker and takes - the min of the results, guaranteeing that the selected cache sizes are - compatible with all workers. - - Returns: - - tuple[num_gpu_blocks, num_cpu_blocks] - """ - # Get the maximum number of blocks that can be allocated on GPU and CPU. - num_blocks = self._run_workers("determine_num_available_blocks") - - # Since we use a shared centralized controller, we take the minimum - # number of blocks across all workers to make sure all the memory - # operators can be applied to all workers. - num_gpu_blocks = min(b[0] for b in num_blocks) - num_cpu_blocks = min(b[1] for b in num_blocks) - - return num_gpu_blocks, num_cpu_blocks - - def initialize(self, num_gpu_blocks: int) -> None: - """ - Initialize the KV cache in all workers. - """ - # NOTE: This is logged in the executor because there can be >1 worker - # with other executors. We could log in the engine level, but work - # remains to abstract away the device for non-GPU configurations. - logger.info("# GPU blocks: %d", num_gpu_blocks) - self._run_workers("initialize_cache", num_gpu_blocks) - self._run_workers("compile_or_warm_up_model") - - def _run_workers( - self, - method: str, - *args, - all_args: Optional[List[Tuple[Any, ...]]] = None, - all_kwargs: Optional[List[Dict[str, Any]]] = None, - **kwargs, - ) -> Any: - """ - Runs the given method on all workers. Can be used in the following - ways: - - Args: - - args/kwargs: All workers share the same args/kwargs - - all_args/all_kwargs: args/kwargs for each worker are specified - individually - """ - count = len(self.workers) - all_worker_args = repeat(args, count) if all_args is None \ - else islice(all_args, 0, None) - all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ - else islice(all_kwargs, 0, None) - - ray_worker_refs = [ - worker.execute_method.remote( # type: ignore[attr-defined] - method, *worker_args, **worker_kwargs) - for (worker, worker_args, worker_kwargs - ) in zip(self.workers, all_worker_args, all_worker_kwargs) - ] - return ray.get(ray_worker_refs) - - def execute_model( - self, - scheduler_output, - ) -> ModelRunnerOutput: - if self.forward_dag is None: - self.forward_dag = self._compiled_ray_dag() - # Only the first worker (with rank 0) returns the execution result. - # Others return None. - output = ray.get(self.forward_dag.execute(scheduler_output))[0] - return output - - def profile(self, is_start=True): - raise NotImplementedError - - def shutdown(self): - if hasattr(self, "forward_dag") and self.forward_dag is not None: - self.forward_dag.teardown() - import ray - for worker in self.workers: - ray.kill(worker) - self.forward_dag = None - - def check_health(self) -> None: - logger.debug("Called check_health.") - - def _check_ray_compiled_graph_installation(self): - import pkg_resources - from packaging import version - - required_version = version.parse("2.39") - current_version = version.parse( - pkg_resources.get_distribution("ray").version) - if current_version < required_version: - raise ValueError(f"Ray version {required_version} is " - f"required, but found {current_version}") - - import importlib.util - raycg = importlib.util.find_spec("ray.experimental.compiled_dag_ref") - if raycg is None: - raise ValueError("Ray Compiled Graph is not installed. " - "Run `pip install ray[adag]` to install it.") - - cupy_spec = importlib.util.find_spec("cupy") - if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: - raise ValueError( - "cupy is not installed but required since " - "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set." - "Run `pip install ray[adag]` and check cupy installation.") - - def _compiled_ray_dag(self): - assert self.parallel_config.use_ray - self._check_ray_compiled_graph_installation() - from ray.dag import InputNode, MultiOutputNode - - with InputNode() as input_batches: - outputs = [ - worker.execute_model.bind( # type: ignore[attr-defined] - input_batches) for worker in self.workers - ] - forward_dag = MultiOutputNode(outputs) - - return forward_dag.experimental_compile() - - def __del__(self): - self.shutdown() diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py deleted file mode 100644 index fc9715b7a5909..0000000000000 --- a/vllm/v1/executor/ray_utils.py +++ /dev/null @@ -1,280 +0,0 @@ -import time -from collections import defaultdict -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple - -from vllm.config import ParallelConfig -from vllm.logger import init_logger -from vllm.platforms import current_platform -from vllm.utils import get_ip -from vllm.v1.outputs import ModelRunnerOutput -from vllm.worker.worker_base import WorkerWrapperBase - -if TYPE_CHECKING: - from vllm.v1.core.scheduler import SchedulerOutput - -logger = init_logger(__name__) -PG_WAIT_TIMEOUT = 60 - -try: - import ray - from ray.util import placement_group_table - from ray.util.placement_group import PlacementGroup - try: - from ray._private.state import available_resources_per_node - except ImportError: - # Ray 2.9.x doesn't expose `available_resources_per_node` - from ray._private.state import state as _state - available_resources_per_node = _state._available_resources_per_node - - class RayWorkerWrapper(WorkerWrapperBase): - - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - # Since the compiled DAG runs a main execution - # in a different thread that calls cuda.set_device. - # The flag indicates is set_device is called on - # that thread. It will be removed soon. - self.compiled_dag_cuda_device_set = False - - def get_node_ip(self) -> str: - return get_ip() - - def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: - node_id = ray.get_runtime_context().get_node_id() - device_key = current_platform.ray_device_key - if not device_key: - raise RuntimeError("current platform %s does not support ray.", - current_platform.device_name) - gpu_ids = ray.get_runtime_context().get_accelerator_ids( - )[device_key] - return node_id, gpu_ids - - def setup_device_if_necessary(self): - # TODO(swang): This is needed right now because Ray CG executes - # on a background thread, so we need to reset torch's current - # device. - # We can remove this API after it is fixed in compiled graph. - import torch - assert self.worker is not None, "Worker is not initialized" - if not self.compiled_dag_cuda_device_set: - torch.cuda.set_device(self.worker.device) - self.compiled_dag_cuda_device_set = True - - def execute_model( - self, - scheduler_output: "SchedulerOutput", - ) -> ModelRunnerOutput: - self.setup_device_if_necessary() - assert self.worker is not None, "Worker is not initialized" - output = self.worker.model_runner.execute_model(scheduler_output) - return output - - ray_import_err = None - -except ImportError as e: - ray = None # type: ignore - ray_import_err = e - RayWorkerWrapper = None # type: ignore - - -def ray_is_available() -> bool: - """Returns True if Ray is available.""" - return ray is not None - - -def assert_ray_available(): - """ - Raise an exception if Ray is not available. - """ - if ray is None: - raise ValueError("Failed to import Ray, please install Ray with " - "`pip install ray`.") from ray_import_err - - -def _verify_bundles(placement_group: "PlacementGroup", - parallel_config: ParallelConfig, device_str: str): - """ - Verify a given placement group has bundles located in the right place. - - There are 2 rules. - - Warn if all tensor parallel workers cannot fit in a single node. - - Fail if driver node is not included in a placement group. - - Args: - placement_group: The placement group to verify. - parallel_config: The parallel configuration. - device_str: The required device. - """ - assert ray.is_initialized(), ( - "Ray is not initialized although distributed-executor-backend is ray.") - pg_data = placement_group_table(placement_group) - # bundle_idx -> node_id - bundle_to_node_ids = pg_data["bundles_to_node_id"] - # bundle_idx -> bundle (e.g., {"GPU": 1}) - bundles = pg_data["bundles"] - # node_id -> List of bundle (e.g., {"GPU": 1}) - node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list) - - for bundle_idx, node_id in bundle_to_node_ids.items(): - node_id_to_bundle[node_id].append(bundles[bundle_idx]) - driver_node_id = ray.get_runtime_context().get_node_id() - - if driver_node_id not in node_id_to_bundle: - raise RuntimeError( - f"driver node id {driver_node_id} is not included in a placement " - f"group {placement_group.id}. Node id -> bundles " - f"{node_id_to_bundle}. " - "You don't have enough GPUs available in a current node. Check " - "`ray status` to see if you have available GPUs in a node " - f"{driver_node_id} before starting an vLLM engine.") - - for node_id, bundles in node_id_to_bundle.items(): - if len(bundles) < parallel_config.tensor_parallel_size: - logger.warning( - "tensor_parallel_size=%d " - "is bigger than a reserved number of %ss (%d " - "%ss) in a node %s. Tensor parallel workers can be " - "spread out to 2+ nodes which can degrade the performance " - "unless you have fast interconnect across nodes, like " - "Infiniband. To resolve this issue, make sure you have more " - "than %d GPUs available at each node.", - parallel_config.tensor_parallel_size, device_str, len(bundles), - device_str, node_id, parallel_config.tensor_parallel_size) - - -def _wait_until_pg_ready(current_placement_group: "PlacementGroup"): - """Wait until a placement group is ready. - - It prints the informative log messages if the placement group is - not created within time. - - """ - # Wait until PG is ready - this will block until all - # requested resources are available, and will timeout - # if they cannot be provisioned. - placement_group_specs = current_placement_group.bundle_specs - - s = time.time() - pg_ready_ref = current_placement_group.ready() - wait_interval = 10 - while time.time() - s < PG_WAIT_TIMEOUT: - ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval) - if len(ready) > 0: - break - - # Exponential backoff for warning print. - wait_interval *= 2 - logger.info( - "Waiting for creating a placement group of specs for " - "%d seconds. specs=%s. Check " - "`ray status` to see if you have enough resources.", - int(time.time() - s), placement_group_specs) - - try: - ray.get(pg_ready_ref, timeout=0) - except ray.exceptions.GetTimeoutError: - raise ValueError( - "Cannot provide a placement group of " - f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See " - "`ray status` to make sure the cluster has enough resources." - ) from None - - -def initialize_ray_cluster( - parallel_config: ParallelConfig, - ray_address: Optional[str] = None, -): - """Initialize the distributed cluster with Ray. - - it will connect to the Ray cluster and create a placement group - for the workers, which includes the specification of the resources - for each distributed worker. - - Args: - parallel_config: The configurations for parallel execution. - ray_address: The address of the Ray cluster. If None, uses - the default Ray cluster address. - """ - assert_ray_available() - - # Connect to a ray cluster. - if current_platform.is_rocm() or current_platform.is_xpu(): - # Try to connect existing ray instance and create a new one if not found - try: - ray.init("auto") - except ConnectionError: - logger.warning( - "No existing RAY instance detected. " - "A new instance will be launched with current node resources.") - ray.init(address=ray_address, - ignore_reinit_error=True, - num_gpus=parallel_config.world_size) - else: - ray.init(address=ray_address, ignore_reinit_error=True) - - if parallel_config.placement_group: - # Placement group is already set. - return - - device_str = current_platform.ray_device_key - if not device_str: - raise ValueError( - f"current platform {current_platform.device_name} does not " - "support ray.") - # Create placement group for worker processes - current_placement_group = ray.util.get_current_placement_group() - if current_placement_group: - # We are in a placement group - bundles = current_placement_group.bundle_specs - # Verify that we can use the placement group. - device_bundles = 0 - for bundle in bundles: - bundle_devices = bundle.get(device_str, 0) - if bundle_devices > 1: - raise ValueError( - "Placement group bundle cannot have more than 1 " - f"{device_str}.") - if bundle_devices: - device_bundles += 1 - if parallel_config.world_size > device_bundles: - raise ValueError( - f"The number of required {device_str}s exceeds the total " - f"number of available {device_str}s in the placement group." - f"Required number of devices: {parallel_config.world_size}. " - f"Total number of devices: {device_bundles}.") - else: - num_devices_in_cluster = ray.cluster_resources().get(device_str, 0) - if parallel_config.world_size > num_devices_in_cluster: - raise ValueError( - f"The number of required {device_str}s exceeds the total " - f"number of available {device_str}s in the placement group.") - # Create a new placement group - placement_group_specs: List[Dict[str, float]] = ([{ - device_str: 1.0 - } for _ in range(parallel_config.world_size)]) - - # vLLM engine is also a worker to execute model with an accelerator, - # so it requires to have the device in a current node. Check if - # the current node has at least one device. - current_ip = get_ip() - current_node_id = ray.get_runtime_context().get_node_id() - current_node_resource = available_resources_per_node()[current_node_id] - if current_node_resource.get(device_str, 0) < 1: - raise ValueError( - f"Current node has no {device_str} available. " - f"{current_node_resource=}. vLLM engine cannot start without " - f"{device_str}. Make sure you have at least 1 {device_str} " - f"available in a node {current_node_id=} {current_ip=}.") - # This way, at least bundle is required to be created in a current - # node. - placement_group_specs[0][f"node:{current_ip}"] = 0.001 - - # By default, Ray packs resources as much as possible. - current_placement_group = ray.util.placement_group( - placement_group_specs, strategy="PACK") - _wait_until_pg_ready(current_placement_group) - - assert current_placement_group is not None - _verify_bundles(current_placement_group, parallel_config, device_str) - # Set the placement group in the parallel config - parallel_config.placement_group = current_placement_group diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py deleted file mode 100644 index c63d7a4c47c15..0000000000000 --- a/vllm/v1/executor/uniproc_executor.py +++ /dev/null @@ -1,85 +0,0 @@ -import os -from typing import Optional, Tuple - -from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.utils import get_distributed_init_method, get_ip, get_open_port -from vllm.v1.executor.abstract import Executor -from vllm.v1.outputs import ModelRunnerOutput -from vllm.v1.worker.gpu_worker import Worker - -logger = init_logger(__name__) - - -class UniprocExecutor(Executor): - - def __init__(self, vllm_config: VllmConfig) -> None: - self.vllm_config = vllm_config - self.model_config = vllm_config.model_config - self.cache_config = vllm_config.cache_config - self.lora_config = vllm_config.lora_config - self.load_config = vllm_config.load_config - self.parallel_config = vllm_config.parallel_config - self.scheduler_config = vllm_config.scheduler_config - self.device_config = vllm_config.device_config - self.speculative_config = vllm_config.speculative_config - self.prompt_adapter_config = vllm_config.prompt_adapter_config - self.observability_config = vllm_config.observability_config - - self.worker: Worker = self._create_worker() - self.worker.init_device() - self.worker.load_model() - - def _create_worker( - self, - local_rank: int = 0, - rank: int = 0, - distributed_init_method: Optional[str] = None) -> Worker: - """Return worker init args for a given rank.""" - # see https://github.com/NVIDIA/nccl/issues/1234 - os.environ['NCCL_CUMEM_ENABLE'] = '0' - - if distributed_init_method is None: - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - return Worker( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - ) - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of available KV blocks by invoking the - underlying worker. - """ - return self.worker.determine_num_available_blocks() - - def initialize(self, num_gpu_blocks: int) -> None: - """Initialize the KV cache by invoking the underlying worker. - """ - # NOTE: This is logged in the executor because there can be >1 worker - # with other executors. We could log in the engine level, but work - # remains to abstract away the device for non-GPU configurations. - logger.info("# GPU blocks: %d", num_gpu_blocks) - self.worker.initialize_cache(num_gpu_blocks) - self.worker.compile_or_warm_up_model() - - def execute_model( - self, - scheduler_output, - ) -> ModelRunnerOutput: - output = self.worker.execute_model(scheduler_output) - assert output is not None - return output - - def profile(self, is_start: bool = True): - self.worker.profile(is_start) - - def shutdown(self): - pass - - def check_health(self) -> None: - # UniprocExecutor will always be healthy as long as - # it's running. - return diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py new file mode 100644 index 0000000000000..6d5cc32ffc5b8 --- /dev/null +++ b/vllm/v1/kv_cache_interface.py @@ -0,0 +1,111 @@ +from dataclasses import dataclass +from typing import Dict, List + +import torch + +from vllm.logger import init_logger +from vllm.utils import cdiv, get_dtype_size + +logger = init_logger(__name__) + + +@dataclass +class KVCacheSpecBase: + """ + A base class for specifying the KV cache format of one layer. + """ + + # number of tokens in a block + block_size: int + + @property + def type_id(self) -> str: + """ + The type identifier of this KV cache. + Return different strings for layers with different KV cache type (e.g., + different number of tokens like full attention vs sliding window + attention, different KV cache size per token like layers with different + number of heads) + + Returns: + The type identifier of this KV cache. + """ + raise NotImplementedError + + @property + def page_size_bytes(self) -> int: + """ + The size of a page with `block_size` tokens in bytes. + + Returns: + The page size + """ + raise NotImplementedError + + def bytes_for_tokens(self, num_tokens: int) -> int: + """ + The KV cache size for `num_tokens` tokens in bytes. Returns the real + memory size after padding `num_tokens` to full blocks. + + Returns: + The KV cache size + """ + raise NotImplementedError + + +@dataclass +class FullAttentionSpec(KVCacheSpecBase): + num_kv_heads: int + head_size: int + dtype: torch.dtype + + @property + def type_id(self) -> str: + return f"full_attention_{self.block_size}_{self.page_size_bytes}" + + @property + def page_size_bytes(self) -> int: + return 2 * self.block_size * self.num_kv_heads * self.head_size \ + * get_dtype_size(self.dtype) + + def bytes_for_tokens(self, num_tokens: int) -> int: + return cdiv(num_tokens, self.block_size) * self.page_size_bytes + + +KVCacheSpec = Dict[str, KVCacheSpecBase] + + +@dataclass +class KVCacheTensor: + """ + A dataclass for specifying how the workers should initialize the KV cache + for a layer. Only contains the size of KV cache for that layer for now. Will + be extended to support multiple layers sharing the same memory pool. + """ + size: int # The size of KV cache Tensor in bytes + + +@dataclass +class KVCacheConfig: + """ + The KV cache configuration of a model. + """ + """The number of KV cache blocks""" + num_blocks: int + """layer_name -> how to initialize KV cache for that layer""" + tensors: Dict[str, KVCacheTensor] + """ + A list of kv-cache groups. Each group includes a set of layers with + the same kv-cache spec, and the total page_size of layers inside a group + is same across all groups (as the KVCacheManager only supports allocating + pages of the same size). For example: + 1. A model only uses full attention: one group with all layers in the model. + 2. (not implemented yet) A model with the same number of full attention + layers and sliding window attention layers: two groups, one for full + attention layers and one for sliding window attention layers. + 3. (not implemented yet) A model with 2 full attention layers and 4 sliding + window attention layers: three groups, (full * 2), (sw * 2), (sw * 2). + """ + groups: List[List[str]] + """the KVCacheSpec of the model""" + kv_cache_spec: KVCacheSpec diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 45450165eaefe..eefcdaf29e753 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -64,6 +64,12 @@ def __init__( # recomputing. self._kv_block_hashes: List[BlockHashType] = [] + # Read-only views + # Prevent directly appending to the these lists since + # they should also be updated simultaneously. + self.output_token_ids = ConstantList(self._output_token_ids) + self.all_token_ids = ConstantList(self._all_token_ids) + @classmethod def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": return cls( @@ -79,18 +85,6 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": lora_request=request.lora_request, ) - @property - def output_token_ids(self) -> ConstantList[int]: - # Prevent directly appending to the output_token_ids since - # all_token_ids should also be updated simultaneously. - return ConstantList(self._output_token_ids) - - @property - def all_token_ids(self) -> ConstantList[int]: - # Prevent directly appending to the all_token_ids since - # output_token_ids should also be updated simultaneously - return ConstantList(self._all_token_ids) - def append_output_token_ids( self, token_ids: Union[int, List[int]], diff --git a/vllm/v1/stats/__init__.py b/vllm/v1/stats/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py new file mode 100644 index 0000000000000..500bc356fc179 --- /dev/null +++ b/vllm/v1/stats/common.py @@ -0,0 +1,451 @@ +import time +from dataclasses import dataclass +from dataclasses import field as dataclass_field +from enum import IntEnum +from typing import ClassVar, Dict, List, Optional, Set + +import msgspec +from msgspec import field as msgspec_field + +from vllm.sampling_params import SamplingParams + + +class RequestStatsUpdate( + msgspec.Struct, # type: ignore + array_like=True, + omit_defaults=True, + gc=False): + """ + An update to the request stats. + + This represents a stats update at a specific timestamp with metadata + associated with the update. + + NOTE: since there might be multiple processes generating updates at + different parts of the engine (e.g. input processor, scheduler, engine core, + etc.), we use the monotonic timestamp to record the update to compute any + intervals, and explicit wall-clock timestamp should be used for timestamps. + + WARNING: This assumes stats are generated in a single machine. If there are + potentially multiple machines, one should always generate the stats updates + on one single machine or use something else. + """ + + class Type(IntEnum): + """See `RequestStats` for the lifecycle of a request.""" + + # Request arrived at the engine frontend. + ARRIVED = 0 + # Input processed by the input processor. + INPUT_PROCESSED = 1 + # Queued on the engine core. + QUEUED = 2 + # Scheduled running prefill by the scheduler. + # A request could be running a new prefill on the prompt tokens or + # a resumed prefill on the original prefill tokens + generated output + # tokens before preemption. + PREFILLING = 3 + # Preempted by the scheduler. + PREEMPTED = 4 + # Output token is generated by the engine core. + DECODING = 5 + # Token detokenized by the detokenizer. + # We will record the timestamp for each output token, as well as the + # finish reason. + DETOKENIZED = 6 + # Request finishes (or aborts). + FINISHED = 7 + + """ + Valid state updates: + ARRIVED + │ + ├──────► INPUT_PROCESSED ──────► QUEUED ──────► PREFILLING ◄────┐ + │ │ │ │ │ + │ │ │ ▼ │ + │ │ │ -──► DECODING │ + │ │ │ | │ │ + │ │ │ | ▼ │ + │ │ │ └─ DETOKENIZED │ + │ │ │ │ │ + │ │ │ ▼ │ + │ ▼ ▼ PREEMPTED ◄──────┘ + │ │ │ │ + └──────────────┴───────────────────┴──────────────┴ + │ + ▼ + FINISHED (All could go to FINISHED) + """ + _VALID_TRANSITIONS: ClassVar[Dict[Type, Set[Type]]] = { + Type.ARRIVED: { + Type.INPUT_PROCESSED, + Type.FINISHED, + }, + Type.INPUT_PROCESSED: { + Type.QUEUED, + Type.FINISHED, + }, + Type.QUEUED: { + Type.PREFILLING, + Type.FINISHED, + }, + Type.PREFILLING: { + Type.DECODING, + Type.PREEMPTED, + Type.FINISHED, + }, + Type.DECODING: { + Type.DETOKENIZED, + Type.FINISHED, + }, + Type.DETOKENIZED: { + Type.DECODING, + Type.PREEMPTED, + Type.FINISHED, + }, + Type.PREEMPTED: {Type.PREFILLING, Type.FINISHED}, + Type.FINISHED: set(), + } + + request_id: str + + type: Type + + # Timestamp when the update is recorded. This is used to record time + # intervals between events rather than wall clock time. + monotonic_ts_s: float = msgspec_field( + default_factory=lambda: time.monotonic()) + + ############################################################ + # Metadata associated with the update. + ############################################################ + # For input_processed. Metadata needed for stats logging. + num_prompt_tokens: Optional[int] = None + sampling_params: Optional[SamplingParams] = None + + # For running. + # Number of tokens computed when scheduled to run. + num_computed_tokens: Optional[int] = None + # Number of cached tokens when scheduled to run. + num_cached_tokens: Optional[int] = None + + # For decoded. + # The number of new output tokens generated. + num_new_tokens: Optional[int] = None + + # For both detokenized and decoded. + # Finished reason. + finish_reason: Optional[str] = None + + # Non-optional fields for each update type. + _REQUIRED_FIELDS: ClassVar[Dict[Type, List[str]]] = { + Type.INPUT_PROCESSED: ["num_prompt_tokens", "sampling_params"], + Type.PREFILLING: ["num_computed_tokens", "num_cached_tokens"], + Type.DETOKENIZED: ["num_new_tokens"], + Type.FINISHED: ["finish_reason"], + } + + def __post_init__(self): + required_fields = self._REQUIRED_FIELDS.get(self.type, []) + for field in required_fields: + if getattr(self, field) is None: + raise ValueError( + f"Field {field} is required for update type {self.type}.") + + @staticmethod + def check_valid_update( + update: "RequestStatsUpdate", + last_update_type: Optional[Type], + last_updated_ts_s: Optional[float], + ): + if last_update_type is None: + assert update.type == RequestStatsUpdate.Type.ARRIVED + else: + valid_cur_update_types = RequestStatsUpdate._VALID_TRANSITIONS[ + last_update_type] + assert update.type in valid_cur_update_types, ( + f"Invalid update type: {update.type} for last_update_type: " + f"{last_update_type}.") + + if last_updated_ts_s is not None: + assert update.monotonic_ts_s >= last_updated_ts_s, ( + "Update timestamp must be monotonically increasing, but " + f"last_updated_ts_s={last_updated_ts_s} and " + f"update.monotonic_ts_s={update.monotonic_ts_s}.") + + +@dataclass +class RequestStats: + """Stats associated with a request (`Request`).""" + + ############################################################ + # Metadata + ############################################################ + request_id: str + sampling_params: Optional[SamplingParams] = None + num_prompt_tokens: Optional[int] = None + + ############################################################ + # Metrics and Stats + ############################################################ + # Timestamp when the request was last updated. + last_updated_ts_s: Optional[float] = None + + # Last update stats type. + last_update_type: Optional[RequestStatsUpdate.Type] = None + + # Timestamp when the request arrived at the llm engine. + arrival_ts_s: Optional[float] = None + + # Number of tokens cached. When part of the request prefix is cached, + # this will be set. + num_cached_tokens: int = 0 + + # Number of tokens computed. + num_computed_tokens: int = 0 + + # The timestamp when the request become waiting in the queue. + queued_ts_s: Optional[float] = None + + # When the input processor is completed. + input_processor_end_ts_s: Optional[float] = None + + # A sorted list of timestamps when the request was scheduled to prefill. + # This could be when: + # 1. the request is newly scheduled, so it's a new prefill. + # 2. the request was preempted and resumed. It is equivalent to running + # a prefill of the original prefill tokens + generated output tokens + # before preemption. + prefill_start_ts_s_lst: List[float] = dataclass_field(default_factory=list) + + # A list of timestamps when a token is decoded by the engine core. + decoding_ts_s_lst: List[float] = dataclass_field(default_factory=list) + + # A sorted list of timestamps for each output token. + output_token_ts_s_lst: List[float] = dataclass_field(default_factory=list) + + # First token's timestamp. + first_token_ts_s: Optional[float] = None + + # TODO(rickyx): we need model runner to surface these. + model_forward_duration_s: float = 0.0 + # Includes model forward, block/sync across workers, cpu-gpu sync time + # and sampling time. + model_execute_duration_s: float = 0.0 + + # A sorted list of timestamps when the request was preempted at the + # scheduler. + # TODO(rickyx): right now, we don't actually have a good high-level + # metric to measure the impact of preemption other than observation of + # large P99 TPOT. Ideally we could quantify the impact of preemption by + # measuring the number of tokens re-computed due to preemption. + preempted_ts_s_lst: List[float] = dataclass_field(default_factory=list) + + # Timestamp when the request was finished at the engine core. + finished_ts_s: Optional[float] = None + + # Finish reason. + finish_reason: Optional[str] = None + + ############################################################ + # Derived properties. + ############################################################ + @property + def prefill_ts_s(self) -> Optional[float]: + """The timestamp when the request started prefilling. + Since a request could be preempted in decoding and later resumed + to prefill the decoded tokens, we use the first prefill start timestamp. + """ + return (self.prefill_start_ts_s_lst[0] + if self.prefill_start_ts_s_lst else None) + + @property + def e2e_latency_s(self) -> Optional[float]: + if self.finished_ts_s is None or self.arrival_ts_s is None: + return None + assert self.finished_ts_s >= self.arrival_ts_s + return self.finished_ts_s - self.arrival_ts_s + + @property + def queue_duration_s(self) -> Optional[float]: + """How long the request was waiting to run.""" + if self.queued_ts_s is None or self.prefill_ts_s is None: + # Either not queued or not running yet. + return None + assert self.queued_ts_s <= self.prefill_ts_s + return self.prefill_ts_s - self.queued_ts_s + + @property + def inference_latency_s(self) -> Optional[float]: + """How long the request was running inference + (prefill and decode).""" + if self.finished_ts_s is None or self.prefill_ts_s is None: + return None + assert self.finished_ts_s >= self.prefill_ts_s + return self.finished_ts_s - self.prefill_ts_s + + @property + def first_token_latency_s(self) -> Optional[float]: + if self.first_token_ts_s is None or self.arrival_ts_s is None: + return None + assert self.first_token_ts_s >= self.arrival_ts_s + return self.first_token_ts_s - self.arrival_ts_s + + @property + def prefill_latency_s(self) -> Optional[float]: + if self.first_token_ts_s is None or self.prefill_ts_s is None: + return None + assert self.first_token_ts_s >= self.prefill_ts_s + return self.first_token_ts_s - self.prefill_ts_s + + @property + def decode_latency_s(self) -> Optional[float]: + if self.e2e_latency_s is None or self.first_token_latency_s is None: + return None + assert self.e2e_latency_s >= self.first_token_latency_s + return self.e2e_latency_s - self.first_token_latency_s + + @property + def output_token_latency_s_lst(self) -> List[float]: + if len(self.output_token_ts_s_lst) == 0: + return [] + latency_s_lst = [] + for i in range(1, len(self.output_token_ts_s_lst)): + assert (self.output_token_ts_s_lst[i] >= + self.output_token_ts_s_lst[i - 1]) + latency_s = (self.output_token_ts_s_lst[i] - + self.output_token_ts_s_lst[i - 1]) + latency_s_lst.append(latency_s) + return latency_s_lst + + @property + def num_output_tokens(self) -> int: + return len(self.output_token_ts_s_lst) + + @property + def is_finished(self) -> bool: + return self.finished_ts_s is not None + + def update_from(self, update: "RequestStatsUpdate"): + RequestStatsUpdate.check_valid_update(update, self.last_update_type, + self.last_updated_ts_s) + ts = update.monotonic_ts_s + self.last_updated_ts_s = ts + self.last_update_type = update.type + if update.type == RequestStatsUpdate.Type.ARRIVED: + self.arrival_ts_s = ts + elif update.type == RequestStatsUpdate.Type.INPUT_PROCESSED: + self.input_processor_end_ts_s = ts + self.sampling_params = update.sampling_params + self.num_prompt_tokens = update.num_prompt_tokens + elif update.type == RequestStatsUpdate.Type.QUEUED: + self.queued_ts_s = ts + elif update.type == RequestStatsUpdate.Type.PREFILLING: + self.prefill_start_ts_s_lst.append(ts) + self.num_cached_tokens = update.num_cached_tokens or 0 + self.num_computed_tokens = update.num_computed_tokens or 0 + elif update.type == RequestStatsUpdate.Type.PREEMPTED: + self._reset_for_preemption(ts) + elif update.type == RequestStatsUpdate.Type.DECODING: + self.decoding_ts_s_lst.append(ts) + elif update.type == RequestStatsUpdate.Type.DETOKENIZED: + self._record_detokenized_output( + ts, + update.num_new_tokens or 0, + ) + elif update.type == RequestStatsUpdate.Type.FINISHED: + self.finished_ts_s = ts + self.finish_reason = update.finish_reason + else: + raise ValueError(f"Unknown update type: {update.type}") + + def _record_detokenized_output( + self, + ts_s: float, + num_new_tokens: int, + ): + # Update if first output token is generated. + if len(self.output_token_ts_s_lst) == 0: + self.first_token_ts_s = ts_s + assert ( + self.prefill_ts_s is not None + ), "Request must be running before generating output tokens." + + # Some X new tokens were generated at the ts. + self.output_token_ts_s_lst.extend([ts_s] * num_new_tokens) + + def _reset_for_preemption(self, ts_s: float): + self.preempted_ts_s_lst.append(ts_s) + # Reset the computed tokens since it might restart the prefill. + self.num_computed_tokens = 0 + # Cached token count might also change when resumed. + self.num_cached_tokens = 0 + # These stats don't change since they happen before request running. + # - arrival_ts_s + # - input_processor_end_ts_s + # - sampling_params + # - num_prompt_tokens + # - first_token_ts_s + # + # These stats are accumulated over preemptions: + # - output_token_ts_s_lst + # - prefill_start_ts_s_lst (after preemption, it will prefill the + # original prefill tokens and any output tokens generated before + # preemption.) + + +@dataclass +class KVCacheStats: + # KV Cache Usage in % + gpu_cache_usage_sys: float = 0.0 + gpu_prefix_cache_hit_rate: float = 0.0 + + +@dataclass +class SchedulerStats: + """Stats associated with the scheduler.""" + + # Number of requests currently running. + num_running_reqs: int = 0 + # Number of requests currently waiting. + num_waiting_reqs: int = 0 + + kv_cache_stats: KVCacheStats = dataclass_field( + default_factory=KVCacheStats) + + +@dataclass +class EngineCoreProcessStats: + """Stats associated with the engine core process.""" + + # Number of requests currently in the input queue. None if the engine core + # is not running in multiprocess mode. + input_queue_size: Optional[int] = None + # Number of outputs currently in the output queue. None if the engine core + # is not running in multiprocess mode. + output_queue_size: Optional[int] = None + + +class EngineCoreStatsSnapshot( + msgspec.Struct, # type: ignore + array_like=True, + omit_defaults=True, + gc=False): + """ + A snapshot of the EngineCore's current stats over a period of time. + """ + + # Snapshot of the scheduler stats. + scheduler_stats: SchedulerStats = msgspec_field( + default_factory=SchedulerStats) + + # Per request stats updates. + requests_stats_updates: List[RequestStatsUpdate] = msgspec_field( + default_factory=list) + + # Engine core's queue stats. + engine_core_process_stats: EngineCoreProcessStats = msgspec_field( + default_factory=EngineCoreProcessStats) + + # TODO(rickyx): Add other components' stats, + # e.g. model runner/worker and etc. diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index b0a7affbebb7e..8dfcf2dd78606 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -1,13 +1,20 @@ import multiprocessing import os import weakref +from collections import defaultdict from collections.abc import Sequence -from typing import (Any, Callable, Dict, Generic, List, Optional, TypeVar, - Union, overload) +from typing import (TYPE_CHECKING, Any, Callable, Dict, Generic, List, + Optional, TypeVar, Union, overload) + +import torch from vllm.logger import init_logger +from vllm.model_executor.models.utils import extract_layer_index from vllm.utils import get_mp_context, kill_process_tree +if TYPE_CHECKING: + from vllm.attention.layer import Attention + logger = init_logger(__name__) T = TypeVar("T") @@ -134,3 +141,48 @@ def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str): socket_file = ipc_socket.replace("ipc://", "") if os and os.path.exists(socket_file): os.remove(socket_file) + + +def bind_kv_cache( + kv_caches: Dict[str, torch.Tensor], + forward_context: Dict[str, "Attention"], + runner_kv_caches: List[torch.Tensor], +) -> None: + """ + Bind the allocated KV cache to both ModelRunner and forward context so + that the KV cache can be used in the forward pass. + + This function: + 1) Fills the ModelRunner's kv cache list (`runner_kv_caches`) with + kv_caches. + 2) Associates each attention layer in the `forward_context` with its + corresponding KV cache in kv_caches. + + Args: + kv_caches: The allocated kv_caches with layer names as keys. + forward_context: The global forward context containing all Attention + layers with layer names as keys. + runner_kv_caches: The kv_cache declared by ModelRunner. + """ + # Bind kv_caches to ModelRunner + assert len(runner_kv_caches) == 0 + + # Convert kv_caches dict to a list of tensors in the order of layer_index. + index2name = defaultdict(list) + for layer_name in kv_caches: + index2name[extract_layer_index(layer_name)].append(layer_name) + + for layer_index in sorted(index2name.keys()): + layer_names = index2name[layer_index] + if len(layer_names) > 1: + # One typical case is encoder-decoder model, e.g., bart. + # The cross attention and self attention in the same decoder layer + # has different layer_name but the same layer_index. + raise NotImplementedError + layer_name = layer_names[0] + runner_kv_caches.append(kv_caches[layer_name]) + + # Bind kv_caches to forward context + for layer_name, kv_cache in kv_caches.items(): + # NOTE: Use list because of v0 PP virtual engine. + forward_context[layer_name].kv_cache = [kv_cache] diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 40494e64b22f0..28d8e39053874 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -30,6 +30,9 @@ class CachedRequestState: num_computed_tokens: int output_token_ids: List[int] + mrope_positions: Optional[torch.Tensor] = None + mrope_position_delta: Optional[int] = None + @property def num_tokens(self) -> int: return len(self.prompt_token_ids) + len(self.output_token_ids) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index fb87dc5a8222a..fdf39449a2c59 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -7,22 +7,29 @@ import torch.distributed import torch.nn as nn +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.config import CompilationLevel, VllmConfig from vllm.distributed.parallel_state import graph_capture from vllm.forward_context import set_forward_context from vllm.inputs import INPUT_REGISTRY from vllm.logger import init_logger +from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import get_model from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.utils import group_mm_inputs_by_modality from vllm.sampling_params import SamplingType from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - LayerBlockType, bind_kv_cache, cdiv, - is_pin_memory_available) + LayerBlockType, cdiv, is_pin_memory_available) from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend, FlashAttentionMetadata) +from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.engine.mm_input_mapper import MMInputMapperClient +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheSpec) from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.utils import bind_kv_cache from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch if TYPE_CHECKING: @@ -88,8 +95,12 @@ def __init__( self.mm_input_mapper_profiling = MMInputMapperClient(self.model_config) self.mm_input_mapper_profiling.use_cache = False - self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens # noqa: E501 - self.encoder_cache_size = self.scheduler_config.encoder_cache_size + encoder_compute_budget, encoder_cache_size = compute_encoder_budget( + model_config=model_config, + scheduler_config=scheduler_config, + ) + self.max_num_encoder_input_tokens = encoder_compute_budget + self.encoder_cache_size = encoder_cache_size # Lazy initialization # self.model: nn.Module # Set after load_model @@ -130,6 +141,32 @@ def __init__( self.positions = torch.zeros(self.max_num_tokens, dtype=torch.int64, device=self.device) + + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.model_config.uses_mrope: + # NOTE: `mrope_positions` is implemented as a permuted tensor to + # satisfy the following properties to allow `torch.compile` to work + # properly: + # - shape: (3, ) + # - stride: (1, 3) + # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1921022256 + + # NOTE: When M-RoPE is enabled, position ids are 3D regardless of + # the modality of inputs. For text-only inputs, each dimension has + # identical position IDs, making M-RoPE functionally equivalent to + # 1D-RoPE. + # See page 5 of https://arxiv.org/abs/2409.12191 + self.mrope_positions = torch.zeros((self.max_num_tokens, 3), + dtype=torch.int64, + device=self.device) + self.mrope_positions_cpu = torch.zeros((self.max_num_tokens, 3), + dtype=torch.int64, + device="cpu", + pin_memory=self.pin_memory) + + self.mrope_positions = self.mrope_positions.permute((1, 0)) + self.mrope_positions_cpu = self.mrope_positions_cpu.permute((1, 0)) + self.inputs_embeds = torch.zeros( (self.max_num_tokens, self.hidden_size), dtype=self.dtype, @@ -237,6 +274,35 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: num_computed_tokens=new_req_data.num_computed_tokens, output_token_ids=[], ) + + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.model_config.uses_mrope: + image_grid_thw = [] + video_grid_thw = [] + for mm_input in self.requests[req_id].mm_inputs: + if mm_input.get("image_grid_thw") is not None: + image_grid_thw.extend( + mm_input["image_grid_thw"].tolist()) + if mm_input.get("video_grid_thw") is not None: + video_grid_thw.extend( + mm_input["video_grid_thw"].tolist()) + + hf_config = self.model_config.hf_config + + self.requests[req_id].mrope_positions, \ + self.requests[req_id].mrope_position_delta = \ + MRotaryEmbedding.get_input_positions_tensor( + self.requests[req_id].prompt_token_ids, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + image_token_id=hf_config.image_token_id, + video_token_id=hf_config.video_token_id, + vision_start_token_id=hf_config.vision_start_token_id, + vision_end_token_id=hf_config.vision_end_token_id, + spatial_merge_size=hf_config.vision_config. + spatial_merge_size, + ) + req_ids_to_add.append(req_id) # Update the cached states of the resumed requests. @@ -304,6 +370,11 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): arange, out=positions_np) + # Calculate M-RoPE positions. + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.model_config.uses_mrope: + self._calc_mrope_positions(scheduler_output) + # Get token indices. # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2] @@ -350,8 +421,16 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # Copy the tensors to the GPU. self.input_ids[:total_num_scheduled_tokens].copy_( self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True) - self.positions[:total_num_scheduled_tokens].copy_( - self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True) + if self.model_config.uses_mrope: + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + self.mrope_positions[:, :total_num_scheduled_tokens].copy_( + self.mrope_positions_cpu[:, :total_num_scheduled_tokens], + non_blocking=True) + else: + # Common case (1D positions) + self.positions[:total_num_scheduled_tokens].copy_( + self.positions_cpu[:total_num_scheduled_tokens], + non_blocking=True) query_start_loc = self.query_start_loc_cpu[:num_reqs + 1].to( self.device, non_blocking=True) seq_start_loc = self.seq_start_loc_cpu[:num_reqs + 1].to( @@ -463,6 +542,61 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): logits_indices = query_start_loc[1:] - 1 return attn_metadata, logits_indices + def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): + mrope_pos_ptr = 0 + num_reqs = self.input_batch.num_reqs + for index, req_id in enumerate(self.input_batch.req_ids[:num_reqs]): + assert req_id is not None + + req = self.requests[req_id] + assert req.mrope_positions is not None + + num_computed_tokens = \ + self.input_batch.num_computed_tokens_cpu[index] + num_scheduled_tokens = \ + scheduler_output.num_scheduled_tokens[req_id] + num_prompt_tokens = len(req.prompt_token_ids) + + if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens: + prompt_part_len = max(0, + num_prompt_tokens - num_computed_tokens) + completion_part_len = max( + 0, num_scheduled_tokens - prompt_part_len) + else: + prompt_part_len = num_scheduled_tokens + completion_part_len = 0 + + assert num_scheduled_tokens == prompt_part_len + completion_part_len + + if prompt_part_len > 0: + # prompt's mrope_positions are pre-computed + dst_start = mrope_pos_ptr + dst_end = mrope_pos_ptr + prompt_part_len + src_start = num_computed_tokens + src_end = num_computed_tokens + prompt_part_len + + self.mrope_positions_cpu[:, dst_start:dst_end] = \ + req.mrope_positions[:,src_start:src_end] + + mrope_pos_ptr += prompt_part_len + + if completion_part_len > 0: + # compute completion's mrope_positions on-the-fly + dst_start = mrope_pos_ptr + dst_end = mrope_pos_ptr + completion_part_len + + self.mrope_positions_cpu[:, dst_start:dst_end] = \ + MRotaryEmbedding.get_next_input_positions_tensor( + req.mrope_position_delta, + context_len=num_computed_tokens + + prompt_part_len, + seq_len=num_computed_tokens + + prompt_part_len + + completion_part_len, + ) + + mrope_pos_ptr += completion_part_len + def _prepare_sampling( self, scheduler_output: "SchedulerOutput", @@ -496,19 +630,34 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"): for input_id in encoder_input_ids: mm_inputs.append(req_state.mm_inputs[input_id]) req_input_ids.append((req_id, input_id)) - batched_mm_inputs = MultiModalKwargs.batch(mm_inputs) - batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs, - device=self.device) - - # Run the encoder. - # `encoder_outputs` is either of the following: - # 1. A tensor of shape [num_images, feature_size, hidden_size] - # in case when feature_size is fixed across all images. - # 2. A list (length: num_images) of tensors, each of shape - # [feature_size, hidden_size] in case when the feature size is - # dynamic depending on input images. - encoder_outputs = self.model.get_multimodal_embeddings( - **batched_mm_inputs) + + # Batch mm inputs as much as we can: if a request in the batch has + # multiple modalities or a different modality than the previous one, + # we process it separately to preserve item order. + # FIXME(ywang96): This is a hacky way to deal with multiple modalities + # in the same batch while still being able to benefit from batching + # multimodal inputs. The proper solution should be reordering the + # encoder outputs. + grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs) + + encoder_outputs = [] + for grouped_mm_inputs in grouped_mm_inputs_list: + batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs) + batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs, + device=self.device) + + # Run the encoder. + # `curr_group_outputs` is either of the following: + # 1. A tensor of shape (num_items, feature_size, hidden_size) + # in case feature_size is fixed across all multimodal items. + # 2. A list or tuple (length: num_items) of tensors, each of shape + # (feature_size, hidden_size) in case the feature size is dynamic + # depending on the input multimodal items. + curr_group_outputs = self.model.get_multimodal_embeddings( + **batched_mm_inputs) + + for output in curr_group_outputs: + encoder_outputs.append(output) # Cache the encoder outputs. for (req_id, input_id), output in zip(req_input_ids, encoder_outputs): @@ -556,6 +705,9 @@ def _gather_encoder_outputs( encoder_outputs.append(encoder_output[start_idx:end_idx]) return encoder_outputs + def get_model(self) -> nn.Module: + return self.model + @torch.inference_mode() def execute_model( self, @@ -609,9 +761,12 @@ def execute_model( # Run the decoder. # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata, self.vllm_config): + positions = self.mrope_positions[:, :num_input_tokens] \ + if self.model_config.uses_mrope \ + else self.positions[:num_input_tokens] hidden_states = self.model( input_ids=input_ids, - positions=self.positions[:num_input_tokens], + positions=positions, kv_caches=self.kv_caches, attn_metadata=None, inputs_embeds=inputs_embeds, @@ -698,9 +853,12 @@ def _dummy_run( input_ids = self.input_ids[:num_tokens] inputs_embeds = None with set_forward_context(None, self.vllm_config): + positions = self.mrope_positions[:, :num_tokens] \ + if self.model_config.uses_mrope \ + else self.positions[:num_tokens] hidden_states = model( input_ids=input_ids, - positions=self.positions[:num_tokens], + positions=positions, kv_caches=kv_caches, attn_metadata=None, inputs_embeds=inputs_embeds, @@ -721,44 +879,30 @@ def profile_run(self) -> None: ] # Profile with multimodal encoder & encoder cache. - if self.is_multimodal_model: - - # Create dummy batch of multimodal inputs. - dummy_request_data = self.input_registry.dummy_data_for_profiling( - model_config=self.model_config, - seq_len=self.max_num_tokens, - mm_registry=self.mm_registry, - ) - dummy_mm_data = dummy_request_data.multi_modal_data + # TODO: handle encoder-decoder models once we support them. + if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0 + and self.encoder_cache_size > 0): # NOTE: Currently model is profiled with a single non-text # modality with the max possible input tokens even when # it supports multiple. - max_tokens_by_modality_dict = self.mm_registry.get_max_tokens_per_item_by_modality( # noqa: E501 + max_tokens_by_modality_dict = MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality( # noqa: E501 self.model_config) - dummy_data_modality, max_tokens_per_mm_item = max( max_tokens_by_modality_dict.items(), key=lambda item: item[1]) # Check how many items of this modality can be supported by - # the encoder cache budget. - encoder_cache_budget = min(self.max_num_encoder_input_tokens, - self.encoder_cache_size) - max_num_mm_items_encoder_budget = encoder_cache_budget // \ - max_tokens_per_mm_item - - # TODO: Allow users to set encoder_cache_budget in case this - # happens. - assert max_num_mm_items_encoder_budget > 0, ( - f"Encoder cache budget={encoder_cache_budget} is too small to " - f"support the maximum possible size of multimodal embeddings" - f"={max_tokens_per_mm_item}.") + # the encoder budget. + encoder_budget = min(self.max_num_encoder_input_tokens, + self.encoder_cache_size) + + max_num_mm_items_encoder_budget = cdiv(encoder_budget, + max_tokens_per_mm_item) # Check how many items of this modality can be supported by # the decoder budget. - max_mm_items_per_req = max( - self.mm_registry.get_mm_limits_per_prompt( - self.model_config).values()) + max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt( + self.model_config)[dummy_data_modality] # NOTE: We do not consider max_num_batched_tokens on purpose # because the multimodal embeddings can be generated in advance @@ -769,6 +913,19 @@ def profile_run(self) -> None: max_num_mm_items = min(max_num_mm_items_encoder_budget, max_num_mm_items_decoder_budget) + logger.info( + "Encoder cache will be initialized with a budget of %s tokens," + " and profiled with %s %s items of the maximum feature size.", + encoder_budget, max_num_mm_items, dummy_data_modality) + + # Create dummy batch of multimodal inputs. + dummy_request_data = self.input_registry.dummy_data_for_profiling( + model_config=self.model_config, + seq_len=self.max_num_tokens, + mm_registry=self.mm_registry, + ) + dummy_mm_data = dummy_request_data.multi_modal_data + # Dummy data definition in V0 may contain multiple multimodal items # (e.g, multiple images) for a single request, therefore here we # always replicate first item by max_num_mm_items times since in V1 @@ -852,15 +1009,71 @@ def capture_model(self) -> None: logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", elapsed_time, cuda_graph_size / (1 << 30)) - def initialize_kv_cache(self, num_blocks: int) -> None: - assert len(self.kv_caches) == 0 - kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape( - num_blocks, self.block_size, self.num_kv_heads, self.head_size) - for _ in range(self.num_attn_layers): - self.kv_caches.append( - torch.zeros(kv_cache_shape, - dtype=self.kv_cache_dtype, - device=self.device)) + def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: + """ + Initialize KV cache based on `kv_cache_config`. + Args: + kv_cache_config: Configuration for the KV cache, including the KV + cache size of each layer + """ + if len(kv_cache_config.groups) > 1: + raise NotImplementedError( + "Hybrid models with more than one KV cache type are not " + "supported yet.") + + kv_caches: Dict[str, torch.Tensor] = {} + + for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items(): + tensor_config = kv_cache_config.tensors[layer_name] + assert tensor_config.size % layer_spec.page_size_bytes == 0 + num_blocks = tensor_config.size // layer_spec.page_size_bytes + if isinstance(layer_spec, FullAttentionSpec): + kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape( + num_blocks, layer_spec.block_size, layer_spec.num_kv_heads, + layer_spec.head_size) + dtype = layer_spec.dtype + kv_caches[layer_name] = torch.zeros(kv_cache_shape, + dtype=dtype, + device=self.device) + else: + raise NotImplementedError + bind_kv_cache( + kv_caches, self.vllm_config.compilation_config.static_forward_context, - [self.kv_caches]) + self.kv_caches) + + def get_kv_cache_spec(self) -> KVCacheSpec: + """ + Generates the KVCacheSpec by parsing the kv cache format from each + Attention module in the static forward context. + Returns: + KVCacheSpec: A dictionary mapping layer names to their KV cache + format. Layers that do not need KV cache are not included. + """ + + forward_ctx = self.vllm_config.compilation_config.static_forward_context + block_size = self.vllm_config.cache_config.block_size + kv_cache_spec: KVCacheSpec = {} + for layer_name, attn_module in forward_ctx.items(): + # TODO: Support other attention modules, e.g., sliding window, + # cross-attention, MLA. + assert isinstance(attn_module, Attention) + if attn_module.attn_type == AttentionType.DECODER: + kv_cache_spec[layer_name] = FullAttentionSpec( + block_size=block_size, + num_kv_heads=attn_module.num_kv_heads, + head_size=attn_module.head_size, + dtype=attn_module.dtype, + ) + elif attn_module.attn_type in (AttentionType.ENCODER, + AttentionType.ENCODER_ONLY): + # encoder-only attention does not need KV cache. + continue + elif attn_module.attn_type == AttentionType.ENCODER_DECODER: + raise NotImplementedError + else: + raise ValueError( + f"Unknown attention type: {attn_module.attn_type}") + + return kv_cache_spec diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 81b247e07ef4a..e69b6c90454c4 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -1,21 +1,24 @@ """A GPU worker class.""" import gc import os -from typing import TYPE_CHECKING, Optional, Tuple +from typing import TYPE_CHECKING, Optional import torch import torch.distributed +import torch.nn as nn import vllm.envs as envs -from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig +from vllm.config import ParallelConfig, VllmConfig +from vllm.device_allocator.cumem import CuMemAllocator from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment, set_custom_all_reduce) from vllm.logger import init_logger from vllm.model_executor import set_random_seed from vllm.platforms import current_platform -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, get_dtype_size +from vllm.utils import GiB_bytes from vllm.v1.core.scheduler import SchedulerOutput +from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.worker.gpu_model_runner import GPUModelRunner @@ -76,6 +79,23 @@ def __init__( else: self.profiler = None + def sleep(self, level: int = 1) -> None: + free_bytes_before_sleep = torch.cuda.mem_get_info()[0] + allocator = CuMemAllocator.get_instance() + allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple()) + free_bytes_after_sleep, total = torch.cuda.mem_get_info() + freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep + used_bytes = total - free_bytes_after_sleep + assert freed_bytes >= 0, "Memory usage increased after sleeping." + logger.info( + "Sleep mode freed %.2f GiB memory, " + "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes, + used_bytes / GiB_bytes) + + def wake_up(self) -> None: + allocator = CuMemAllocator.get_instance() + allocator.wake_up() + def init_device(self): if self.device_config.device.type == "cuda": # torch.distributed.all_reduce does not free the input tensor until @@ -109,23 +129,31 @@ def init_device(self): self.model_runner = GPUModelRunner(self.vllm_config, self.device) def load_model(self) -> None: - self.model_runner.load_model() + if self.vllm_config.model_config.enable_sleep_mode: + allocator = CuMemAllocator.get_instance() + assert allocator.get_current_usage() == 0, ( + "Sleep mode can only be " + "used for one instance per process.") + context = allocator.use_memory_pool(tag="weights") + else: + from contextlib import nullcontext + context = nullcontext() + with context: + self.model_runner.load_model() @torch.inference_mode() - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Profiles the peak memory usage of the model to determine how many - KV blocks may be allocated without OOMs. + def determine_available_memory(self) -> int: + """Profiles the peak memory usage of the model to determine how much + memory can be used for KV cache without OOMs. The engine will first conduct a profiling of the existing memory usage. - Then, it calculate the maximum possible number of GPU and CPU blocks - that can be allocated with the remaining free memory. + Then, it calculate the free memory that can be used for KV cache in + bytes. .. tip:: You may limit the usage of GPU memory by adjusting the `gpu_memory_utilization` parameter. """ - # Profile the memory usage of the model and get the maximum number of - # cache blocks that can be allocated with the remaining free memory. torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() @@ -161,33 +189,21 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: total_gpu_memory * self.cache_config.gpu_memory_utilization - peak_memory) - # Calculate the number of blocks that can be allocated with the - # profiled peak memory. - cache_block_size = _get_cache_block_size(self.cache_config, - self.model_config, - self.parallel_config) - num_gpu_blocks = int(available_kv_cache_memory // cache_block_size) - num_gpu_blocks = max(num_gpu_blocks, 0) - return num_gpu_blocks, 0 - - def initialize_cache(self, num_gpu_blocks: int) -> None: - """Allocate GPU and CPU KV cache with the specified number of blocks.""" - if num_gpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine.") - - max_seq_len = self.cache_config.block_size * num_gpu_blocks - max_model_len = self.model_config.max_model_len - if max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`gpu_memory_utilization` or decreasing `max_model_len` when " - "initializing the engine.") + return int(available_kv_cache_memory) - self.model_runner.initialize_kv_cache(num_gpu_blocks) + def get_kv_cache_spec(self) -> KVCacheSpec: + return self.model_runner.get_kv_cache_spec() + + def initialize_cache(self, kv_cache_config: KVCacheConfig) -> None: + """Allocate GPU KV cache with the specified kv_cache_config.""" + if self.vllm_config.model_config.enable_sleep_mode: + allocator = CuMemAllocator.get_instance() + context = allocator.use_memory_pool(tag="kv_cache") + else: + from contextlib import nullcontext + context = nullcontext() + with context: + self.model_runner.initialize_kv_cache(kv_cache_config) def compile_or_warm_up_model(self) -> None: if not self.model_config.enforce_eager: @@ -196,6 +212,9 @@ def compile_or_warm_up_model(self) -> None: # the model initialization and profiling. set_random_seed(self.model_config.seed) + def get_model(self) -> nn.Module: + return self.model_runner.get_model() + @torch.inference_mode() def execute_model( self, @@ -251,24 +270,3 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): f"of at least 8.0. Your {gpu_name} GPU {compute_str}. " "You can use float16 instead by explicitly setting the" "`dtype` flag in CLI, for example: --dtype=half.") - - -def _get_cache_block_size( - cache_config: CacheConfig, - model_config: ModelConfig, - parallel_config: ParallelConfig, -) -> int: - head_size = model_config.get_head_size() - num_heads = model_config.get_num_kv_heads(parallel_config) - num_attention_layers = model_config.get_num_layers_by_block_type( - parallel_config, LayerBlockType.attention) - - key_cache_block = cache_config.block_size * num_heads * head_size - value_cache_block = key_cache_block - total = num_attention_layers * (key_cache_block + value_cache_block) - if cache_config.cache_dtype == "auto": - dtype = model_config.dtype - else: - dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] - dtype_size = get_dtype_size(dtype) - return dtype_size * total diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 303d9a15e9c3c..4b429b67b36f8 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -144,9 +144,7 @@ def __init__(self, runner: "CPUModelRunner", finished_requests_ids: Optional[List[str]] = None) -> None: super().__init__() - self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] self.runner = runner - self.chunked_prefill = (runner.scheduler_config.chunked_prefill_enabled or runner.cache_config.enable_prefix_caching) self.model_input_cls = self.runner._model_input_cls @@ -156,10 +154,17 @@ def __init__(self, self.device = self.runner.device self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper self.enable_lora = self.runner.lora_config is not None + if self.runner.attn_backend is not None: + # spec decode (e.g. Medusa) does not have atten backend + attn_backend = self.runner.attn_backend + self.att_metadata_builder = attn_backend.get_builder_cls()(self) + + def prepare(self, + finished_requests_ids: Optional[List[str]] = None) -> None: + self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] self.input_data = ModelInputForCPUBuilder.ModelInputData( self.runner.model_config.uses_mrope) - self.att_metadata_builder = self.runner.attn_backend.get_builder_cls()( - self) + self.att_metadata_builder.prepare() def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): self.seq_group_metadata_list.append(seq_group_metadata) @@ -431,6 +436,7 @@ class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]): """ _model_input_cls: Type[TModelInputForCPU] _builder_cls: Type[ModelInputForCPUBuilder] + builder: ModelInputForCPUBuilder def __init__( self, @@ -477,6 +483,10 @@ def __init__( # Set after load_model. self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None + if hasattr(self, "_builder_cls"): + # multi-step model runner does not have `_builder_cls` + self.builder = self._builder_cls(weakref.proxy(self)) + def load_model(self) -> None: self.model = get_model(vllm_config=self.vllm_config) @@ -509,6 +519,9 @@ def load_model(self) -> None: ) self.model = self.lora_manager.create_lora_manager(self.model) + def get_model(self) -> nn.Module: + return self.model + def _prepare_model_input_tensors( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -519,10 +532,10 @@ def _prepare_model_input_tensors( metadata for possible additional steps, e.g., sampling. """ - builder = self._builder_cls(weakref.proxy(self), finished_requests_ids) - builder.set_seq_group_list(seq_group_metadata_list) + self.builder.prepare(finished_requests_ids) + self.builder.set_seq_group_list(seq_group_metadata_list) - return builder.build() # type: ignore + return self.builder.build() # type: ignore # sampler property will be used by spec_decode_worker @property diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 203b6dc416d65..1c39e0d875400 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -21,6 +21,7 @@ import habana_frameworks.torch as htorch import habana_frameworks.torch.internal.bridge_config as bc import torch +import torch.nn as nn from vllm_hpu_extension.ops import LoraMask as LoraMask from vllm_hpu_extension.ops import batch2block, block2batch from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler, @@ -699,6 +700,9 @@ def load_model(self) -> None: msg = f"Loading model weights took in total {m.get_summary_string()}" logger.info(msg) + def get_model(self) -> nn.Module: + return self.model + def _use_graphs(self, batch_size, seq_len, is_prompt): if self.enforce_eager: return False diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 9401241073c7d..3c570212625c4 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -130,7 +130,6 @@ def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, ) -> Optional[List[SamplerOutput]]: - assert execute_model_req is not None # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501 # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501 # VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any # noqa:E501 @@ -144,7 +143,8 @@ def execute_model( 'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0' log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS', '0') != '0' or log_cpu_fallbacks_all - if log_graph_compilation or log_cpu_fallbacks: + if (log_graph_compilation or log_cpu_fallbacks) and \ + execute_model_req is not None: from habana_frameworks.torch.hpu.metrics import metric_localcontext seq_group_metadata_list = execute_model_req.seq_group_metadata_list is_prompt = any([ diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index ae8b7f97c827d..e311c14111d49 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -457,17 +457,13 @@ def __init__(self, self.enable_prompt_adapter = (self.runner.prompt_adapter_config is not None) self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper - self.finished_requests_ids = finished_requests_ids self.decode_only = True - # Intermediate data (data in CPU before going to GPU) for - # the current sequence group. - self.inter_data_list: List[ - ModelInputForGPUBuilder.InterDataForSeqGroup] = [] - # Attention metadata inputs. - self.attn_metadata_builder = self.attn_backend.make_metadata_builder( - weakref.proxy(self)) + if self.attn_backend is not None: + # spec decode (e.g. Medusa) does not have atten backend + self.attn_metadata_builder = self.attn_backend.get_builder_cls()( + weakref.proxy(self)) # Engine/Model configurations. self.chunked_prefill_enabled = ( @@ -479,6 +475,17 @@ def __init__(self, self.block_aligned_sliding_window = \ self.sliding_window_blocks * self.block_size + def prepare(self, + finished_requests_ids: Optional[List[str]] = None) -> None: + self.finished_requests_ids = finished_requests_ids + + # Intermediate data (data in CPU before going to GPU) for + # the current sequence group. + self.inter_data_list: List[ + ModelInputForGPUBuilder.InterDataForSeqGroup] = [] + + self.attn_metadata_builder.prepare() + def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int, seq_group_metadata: SequenceGroupMetadata): """Compute context length, sequence length and tokens @@ -993,6 +1000,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): """ _model_input_cls: Type[TModelInputForGPU] _builder_cls: Type[ModelInputForGPUBuilder] + builder: ModelInputForGPUBuilder def __init__( self, @@ -1093,6 +1101,10 @@ def __init__( SamplingMetadataCache() \ if self.parallel_config.pipeline_parallel_size == 1 else None + if hasattr(self, "_builder_cls"): + # multi-step model runner does not have `_builder_cls` + self.builder = self._builder_cls(weakref.proxy(self)) + def load_model(self) -> None: logger.info("Starting to load model %s...", self.model_config.model) with DeviceMemoryProfiler() as m: @@ -1176,6 +1188,9 @@ def load_model(self) -> None: fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, backend=backend) + def get_model(self) -> nn.Module: + return self.model + def save_sharded_state( self, path: str, @@ -1223,13 +1238,13 @@ def _prepare_model_input_tensors( If cuda graph is required, this API automatically pads inputs. """ - builder = self._builder_cls(weakref.proxy(self), finished_requests_ids) + self.builder.prepare(finished_requests_ids) for seq_group_metadata in seq_group_metadata_list: - builder.add_seq_group(seq_group_metadata) + self.builder.add_seq_group(seq_group_metadata) - builder.reset_cached_inter_data() + self.builder.reset_cached_inter_data() - return builder.build() # type: ignore + return self.builder.build() # type: ignore @contextmanager def set_in_profile_run(self): diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index c7abad7e0258d..aef4bdcdd4bf9 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -7,6 +7,7 @@ Optional, Type, TypeVar) import torch +import torch.nn as nn from torch import is_tensor from vllm.config import VllmConfig @@ -199,6 +200,11 @@ class ModelRunnerInputBuilderBase(ABC, Generic[T]): """A builder to create ModelRunnerInputBase objects. """ + @abstractmethod + def prepare(self, + finished_requests_ids: Optional[List[str]] = None) -> None: + raise NotImplementedError + @abstractmethod def add_seq_group(self, seq_group_metadata): """TBA""" @@ -264,6 +270,10 @@ def prepare_model_input( """ raise NotImplementedError + @abstractmethod + def get_model(self) -> nn.Module: + raise NotImplementedError + def execute_model( self, model_input: T, @@ -297,9 +307,9 @@ class ModelRunnerWrapperBase: def __init__( self, - moderl_runner: ModelRunnerBase, + model_runner: ModelRunnerBase, ) -> None: - self.model_runner: ModelRunnerBase = moderl_runner + self.model_runner: ModelRunnerBase = model_runner def __getattr__(self, attr): return getattr(self.model_runner, attr) diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index a35f5467e1a1f..596c26eac28bd 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -113,6 +113,9 @@ def load_model(self) -> None: raise NotImplementedError( "Supports only Transformer-NeuronX based models.") + def get_model(self) -> nn.Module: + return self.model + def _prepare_prompt( self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index a38b5a4e6e8d5..9d0a759ca2f21 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -84,6 +84,9 @@ def load_model(self) -> None: kv_cache_dtype=self.kv_cache_dtype, ov_core=self.ov_core) + def get_model(self) -> nn.Module: + return self.model + def _prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index 50a155d22c666..f5b46cde3969c 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -4,6 +4,7 @@ import openvino as ov import torch import torch.distributed +import torch.nn as nn import vllm.envs as envs from vllm.attention import get_attn_backend @@ -362,6 +363,9 @@ def cache_copy( ) -> None: self.cache_engine.copy(blocks_to_copy) # type: ignore + def get_model(self) -> nn.Module: + return self.model_runner.get_model() + @torch.inference_mode() def execute_model( self, diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 52c577bccab9c..f5c7bc955a673 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -158,6 +158,9 @@ def load_model(self) -> None: fullgraph=True, dynamic=False) + def get_model(self) -> nn.Module: + return self.model.model + def _dummy_run( self, batch_size: int, diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index a3e377ef2b19d..c95d2a9a69ff3 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -8,6 +8,7 @@ import vllm.envs as envs from vllm.config import VllmConfig +from vllm.device_allocator.cumem import CuMemAllocator from vllm.distributed import (ensure_kv_transfer_initialized, ensure_model_parallel_initialized, init_distributed_environment, @@ -21,7 +22,8 @@ from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, SequenceGroupMetadata, SequenceGroupMetadataDelta) -from vllm.utils import GiB_bytes, bind_kv_cache, memory_profiling +from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache, + memory_profiling) from vllm.worker.cache_engine import CacheEngine from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner @@ -55,9 +57,6 @@ def __init__( self.rank = rank self.distributed_init_method = distributed_init_method self.is_driver_worker = is_driver_worker - if is_driver_worker: - assert rank % self.parallel_config.tensor_parallel_size == 0, \ - "Driver worker should be rank 0 of tensor parallel group." if self.model_config.trust_remote_code: # note: lazy import to avoid importing torch before initializing from vllm.utils import init_cached_hf_modules @@ -122,6 +121,23 @@ def stop_profile(self): raise RuntimeError("Profiler is not enabled.") self.profiler.stop() + def sleep(self, level: int = 1) -> None: + free_bytes_before_sleep = torch.cuda.mem_get_info()[0] + allocator = CuMemAllocator.get_instance() + allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple()) + free_bytes_after_sleep, total = torch.cuda.mem_get_info() + freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep + used_bytes = total - free_bytes_after_sleep + assert freed_bytes >= 0, "Memory usage increased after sleeping." + logger.info( + "Sleep mode freed %.2f GiB memory, " + "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes, + used_bytes / GiB_bytes) + + def wake_up(self) -> None: + allocator = CuMemAllocator.get_instance() + allocator.wake_up() + def init_device(self) -> None: if self.device_config.device.type == "cuda": # torch.distributed.all_reduce does not free the input tensor until @@ -140,7 +156,8 @@ def init_device(self) -> None: _check_if_gpu_supports_dtype(self.model_config.dtype) gc.collect() torch.cuda.empty_cache() - self.init_gpu_memory = torch.cuda.mem_get_info()[0] + torch.cuda.reset_peak_memory_stats() + self.baseline_snapshot = MemorySnapshot() else: raise RuntimeError( f"Not support device type: {self.device_config.device}") @@ -152,7 +169,17 @@ def init_device(self) -> None: set_random_seed(self.model_config.seed) def load_model(self): - self.model_runner.load_model() + if self.vllm_config.model_config.enable_sleep_mode: + allocator = CuMemAllocator.get_instance() + assert allocator.get_current_usage() == 0, ( + "Sleep mode can only be " + "used for one instance per process.") + context = allocator.use_memory_pool(tag="weights") + else: + from contextlib import nullcontext + context = nullcontext() + with context: + self.model_runner.load_model() def save_sharded_state( self, @@ -195,10 +222,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Execute a forward pass with dummy inputs to profile the memory usage # of the model. - with memory_profiling(baseline_memory_in_bytes=total_gpu_memory - - self.init_gpu_memory, - weights_memory_in_bytes=self.model_runner. - model_memory_usage) as result: + with memory_profiling( + self.baseline_snapshot, + weights_memory=self.model_runner.model_memory_usage) as result: self.model_runner.profile_run() self._assert_memory_footprint_increased_during_profiling() @@ -206,7 +232,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: memory_for_current_instance = total_gpu_memory * \ self.cache_config.gpu_memory_utilization available_kv_cache_memory = (memory_for_current_instance - - result.non_kv_cache_memory_in_bytes) + result.non_kv_cache_memory) # Calculate the number of blocks that can be allocated with the # profiled peak memory. @@ -229,11 +255,11 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: f"({self.cache_config.gpu_memory_utilization:.2f})" f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n" "model weights take " - f"{(result.weights_memory_in_bytes / GiB_bytes):.2f}GiB;" + f"{(result.weights_memory / GiB_bytes):.2f}GiB;" " non_torch_memory takes " - f"{(result.non_torch_increase_in_bytes / GiB_bytes):.2f}GiB;" + f"{(result.non_torch_increase / GiB_bytes):.2f}GiB;" " PyTorch activation peak memory takes " - f"{(result.torch_peak_increase_in_bytes / GiB_bytes):.2f}GiB;" + f"{(result.torch_peak_increase / GiB_bytes):.2f}GiB;" " the rest of the memory reserved for KV Cache is " f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.") @@ -249,11 +275,13 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: def _assert_memory_footprint_increased_during_profiling(self): # NOTE(woosuk): Here we assume that the other processes using the same # GPU did not change their memory usage during the profiling. - free_gpu_memory, _ = torch.cuda.mem_get_info() - assert self.init_gpu_memory - free_gpu_memory > 0, ( + free_gpu_memory, total = torch.cuda.mem_get_info() + cuda_memory = total - free_gpu_memory + assert self.baseline_snapshot.cuda_memory < cuda_memory, ( "Error in memory profiling. " - f"Initial free memory {self.init_gpu_memory}, current free memory" - f" {free_gpu_memory}. This happens when the GPU memory was " + f"Initial used memory {self.baseline_snapshot.cuda_memory}, " + f"currently used memory {cuda_memory}. " + f"This happens when the GPU memory was " "not properly cleaned up before initializing the vLLM instance.") def initialize_cache(self, num_gpu_blocks: int, @@ -270,7 +298,14 @@ def initialize_cache(self, num_gpu_blocks: int, self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks - self._init_cache_engine() + if self.vllm_config.model_config.enable_sleep_mode: + allocator = CuMemAllocator.get_instance() + context = allocator.use_memory_pool(tag="kv_cache") + else: + from contextlib import nullcontext + context = nullcontext() + with context: + self._init_cache_engine() self._warm_up_model() def _init_cache_engine(self): diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 7c14b8344b49e..c6e6693c54f57 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -4,7 +4,9 @@ from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union +import cloudpickle import torch +import torch.nn as nn from vllm.config import ObservabilityConfig, VllmConfig from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group @@ -13,7 +15,8 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest, IntermediateTensors from vllm.utils import (enable_trace_function_call_for_thread, - resolve_obj_by_qualname, update_environment_variables) + resolve_obj_by_qualname, run_method, + update_environment_variables) from vllm.worker.model_runner_base import (BroadcastableModelInput, ModelRunnerBase, ModelRunnerInputBase) @@ -88,6 +91,11 @@ def start_worker_execution_loop(self) -> None: if output is None: return None + @abstractmethod + def get_model(self) -> nn.Module: + raise NotImplementedError + + @abstractmethod def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None @@ -145,6 +153,9 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + def get_model(self) -> nn.Module: + return self.worker.get_model() + def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None @@ -361,6 +372,9 @@ def prepare_input( else: return self._get_worker_input_from_broadcast() + def get_model(self) -> nn.Module: + return self.model_runner.get_model() + def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, @@ -461,7 +475,8 @@ def _execute_model_spmd( class WorkerWrapperBase: """ - The whole point of this class is to lazily initialize the worker. + This class represents one process in an executor/engine. It is responsible + for lazily initializing the worker and handling the worker's lifecycle. We first instantiate the WorkerWrapper, which remembers the worker module and class name. Then, when we call `update_environment_variables`, and the real initialization happens in `init_worker`. @@ -470,9 +485,19 @@ class WorkerWrapperBase: def __init__( self, vllm_config: VllmConfig, - rank: int = 0, + rpc_rank: int = 0, ) -> None: - self.rank = rank + """ + Initialize the worker wrapper with the given vllm_config and rpc_rank. + Note: rpc_rank is the rank of the worker in the executor. In most cases, + it is also the rank of the worker in the distributed group. However, + when multiple executors work together, they can be different. + e.g. in the case of SPMD-style offline inference with TP=2, + users can launch 2 engines/executors, each with only 1 worker. + All workers have rpc_rank=0, but they have different ranks in the TP + group. + """ + self.rpc_rank = rpc_rank self.vllm_config = vllm_config self.worker: Optional[WorkerBase] = None if vllm_config.model_config is not None: @@ -485,16 +510,16 @@ def __init__( def adjust_rank(self, rank_mapping: Dict[int, int]) -> None: """ - Adjust the rank based on the given mapping. + Adjust the rpc_rank based on the given mapping. It is only used during the initialization of the executor, - to adjust the rank of workers after we create all workers. + to adjust the rpc_rank of workers after we create all workers. """ - if self.rank in rank_mapping: - self.rank = rank_mapping[self.rank] + if self.rpc_rank in rank_mapping: + self.rpc_rank = rank_mapping[self.rpc_rank] def update_environment_variables(self, envs_list: List[Dict[str, str]]) -> None: - envs = envs_list[self.rank] + envs = envs_list[self.rpc_rank] key = 'CUDA_VISIBLE_DEVICES' if key in envs and key in os.environ: # overwriting CUDA_VISIBLE_DEVICES is desired behavior @@ -507,31 +532,33 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None: Here we inject some common logic before initializing the worker. Arguments are passed to the worker class constructor. """ - kwargs = all_kwargs[self.rank] + kwargs = all_kwargs[self.rpc_rank] enable_trace_function_call_for_thread(self.vllm_config) - # see https://github.com/NVIDIA/nccl/issues/1234 - os.environ['NCCL_CUMEM_ENABLE'] = '0' - from vllm.plugins import load_general_plugins load_general_plugins() - worker_class = resolve_obj_by_qualname( - self.vllm_config.parallel_config.worker_cls) + if isinstance(self.vllm_config.parallel_config.worker_cls, str): + worker_class = resolve_obj_by_qualname( + self.vllm_config.parallel_config.worker_cls) + else: + assert isinstance(self.vllm_config.parallel_config.worker_cls, + bytes) + worker_class = cloudpickle.loads( + self.vllm_config.parallel_config.worker_cls) self.worker = worker_class(**kwargs) assert self.worker is not None - def execute_method(self, method: str, *args, **kwargs): + def execute_method(self, method: Union[str, bytes], *args, **kwargs): try: target = self if self.worker is None else self.worker - executor = getattr(target, method) - return executor(*args, **kwargs) + return run_method(target, method, args, kwargs) except Exception as e: # if the driver worker also execute methods, # exceptions in the rest worker may cause deadlock in rpc like ray # see https://github.com/vllm-project/vllm/issues/3455 # print the error and inform the user to solve the error - msg = (f"Error executing method {method}. " + msg = (f"Error executing method {method!r}. " "This might cause deadlock in distributed execution.") logger.exception(msg) raise e diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 82b8f22a5af33..053658d047311 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -113,7 +113,6 @@ def __init__(self, runner: "XPUModelRunner", finished_requests_ids: Optional[List[str]] = None) -> None: super().__init__() - self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] self.runner = runner self.model_input_cls = self.runner._model_input_cls self.attn_backend = self.runner.attn_backend @@ -121,6 +120,10 @@ def __init__(self, self.block_size = self.runner.block_size self.device = self.runner.device + def prepare(self, + finished_requests_ids: Optional[List[str]] = None) -> None: + self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] + def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): self.seq_group_metadata_list.append(seq_group_metadata) @@ -408,6 +411,8 @@ def __init__( SamplingMetadataCache() \ if self.parallel_config.pipeline_parallel_size == 1 else None + self.builder = self._builder_cls(weakref.proxy(self)) + def load_model(self) -> None: with DeviceMemoryProfiler() as m: self.model = get_model(vllm_config=self.vllm_config) @@ -416,6 +421,9 @@ def load_model(self) -> None: logger.info("Loading model weights took %.4f GB", self.model_memory_usage / float(2**30)) + def get_model(self) -> nn.Module: + return self.model + @property def vocab_size(self) -> int: return self.model_config.get_vocab_size() @@ -514,7 +522,8 @@ def _prepare_model_input_tensors( metadata for possible additional steps, e.g., sampling. """ - builder = self._builder_cls(weakref.proxy(self), finished_requests_ids) + builder = self.builder + builder.prepare(finished_requests_ids) for seq_group_metadata in seq_group_metadata_list: builder.add_seq_group(seq_group_metadata)