From 36b63b553cae0b5184f7021a32e88dfe576519e7 Mon Sep 17 00:00:00 2001
From: Konrad Zawora
Date: Tue, 2 Jul 2024 15:32:40 +0200
Subject: [PATCH] Revert "habana_main rebase (#71)"
This reverts commit 5e1a5653e6471e1e80e1cdebcb0ff64921126246.
---
.buildkite/check-wheel-size.py | 2 +-
.buildkite/download-images.sh | 4 +
.../configs/Meta-Llama-3-70B-Instruct.yaml | 11 -
.../configs/Meta-Llama-3-8B-Instruct-FP8.yaml | 11 -
.../configs/Meta-Llama-3-8B-Instruct.yaml | 11 -
.../configs/Mixtral-8x7B-Instruct-v0.1.yaml | 11 -
.../lm-eval-harness/configs/models-large.txt | 2 -
.../lm-eval-harness/configs/models-small.txt | 2 -
.../run-lm-eval-gsm-hf-baseline.sh | 46 -
.../run-lm-eval-gsm-vllm-baseline.sh | 51 -
.buildkite/lm-eval-harness/run-tests.sh | 59 -
.../test_lm_eval_correctness.py | 54 -
.buildkite/nightly-benchmarks/README.md | 103 -
.../benchmark-pipeline.yaml | 62 -
.../nightly-benchmarks/kickoff-pipeline.sh | 27 -
.../run-benchmarks-suite.sh | 358 --
.../convert-results-json-to-markdown.py | 192 -
.../scripts/wait-for-image.sh | 17 -
.../nightly-benchmarks/tests/descriptions.md | 67 -
.../tests/latency-tests.json | 32 -
.../tests/serving-tests.json | 59 -
.../tests/throughput-tests.json | 35 -
.buildkite/release-pipeline.yaml | 21 -
.buildkite/run-amd-test.sh | 39 +-
.buildkite/run-benchmarks.sh | 13 +-
.buildkite/run-cpu-test.sh | 18 +-
.buildkite/run-openvino-test.sh | 14 -
.buildkite/run-xpu-test.sh | 14 -
.buildkite/test-pipeline.yaml | 159 +-
.buildkite/test-template.j2 | 94 +
.clang-format | 26 -
.github/ISSUE_TEMPLATE/400-bug report.yml | 2 -
.github/workflows/clang-format.yml | 42 -
.github/workflows/mypy.yaml | 3 +-
.github/workflows/ruff.yml | 2 +-
CMakeLists.txt | 86 +-
Dockerfile | 101 +-
Dockerfile.cpu | 20 +-
Dockerfile.neuron | 2 +-
Dockerfile.openvino | 26 -
Dockerfile.ppc64le | 22 -
Dockerfile.rocm | 215 +-
Dockerfile.tpu | 19 -
Dockerfile.xpu | 22 -
README.md | 89 +-
benchmarks/backend_request_func.py | 63 +-
benchmarks/benchmark_latency.py | 152 +-
benchmarks/benchmark_prefix_caching.py | 4 +-
benchmarks/benchmark_serving.py | 95 +-
benchmarks/benchmark_throughput.py | 82 +-
.../cutlass_benchmarks/w8a8_benchmarks.py | 353 --
.../cutlass_benchmarks/weight_shapes.py | 43 -
benchmarks/kernels/benchmark_aqlm.py | 14 +-
benchmarks/kernels/benchmark_marlin.py | 235 --
benchmarks/kernels/benchmark_mixtral_moe.py | 215 +
benchmarks/kernels/benchmark_moe.py | 333 --
.../kernels/benchmark_paged_attention.py | 31 +-
benchmarks/kernels/benchmark_rope.py | 13 +-
benchmarks/kernels/benchmark_shapes.py | 75 -
benchmarks/launch_tgi_server.sh | 2 +-
benchmarks/overheads/benchmark_hashing.py | 63 -
cmake/cpu_extension.cmake | 40 +-
cmake/utils.cmake | 32 +-
collect_env.py | 7 -
csrc/activation_kernels.cu | 153 +-
csrc/attention/attention_generic.cuh | 19 +-
csrc/attention/attention_kernels.cu | 871 ++---
csrc/attention/attention_utils.cuh | 11 +-
csrc/attention/dtype_bfloat16.cuh | 74 +-
csrc/attention/dtype_float16.cuh | 92 +-
csrc/attention/dtype_float32.cuh | 88 +-
csrc/attention/dtype_fp8.cuh | 36 +-
csrc/cache.h | 48 +-
csrc/cache_kernels.cu | 384 +-
csrc/cpu/activation.cpp | 79 +-
csrc/cpu/attention.cpp | 430 +-
csrc/cpu/cache.cpp | 60 +-
csrc/cpu/cpu_types.hpp | 351 +-
csrc/cpu/cpu_types_vsx.hpp | 491 ---
csrc/cpu/cpu_types_x86.hpp | 515 ---
csrc/cpu/layernorm.cpp | 32 +-
csrc/cpu/pos_encoding.cpp | 166 +-
csrc/cpu/pybind.cpp | 73 +
csrc/cpu/torch_bindings.cpp | 110 -
csrc/cuda_compat.h | 17 +-
csrc/cuda_utils.h | 9 +-
csrc/cuda_utils_kernels.cu | 40 +-
csrc/custom_all_reduce.cu | 73 +-
csrc/custom_all_reduce.cuh | 105 +-
csrc/custom_all_reduce_test.cu | 38 +-
csrc/dispatch_utils.h | 44 +-
csrc/layernorm_kernels.cu | 244 +-
csrc/moe/moe_ops.cpp | 7 +
csrc/moe/moe_ops.h | 10 +-
csrc/moe/topk_softmax_kernels.cu | 29 +-
csrc/moe/torch_bindings.cpp | 12 -
csrc/moe_align_block_size_kernels.cu | 213 +-
csrc/ops.h | 328 +-
csrc/pos_encoding_kernels.cu | 235 +-
csrc/punica/bgmv/bgmv_config.h | 60 +-
csrc/punica/bgmv/bgmv_impl.cuh | 154 -
csrc/punica/bgmv/vec_dtypes.cuh | 5 +-
csrc/punica/{punica_ops.cu => punica_ops.cc} | 23 +-
csrc/punica/punica_ops.h | 11 -
csrc/punica/torch_bindings.cpp | 18 -
csrc/punica/type_convert.h | 82 -
csrc/pybind.cpp | 136 +
csrc/quantization/aqlm/gemm_kernels.cu | 538 ++-
csrc/quantization/awq/dequantize.cuh | 138 +-
csrc/quantization/awq/gemm_kernels.cu | 613 ++-
.../compressed_tensors/int8_quant_kernels.cu | 115 -
.../broadcast_load_epilogue_c2x.hpp | 346 --
.../broadcast_load_epilogue_c3x.hpp | 389 --
csrc/quantization/cutlass_w8a8/common.hpp | 27 -
.../cutlass_w8a8/scaled_mm_c2x.cu | 609 ---
.../cutlass_w8a8/scaled_mm_c3x.cu | 557 ---
.../cutlass_w8a8/scaled_mm_entry.cu | 101 -
csrc/quantization/fp8/amd/hip_float8.h | 137 -
csrc/quantization/fp8/amd/hip_float8_impl.h | 316 --
csrc/quantization/fp8/amd/quant_utils.cuh | 575 ---
csrc/quantization/fp8/amd_detail/hip_float8.h | 167 +
.../fp8/amd_detail/hip_float8_impl.h | 316 ++
.../fp8/amd_detail/quant_utils.cuh | 517 +++
csrc/quantization/fp8/common.cu | 165 -
csrc/quantization/fp8/fp8_cuda_kernels.cu | 135 +
csrc/quantization/fp8/nvidia/quant_utils.cuh | 570 ---
.../fp8_e5m2_kvcache/quant_utils.cuh | 277 ++
csrc/quantization/gptq/compat.cuh | 70 +-
csrc/quantization/gptq/matrix_view.cuh | 503 ++-
csrc/quantization/gptq/q_gemm.cu | 3443 +++++++++--------
csrc/quantization/gptq/qdq_2.cuh | 107 +-
csrc/quantization/gptq/qdq_3.cuh | 246 +-
csrc/quantization/gptq/qdq_4.cuh | 203 +-
csrc/quantization/gptq/qdq_8.cuh | 34 +-
csrc/quantization/gptq/qdq_util.cuh | 58 +-
csrc/quantization/gptq_marlin/gptq_marlin.cu | 838 ++--
csrc/quantization/gptq_marlin/gptq_marlin.cuh | 52 +-
.../gptq_marlin/gptq_marlin_dtypes.cuh | 77 -
.../gptq_marlin/gptq_marlin_repack.cu | 94 +-
csrc/quantization/marlin/{dense => }/LICENSE | 0
.../marlin/{dense => }/marlin_cuda_kernel.cu | 462 +--
csrc/quantization/marlin/sparse/LICENSE | 203 -
csrc/quantization/marlin/sparse/common/base.h | 51 -
csrc/quantization/marlin/sparse/common/mem.h | 136 -
csrc/quantization/marlin/sparse/common/mma.h | 191 -
.../marlin/sparse/marlin_24_cuda_kernel.cu | 1125 ------
.../squeezellm/quant_cuda_kernel.cu | 64 +-
csrc/reduction_utils.cuh | 72 +-
csrc/registration.h | 22 -
csrc/torch_bindings.cpp | 293 --
docs/requirements-docs.txt | 9 +-
docs/source/automatic_prefix_caching/apc.rst | 110 -
.../automatic_prefix_caching/details.md | 43 -
docs/source/community/meetups.rst | 13 -
docs/source/community/sponsors.md | 27 -
docs/source/conf.py | 17 +-
docs/source/dev/dockerfile/dockerfile.rst | 22 +-
.../input_processing_pipeline.rst | 20 -
.../input_processing/model_inputs_index.rst | 39 -
.../dev/multimodal/multimodal_index.rst | 47 -
docs/source/dev/offline_inference/llm.rst | 6 -
.../dev/offline_inference/llm_inputs.rst | 14 -
.../dev/offline_inference/offline_index.rst | 8 -
docs/source/dev/sampling_params.rst | 4 +-
.../getting_started/amd-installation.rst | 6 +-
.../getting_started/cpu-installation.rst | 25 +-
docs/source/getting_started/debugging.rst | 65 -
docs/source/getting_started/installation.rst | 4 -
.../getting_started/openvino-installation.rst | 95 -
.../getting_started/tpu-installation.rst | 93 -
.../getting_started/xpu-installation.rst | 61 -
docs/source/index.rst | 31 +-
docs/source/models/adding_model.rst | 4 +-
docs/source/models/lora.rst | 3 -
docs/source/models/performance.rst | 55 +-
docs/source/models/spec_decode.rst | 77 -
docs/source/models/supported_models.rst | 51 +-
docs/source/models/vlm.rst | 148 -
docs/source/quantization/fp8.rst | 208 -
.../quantization/supported_hardware.rst | 30 -
.../serving/deploying_with_cerebrium.rst | 109 -
docs/source/serving/deploying_with_docker.rst | 9 +-
docs/source/serving/deploying_with_dstack.rst | 103 -
docs/source/serving/deploying_with_lws.rst | 12 -
docs/source/serving/distributed_serving.rst | 17 +-
docs/source/serving/env_vars.rst | 5 -
docs/source/serving/faq.rst | 12 -
docs/source/serving/integrations.rst | 3 -
.../serving/openai_compatible_server.md | 17 +-
docs/source/serving/tensorizer.rst | 12 -
examples/api_client.py | 7 +-
examples/aqlm_example.py | 7 +-
examples/fp8/extract_scales.py | 20 +-
examples/fp8/quantizer/quantize.py | 16 +-
examples/llava_example.py | 58 +-
examples/llava_next_example.py | 47 -
examples/llm_engine_example.py | 3 +-
examples/lora_with_quantization_inference.py | 140 -
examples/offline_inference_arctic.py | 26 -
examples/offline_inference_distributed.py | 54 +-
examples/offline_inference_embedding.py | 17 -
examples/offline_inference_mlpspeculator.py | 58 -
examples/offline_inference_neuron.py | 0
examples/offline_inference_openai.md | 172 -
examples/offline_inference_with_prefix.py | 51 +-
examples/openai_embedding_client.py | 23 -
examples/openai_example_batch.jsonl | 2 -
examples/openai_vision_api_client.py | 89 -
examples/phi3v_example.py | 62 -
examples/production_monitoring/Otel.md | 82 -
examples/production_monitoring/README.md | 3 +-
.../production_monitoring/dummy_client.py | 35 -
examples/production_monitoring/grafana.json | 432 +--
examples/save_sharded_state.py | 75 -
examples/template_llava.jinja | 23 -
examples/tensorize_vllm_model.py | 274 +-
format.sh | 78 +-
requirements-common.txt | 11 +-
requirements-cpu.txt | 5 +-
requirements-cuda.txt | 4 +-
requirements-dev.txt | 36 +-
requirements-lint.txt | 14 -
requirements-openvino.txt | 9 -
requirements-rocm.txt | 3 +-
requirements-test.txt | 24 -
requirements-tpu.txt | 7 -
requirements-xpu.txt | 11 -
setup.py | 89 +-
tests/async_engine/__init__.py | 0
tests/async_engine/api_server_async_engine.py | 4 +-
tests/async_engine/test_api_server.py | 2 +
tests/async_engine/test_async_llm_engine.py | 40 +-
tests/async_engine/test_chat_template.py | 30 +-
.../test_merge_async_iterators.py | 41 +
tests/async_engine/test_openapi_server_ray.py | 106 +-
tests/basic_correctness/__init__.py | 0
.../test_basic_correctness.py | 56 +-
.../basic_correctness/test_chunked_prefill.py | 43 +-
tests/basic_correctness/test_preemption.py | 207 +-
tests/conftest.py | 412 +-
tests/core/block/e2e/__init__.py | 0
tests/core/block/e2e/conftest.py | 29 +-
tests/core/block/e2e/test_correctness.py | 128 +-
.../e2e/test_correctness_sliding_window.py | 168 -
tests/core/block/test_block_manager_v2.py | 279 +-
tests/core/block/test_block_table.py | 13 +-
.../block/test_cpu_gpu_block_allocator.py | 24 +-
tests/core/block/test_naive_block.py | 6 +-
tests/core/block/test_prefix_caching_block.py | 191 +-
tests/core/test_block_manager.py | 250 +-
tests/core/test_chunked_prefill_scheduler.py | 51 +-
tests/core/test_scheduler.py | 81 +-
tests/core/utils.py | 128 +-
tests/distributed/__init__.py | 0
.../test_basic_distributed_correctness.py | 60 +-
.../test_chunked_prefill_distributed.py | 53 +-
tests/distributed/test_comm_ops.py | 99 +-
tests/distributed/test_custom_all_reduce.py | 95 +-
.../distributed/test_multimodal_broadcast.py | 51 -
tests/distributed/test_parallel_state.py | 57 -
tests/distributed/test_pynccl.py | 180 +-
tests/distributed/test_pynccl_library.py | 46 +
tests/distributed/test_same_node.py | 11 -
tests/distributed/test_shm_broadcast.py | 99 -
tests/distributed/test_utils.py | 38 -
tests/engine/__init__.py | 0
tests/engine/output_processor/__init__.py | 0
.../output_processor/test_multi_step.py | 15 +-
.../output_processor/test_stop_checker.py | 85 -
tests/engine/test_computed_prefix_blocks.py | 2 +
tests/engine/test_skip_tokenizer_init.py | 4 +-
tests/engine/test_stop_reason.py | 13 +-
tests/engine/test_stop_strings.py | 6 +-
tests/entrypoints/__init__.py | 0
tests/entrypoints/llm/__init__.py | 0
tests/entrypoints/llm/test_encode.py | 142 -
tests/entrypoints/llm/test_generate.py | 142 -
.../llm/test_generate_multiple_loras.py | 67 -
tests/entrypoints/openai/__init__.py | 0
tests/entrypoints/openai/test_completion.py | 648 ----
tests/entrypoints/openai/test_embedding.py | 144 -
tests/entrypoints/openai/test_models.py | 69 -
tests/entrypoints/openai/test_run_batch.py | 53 -
tests/entrypoints/openai/test_serving_chat.py | 9 +-
tests/entrypoints/openai/test_vision.py | 276 --
.../{openai => }/test_guided_processors.py | 2 +
tests/entrypoints/test_llm_generate.py | 41 +
.../test_chat.py => test_openai_server.py} | 882 +++--
...ion.py => test_server_oot_registration.py} | 12 +-
tests/kernels/__init__.py | 0
tests/kernels/test_activation.py | 32 +-
tests/kernels/test_attention.py | 128 +-
tests/kernels/test_attention_selector.py | 82 -
tests/kernels/test_blocksparse_attention.py | 442 ---
tests/kernels/test_cache.py | 253 +-
tests/kernels/test_cutlass.py | 296 --
tests/kernels/test_flash_attn.py | 208 -
tests/kernels/test_int8_quant.py | 66 -
tests/kernels/test_layernorm.py | 20 +-
tests/kernels/test_marlin_gemm.py | 219 --
tests/kernels/test_moe.py | 5 +-
tests/kernels/test_pos_encoding.py | 75 +-
tests/kernels/test_prefix_prefill.py | 253 +-
tests/kernels/test_rand.py | 2 +
tests/kernels/test_sampler.py | 4 +
tests/kernels/utils.py | 22 -
tests/lora/conftest.py | 116 +-
tests/lora/data/__init__.py | 0
tests/lora/data/long_context_test_data.py | 119 -
tests/lora/test_baichuan.py | 23 +-
tests/lora/test_chatglm3.py | 8 +-
tests/lora/test_gemma.py | 8 +-
tests/lora/test_layer_variation.py | 8 +-
tests/lora/test_layers.py | 349 +-
tests/lora/test_llama.py | 27 +-
tests/lora/test_long_context.py | 296 --
tests/lora/test_lora.py | 4 +
tests/lora/test_lora_checkpoints.py | 4 +-
tests/lora/test_lora_manager.py | 79 +-
tests/lora/test_mixtral.py | 13 +-
tests/lora/test_phi.py | 69 -
tests/lora/test_punica.py | 31 +-
tests/lora/test_quant_model.py | 9 +-
tests/lora/test_utils.py | 14 +-
tests/lora/test_worker.py | 4 +
tests/lora/utils.py | 18 +-
tests/metrics/__init__.py | 0
tests/metrics/test_metrics.py | 94 +-
tests/model_executor/__init__.py | 0
tests/models/__init__.py | 0
tests/models/test_aqlm.py | 24 +-
tests/models/test_big_models.py | 60 +-
tests/models/test_compressed_tensors.py | 49 -
tests/models/test_embedding.py | 42 -
tests/models/test_fp8.py | 105 +-
tests/models/test_gptq_marlin.py | 71 +-
tests/models/test_gptq_marlin_24.py | 71 -
tests/models/test_llava.py | 204 +-
tests/models/test_llava_next.py | 124 -
tests/models/test_marlin.py | 38 +-
tests/models/test_mistral.py | 41 +-
tests/models/test_models.py | 40 +-
tests/models/test_oot_registration.py | 1 +
tests/models/test_phi3v.py | 162 -
tests/models/test_registry.py | 9 -
tests/models/utils.py | 40 +-
tests/multimodal/__init__.py | 0
tests/multimodal/test_mapper.py | 116 -
tests/multimodal/test_utils.py | 75 -
tests/prefix_caching/__init__.py | 0
.../test_disable_sliding_window.py | 44 -
tests/prefix_caching/test_prefix_caching.py | 5 +-
tests/quantization/__init__.py | 0
tests/quantization/test_bitsandbytes.py | 76 -
tests/quantization/test_compressed_tensors.py | 124 -
tests/quantization/test_configs.py | 8 +-
tests/quantization/test_fp8.py | 91 +-
tests/quantization/utils.py | 15 -
tests/samplers/__init__.py | 0
tests/samplers/test_beam_search.py | 22 +-
tests/samplers/test_ignore_eos.py | 24 +-
tests/samplers/test_logits_processor.py | 87 +-
tests/samplers/test_logprobs.py | 78 +-
tests/samplers/test_ranks.py | 38 +-
tests/samplers/test_rejection_sampler.py | 37 +-
tests/samplers/test_sampler.py | 219 +-
tests/samplers/test_seeded_generate.py | 13 +-
.../test_typical_acceptance_sampler.py | 496 ---
tests/spec_decode/e2e/conftest.py | 60 +-
tests/spec_decode/e2e/test_compatibility.py | 55 +
tests/spec_decode/e2e/test_integration.py | 44 -
.../e2e/test_integration_dist_tp2.py | 111 -
.../e2e/test_integration_dist_tp4.py | 60 -
tests/spec_decode/e2e/test_logprobs.py | 6 +
tests/spec_decode/e2e/test_mlp_correctness.py | 216 --
.../e2e/test_multistep_correctness.py | 100 +-
.../spec_decode/e2e/test_ngram_correctness.py | 48 +-
tests/spec_decode/test_batch_expansion.py | 16 +-
tests/spec_decode/test_dynamic_spec_decode.py | 83 -
tests/spec_decode/test_metrics.py | 100 +-
tests/spec_decode/test_multi_step_worker.py | 49 +-
tests/spec_decode/test_ngram_worker.py | 43 +-
tests/spec_decode/test_spec_decode_worker.py | 188 +-
tests/spec_decode/test_utils.py | 26 +-
tests/spec_decode/utils.py | 23 +-
.../tensorize_vllm_model_for_testing.py | 245 ++
tests/tensorizer_loader/test_tensorizer.py | 358 +-
tests/test_cache_block_hashing.py | 12 +-
tests/test_config.py | 86 +-
tests/test_inputs.py | 53 -
tests/test_logger.py | 1 -
tests/test_logits_processor.py | 36 +-
tests/test_regression.py | 21 -
tests/test_sequence.py | 43 +-
tests/test_sharded_state_loader.py | 127 -
tests/test_utils.py | 191 -
tests/tokenization/test_detokenize.py | 19 +-
tests/tokenization/test_get_eos.py | 31 -
tests/tokenization/test_tokenizer_group.py | 99 -
tests/tracing/__init__.py | 0
tests/tracing/test_tracing.py | 116 -
tests/utils.py | 224 --
tests/worker/test_model_input.py | 152 -
tests/worker/test_model_runner.py | 268 +-
tests/worker/test_swap.py | 28 +-
vllm/__init__.py | 14 +-
vllm/_custom_ops.py | 393 +-
vllm/_ipex_ops.py | 244 --
vllm/attention/__init__.py | 5 +-
vllm/attention/backends/abstract.py | 85 +-
vllm/attention/backends/blocksparse_attn.py | 410 --
vllm/attention/backends/flash_attn.py | 256 +-
vllm/attention/backends/flashinfer.py | 159 +-
vllm/attention/backends/habana_attn.py | 57 +-
vllm/attention/backends/ipex_attn.py | 355 --
vllm/attention/backends/openvino.py | 101 -
vllm/attention/backends/pallas.py | 237 --
vllm/attention/backends/rocm_flash_attn.py | 204 +-
vllm/attention/backends/torch_sdpa.py | 106 +-
vllm/attention/backends/xformers.py | 129 +-
vllm/attention/layer.py | 67 +-
.../ops/blocksparse_attention/__init__.py | 0
.../blocksparse_attention_kernel.py | 423 --
.../ops/blocksparse_attention/interface.py | 238 --
.../ops/blocksparse_attention/utils.py | 216 --
vllm/attention/ops/ipex_attn.py | 120 -
vllm/attention/ops/paged_attn.py | 41 +-
vllm/attention/ops/prefix_prefill.py | 51 +-
vllm/attention/ops/triton_flash_attention.py | 10 -
vllm/attention/selector.py | 180 +-
vllm/block.py | 43 +
vllm/config.py | 611 +--
vllm/core/block/block_table.py | 121 +-
vllm/core/block/common.py | 199 +-
vllm/core/block/cpu_gpu_block_allocator.py | 210 +-
vllm/core/block/interfaces.py | 97 +-
vllm/core/block/naive_block.py | 266 +-
vllm/core/block/prefix_caching_block.py | 747 +---
vllm/core/block/utils.py | 56 -
vllm/core/block_manager_v1.py | 210 +-
vllm/core/block_manager_v2.py | 319 +-
vllm/core/embedding_model_block_manager.py | 83 -
vllm/core/evictor_v1.py | 5 +-
vllm/core/evictor_v2.py | 5 +-
vllm/core/interfaces.py | 12 +-
vllm/core/scheduler.py | 130 +-
vllm/distributed/communication_op.py | 261 +-
.../device_communicators/cuda_wrapper.py | 146 -
.../device_communicators/custom_all_reduce.py | 366 +-
.../custom_all_reduce_utils.py | 244 --
.../device_communicators/pynccl.py | 331 +-
.../device_communicators/pynccl_utils.py | 66 +
.../device_communicators/pynccl_wrapper.py | 278 --
.../device_communicators/shm_broadcast.py | 295 --
vllm/distributed/parallel_state.py | 1090 +-----
vllm/distributed/utils.py | 90 +-
vllm/engine/arg_utils.py | 400 +-
vllm/engine/async_llm_engine.py | 380 +-
vllm/engine/async_timeout.py | 189 -
vllm/engine/llm_engine.py | 553 +--
vllm/engine/metrics.py | 302 +-
vllm/engine/output_processor/multi_step.py | 6 +-
vllm/engine/output_processor/single_step.py | 22 +-
vllm/engine/output_processor/stop_checker.py | 26 +-
vllm/engine/output_processor/util.py | 10 +-
vllm/entrypoints/api_server.py | 5 +-
vllm/entrypoints/llm.py | 469 +--
vllm/entrypoints/openai/api_server.py | 83 +-
vllm/entrypoints/openai/cli_args.py | 3 +-
vllm/entrypoints/openai/protocol.py | 345 +-
vllm/entrypoints/openai/run_batch.py | 146 -
vllm/entrypoints/openai/serving_chat.py | 408 +-
vllm/entrypoints/openai/serving_completion.py | 191 +-
vllm/entrypoints/openai/serving_embedding.py | 144 -
vllm/entrypoints/openai/serving_engine.py | 166 +-
vllm/envs.py | 68 +-
vllm/executor/cpu_executor.py | 3 -
vllm/executor/distributed_gpu_executor.py | 141 +-
vllm/executor/executor_base.py | 16 +-
vllm/executor/gpu_executor.py | 83 +-
vllm/executor/habana_executor.py | 5 +-
vllm/executor/multiproc_gpu_executor.py | 159 -
vllm/executor/multiproc_worker_utils.py | 9 +-
vllm/executor/neuron_executor.py | 12 +-
vllm/executor/openvino_executor.py | 163 -
vllm/executor/ray_gpu_executor.py | 141 +-
vllm/executor/ray_habana_executor.py | 90 +-
vllm/executor/ray_utils.py | 8 +-
vllm/executor/ray_xpu_executor.py | 401 --
vllm/executor/tpu_executor.py | 120 -
vllm/executor/xpu_executor.py | 98 -
vllm/hpu/attn_bias.py | 764 ++++
vllm/hpu/ops.py | 77 +-
vllm/hpu/utils.py | 83 +-
vllm/hpu/xops.py | 41 +
vllm/inputs/__init__.py | 19 -
vllm/inputs/data.py | 143 -
vllm/inputs/registry.py | 208 -
vllm/logger.py | 3 +-
vllm/lora/fully_sharded_layers.py | 58 +-
vllm/lora/layers.py | 223 +-
vllm/lora/lora.py | 6 +-
vllm/lora/models.py | 285 +-
vllm/lora/punica.py | 48 +-
vllm/lora/request.py | 2 -
vllm/lora/utils.py | 39 +-
vllm/lora/worker_manager.py | 59 +-
vllm/model_executor/custom_op.py | 63 -
.../guided_decoding/__init__.py | 30 +-
.../guided_decoding/outlines_decoding.py | 31 +-
.../outlines_logits_processors.py | 62 +-
vllm/model_executor/layers/activation.py | 97 +-
.../layers/fused_moe/__init__.py | 5 +-
...280,device_name=NVIDIA_A100-SXM4-80GB.json | 146 -
...280,device_name=NVIDIA_H100_80GB_HBM3.json | 146 -
...640,device_name=NVIDIA_A100-SXM4-80GB.json | 146 -
...640,device_name=NVIDIA_H100_80GB_HBM3.json | 146 -
...14336,device_name=AMD_Instinct_MI300X.json | 200 -
...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 138 -
...=1792,device_name=AMD_Instinct_MI300X.json | 200 -
...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 146 -
...=3584,device_name=AMD_Instinct_MI300X.json | 200 -
...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 116 +-
...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 146 -
...=7168,device_name=AMD_Instinct_MI300X.json | 200 -
...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 88 +-
...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 146 -
.../layers/fused_moe/fused_moe.py | 358 +-
vllm/model_executor/layers/layernorm.py | 105 +-
vllm/model_executor/layers/linear.py | 304 +-
.../model_executor/layers/logits_processor.py | 32 +-
vllm/model_executor/layers/pooler.py | 56 -
.../layers/quantization/__init__.py | 18 +-
.../layers/quantization/aqlm.py | 2 +-
.../model_executor/layers/quantization/awq.py | 3 +-
.../layers/quantization/base_config.py | 14 +-
.../layers/quantization/bitsandbytes.py | 175 -
.../compressed_tensors/__init__.py | 0
.../compressed_tensors/compressed_tensors.py | 235 --
.../compressed_tensors/schemes/__init__.py | 11 -
.../schemes/compressed_tensors_scheme.py | 41 -
.../schemes/compressed_tensors_unquantized.py | 42 -
.../schemes/compressed_tensors_w4a16_24.py | 138 -
.../schemes/compressed_tensors_w8a8.py | 77 -
.../compressed_tensors_w8a8_dynamictoken.py | 33 -
.../compressed_tensors_w8a8_statictensor.py | 47 -
.../schemes/compressed_tensors_wNa16.py | 175 -
.../quantization/compressed_tensors/utils.py | 122 -
.../layers/quantization/deepspeedfp.py | 194 -
.../model_executor/layers/quantization/fp8.py | 190 +-
.../layers/quantization/gptq_marlin.py | 57 +-
.../layers/quantization/gptq_marlin_24.py | 291 --
.../layers/quantization/marlin.py | 22 -
.../layers/quantization/squeezellm.py | 3 +-
.../layers/quantization/utils/__init__.py | 0
.../layers/quantization/utils/format_24.py | 308 --
.../quantization/utils/marlin_24_perms.py | 60 -
.../layers/quantization/utils/marlin_perms.py | 60 -
.../layers/quantization/utils/marlin_utils.py | 225 --
.../layers/quantization/utils/quant_utils.py | 146 -
.../layers/rejection_sampler.py | 192 +-
.../model_executor/layers/rotary_embedding.py | 369 +-
vllm/model_executor/layers/sampler.py | 157 +-
.../layers/spec_decode_base_sampler.py | 219 --
.../layers/typical_acceptance_sampler.py | 184 -
.../layers/vocab_parallel_embedding.py | 264 +-
vllm/model_executor/model_loader/__init__.py | 19 +-
vllm/model_executor/model_loader/loader.py | 592 +--
vllm/model_executor/model_loader/openvino.py | 210 -
.../model_executor/model_loader/tensorizer.py | 187 +-
.../model_loader/weight_utils.py | 95 +-
vllm/model_executor/models/__init__.py | 24 +-
vllm/model_executor/models/arctic.py | 532 ---
vllm/model_executor/models/baichuan.py | 42 +-
vllm/model_executor/models/bloom.py | 16 +-
vllm/model_executor/models/chatglm.py | 42 +-
vllm/model_executor/models/clip.py | 248 --
vllm/model_executor/models/commandr.py | 73 +-
vllm/model_executor/models/dbrx.py | 31 +-
vllm/model_executor/models/decilm.py | 8 +-
vllm/model_executor/models/deepseek.py | 17 +-
vllm/model_executor/models/deepseek_v2.py | 534 ---
vllm/model_executor/models/falcon.py | 77 +-
vllm/model_executor/models/gemma.py | 52 +-
vllm/model_executor/models/gemma2.py | 401 --
vllm/model_executor/models/gpt2.py | 17 +-
vllm/model_executor/models/gpt_bigcode.py | 57 +-
vllm/model_executor/models/gpt_j.py | 21 +-
vllm/model_executor/models/gpt_neox.py | 17 +-
vllm/model_executor/models/interfaces.py | 144 -
vllm/model_executor/models/internlm2.py | 14 +-
vllm/model_executor/models/jais.py | 23 +-
vllm/model_executor/models/llama.py | 105 +-
vllm/model_executor/models/llama_embedding.py | 87 -
vllm/model_executor/models/llava.py | 218 +-
vllm/model_executor/models/llava_next.py | 492 ---
vllm/model_executor/models/minicpm.py | 24 +-
vllm/model_executor/models/mixtral.py | 188 +-
vllm/model_executor/models/mixtral_quant.py | 44 +-
vllm/model_executor/models/mlp_speculator.py | 188 -
vllm/model_executor/models/mpt.py | 17 +-
vllm/model_executor/models/olmo.py | 15 +-
vllm/model_executor/models/opt.py | 17 +-
vllm/model_executor/models/orion.py | 14 +-
vllm/model_executor/models/phi.py | 62 +-
vllm/model_executor/models/phi3_small.py | 447 ---
vllm/model_executor/models/phi3v.py | 456 ---
vllm/model_executor/models/qwen.py | 26 +-
vllm/model_executor/models/qwen2.py | 62 +-
vllm/model_executor/models/qwen2_moe.py | 28 +-
vllm/model_executor/models/stablelm.py | 15 +-
vllm/model_executor/models/starcoder2.py | 31 +-
vllm/model_executor/models/xverse.py | 26 +-
vllm/model_executor/pooling_metadata.py | 69 -
vllm/model_executor/sampling_metadata.py | 92 +-
vllm/multimodal/__init__.py | 18 -
vllm/multimodal/base.py | 121 -
vllm/multimodal/image.py | 44 -
vllm/multimodal/registry.py | 111 -
vllm/multimodal/utils.py | 94 -
vllm/outputs.py | 112 +-
vllm/pooling_params.py | 20 -
vllm/sampling_params.py | 44 +-
vllm/sequence.py | 385 +-
vllm/spec_decode/batch_expansion.py | 68 +-
vllm/spec_decode/draft_model_runner.py | 170 -
vllm/spec_decode/interfaces.py | 6 +-
vllm/spec_decode/metrics.py | 24 +-
vllm/spec_decode/mlp_speculator_worker.py | 86 -
vllm/spec_decode/multi_step_worker.py | 65 +-
vllm/spec_decode/ngram_worker.py | 129 +-
vllm/spec_decode/proposer_worker_base.py | 44 -
.../spec_decode/smaller_tp_proposer_worker.py | 149 -
vllm/spec_decode/spec_decode_worker.py | 328 +-
vllm/spec_decode/top1_proposer.py | 139 +-
vllm/spec_decode/util.py | 26 +-
vllm/test_utils.py | 41 +
vllm/tracing.py | 104 -
vllm/transformers_utils/config.py | 56 +-
vllm/transformers_utils/configs/__init__.py | 2 -
vllm/transformers_utils/configs/arctic.py | 204 -
vllm/transformers_utils/configs/chatglm.py | 2 -
.../configs/mlp_speculator.py | 65 -
vllm/transformers_utils/detokenizer.py | 2 +-
vllm/transformers_utils/image_processor.py | 37 -
.../tokenizer_group/base_tokenizer_group.py | 4 -
.../tokenizer_group/ray_tokenizer_group.py | 116 +-
.../tokenizer_group/tokenizer_group.py | 20 +-
vllm/usage/usage_lib.py | 5 +-
vllm/utils.py | 404 +-
vllm/version.py | 1 -
vllm/worker/cache_engine.py | 50 +-
vllm/worker/cpu_model_runner.py | 250 +-
vllm/worker/cpu_worker.py | 97 +-
vllm/worker/embedding_model_runner.py | 152 -
vllm/worker/habana_model_runner.py | 111 +-
vllm/worker/habana_worker.py | 43 +-
vllm/worker/model_runner.py | 1615 ++++----
vllm/worker/model_runner_base.py | 158 -
vllm/worker/neuron_model_runner.py | 71 +-
vllm/worker/neuron_worker.py | 39 +-
vllm/worker/openvino_model_runner.py | 330 --
vllm/worker/openvino_worker.py | 353 --
vllm/worker/tpu_model_runner.py | 583 ---
vllm/worker/tpu_worker.py | 291 --
vllm/worker/worker.py | 199 +-
vllm/worker/worker_base.py | 202 +-
vllm/worker/xpu_model_runner.py | 475 ---
vllm/worker/xpu_worker.py | 193 -
669 files changed, 19646 insertions(+), 64037 deletions(-)
delete mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
delete mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
delete mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
delete mode 100644 .buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
delete mode 100644 .buildkite/lm-eval-harness/configs/models-large.txt
delete mode 100644 .buildkite/lm-eval-harness/configs/models-small.txt
delete mode 100644 .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
delete mode 100644 .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
delete mode 100644 .buildkite/lm-eval-harness/run-tests.sh
delete mode 100644 .buildkite/lm-eval-harness/test_lm_eval_correctness.py
delete mode 100644 .buildkite/nightly-benchmarks/README.md
delete mode 100644 .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
delete mode 100755 .buildkite/nightly-benchmarks/kickoff-pipeline.sh
delete mode 100644 .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
delete mode 100644 .buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
delete mode 100644 .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
delete mode 100644 .buildkite/nightly-benchmarks/tests/descriptions.md
delete mode 100644 .buildkite/nightly-benchmarks/tests/latency-tests.json
delete mode 100644 .buildkite/nightly-benchmarks/tests/serving-tests.json
delete mode 100644 .buildkite/nightly-benchmarks/tests/throughput-tests.json
delete mode 100644 .buildkite/release-pipeline.yaml
delete mode 100755 .buildkite/run-openvino-test.sh
delete mode 100644 .buildkite/run-xpu-test.sh
create mode 100644 .buildkite/test-template.j2
delete mode 100644 .clang-format
delete mode 100644 .github/workflows/clang-format.yml
delete mode 100644 Dockerfile.openvino
delete mode 100644 Dockerfile.ppc64le
delete mode 100644 Dockerfile.tpu
delete mode 100644 Dockerfile.xpu
delete mode 100644 benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
delete mode 100644 benchmarks/cutlass_benchmarks/weight_shapes.py
delete mode 100644 benchmarks/kernels/benchmark_marlin.py
create mode 100644 benchmarks/kernels/benchmark_mixtral_moe.py
delete mode 100644 benchmarks/kernels/benchmark_moe.py
delete mode 100644 benchmarks/kernels/benchmark_shapes.py
delete mode 100644 benchmarks/overheads/benchmark_hashing.py
delete mode 100644 csrc/cpu/cpu_types_vsx.hpp
delete mode 100644 csrc/cpu/cpu_types_x86.hpp
create mode 100644 csrc/cpu/pybind.cpp
delete mode 100644 csrc/cpu/torch_bindings.cpp
create mode 100644 csrc/moe/moe_ops.cpp
delete mode 100644 csrc/moe/torch_bindings.cpp
rename csrc/punica/{punica_ops.cu => punica_ops.cc} (97%)
delete mode 100644 csrc/punica/punica_ops.h
delete mode 100644 csrc/punica/torch_bindings.cpp
delete mode 100644 csrc/punica/type_convert.h
create mode 100644 csrc/pybind.cpp
delete mode 100644 csrc/quantization/compressed_tensors/int8_quant_kernels.cu
delete mode 100644 csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp
delete mode 100644 csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
delete mode 100644 csrc/quantization/cutlass_w8a8/common.hpp
delete mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
delete mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
delete mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
delete mode 100644 csrc/quantization/fp8/amd/hip_float8.h
delete mode 100644 csrc/quantization/fp8/amd/hip_float8_impl.h
delete mode 100644 csrc/quantization/fp8/amd/quant_utils.cuh
create mode 100644 csrc/quantization/fp8/amd_detail/hip_float8.h
create mode 100644 csrc/quantization/fp8/amd_detail/hip_float8_impl.h
create mode 100644 csrc/quantization/fp8/amd_detail/quant_utils.cuh
delete mode 100644 csrc/quantization/fp8/common.cu
create mode 100644 csrc/quantization/fp8/fp8_cuda_kernels.cu
delete mode 100644 csrc/quantization/fp8/nvidia/quant_utils.cuh
create mode 100644 csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh
delete mode 100644 csrc/quantization/gptq_marlin/gptq_marlin_dtypes.cuh
rename csrc/quantization/marlin/{dense => }/LICENSE (100%)
rename csrc/quantization/marlin/{dense => }/marlin_cuda_kernel.cu (73%)
delete mode 100644 csrc/quantization/marlin/sparse/LICENSE
delete mode 100644 csrc/quantization/marlin/sparse/common/base.h
delete mode 100644 csrc/quantization/marlin/sparse/common/mem.h
delete mode 100644 csrc/quantization/marlin/sparse/common/mma.h
delete mode 100644 csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
delete mode 100644 csrc/registration.h
delete mode 100644 csrc/torch_bindings.cpp
delete mode 100644 docs/source/automatic_prefix_caching/apc.rst
delete mode 100644 docs/source/automatic_prefix_caching/details.md
delete mode 100644 docs/source/community/meetups.rst
delete mode 100644 docs/source/community/sponsors.md
delete mode 100644 docs/source/dev/input_processing/input_processing_pipeline.rst
delete mode 100644 docs/source/dev/input_processing/model_inputs_index.rst
delete mode 100644 docs/source/dev/multimodal/multimodal_index.rst
delete mode 100644 docs/source/dev/offline_inference/llm.rst
delete mode 100644 docs/source/dev/offline_inference/llm_inputs.rst
delete mode 100644 docs/source/dev/offline_inference/offline_index.rst
delete mode 100644 docs/source/getting_started/debugging.rst
delete mode 100644 docs/source/getting_started/openvino-installation.rst
delete mode 100644 docs/source/getting_started/tpu-installation.rst
delete mode 100644 docs/source/getting_started/xpu-installation.rst
delete mode 100644 docs/source/models/spec_decode.rst
delete mode 100644 docs/source/models/vlm.rst
delete mode 100644 docs/source/quantization/fp8.rst
delete mode 100644 docs/source/quantization/supported_hardware.rst
delete mode 100644 docs/source/serving/deploying_with_cerebrium.rst
delete mode 100644 docs/source/serving/deploying_with_dstack.rst
delete mode 100644 docs/source/serving/deploying_with_lws.rst
delete mode 100644 docs/source/serving/faq.rst
delete mode 100644 docs/source/serving/tensorizer.rst
delete mode 100644 examples/llava_next_example.py
delete mode 100644 examples/lora_with_quantization_inference.py
delete mode 100644 examples/offline_inference_arctic.py
delete mode 100644 examples/offline_inference_embedding.py
delete mode 100644 examples/offline_inference_mlpspeculator.py
mode change 100644 => 100755 examples/offline_inference_neuron.py
delete mode 100644 examples/offline_inference_openai.md
delete mode 100644 examples/openai_embedding_client.py
delete mode 100644 examples/openai_example_batch.jsonl
delete mode 100644 examples/openai_vision_api_client.py
delete mode 100644 examples/phi3v_example.py
delete mode 100644 examples/production_monitoring/Otel.md
delete mode 100644 examples/production_monitoring/dummy_client.py
delete mode 100644 examples/save_sharded_state.py
delete mode 100644 examples/template_llava.jinja
delete mode 100644 requirements-lint.txt
delete mode 100644 requirements-openvino.txt
delete mode 100644 requirements-test.txt
delete mode 100644 requirements-tpu.txt
delete mode 100644 requirements-xpu.txt
delete mode 100644 tests/async_engine/__init__.py
create mode 100644 tests/async_engine/test_merge_async_iterators.py
delete mode 100644 tests/basic_correctness/__init__.py
delete mode 100644 tests/core/block/e2e/__init__.py
delete mode 100644 tests/core/block/e2e/test_correctness_sliding_window.py
delete mode 100644 tests/distributed/__init__.py
delete mode 100644 tests/distributed/test_multimodal_broadcast.py
delete mode 100644 tests/distributed/test_parallel_state.py
create mode 100644 tests/distributed/test_pynccl_library.py
delete mode 100644 tests/distributed/test_same_node.py
delete mode 100644 tests/distributed/test_shm_broadcast.py
delete mode 100644 tests/distributed/test_utils.py
delete mode 100644 tests/engine/__init__.py
delete mode 100644 tests/engine/output_processor/__init__.py
delete mode 100644 tests/engine/output_processor/test_stop_checker.py
delete mode 100644 tests/entrypoints/__init__.py
delete mode 100644 tests/entrypoints/llm/__init__.py
delete mode 100644 tests/entrypoints/llm/test_encode.py
delete mode 100644 tests/entrypoints/llm/test_generate.py
delete mode 100644 tests/entrypoints/llm/test_generate_multiple_loras.py
delete mode 100644 tests/entrypoints/openai/__init__.py
delete mode 100644 tests/entrypoints/openai/test_completion.py
delete mode 100644 tests/entrypoints/openai/test_embedding.py
delete mode 100644 tests/entrypoints/openai/test_models.py
delete mode 100644 tests/entrypoints/openai/test_run_batch.py
delete mode 100644 tests/entrypoints/openai/test_vision.py
rename tests/entrypoints/{openai => }/test_guided_processors.py (98%)
create mode 100644 tests/entrypoints/test_llm_generate.py
rename tests/entrypoints/{openai/test_chat.py => test_openai_server.py} (50%)
rename tests/entrypoints/{openai/test_oot_registration.py => test_server_oot_registration.py} (84%)
delete mode 100644 tests/kernels/__init__.py
delete mode 100644 tests/kernels/test_attention_selector.py
delete mode 100644 tests/kernels/test_blocksparse_attention.py
delete mode 100644 tests/kernels/test_cutlass.py
delete mode 100644 tests/kernels/test_flash_attn.py
delete mode 100644 tests/kernels/test_int8_quant.py
delete mode 100644 tests/kernels/test_marlin_gemm.py
delete mode 100644 tests/kernels/utils.py
delete mode 100644 tests/lora/data/__init__.py
delete mode 100644 tests/lora/data/long_context_test_data.py
delete mode 100644 tests/lora/test_long_context.py
delete mode 100644 tests/lora/test_phi.py
delete mode 100644 tests/metrics/__init__.py
delete mode 100644 tests/model_executor/__init__.py
delete mode 100644 tests/models/__init__.py
delete mode 100644 tests/models/test_compressed_tensors.py
delete mode 100644 tests/models/test_embedding.py
delete mode 100644 tests/models/test_gptq_marlin_24.py
delete mode 100644 tests/models/test_llava_next.py
delete mode 100644 tests/models/test_phi3v.py
delete mode 100644 tests/models/test_registry.py
delete mode 100644 tests/multimodal/__init__.py
delete mode 100644 tests/multimodal/test_mapper.py
delete mode 100644 tests/multimodal/test_utils.py
delete mode 100644 tests/prefix_caching/__init__.py
delete mode 100644 tests/prefix_caching/test_disable_sliding_window.py
delete mode 100644 tests/quantization/__init__.py
delete mode 100644 tests/quantization/test_bitsandbytes.py
delete mode 100644 tests/quantization/test_compressed_tensors.py
delete mode 100644 tests/quantization/utils.py
delete mode 100644 tests/samplers/__init__.py
delete mode 100644 tests/samplers/test_typical_acceptance_sampler.py
delete mode 100644 tests/spec_decode/e2e/test_integration.py
delete mode 100644 tests/spec_decode/e2e/test_integration_dist_tp2.py
delete mode 100644 tests/spec_decode/e2e/test_integration_dist_tp4.py
delete mode 100644 tests/spec_decode/e2e/test_mlp_correctness.py
delete mode 100644 tests/spec_decode/test_dynamic_spec_decode.py
create mode 100644 tests/tensorizer_loader/tensorize_vllm_model_for_testing.py
delete mode 100644 tests/test_inputs.py
delete mode 100644 tests/test_sharded_state_loader.py
delete mode 100644 tests/test_utils.py
delete mode 100644 tests/tokenization/test_get_eos.py
delete mode 100644 tests/tracing/__init__.py
delete mode 100644 tests/tracing/test_tracing.py
delete mode 100644 tests/utils.py
delete mode 100644 tests/worker/test_model_input.py
delete mode 100644 vllm/_ipex_ops.py
delete mode 100644 vllm/attention/backends/blocksparse_attn.py
delete mode 100644 vllm/attention/backends/ipex_attn.py
delete mode 100644 vllm/attention/backends/openvino.py
delete mode 100644 vllm/attention/backends/pallas.py
delete mode 100644 vllm/attention/ops/blocksparse_attention/__init__.py
delete mode 100644 vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
delete mode 100644 vllm/attention/ops/blocksparse_attention/interface.py
delete mode 100644 vllm/attention/ops/blocksparse_attention/utils.py
delete mode 100644 vllm/attention/ops/ipex_attn.py
delete mode 100644 vllm/core/block/utils.py
delete mode 100644 vllm/core/embedding_model_block_manager.py
delete mode 100644 vllm/distributed/device_communicators/cuda_wrapper.py
delete mode 100644 vllm/distributed/device_communicators/custom_all_reduce_utils.py
create mode 100644 vllm/distributed/device_communicators/pynccl_utils.py
delete mode 100644 vllm/distributed/device_communicators/pynccl_wrapper.py
delete mode 100644 vllm/distributed/device_communicators/shm_broadcast.py
delete mode 100644 vllm/engine/async_timeout.py
delete mode 100644 vllm/entrypoints/openai/run_batch.py
delete mode 100644 vllm/entrypoints/openai/serving_embedding.py
delete mode 100644 vllm/executor/multiproc_gpu_executor.py
delete mode 100644 vllm/executor/openvino_executor.py
delete mode 100644 vllm/executor/ray_xpu_executor.py
delete mode 100644 vllm/executor/tpu_executor.py
delete mode 100644 vllm/executor/xpu_executor.py
create mode 100644 vllm/hpu/attn_bias.py
create mode 100644 vllm/hpu/xops.py
delete mode 100644 vllm/inputs/__init__.py
delete mode 100644 vllm/inputs/data.py
delete mode 100644 vllm/inputs/registry.py
delete mode 100644 vllm/model_executor/custom_op.py
delete mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
delete mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
delete mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
delete mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
delete mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
delete mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
delete mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
delete mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
delete mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
delete mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
delete mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
delete mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
delete mode 100644 vllm/model_executor/layers/pooler.py
delete mode 100644 vllm/model_executor/layers/quantization/bitsandbytes.py
delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/utils.py
delete mode 100644 vllm/model_executor/layers/quantization/deepspeedfp.py
delete mode 100644 vllm/model_executor/layers/quantization/gptq_marlin_24.py
delete mode 100644 vllm/model_executor/layers/quantization/utils/__init__.py
delete mode 100644 vllm/model_executor/layers/quantization/utils/format_24.py
delete mode 100644 vllm/model_executor/layers/quantization/utils/marlin_24_perms.py
delete mode 100644 vllm/model_executor/layers/quantization/utils/marlin_perms.py
delete mode 100644 vllm/model_executor/layers/quantization/utils/marlin_utils.py
delete mode 100644 vllm/model_executor/layers/quantization/utils/quant_utils.py
delete mode 100644 vllm/model_executor/layers/spec_decode_base_sampler.py
delete mode 100644 vllm/model_executor/layers/typical_acceptance_sampler.py
delete mode 100644 vllm/model_executor/model_loader/openvino.py
mode change 100644 => 100755 vllm/model_executor/models/__init__.py
delete mode 100644 vllm/model_executor/models/arctic.py
delete mode 100644 vllm/model_executor/models/clip.py
delete mode 100644 vllm/model_executor/models/deepseek_v2.py
delete mode 100644 vllm/model_executor/models/gemma2.py
delete mode 100644 vllm/model_executor/models/interfaces.py
delete mode 100644 vllm/model_executor/models/llama_embedding.py
delete mode 100644 vllm/model_executor/models/llava_next.py
delete mode 100644 vllm/model_executor/models/mlp_speculator.py
delete mode 100644 vllm/model_executor/models/phi3_small.py
delete mode 100644 vllm/model_executor/models/phi3v.py
delete mode 100644 vllm/model_executor/pooling_metadata.py
delete mode 100644 vllm/multimodal/__init__.py
delete mode 100644 vllm/multimodal/base.py
delete mode 100644 vllm/multimodal/image.py
delete mode 100644 vllm/multimodal/registry.py
delete mode 100644 vllm/multimodal/utils.py
delete mode 100644 vllm/pooling_params.py
delete mode 100644 vllm/spec_decode/draft_model_runner.py
delete mode 100644 vllm/spec_decode/mlp_speculator_worker.py
delete mode 100644 vllm/spec_decode/proposer_worker_base.py
delete mode 100644 vllm/spec_decode/smaller_tp_proposer_worker.py
create mode 100644 vllm/test_utils.py
delete mode 100644 vllm/tracing.py
delete mode 100644 vllm/transformers_utils/configs/arctic.py
delete mode 100644 vllm/transformers_utils/configs/mlp_speculator.py
delete mode 100644 vllm/transformers_utils/image_processor.py
delete mode 100644 vllm/version.py
delete mode 100644 vllm/worker/embedding_model_runner.py
delete mode 100644 vllm/worker/model_runner_base.py
delete mode 100644 vllm/worker/openvino_model_runner.py
delete mode 100644 vllm/worker/openvino_worker.py
delete mode 100644 vllm/worker/tpu_model_runner.py
delete mode 100644 vllm/worker/tpu_worker.py
delete mode 100644 vllm/worker/xpu_model_runner.py
delete mode 100644 vllm/worker/xpu_worker.py
diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index 75ad094fa1382..90a5e54736cf3 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -1,7 +1,7 @@
import os
import zipfile
-MAX_SIZE_MB = 200
+MAX_SIZE_MB = 100
def print_top_10_largest_files(zip_file):
diff --git a/.buildkite/download-images.sh b/.buildkite/download-images.sh
index 360a7584bccf1..389a12956c3c3 100644
--- a/.buildkite/download-images.sh
+++ b/.buildkite/download-images.sh
@@ -8,6 +8,10 @@ set -o pipefail
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
mkdir -p images
cd images
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
deleted file mode 100644
index fa6ea236ef04f..0000000000000
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
-model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
-tasks:
-- name: "gsm8k"
- metrics:
- - name: "exact_match,strict-match"
- value: 0.892
- - name: "exact_match,flexible-extract"
- value: 0.892
-limit: 250
-num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
deleted file mode 100644
index 02668702b83af..0000000000000
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
-model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
-tasks:
-- name: "gsm8k"
- metrics:
- - name: "exact_match,strict-match"
- value: 0.756
- - name: "exact_match,flexible-extract"
- value: 0.752
-limit: 250
-num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
deleted file mode 100644
index fb4b4915ab955..0000000000000
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
-model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
-tasks:
-- name: "gsm8k"
- metrics:
- - name: "exact_match,strict-match"
- value: 0.756
- - name: "exact_match,flexible-extract"
- value: 0.752
-limit: 250
-num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
deleted file mode 100644
index dec9164d1b84e..0000000000000
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
-model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
-tasks:
-- name: "gsm8k"
- metrics:
- - name: "exact_match,strict-match"
- value: 0.616
- - name: "exact_match,flexible-extract"
- value: 0.632
-limit: 250
-num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt
deleted file mode 100644
index 127ec5d97bcff..0000000000000
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Meta-Llama-3-70B-Instruct.yaml
-Mixtral-8x7B-Instruct-v0.1.yaml
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
deleted file mode 100644
index 273c5482db264..0000000000000
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Meta-Llama-3-8B-Instruct.yaml
-Meta-Llama-3-8B-Instruct-FP8.yaml
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
deleted file mode 100644
index fdb8ec5393b36..0000000000000
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-# We can use this script to compute baseline accuracy on GSM for transformers.
-#
-# Make sure you have lm-eval-harness installed:
-# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
-
-usage() {
- echo``
- echo "Runs lm eval harness on GSM8k using huggingface transformers."
- echo "This pathway is intended to be used to create baselines for "
- echo "our automated nm-test-accuracy workflow"
- echo
- echo "usage: ${0} "
- echo
- echo " -m - huggingface stub or local directory of the model"
- echo " -b - batch size to run the evaluation at"
- echo " -l - limit number of samples to run"
- echo " -f - number of fewshot samples to use"
- echo
-}
-
-while getopts "m:b:l:f:" OPT; do
- case ${OPT} in
- m )
- MODEL="$OPTARG"
- ;;
- b )
- BATCH_SIZE="$OPTARG"
- ;;
- l )
- LIMIT="$OPTARG"
- ;;
- f )
- FEWSHOT="$OPTARG"
- ;;
- \? )
- usage
- exit 1
- ;;
- esac
-done
-
-lm_eval --model hf \
- --model_args pretrained=$MODEL,parallelize=True \
- --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
- --batch_size $BATCH_SIZE
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
deleted file mode 100644
index a2876bade8893..0000000000000
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-# We can use this script to compute baseline accuracy on GSM for vllm.
-# We use this for fp8, which HF does not support.
-#
-# Make sure you have lm-eval-harness installed:
-# pip install lm-eval==0.4.2
-
-usage() {
- echo``
- echo "Runs lm eval harness on GSM8k using huggingface transformers."
- echo "This pathway is intended to be used to create baselines for "
- echo "our automated nm-test-accuracy workflow"
- echo
- echo "usage: ${0} "
- echo
- echo " -m - huggingface stub or local directory of the model"
- echo " -b - batch size to run the evaluation at"
- echo " -l - limit number of samples to run"
- echo " -f - number of fewshot samples to use"
- echo " -t - tensor parallel size to run at"
- echo
-}
-
-while getopts "m:b:l:f:t:" OPT; do
- case ${OPT} in
- m )
- MODEL="$OPTARG"
- ;;
- b )
- BATCH_SIZE="$OPTARG"
- ;;
- l )
- LIMIT="$OPTARG"
- ;;
- f )
- FEWSHOT="$OPTARG"
- ;;
- t )
- TP_SIZE="$OPTARG"
- ;;
- \? )
- usage
- exit 1
- ;;
- esac
-done
-
-lm_eval --model vllm \
- --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE \
- --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
- --batch_size $BATCH_SIZE
diff --git a/.buildkite/lm-eval-harness/run-tests.sh b/.buildkite/lm-eval-harness/run-tests.sh
deleted file mode 100644
index b4fdde6dab425..0000000000000
--- a/.buildkite/lm-eval-harness/run-tests.sh
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/bin/bash
-
-usage() {
- echo``
- echo "Runs lm eval harness on GSM8k using vllm and compares to "
- echo "precomputed baseline (measured by HF transformers.)"
- echo
- echo "usage: ${0} "
- echo
- echo " -c - path to the test data config (e.g. configs/small-models.txt)"
- echo " -t - tensor parallel size"
- echo
-}
-
-SUCCESS=0
-
-while getopts "c:t:" OPT; do
- case ${OPT} in
- c )
- CONFIG="$OPTARG"
- ;;
- t )
- TP_SIZE="$OPTARG"
- ;;
- \? )
- usage
- exit 1
- ;;
- esac
-done
-
-# Parse list of configs.
-IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
-
-for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
-do
- LOCAL_SUCCESS=0
-
- echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
-
- export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
- export LM_EVAL_TP_SIZE=$TP_SIZE
- pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
-
- if [[ $LOCAL_SUCCESS == 0 ]]; then
- echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
- else
- echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
- fi
-
- SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
-
-done
-
-if [ "${SUCCESS}" -eq "0" ]; then
- exit 0
-else
- exit 1
-fi
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
deleted file mode 100644
index 975841dad1c29..0000000000000
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ /dev/null
@@ -1,54 +0,0 @@
-"""
-LM eval harness on model to compare vs HF baseline computed offline.
-Configs are found in configs/$MODEL.yaml
-
-* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
-* export LM_EVAL_TP_SIZE=4
-* pytest -s test_lm_eval_correctness.py
-"""
-
-import os
-from pathlib import Path
-
-import lm_eval
-import numpy
-import yaml
-
-RTOL = 0.02
-TEST_DATA_FILE = os.environ.get(
- "LM_EVAL_TEST_DATA_FILE",
- ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
-
-TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
-
-
-def launch_lm_eval(eval_config):
- model_args = f"pretrained={eval_config['model_name']}," \
- f"tensor_parallel_size={TP_SIZE}"
-
- results = lm_eval.simple_evaluate(
- model="vllm",
- model_args=model_args,
- tasks=[task["name"] for task in eval_config["tasks"]],
- num_fewshot=eval_config["num_fewshot"],
- limit=eval_config["limit"],
- batch_size="auto")
-
- return results
-
-
-def test_lm_eval_correctness():
- eval_config = yaml.safe_load(
- Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
-
- # Launch eval requests.
- results = launch_lm_eval(eval_config)
-
- # Confirm scores match ground truth.
- for task in eval_config["tasks"]:
- for metric in task["metrics"]:
- ground_truth = metric["value"]
- measured_value = results["results"][task["name"]][metric["name"]]
- print(f'{task["name"]} | {metric["name"]}: '
- f'ground_truth={ground_truth} | measured={measured_value}')
- assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
deleted file mode 100644
index 4036b32a46bf7..0000000000000
--- a/.buildkite/nightly-benchmarks/README.md
+++ /dev/null
@@ -1,103 +0,0 @@
-# vLLM benchmark suite
-
-## Introduction
-
-This directory contains the performance benchmarking CI for vllm.
-The goal is to help developers know the impact of their PRs on the performance of vllm.
-
-This benchmark will be *triggered* upon:
-- A PR being merged into vllm.
-- Every commit for those PRs with `perf-benchmarks` label.
-
-**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for more GPUs is comming later), with different models.
-
-**Benchmarking Duration**: about 1hr.
-
-**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run.
-
-
-## Configuring the workload
-
-The benchmarking workload contains three parts:
-- Latency tests in `latency-tests.json`.
-- Throughput tests in `throughput-tests.json`.
-- Serving tests in `serving-tests.json`.
-
-See [descriptions.md](tests/descriptions.md) for detailed descriptions.
-
-### Latency test
-
-Here is an example of one test inside `latency-tests.json`:
-
-```json
-[
- {
- "test_name": "latency_llama8B_tp1",
- "parameters": {
- "model": "meta-llama/Meta-Llama-3-8B",
- "tensor_parallel_size": 1,
- "load_format": "dummy",
- "num_iters_warmup": 5,
- "num_iters": 15
- }
- },
-]
-```
-
-In this example:
-- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
-- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
-
-Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
-
-WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
-
-
-### Throughput test
-The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
-
-The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
-
-### Serving test
-We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
-
-```
-[
- {
- "test_name": "serving_llama8B_tp1_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
- "server_parameters": {
- "model": "meta-llama/Meta-Llama-3-8B",
- "tensor_parallel_size": 1,
- "swap_space": 16,
- "disable_log_stats": "",
- "disable_log_requests": "",
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Meta-Llama-3-8B",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
-]
-```
-
-Inside this example:
-- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
-- The `server-parameters` includes the command line arguments for vLLM server.
-- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
-- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
-
-The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
-
-WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
-
-## Visualizing the results
-The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
-You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
-If you do not see the table, please wait till the benchmark finish running.
-The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
-The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
deleted file mode 100644
index 2b25c954b5c5c..0000000000000
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ /dev/null
@@ -1,62 +0,0 @@
-steps:
- - label: "Wait for container to be ready"
- agents:
- queue: A100
- plugins:
- - kubernetes:
- podSpec:
- containers:
- - image: badouralix/curl-jq
- command:
- - sh
- - .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- - wait
- - label: "A100 Benchmark"
- agents:
- queue: A100
- plugins:
- - kubernetes:
- podSpec:
- priorityClassName: perf-benchmark
- containers:
- - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
- command:
- - bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
- resources:
- limits:
- nvidia.com/gpu: 8
- volumeMounts:
- - name: devshm
- mountPath: /dev/shm
- env:
- - name: VLLM_USAGE_SOURCE
- value: ci-test
- - name: HF_TOKEN
- valueFrom:
- secretKeyRef:
- name: hf-token-secret
- key: token
- nodeSelector:
- nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
- volumes:
- - name: devshm
- emptyDir:
- medium: Memory
- # - label: "H100: NVIDIA SMI"
- # agents:
- # queue: H100
- # plugins:
- # - docker#v5.11.0:
- # image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
- # command:
- # - bash
- # - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
- # mount-buildkite-agent: true
- # propagate-environment: true
- # propagate-uid-gid: false
- # ipc: host
- # gpus: all
- # environment:
- # - VLLM_USAGE_SOURCE
- # - HF_TOKEN
-
diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
deleted file mode 100755
index 15d411febcee1..0000000000000
--- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env bash
-
-# NOTE(simon): this script runs inside a buildkite agent with CPU only access.
-set -euo pipefail
-
-# Install system packages
-apt update
-apt install -y curl jq
-
-# Install minijinja for templating
-curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
-source $HOME/.cargo/env
-
-# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
-if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
- PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
-
- if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
- echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
- else
- echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
- exit 0
- fi
-fi
-
-# Upload sample.yaml
-buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
deleted file mode 100644
index 021473f76d0e5..0000000000000
--- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+++ /dev/null
@@ -1,358 +0,0 @@
-#!/bin/bash
-
-# This script should be run inside the CI process
-# This script assumes that we are already inside the vllm/ directory
-# Benchmarking results will be available inside vllm/benchmarks/results/
-
-# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
-# and we still want to see other benchmarking results even when mixtral crashes.
-set -o pipefail
-
-check_gpus() {
- # check the number of GPUs and GPU type.
- declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
- if [[ $gpu_count -gt 0 ]]; then
- echo "GPU found."
- else
- echo "Need at least 1 GPU to run benchmarking."
- exit 1
- fi
- declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
- echo "GPU type is $gpu_type"
-}
-
-check_hf_token() {
- # check if HF_TOKEN is available and valid
- if [[ -z "$HF_TOKEN" ]]; then
- echo "Error: HF_TOKEN is not set."
- exit 1
- elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
- echo "Error: HF_TOKEN does not start with 'hf_'."
- exit 1
- else
- echo "HF_TOKEN is set and valid."
- fi
-}
-
-json2args() {
- # transforms the JSON string to command line args, and '_' is replaced to '-'
- # example:
- # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
- # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
- local json_string=$1
- local args=$(
- echo "$json_string" | jq -r '
- to_entries |
- map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
- join(" ")
- '
- )
- echo "$args"
-}
-
-wait_for_server() {
- # wait for vllm server to start
- # return 1 if vllm server crashes
- timeout 1200 bash -c '
- until curl localhost:8000/v1/completions; do
- sleep 1
- done' && return 0 || return 1
-}
-
-kill_gpu_processes() {
- # kill all processes on GPU.
- pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
- if [ -z "$pids" ]; then
- echo "No GPU processes found."
- else
- for pid in $pids; do
- kill -9 "$pid"
- echo "Killed process with PID: $pid"
- done
-
- echo "All GPU processes have been killed."
- fi
-
- # waiting for GPU processes to be fully killed
- sleep 10
-
- # remove vllm config file
- rm -rf ~/.config/vllm
-
- # Print the GPU memory usage
- # so that we know if all GPU processes are killed.
- gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
- # The memory usage should be 0 MB.
- echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
-}
-
-upload_to_buildkite() {
- # upload the benchmarking results to buildkite
-
- # if the agent binary is not found, skip uploading the results, exit 0
- if [ ! -f /workspace/buildkite-agent ]; then
- echo "buildkite-agent binary not found. Skip uploading the results."
- return 0
- fi
- /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
- /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-run_latency_tests() {
- # run latency tests using `benchmark_latency.py`
- # $1: a json file specifying latency test cases
-
- local latency_test_file
- latency_test_file=$1
-
- # Iterate over latency tests
- jq -c '.[]' "$latency_test_file" | while read -r params; do
- # get the test name, and append the GPU type back to it.
- test_name=$(echo "$params" | jq -r '.test_name')
- if [[ ! "$test_name" =~ ^latency_ ]]; then
- echo "In latency-test.json, test_name must start with \"latency_\"."
- exit 1
- fi
-
- # if TEST_SELECTOR is set, only run the test cases that match the selector
- if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
- echo "Skip test case $test_name."
- continue
- fi
-
- # get arguments
- latency_params=$(echo "$params" | jq -r '.parameters')
- latency_args=$(json2args "$latency_params")
-
- # check if there is enough GPU to run the test
- tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
- if [[ $gpu_count -lt $tp ]]; then
- echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
- continue
- fi
-
- latency_command="python3 benchmark_latency.py \
- --output-json $RESULTS_FOLDER/${test_name}.json \
- $latency_args"
-
- echo "Running test case $test_name"
- echo "Latency command: $latency_command"
-
- # recoding benchmarking command ang GPU command
- jq_output=$(jq -n \
- --arg latency "$latency_command" \
- --arg gpu "$gpu_type" \
- '{
- latency_command: $latency,
- gpu_type: $gpu
- }')
- echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
-
- # run the benchmark
- eval "$latency_command"
-
- kill_gpu_processes
-
- done
-}
-
-
-run_throughput_tests() {
- # run throughput tests using `benchmark_throughput.py`
- # $1: a json file specifying throughput test cases
-
- local throughput_test_file
- throughput_test_file=$1
-
- # Iterate over throughput tests
- jq -c '.[]' "$throughput_test_file" | while read -r params; do
- # get the test name, and append the GPU type back to it.
- test_name=$(echo "$params" | jq -r '.test_name')
- if [[ ! "$test_name" =~ ^throughput_ ]]; then
- echo "In throughput-test.json, test_name must start with \"throughput_\"."
- exit 1
- fi
-
- # if TEST_SELECTOR is set, only run the test cases that match the selector
- if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
- echo "Skip test case $test_name."
- continue
- fi
-
- # get arguments
- throughput_params=$(echo "$params" | jq -r '.parameters')
- throughput_args=$(json2args "$throughput_params")
-
- # check if there is enough GPU to run the test
- tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
- if [[ $gpu_count -lt $tp ]]; then
- echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
- continue
- fi
-
- throughput_command="python3 benchmark_throughput.py \
- --output-json $RESULTS_FOLDER/${test_name}.json \
- $throughput_args"
-
- echo "Running test case $test_name"
- echo "Throughput command: $throughput_command"
- # recoding benchmarking command ang GPU command
- jq_output=$(jq -n \
- --arg command "$throughput_command" \
- --arg gpu "$gpu_type" \
- '{
- throughput_command: $command,
- gpu_type: $gpu
- }')
- echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
-
- # run the benchmark
- eval "$throughput_command"
-
- kill_gpu_processes
-
- done
-}
-
-run_serving_tests() {
- # run serving tests using `benchmark_serving.py`
- # $1: a json file specifying serving test cases
-
- local serving_test_file
- serving_test_file=$1
-
- # Iterate over serving tests
- jq -c '.[]' "$serving_test_file" | while read -r params; do
- # get the test name, and append the GPU type back to it.
- test_name=$(echo "$params" | jq -r '.test_name')
- if [[ ! "$test_name" =~ ^serving_ ]]; then
- echo "In serving-test.json, test_name must start with \"serving_\"."
- exit 1
- fi
-
- # if TEST_SELECTOR is set, only run the test cases that match the selector
- if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
- echo "Skip test case $test_name."
- continue
- fi
-
-
- # get client and server arguments
- server_params=$(echo "$params" | jq -r '.server_parameters')
- client_params=$(echo "$params" | jq -r '.client_parameters')
- server_args=$(json2args "$server_params")
- client_args=$(json2args "$client_params")
- qps_list=$(echo "$params" | jq -r '.qps_list')
- qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
- echo "Running over qps list $qps_list"
-
- # check if there is enough GPU to run the test
- tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
- if [[ $gpu_count -lt $tp ]]; then
- echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
- continue
- fi
-
- # check if server model and client model is aligned
- server_model=$(echo "$server_params" | jq -r '.model')
- client_model=$(echo "$client_params" | jq -r '.model')
- if [[ $server_model != "$client_model" ]]; then
- echo "Server model and client model must be the same. Skip testcase $testname."
- continue
- fi
-
- server_command="python3 \
- -m vllm.entrypoints.openai.api_server \
- $server_args"
-
- # run the server
- echo "Running test case $test_name"
- echo "Server command: $server_command"
- eval "$server_command" &
-
- # wait until the server is alive
- wait_for_server
- if [ $? -eq 0 ]; then
- echo ""
- echo "vllm server is up and running."
- else
- echo ""
- echo "vllm failed to start within the timeout period."
- fi
-
- # iterate over different QPS
- for qps in $qps_list; do
- # remove the surrounding single quote from qps
- if [[ "$qps" == *"inf"* ]]; then
- echo "qps was $qps"
- qps="inf"
- echo "now qps is $qps"
- fi
-
- new_test_name=$test_name"_qps_"$qps
-
- client_command="python3 benchmark_serving.py \
- --save-result \
- --result-dir $RESULTS_FOLDER \
- --result-filename ${new_test_name}.json \
- --request-rate $qps \
- $client_args"
-
- echo "Running test case $test_name with qps $qps"
- echo "Client command: $client_command"
-
- eval "$client_command"
-
- # record the benchmarking commands
- jq_output=$(jq -n \
- --arg server "$server_command" \
- --arg client "$client_command" \
- --arg gpu "$gpu_type" \
- '{
- server_command: $server,
- client_command: $client,
- gpu_type: $gpu
- }')
- echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
-
- done
-
- # clean up
- kill_gpu_processes
- done
-}
-
-main() {
- check_gpus
- check_hf_token
-
- # dependencies
- (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
- (which jq) || (apt-get update && apt-get -y install jq)
-
- # get the current IP address, required by benchmark_serving.py
- export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
- # turn of the reporting of the status of each request, to clean up the terminal output
- export VLLM_LOG_LEVEL="WARNING"
-
- # prepare for benchmarking
- cd benchmarks || exit 1
- wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
- declare -g RESULTS_FOLDER=results/
- mkdir -p $RESULTS_FOLDER
- QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
-
- # benchmarking
- run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
- run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
- run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
-
-
- # postprocess benchmarking results
- pip install tabulate pandas
- python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
-
- upload_to_buildkite
-}
-
-main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
deleted file mode 100644
index 534ecf17930e9..0000000000000
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ /dev/null
@@ -1,192 +0,0 @@
-import json
-import os
-from pathlib import Path
-
-import pandas as pd
-from tabulate import tabulate
-
-results_folder = Path("results/")
-
-# latency results and the keys that will be printed into markdown
-latency_results = []
-latency_column_mapping = {
- "test_name": "Test name",
- "gpu_type": "GPU",
- "avg_latency": "Mean latency (ms)",
- # "P10": "P10 (s)",
- # "P25": "P25 (s)",
- "P50": "Median latency (ms)",
- # "P75": "P75 (s)",
- # "P90": "P90 (s)",
- "P99": "P99 latency (ms)",
-}
-
-# throughput tests and the keys that will be printed into markdown
-throughput_results = []
-throughput_results_column_mapping = {
- "test_name": "Test name",
- "gpu_type": "GPU",
- # "num_requests": "# of req.",
- # "total_num_tokens": "Total # of tokens",
- # "elapsed_time": "Elapsed time (s)",
- "requests_per_second": "Tput (req/s)",
- # "tokens_per_second": "Tput (tok/s)",
-}
-
-# serving results and the keys that will be printed into markdown
-serving_results = []
-serving_column_mapping = {
- "test_name": "Test name",
- "gpu_type": "GPU",
- # "completed": "# of req.",
- "request_throughput": "Tput (req/s)",
- # "input_throughput": "Input Tput (tok/s)",
- # "output_throughput": "Output Tput (tok/s)",
- "mean_ttft_ms": "Mean TTFT (ms)",
- "median_ttft_ms": "Median TTFT (ms)",
- "p99_ttft_ms": "P99 TTFT (ms)",
- # "mean_tpot_ms": "Mean TPOT (ms)",
- # "median_tpot_ms": "Median",
- # "p99_tpot_ms": "P99",
- "mean_itl_ms": "Mean ITL (ms)",
- "median_itl_ms": "Median ITL (ms)",
- "p99_itl_ms": "P99 ITL (ms)",
-}
-
-
-def read_markdown(file):
- if os.path.exists(file):
- with open(file, "r") as f:
- return f.read() + "\n"
- else:
- return f"{file} not found.\n"
-
-
-def results_to_json(latency, throughput, serving):
- return json.dumps({
- 'latency': latency.to_dict(),
- 'throughput': throughput.to_dict(),
- 'serving': serving.to_dict()
- })
-
-
-if __name__ == "__main__":
-
- # collect results
- for test_file in results_folder.glob("*.json"):
-
- with open(test_file, "r") as f:
- raw_result = json.loads(f.read())
-
- if "serving" in str(test_file):
- # this result is generated via `benchmark_serving.py`
-
- # attach the benchmarking command to raw_result
- with open(test_file.with_suffix(".commands"), "r") as f:
- command = json.loads(f.read())
- raw_result.update(command)
-
- # update the test name of this result
- raw_result.update({"test_name": test_file.stem})
-
- # add the result to raw_result
- serving_results.append(raw_result)
- continue
-
- elif "latency" in f.name:
- # this result is generated via `benchmark_latency.py`
-
- # attach the benchmarking command to raw_result
- with open(test_file.with_suffix(".commands"), "r") as f:
- command = json.loads(f.read())
- raw_result.update(command)
-
- # update the test name of this result
- raw_result.update({"test_name": test_file.stem})
-
- # get different percentiles
- for perc in [10, 25, 50, 75, 90, 99]:
- # Multiply 1000 to convert the time unit from s to ms
- raw_result.update(
- {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
- raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
-
- # add the result to raw_result
- latency_results.append(raw_result)
- continue
-
- elif "throughput" in f.name:
- # this result is generated via `benchmark_throughput.py`
-
- # attach the benchmarking command to raw_result
- with open(test_file.with_suffix(".commands"), "r") as f:
- command = json.loads(f.read())
- raw_result.update(command)
-
- # update the test name of this result
- raw_result.update({"test_name": test_file.stem})
-
- # add the result to raw_result
- throughput_results.append(raw_result)
- continue
-
- print(f"Skipping {test_file}")
-
- latency_results = pd.DataFrame.from_dict(latency_results)
- serving_results = pd.DataFrame.from_dict(serving_results)
- throughput_results = pd.DataFrame.from_dict(throughput_results)
-
- raw_results_json = results_to_json(latency_results, throughput_results,
- serving_results)
-
- # remapping the key, for visualization purpose
- if not latency_results.empty:
- latency_results = latency_results[list(
- latency_column_mapping.keys())].rename(
- columns=latency_column_mapping)
- if not serving_results.empty:
- serving_results = serving_results[list(
- serving_column_mapping.keys())].rename(
- columns=serving_column_mapping)
- if not throughput_results.empty:
- throughput_results = throughput_results[list(
- throughput_results_column_mapping.keys())].rename(
- columns=throughput_results_column_mapping)
-
- processed_results_json = results_to_json(latency_results,
- throughput_results,
- serving_results)
-
- # get markdown tables
- latency_md_table = tabulate(latency_results,
- headers='keys',
- tablefmt='pipe',
- showindex=False)
- serving_md_table = tabulate(serving_results,
- headers='keys',
- tablefmt='pipe',
- showindex=False)
- throughput_md_table = tabulate(throughput_results,
- headers='keys',
- tablefmt='pipe',
- showindex=False)
-
- # document the result
- with open(results_folder / "benchmark_results.md", "w") as f:
-
- results = read_markdown(
- "../.buildkite/nightly-benchmarks/tests/descriptions.md")
- results = results.format(
- latency_tests_markdown_table=latency_md_table,
- throughput_tests_markdown_table=throughput_md_table,
- serving_tests_markdown_table=serving_md_table,
- benchmarking_results_in_json_string=processed_results_json)
- f.write(results)
-
- # document benchmarking results in json
- with open(results_folder / "benchmark_results.json", "w") as f:
-
- results = latency_results.to_dict(
- orient='records') + throughput_results.to_dict(
- orient='records') + serving_results.to_dict(orient='records')
- f.write(json.dumps(results))
diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
deleted file mode 100644
index c785e6a0da628..0000000000000
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/sh
-TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
-URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
-
-retries=0
-while [ $retries -lt 1000 ]; do
- if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
- exit 0
- fi
-
- echo "Waiting for image to be available..."
-
- retries=$((retries + 1))
- sleep 5
-done
-
-exit 1
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/tests/descriptions.md b/.buildkite/nightly-benchmarks/tests/descriptions.md
deleted file mode 100644
index 891e4917070d9..0000000000000
--- a/.buildkite/nightly-benchmarks/tests/descriptions.md
+++ /dev/null
@@ -1,67 +0,0 @@
-
-## Latency tests
-
-This test suite aims to test vllm's end-to-end latency under a controlled setup.
-
-- Input length: 32 tokens.
-- Output length: 128 tokens.
-- Batch size: fixed (8).
-- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
-- Evaluation metrics: end-to-end latency (mean, median, p99).
-
-### Latency benchmarking results
-
-{latency_tests_markdown_table}
-
-## Throughput tests
-
-This test suite aims to test vllm's throughput.
-
-- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
-- Output length: the corresponding output length of these 200 prompts.
-- Batch size: dynamically determined by vllm to achieve maximum throughput.
-- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
-- Evaluation metrics: throughput.
-
-### Throughput benchmarking results
-
-{throughput_tests_markdown_table}
-
-## Serving tests
-
-This test suite aims to test vllm's real serving metrics.
-
-- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
-- Output length: the corresponding output length of these 200 prompts.
-- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
-- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
-- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
-- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
-
-### Serving benchmarking results
-
-{serving_tests_markdown_table}
-
-## json version of the benchmarking tables
-
-This section contains the data of the markdown tables above in JSON format.
-You can load the benchmarking tables into pandas dataframes as follows:
-
-```python
-import json
-import pandas as pd
-
-benchmarking_results_json = """The json string"""
-benchmarking_results = json.loads(benchmarking_results_json)
-latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
-throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
-serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
-```
-
-The json string for all benchmarking tables:
-```json
-{benchmarking_results_in_json_string}
-```
-
-You can also check the raw experiment data in the Artifact tab of the Buildkite page.
-
diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests.json b/.buildkite/nightly-benchmarks/tests/latency-tests.json
deleted file mode 100644
index 06488cd79110a..0000000000000
--- a/.buildkite/nightly-benchmarks/tests/latency-tests.json
+++ /dev/null
@@ -1,32 +0,0 @@
-[
- {
- "test_name": "latency_llama8B_tp1",
- "parameters": {
- "model": "meta-llama/Meta-Llama-3-8B",
- "tensor_parallel_size": 1,
- "load_format": "dummy",
- "num_iters_warmup": 5,
- "num_iters": 15
- }
- },
- {
- "test_name": "latency_llama70B_tp4",
- "parameters": {
- "model": "meta-llama/Meta-Llama-3-70B-Instruct",
- "tensor_parallel_size": 4,
- "load_format": "dummy",
- "num-iters-warmup": 5,
- "num-iters": 15
- }
- },
- {
- "test_name": "latency_mixtral8x7B_tp2",
- "parameters": {
- "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
- "tensor_parallel_size": 2,
- "load_format": "dummy",
- "num-iters-warmup": 5,
- "num-iters": 15
- }
- }
-]
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json
deleted file mode 100644
index 86a0fefa339f7..0000000000000
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ /dev/null
@@ -1,59 +0,0 @@
-[
- {
- "test_name": "serving_llama8B_tp1_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
- "server_parameters": {
- "model": "meta-llama/Meta-Llama-3-8B",
- "tensor_parallel_size": 1,
- "swap_space": 16,
- "disable_log_stats": "",
- "disable_log_requests": "",
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Meta-Llama-3-8B",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama70B_tp4_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
- "server_parameters": {
- "model": "meta-llama/Meta-Llama-3-70B-Instruct",
- "tensor_parallel_size": 4,
- "swap_space": 16,
- "disable_log_stats": "",
- "disable_log_requests": "",
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Meta-Llama-3-70B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_mixtral8x7B_tp2_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
- "server_parameters": {
- "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
- "tensor_parallel_size": 2,
- "swap_space": 16,
- "disable_log_stats": "",
- "disable_log_requests": "",
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- }
-]
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests.json b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
deleted file mode 100644
index 41ac135748704..0000000000000
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests.json
+++ /dev/null
@@ -1,35 +0,0 @@
-[
- {
- "test_name": "throughput_llama8B_tp1",
- "parameters": {
- "model": "meta-llama/Meta-Llama-3-8B",
- "tensor_parallel_size": 1,
- "load_format": "dummy",
- "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200,
- "backend": "vllm"
- }
- },
- {
- "test_name": "throughput_llama70B_tp4",
- "parameters": {
- "model": "meta-llama/Meta-Llama-3-70B-Instruct",
- "tensor_parallel_size": 4,
- "load_format": "dummy",
- "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200,
- "backend": "vllm"
- }
- },
- {
- "test_name": "throughput_mixtral8x7B_tp2",
- "parameters": {
- "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
- "tensor_parallel_size": 2,
- "load_format": "dummy",
- "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200,
- "backend": "vllm"
- }
- }
-]
\ No newline at end of file
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
deleted file mode 100644
index 1959f9752069f..0000000000000
--- a/.buildkite/release-pipeline.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-steps:
- - block: "Build wheels"
-
- - label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}"
- agents:
- queue: cpu_queue
- commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
- - "mkdir artifacts"
- - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
- - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
- matrix:
- setup:
- cuda_version:
- - "11.8.0"
- - "12.1.0"
- python_version:
- - "3.8"
- - "3.9"
- - "3.10"
- - "3.11"
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index bde8ab6184d3c..ce508e4748aba 100644
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -1,38 +1,10 @@
-# This script runs test inside the corresponding ROCm docker container.
+# This script build the ROCm docker image and runs test inside it.
set -ex
# Print ROCm version
echo "--- ROCm info"
rocminfo
-# cleanup older docker images
-cleanup_docker() {
- # Get Docker's root directory
- docker_root=$(docker info -f '{{.DockerRootDir}}')
- if [ -z "$docker_root" ]; then
- echo "Failed to determine Docker root directory."
- exit 1
- fi
- echo "Docker root directory: $docker_root"
- # Check disk usage of the filesystem where Docker's root directory is located
- disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
- # Define the threshold
- threshold=70
- if [ "$disk_usage" -gt "$threshold" ]; then
- echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
- # Remove dangling images (those that are not tagged and not used by any container)
- docker image prune -f
- # Remove unused volumes
- docker volume prune -f
- echo "Docker images and volumes cleanup completed."
- else
- echo "Disk usage is below $threshold%. No cleanup needed."
- fi
-}
-
-# Call the cleanup docker function
-cleanup_docker
-
echo "--- Resetting GPUs"
echo "reset" > /opt/amdgpu/etc/gpu_state
@@ -47,16 +19,15 @@ done
echo "--- Building container"
sha=$(git rev-parse --short HEAD)
-image_name=rocm_${sha}
-container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
+container_name=rocm_${sha}
docker build \
- -t ${image_name} \
+ -t ${container_name} \
-f Dockerfile.rocm \
--progress plain \
.
remove_docker_container() {
- docker rm -f ${container_name} || docker image rm -f ${image_name} || true
+ docker rm -f ${container_name} || docker image rm -f ${container_name} || true
}
trap remove_docker_container EXIT
@@ -68,6 +39,6 @@ docker run \
--rm \
-e HF_TOKEN \
--name ${container_name} \
- ${image_name} \
+ ${container_name} \
/bin/bash -c "${@}"
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
index cbf6dda677c53..7fbad1c4bd950 100644
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@@ -9,10 +9,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# run python-based benchmarks and upload the result to buildkite
-python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
+python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
bench_latency_exit_code=$?
-python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
+python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite
@@ -50,16 +50,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
echo "" >> benchmark_results.md
echo '```' >> benchmark_results.md
-tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
+tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
echo '```' >> benchmark_results.md
# if the agent binary is not found, skip uploading the results, exit 0
-if [ ! -f /usr/bin/buildkite-agent ]; then
+if [ ! -f /workspace/buildkite-agent ]; then
exit 0
fi
# upload the results to buildkite
-buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
+/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
# exit with the exit code of the benchmarks
if [ $bench_latency_exit_code -ne 0 ]; then
@@ -74,5 +74,4 @@ if [ $bench_serving_exit_code -ne 0 ]; then
exit $bench_serving_exit_code
fi
-rm ShareGPT_V3_unfiltered_cleaned_split.json
-buildkite-agent artifact upload "*.json"
+/workspace/buildkite-agent artifact upload openai-*.json
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index f4fa24be1f20f..f187d1f181724 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -4,23 +4,11 @@ set -ex
# Try building the docker image
docker build -t cpu-test -f Dockerfile.cpu .
-docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
# Setup cleanup
-remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
+remove_docker_container() { docker rm -f cpu-test || true; }
trap remove_docker_container EXIT
remove_docker_container
-# Run the image
-docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
-docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
-
-# offline inference
-docker exec cpu-test bash -c "python3 examples/offline_inference.py"
-docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
-
-# Run basic model test
-docker exec cpu-test bash -c "cd tests;
- pip install pytest Pillow protobuf
- cd ../
- pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
+# Run the image and launch offline inference
+docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 examples/offline_inference.py
diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
deleted file mode 100755
index 70e56596c4a86..0000000000000
--- a/.buildkite/run-openvino-test.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-# This script build the OpenVINO docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Try building the docker image
-docker build -t openvino-test -f Dockerfile.openvino .
-
-# Setup cleanup
-remove_docker_container() { docker rm -f openvino-test || true; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
deleted file mode 100644
index 22a7e76937a76..0000000000000
--- a/.buildkite/run-xpu-test.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-# This script build the CPU docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Try building the docker image
-docker build -t xpu-test -f Dockerfile.xpu .
-
-# Setup cleanup
-remove_docker_container() { docker rm -f xpu-test || true; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Run the image and launch offline inference
-docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d96e3c6d192e2..cee5e7e9d2a73 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1,23 +1,17 @@
# In this file, you can add more tests to run either by adding a new step or
# adding a new command to an existing step. See different options here for examples.
-
-# This script will be feed into Jinja template in `test-template-aws.j2` at
-# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
-# to generate the final pipeline yaml file.
-
+# This script will be feed into Jinja template in `test-template.j2` to generate
+# the final pipeline yaml file.
steps:
- label: Regression Test
- mirror_hardwares: [amd]
command: pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
- label: AsyncEngine Test
- #mirror_hardwares: [amd]
command: pytest -v -s async_engine
- label: Basic Correctness Test
- mirror_hardwares: [amd]
commands:
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
@@ -27,99 +21,68 @@ steps:
- label: Core Test
mirror_hardwares: [amd]
- commands:
- - pytest -v -s core
- - pytest -v -s distributed/test_parallel_state.py
+ command: pytest -v -s core
- label: Distributed Comm Ops Test
- #mirror_hardwares: [amd]
- working_dir: "/vllm-workspace/tests"
+ command: pytest -v -s test_comm_ops.py
+ working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 2
- commands:
- - pytest -v -s distributed/test_comm_ops.py
- - pytest -v -s distributed/test_shm_broadcast.py
-- label: Distributed Tests (2 GPUs)
+- label: Distributed Tests
+ working_dir: "/vllm-workspace/tests/distributed"
+
+ num_gpus: 2 # only support 1 or 2 for now.
mirror_hardwares: [amd]
- working_dir: "/vllm-workspace/tests"
- num_gpus: 2
+
commands:
- - bash ../.buildkite/download-images.sh
- - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
- - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
- - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
- - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
- - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
- - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
- - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
- - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
- - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
- - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
- - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
- - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
-
-- label: Distributed Tests (4 GPUs)
- #mirror_hardwares: [amd]
- working_dir: "/vllm-workspace/tests"
+ - pytest -v -s test_pynccl_library.py
+ - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
+ - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
+ - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
+ - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
+
+- label: Distributed Tests (Multiple Groups)
+ working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 4
commands:
- - pytest -v -s distributed/test_pynccl.py
- # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
- # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
- - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
- - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
+ - pytest -v -s test_pynccl.py
- label: Engine Test
- mirror_hardwares: [amd]
+ #mirror_hardwares: [amd]
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
- label: Entrypoints Test
- mirror_hardwares: [amd]
-
commands:
- - pytest -v -s entrypoints/llm
- - pytest -v -s entrypoints/openai
+ # these tests have to be separated, because each one will allocate all posible GPU memory
+ - pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py
+ - pytest -v -s entrypoints/test_server_oot_registration.py
- label: Examples Test
working_dir: "/vllm-workspace/examples"
mirror_hardwares: [amd]
commands:
# install aws cli for llava_example.py
- # install tensorizer for tensorize_vllm_model.py
- - pip install awscli tensorizer
+ - pip install awscli
- python3 offline_inference.py
- python3 offline_inference_with_prefix.py
- python3 llm_engine_example.py
- python3 llava_example.py
- - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-
-- label: Inputs Test
- #mirror_hardwares: [amd]
- commands:
- - bash ../.buildkite/download-images.sh
- - pytest -v -s test_inputs.py
- - pytest -v -s multimodal
- label: Kernels Test %N
- #mirror_hardwares: [amd]
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4
- label: Models Test
#mirror_hardwares: [amd]
commands:
- - pytest -v -s models -m \"not vlm\"
+ - bash ../.buildkite/download-images.sh
+ - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
-- label: Vision Language Models Test
- mirror_hardwares: [amd]
+- label: Llava Test
+ #mirror_hardwares: [amd]
commands:
- bash ../.buildkite/download-images.sh
- - pytest -v -s models -m vlm
+ - pytest -v -s models/test_llava.py
- label: Prefix Caching Test
mirror_hardwares: [amd]
@@ -127,63 +90,33 @@ steps:
- pytest -v -s prefix_caching
- label: Samplers Test
- #mirror_hardwares: [amd]
command: pytest -v -s samplers
- label: LogitsProcessor Test
mirror_hardwares: [amd]
command: pytest -v -s test_logits_processor.py
-- label: Utils Test
- command: pytest -v -s test_utils.py
-
- label: Worker Test
mirror_hardwares: [amd]
command: pytest -v -s worker
- label: Speculative decoding tests
#mirror_hardwares: [amd]
- commands:
- # See https://github.com/vllm-project/vllm/issues/5152
- - export VLLM_ATTENTION_BACKEND=XFORMERS
- - pytest -v -s spec_decode
+ command: pytest -v -s spec_decode
- label: LoRA Test %N
- #mirror_hardwares: [amd]
- command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
+ command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4
-- label: LoRA Long Context (Distributed)
- #mirror_hardwares: [amd]
- num_gpus: 4
- # This test runs llama 13B, so it is required to run on 4 GPUs.
- commands:
- # FIXIT: find out which code initialize cuda before running the test
- # before the fix, we need to use spawn to test it
- - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- - pytest -v -s -x lora/test_long_context.py
-
- label: Tensorizer Test
- #mirror_hardwares: [amd]
command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
- label: Metrics Test
- mirror_hardwares: [amd]
command: pytest -v -s metrics
- label: Quantization Test
- #mirror_hardwares: [amd]
command: pytest -v -s quantization
-- label: Tracing Test
- commands:
- - "pip install \
- opentelemetry-sdk \
- opentelemetry-api \
- opentelemetry-exporter-otlp \
- opentelemetry-semantic-conventions-ai"
- - pytest -v -s tracing
-
- label: Benchmarks
working_dir: "/vllm-workspace/.buildkite"
mirror_hardwares: [amd]
@@ -191,39 +124,9 @@ steps:
- pip install aiohttp
- bash run-benchmarks.sh
-- label: LM Eval Small Models
- working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
- commands:
- - pip install lm-eval
- - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- - bash ./run-tests.sh -c configs/models-small.txt -t 1
-
-- label: LM Eval Large Models
- gpu: a100
- num_gpus: 4
- working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
- commands:
- - pip install lm-eval
- - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- - bash ./run-tests.sh -c configs/models-large.txt -t 4
-
- label: Documentation Build
working_dir: "/vllm-workspace/test_docs/docs"
no_gpu: True
commands:
- pip install -r requirements-docs.txt
- SPHINXOPTS=\"-W\" make html
-
-- label: Distributed Tests (A100)
- gpu: a100
- num_gpus: 4
- commands:
- # NOTE: don't test llama model here, it seems hf implementation is buggy
- # see https://github.com/vllm-project/vllm/pull/5689 for details
- - pytest -v -s distributed/test_custom_all_reduce.py
- - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
- - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.5/flashinfer-0.0.5+cu121torch2.3-cp310-cp310-linux_x86_64.whl
- - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- - pytest -v -s -x lora/test_mixtral.py
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
new file mode 100644
index 0000000000000..174c756ae74a3
--- /dev/null
+++ b/.buildkite/test-template.j2
@@ -0,0 +1,94 @@
+{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
+{% set default_num_gpu = 1 %}
+{% set default_working_dir = "/vllm-workspace/tests" %}
+
+steps:
+
+ - label: ":docker: build image"
+ commands:
+ - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
+ - "docker push {{ docker_image }}"
+ env:
+ DOCKER_BUILDKIT: "1"
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 5
+ - exit_status: -10 # Agent was lost
+ limit: 5
+ - wait
+
+ - group: "AMD Tests"
+ depends_on: ~
+ steps:
+ {% for step in steps %}
+ {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
+ - label: "AMD: {{ step.label }}"
+ agents:
+ queue: amd
+ command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
+ env:
+ DOCKER_BUILDKIT: "1"
+ {% endif %}
+ {% endfor %}
+
+ - label: "Neuron Test"
+ depends_on: ~
+ agents:
+ queue: neuron
+ command: bash .buildkite/run-neuron-test.sh
+ soft_fail: true
+
+ - label: "Intel Test"
+ depends_on: ~
+ command: bash .buildkite/run-cpu-test.sh
+
+ {% for step in steps %}
+ - label: "{{ step.label }}"
+ agents:
+ queue: kubernetes
+ soft_fail: {{ step.soft_fail or false }}
+ {% if step.parallelism %}
+ parallelism: {{ step.parallelism }}
+ {% endif %}
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 5
+ - exit_status: -10 # Agent was lost
+ limit: 5
+ plugins:
+ - kubernetes:
+ podSpec:
+ {% if step.num_gpus %}
+ priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
+ {% endif %}
+ volumes:
+ - name: dshm
+ emptyDir:
+ medium: Memory
+ containers:
+ - image: "{{ docker_image }}"
+ command: ["bash"]
+ args:
+ - '-c'
+ - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
+ {% if not step.no_gpu %}
+ resources:
+ requests:
+ nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+ limits:
+ nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+ {% endif %}
+ env:
+ - name: VLLM_USAGE_SOURCE
+ value: ci-test
+ - name: HF_TOKEN
+ valueFrom:
+ secretKeyRef:
+ name: hf-token-secret
+ key: token
+ volumeMounts:
+ - mountPath: /dev/shm
+ name: dshm
+ {% endfor %}
diff --git a/.clang-format b/.clang-format
deleted file mode 100644
index 7f9e6d720fae5..0000000000000
--- a/.clang-format
+++ /dev/null
@@ -1,26 +0,0 @@
-BasedOnStyle: Google
-UseTab: Never
-IndentWidth: 2
-ColumnLimit: 80
-
-# Force pointers to the type for C++.
-DerivePointerAlignment: false
-PointerAlignment: Left
-
-# Reordering #include statements can (and currently will) introduce errors
-SortIncludes: false
-
-# Style choices
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-IndentPPDirectives: BeforeHash
-
-IncludeCategories:
- - Regex: '^<'
- Priority: 4
- - Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
- Priority: 3
- - Regex: '^"(qoda|\.\.)/'
- Priority: 2
- - Regex: '.*'
- Priority: 1
diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug report.yml
index ce980c3f4a01d..08120ad8e5a60 100644
--- a/.github/ISSUE_TEMPLATE/400-bug report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug report.yml
@@ -59,8 +59,6 @@ body:
Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
- Please set the environment variable `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging to help debugging potential issues.
-
If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
placeholder: |
A clear and concise description of what the bug is.
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
deleted file mode 100644
index e9b6e28fa6bcb..0000000000000
--- a/.github/workflows/clang-format.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-name: clang-format
-
-on:
- # Trigger the workflow on push or pull request,
- # but only for the main branch
- push:
- branches:
- - main
- pull_request:
- branches:
- - main
-
-jobs:
- clang-format:
- runs-on: ubuntu-latest
- strategy:
- matrix:
- python-version: ["3.11"]
- steps:
- - uses: actions/checkout@v2
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v2
- with:
- python-version: ${{ matrix.python-version }}
- - name: Install dependencies
- run: |
- python -m pip install --upgrade pip
- pip install clang-format==18.1.5
- - name: Running clang-format
- run: |
- EXCLUDES=(
- 'csrc/moe/topk_softmax_kernels.cu'
- 'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
- 'csrc/punica/bgmv/bgmv_config.h'
- 'csrc/punica/bgmv/bgmv_impl.cuh'
- 'csrc/punica/bgmv/vec_dtypes.cuh'
- 'csrc/punica/punica_ops.cu'
- 'csrc/punica/type_convert.h'
- )
- find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
- | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
- | xargs clang-format --dry-run --Werror
\ No newline at end of file
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 62f0dbcd93eff..a20753d8a7702 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -37,7 +37,6 @@ jobs:
mypy vllm/distributed --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml
- mypy vllm/multimodal --config-file pyproject.toml
mypy vllm/usage --config-file pyproject.toml
mypy vllm/*.py --config-file pyproject.toml
mypy vllm/transformers_utils --config-file pyproject.toml
@@ -47,5 +46,5 @@ jobs:
mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/lora --config-file pyproject.toml
mypy vllm/logging --config-file pyproject.toml
- mypy tests --config-file pyproject.toml
+ mypy vllm/model_executor --config-file pyproject.toml
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index 773def58fd966..e71033f828006 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -25,7 +25,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
- pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
+ pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
- name: Analysing the code with ruff
run: |
ruff .
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ede9192cd1dbb..f817f3382c5e1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,8 +2,7 @@ cmake_minimum_required(VERSION 3.21)
project(vllm_extensions LANGUAGES CXX)
-# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
-set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
+option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
@@ -33,7 +32,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
# versions are derived from Dockerfile.rocm
#
set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.4.0")
+set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
+set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
#
# Try to find python package with an executable that exactly matches
@@ -66,6 +66,19 @@ endif()
#
find_package(Torch REQUIRED)
+#
+# Normally `torch.utils.cpp_extension.CUDAExtension` would add
+# `libtorch_python.so` for linking against an extension. Torch's cmake
+# configuration does not include this library (presumably since the cmake
+# config is used for standalone C++ binaries that link against torch).
+# The `libtorch_python.so` library defines some of the glue code between
+# torch/python via pybind and is required by VLLM extensions for this
+# reason. So, add it by manually with `find_library` using torch's
+# installed library path.
+#
+find_library(torch_python_LIBRARY torch_python PATHS
+ "${TORCH_INSTALL_PREFIX}/lib")
+
#
# Forward the non-CUDA device extensions to external CMake scripts.
#
@@ -98,11 +111,18 @@ elseif(HIP_FOUND)
# .hip extension automatically, HIP must be enabled explicitly.
enable_language(HIP)
- # ROCm 5.X and 6.X
- if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
- NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
- message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM} "
- "expected for ROCm build, saw ${Torch_VERSION} instead.")
+ # ROCm 5.x
+ if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
+ NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
+ message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
+ "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
+ endif()
+
+ # ROCm 6.x
+ if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
+ NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
+ message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
+ "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
endif()
else()
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
@@ -147,47 +167,19 @@ set(VLLM_EXT_SRC
"csrc/layernorm_kernels.cu"
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
"csrc/quantization/gptq/q_gemm.cu"
- "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
- "csrc/quantization/fp8/common.cu"
+ "csrc/quantization/fp8/fp8_cuda_kernels.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/moe_align_block_size_kernels.cu"
- "csrc/torch_bindings.cpp")
+ "csrc/pybind.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA")
- include(FetchContent)
- SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
- FetchContent_Declare(
- cutlass
- GIT_REPOSITORY https://github.com/nvidia/cutlass.git
- # CUTLASS 3.5.0
- GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
- )
- FetchContent_MakeAvailable(cutlass)
-
list(APPEND VLLM_EXT_SRC
"csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
- "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
- "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
+ "csrc/quantization/marlin/marlin_cuda_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
- "csrc/custom_all_reduce.cu"
- "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
- "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
- "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
-
- #
- # The CUTLASS kernels for Hopper require sm90a to be enabled.
- # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
- # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
- if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
- set_source_files_properties(
- "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
- PROPERTIES
- COMPILE_FLAGS
- "-gencode arch=compute_90a,code=sm_90a")
- endif()
-
+ "csrc/custom_all_reduce.cu")
endif()
define_gpu_extension_target(
@@ -197,8 +189,6 @@ define_gpu_extension_target(
SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
- INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
- USE_SABI 3
WITH_SOABI)
#
@@ -206,7 +196,7 @@ define_gpu_extension_target(
#
set(VLLM_MOE_EXT_SRC
- "csrc/moe/torch_bindings.cpp"
+ "csrc/moe/moe_ops.cpp"
"csrc/moe/topk_softmax_kernels.cu")
define_gpu_extension_target(
@@ -216,7 +206,6 @@ define_gpu_extension_target(
SOURCES ${VLLM_MOE_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
- USE_SABI 3
WITH_SOABI)
#
@@ -230,8 +219,7 @@ set(VLLM_PUNICA_EXT_SRC
"csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
- "csrc/punica/punica_ops.cu"
- "csrc/punica/torch_bindings.cpp")
+ "csrc/punica/punica_ops.cc")
#
# Copy GPU compilation flags+update for punica
@@ -255,9 +243,6 @@ if (${VLLM_GPU_LANG} STREQUAL "CUDA")
endif()
endforeach()
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
-elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
- set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
- message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
endif()
if (VLLM_PUNICA_GPU_ARCHES)
@@ -268,7 +253,6 @@ if (VLLM_PUNICA_GPU_ARCHES)
SOURCES ${VLLM_PUNICA_EXT_SRC}
COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
- USE_SABI 3
WITH_SOABI)
else()
message(WARNING "Unable to create _punica_C target because none of the "
@@ -293,7 +277,9 @@ add_custom_target(default)
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
message(STATUS "Enabling C extension.")
add_dependencies(default _C)
+endif()
+if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Enabling moe extension.")
add_dependencies(default _moe_C)
diff --git a/Dockerfile b/Dockerfile
index d031d98c5b7e4..90be3a30f89b1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -5,35 +5,18 @@
# docs/source/dev/dockerfile/dockerfile.rst and
# docs/source/assets/dev/dockerfile-stages-dependency.png
-ARG CUDA_VERSION=12.4.1
#################### BASE BUILD IMAGE ####################
# prepare basic build environment
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base
-
-ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
- && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
- && apt-get update -y \
- && apt-get install -y ccache software-properties-common \
- && add-apt-repository ppa:deadsnakes/ppa \
- && apt-get update -y \
- && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv python3-pip \
- && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
- && python3 --version \
- && python3 -m pip --version
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
RUN apt-get update -y \
- && apt-get install -y python3-pip git curl sudo
+ && apt-get install -y python3-pip git
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+RUN ldconfig /usr/local/cuda-12.4/compat/
WORKDIR /workspace
@@ -41,7 +24,12 @@ WORKDIR /workspace
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
RUN --mount=type=cache,target=/root/.cache/pip \
- python3 -m pip install -r requirements-cuda.txt
+ pip install -r requirements-cuda.txt
+
+# install development dependencies
+COPY requirements-dev.txt requirements-dev.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+ pip install -r requirements-dev.txt
# cuda arch list used by torch
# can be useful for both `dev` and `test`
@@ -51,16 +39,14 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
#################### BASE BUILD IMAGE ####################
-#################### WHEEL BUILD IMAGE ####################
-FROM base AS build
-ARG PYTHON_VERSION=3
+#################### WHEEL BUILD IMAGE ####################
+FROM dev AS build
# install build dependencies
COPY requirements-build.txt requirements-build.txt
-
RUN --mount=type=cache,target=/root/.cache/pip \
- python3 -m pip install -r requirements-build.txt
+ pip install -r requirements-build.txt
# install compiler cache to speed up compilation leveraging local or remote caching
RUN apt-get update -y && apt-get install -y ccache
@@ -84,50 +70,43 @@ ENV NVCC_THREADS=$nvcc_threads
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1
-ARG USE_SCCACHE
-# if USE_SCCACHE is set, use sccache to speed up compilation
-RUN --mount=type=cache,target=/root/.cache/pip \
- if [ "$USE_SCCACHE" = "1" ]; then \
- echo "Installing sccache..." \
- && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
- && tar -xzf sccache.tar.gz \
- && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
- && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
- && export SCCACHE_BUCKET=vllm-build-sccache \
- && export SCCACHE_REGION=us-west-2 \
- && sccache --show-stats \
- && python3 setup.py bdist_wheel --dist-dir=dist \
- && sccache --show-stats; \
- fi
-
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
- if [ "$USE_SCCACHE" != "1" ]; then \
- python3 setup.py bdist_wheel --dist-dir=dist; \
- fi
+ python3 setup.py bdist_wheel --dist-dir=dist
# check the size of the wheel, we cannot upload wheels larger than 100MB
COPY .buildkite/check-wheel-size.py check-wheel-size.py
RUN python3 check-wheel-size.py dist
+# the `vllm_nccl` package must be installed from source distribution
+# pip is too smart to store a wheel in the cache, and other CI jobs
+# will directly use the wheel from the cache, which is not what we want.
+# we need to remove it manually
+RUN --mount=type=cache,target=/root/.cache/pip \
+ pip cache remove vllm_nccl*
#################### EXTENSION Build IMAGE ####################
-#################### DEV IMAGE ####################
-FROM base as dev
+#################### FLASH_ATTENTION Build IMAGE ####################
+FROM dev as flash-attn-builder
+# max jobs used for build
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+# flash attention version
+ARG flash_attn_version=v2.5.8
+ENV FLASH_ATTN_VERSION=${flash_attn_version}
-COPY requirements-lint.txt requirements-lint.txt
-COPY requirements-test.txt requirements-test.txt
-COPY requirements-dev.txt requirements-dev.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
- python3 -m pip install -r requirements-dev.txt
+WORKDIR /usr/src/flash-attention-v2
+
+# Download the wheel or build it if a pre-compiled release doesn't exist
+RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
+ --no-build-isolation --no-deps --no-cache-dir
-#################### DEV IMAGE ####################
+#################### FLASH_ATTENTION Build IMAGE ####################
#################### vLLM installation IMAGE ####################
# image with vLLM installed
-FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
-ARG CUDA_VERSION=12.4.1
+FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
WORKDIR /vllm-workspace
RUN apt-get update -y \
@@ -137,12 +116,16 @@ RUN apt-get update -y \
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+RUN ldconfig /usr/local/cuda-12.4/compat/
# install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
- python3 -m pip install dist/*.whl --verbose
+ pip install dist/*.whl --verbose
+
+RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
+ --mount=type=cache,target=/root/.cache/pip \
+ pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
#################### vLLM installation IMAGE ####################
@@ -155,7 +138,7 @@ ADD . /vllm-workspace/
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
- python3 -m pip install -r requirements-dev.txt
+ pip install -r requirements-dev.txt
# doc requires source code
# we hide them inside `test_docs/` , so that this source code
@@ -172,7 +155,7 @@ FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
- pip install accelerate hf_transfer 'modelscope!=1.15.0'
+ pip install accelerate hf_transfer modelscope
ENV VLLM_USAGE_SOURCE production-docker-image
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 6e55203decc56..4251fddd6cc3b 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -1,19 +1,13 @@
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
-FROM ubuntu:22.04 AS cpu-test-1
+FROM ubuntu:22.04
RUN apt-get update -y \
- && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
+ && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
-RUN echo 'export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD' >> ~/.bashrc
-
-RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
-
RUN pip install --upgrade pip \
- && pip install wheel packaging ninja "setuptools>=49.4.0" numpy
-
-FROM cpu-test-1 AS build
+ && pip install wheel packaging ninja setuptools>=49.4.0 numpy
COPY ./ /workspace/vllm
@@ -21,14 +15,6 @@ WORKDIR /workspace/vllm
RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
-# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
-ARG VLLM_CPU_DISABLE_AVX512
-ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
-
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
-WORKDIR /workspace/
-
-RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
-
CMD ["/bin/bash"]
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 010f23a143010..fe42b4ef393f1 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -28,7 +28,7 @@ COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
RUN cd /app/vllm \
&& python3 -m pip install -U -r requirements-neuron.txt
-ENV VLLM_TARGET_DEVICE neuron
+ENV VLLM_BUILD_WITH_NEURON 1
RUN cd /app/vllm \
&& pip install -e . \
&& cd ..
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
deleted file mode 100644
index 9861997b451a9..0000000000000
--- a/Dockerfile.openvino
+++ /dev/null
@@ -1,26 +0,0 @@
-# The vLLM Dockerfile is used to construct vLLM image that can be directly used
-# to run the OpenAI compatible server.
-
-FROM ubuntu:22.04 AS dev
-
-RUN apt-get update -y && \
- apt-get install -y python3-pip git
-WORKDIR /workspace
-
-# copy requirements
-COPY requirements-build.txt /workspace/vllm/
-COPY requirements-common.txt /workspace/vllm/
-COPY requirements-openvino.txt /workspace/vllm/
-
-COPY vllm/ /workspace/vllm/vllm
-COPY setup.py /workspace/vllm/
-
-# install build requirements
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
-# build vLLM with OpenVINO backend
-RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
-
-COPY examples/ /workspace/vllm/examples
-COPY benchmarks/ /workspace/vllm/benchmarks
-
-CMD ["/bin/bash"]
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
deleted file mode 100644
index d4e4c483cada8..0000000000000
--- a/Dockerfile.ppc64le
+++ /dev/null
@@ -1,22 +0,0 @@
-FROM mambaorg/micromamba
-ARG MAMBA_DOCKERFILE_ACTIVATE=1
-USER root
-
-RUN apt-get update -y && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
-
-# Some packages in requirements-cpu are installed here
-# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
-# Currently these may not be available for venv or pip directly
-RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 pytorch-cpu=2.1.2 torchvision-cpu=0.16.2 && micromamba clean --all --yes
-
-COPY ./ /workspace/vllm
-
-WORKDIR /workspace/vllm
-
-# These packages will be in rocketce eventually
-RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
-
-RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
-
-WORKDIR /vllm-workspace
-ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 1b89b892bbf1c..d04bb9915e2ab 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -1,35 +1,35 @@
-# Default ROCm 6.1 base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
-
-# Tested and supported base rocm/pytorch images
-ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1" \
- ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \
- ROCM_6_1_BASE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
-
-# Default ROCm ARCHes to build vLLM for.
-ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
-
-# Whether to build CK-based flash-attention
-# If 0, will not build flash attention
-# This is useful for gfx target where flash-attention is not supported
-# (i.e. those that do not appear in `FA_GFX_ARCHS`)
-# Triton FA is used by default on ROCm now so this is unnecessary.
-ARG BUILD_FA="1"
+# default base image
+ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+
+FROM $BASE_IMAGE
+
+ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+
+RUN echo "Base image is $BASE_IMAGE"
+
+# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
+# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+
+
ARG FA_GFX_ARCHS="gfx90a;gfx942"
-ARG FA_BRANCH="ae7928c"
+RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
-# Whether to build triton on rocm
-ARG BUILD_TRITON="1"
-ARG TRITON_BRANCH="0ef1848"
+ARG FA_BRANCH="ae7928c"
+RUN echo "FA_BRANCH is $FA_BRANCH"
-### Base image build stage
-FROM $BASE_IMAGE AS base
+# whether to build flash-attention
+# if 0, will not build flash attention
+# this is useful for gfx target where flash-attention is not supported
+# In that case, we need to use the python reference attention implementation in vllm
+ARG BUILD_FA="1"
-# Import arg(s) defined before this build stage
-ARG PYTORCH_ROCM_ARCH
+# whether to build triton on rocm
+ARG BUILD_TRITON="1"
# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y
+
+# Install some basic utilities
RUN apt-get update && apt-get install -y \
curl \
ca-certificates \
@@ -40,165 +40,68 @@ RUN apt-get update && apt-get install -y \
build-essential \
wget \
unzip \
+ nvidia-cuda-toolkit \
tmux \
- ccache \
&& rm -rf /var/lib/apt/lists/*
-# When launching the container, mount the code directory to /vllm-workspace
+### Mount Point ###
+# When launching the container, mount the code directory to /app
ARG APP_MOUNT=/vllm-workspace
+VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}
-RUN pip install --upgrade pip
-# Remove sccache so it doesn't interfere with ccache
-# TODO: implement sccache support across components
-RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)"
-# Install torch == 2.4.0 on ROCm
-RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
- *"rocm-5.7"*) \
- pip uninstall -y torch torchaudio torchvision \
- && pip install --no-cache-dir --pre \
- torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
- torchvision==0.19.0.dev20240612 \
- --index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \
- *"rocm-6.0"*) \
- pip uninstall -y torch torchaudio torchvision \
- && pip install --no-cache-dir --pre \
- torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
- torchvision==0.19.0.dev20240612 \
- --index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \
- *"rocm-6.1"*) \
- pip uninstall -y torch torchaudio torchvision \
- && pip install --no-cache-dir --pre \
- torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
- torchvision==0.19.0.dev20240612 \
- --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
- *) ;; esac
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
-ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
-ENV CCACHE_DIR=/root/.cache/ccache
-
-
-### AMD-SMI build stage
-FROM base AS build_amdsmi
-# Build amdsmi wheel always
-RUN cd /opt/rocm/share/amd_smi \
- && pip wheel . --wheel-dir=/install
-
-
-### Flash-Attention wheel build stage
-FROM base AS build_fa
-ARG BUILD_FA
-ARG FA_GFX_ARCHS
-ARG FA_BRANCH
-# Build ROCm flash-attention wheel if `BUILD_FA = 1`
-RUN --mount=type=cache,target=${CCACHE_DIR} \
- if [ "$BUILD_FA" = "1" ]; then \
- mkdir -p libs \
+# Install ROCm flash-attention
+RUN if [ "$BUILD_FA" = "1" ]; then \
+ mkdir libs \
&& cd libs \
&& git clone https://github.com/ROCm/flash-attention.git \
&& cd flash-attention \
- && git checkout "${FA_BRANCH}" \
+ && git checkout ${FA_BRANCH} \
&& git submodule update --init \
- && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
- *"rocm-5.7"*) \
- export VLLM_TORCH_PATH="$(python3 -c 'import torch; print(torch.__path__[0])')" \
- && patch "${VLLM_TORCH_PATH}"/utils/hipify/hipify_python.py hipify_patch.patch;; \
- *) ;; esac \
- && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
- # Create an empty directory otherwise as later build stages expect one
- else mkdir -p /install; \
+ && export GPU_ARCHS=${FA_GFX_ARCHS} \
+ && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
+ patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
+ && python3 setup.py install \
+ && cd ..; \
fi
+# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
+# Manually removed it so that later steps of numpy upgrade can continue
+RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
+ rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
-### Triton wheel build stage
-FROM base AS build_triton
-ARG BUILD_TRITON
-ARG TRITON_BRANCH
-# Build triton wheel if `BUILD_TRITON = 1`
-RUN --mount=type=cache,target=${CCACHE_DIR} \
- if [ "$BUILD_TRITON" = "1" ]; then \
+# build triton
+RUN if [ "$BUILD_TRITON" = "1" ]; then \
mkdir -p libs \
&& cd libs \
- && git clone https://github.com/OpenAI/triton.git \
- && cd triton \
- && git checkout "${TRITON_BRANCH}" \
- && cd python \
- && python3 setup.py bdist_wheel --dist-dir=/install; \
- # Create an empty directory otherwise as later build stages expect one
- else mkdir -p /install; \
+ && pip uninstall -y triton \
+ && git clone https://github.com/ROCm/triton.git \
+ && cd triton/python \
+ && pip3 install . \
+ && cd ../..; \
fi
-
-### Final vLLM build stage
-FROM base AS final
-# Import the vLLM development directory from the build context
+WORKDIR /vllm-workspace
COPY . .
-# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
-# Manually remove it so that later steps of numpy upgrade can continue
-RUN case "$(which python3)" in \
- *"/opt/conda/envs/py_3.9"*) \
- rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
- *) ;; esac
+RUN python3 -m pip install --upgrade pip numba
-# Package upgrades for useful functionality or to avoid dependency issues
RUN --mount=type=cache,target=/root/.cache/pip \
- pip install --upgrade numba scipy huggingface-hub[cli]
-
-# Make sure punica kernels are built (for LoRA)
-ENV VLLM_INSTALL_PUNICA_KERNELS=1
-# Workaround for ray >= 2.10.0
-ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
-# Silences the HF Tokenizers warning
-ENV TOKENIZERS_PARALLELISM=false
-
-RUN --mount=type=cache,target=${CCACHE_DIR} \
- --mount=type=cache,target=/root/.cache/pip \
pip install -U -r requirements-rocm.txt \
- && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
- *"rocm-6.0"*) \
- patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \
- *"rocm-6.1"*) \
- # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
- wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P rocm_patch \
- && cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6 \
- # Prevent interference if torch bundles its own HIP runtime
- && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
- *) ;; esac \
- && python3 setup.py clean --all \
- && python3 setup.py develop
-
-# Copy amdsmi wheel into final image
-RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
- mkdir -p libs \
- && cp /install/*.whl libs \
- # Preemptively uninstall to avoid same-version no-installs
- && pip uninstall -y amdsmi;
-
-# Copy triton wheel(s) into final image if they were built
-RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
- mkdir -p libs \
- && if ls /install/*.whl; then \
- cp /install/*.whl libs \
- # Preemptively uninstall to avoid same-version no-installs
- && pip uninstall -y triton; fi
+ && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
+ && python3 setup.py install \
+ && cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
+ && cd ..
-# Copy flash-attn wheel(s) into final image if they were built
-RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
- mkdir -p libs \
- && if ls /install/*.whl; then \
- cp /install/*.whl libs \
- # Preemptively uninstall to avoid same-version no-installs
- && pip uninstall -y flash-attn; fi
-
-# Install wheels that were built to the final image
-RUN --mount=type=cache,target=/root/.cache/pip \
- if ls libs/*.whl; then \
- pip install libs/*.whl; fi
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3
CMD ["/bin/bash"]
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
deleted file mode 100644
index 931c844c08dce..0000000000000
--- a/Dockerfile.tpu
+++ /dev/null
@@ -1,19 +0,0 @@
-ARG NIGHTLY_DATE="20240601"
-ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
-
-FROM $BASE_IMAGE
-
-WORKDIR /workspace
-COPY . /workspace/vllm
-
-ENV VLLM_TARGET_DEVICE="tpu"
-# Install aiohttp separately to avoid build errors.
-RUN pip install aiohttp
-# Install the TPU and Pallas dependencies.
-RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
-RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-
-# Build vLLM.
-RUN cd /workspace/vllm && python setup.py develop
-
-CMD ["/bin/bash"]
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
deleted file mode 100644
index c39e551672d20..0000000000000
--- a/Dockerfile.xpu
+++ /dev/null
@@ -1,22 +0,0 @@
-FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
-
-RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
- echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
- chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
- rm /etc/apt/sources.list.d/intel-graphics.list && \
- wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
- echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
- chmod 644 /usr/share/keyrings/intel-graphics.gpg
-
-RUN apt-get update -y \
-&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
-
-COPY ./ /workspace/vllm
-
-WORKDIR /workspace/vllm
-
-RUN pip install -v -r requirements-xpu.txt
-
-RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
-
-CMD ["/bin/bash"]
diff --git a/README.md b/README.md
index d6957a7f5ee3a..9b180877a5a82 100644
--- a/README.md
+++ b/README.md
@@ -14,19 +14,7 @@ Easy, fast, and cheap LLM serving for everyone
----
-
-**Ray Summit CPF is Open (June 4th to June 20th)!**
-
-There will be a track for vLLM at the Ray Summit (09/30-10/02, SF) this year!
-If you have cool projects related to vLLM or LLM inference, we would love to see your proposals.
-This will be a great chance for everyone in the community to get together and learn.
-Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite)
-
----
-
*Latest News* 🔥
-- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
- [2024/05] vLLM-fork specific: Added Intel® Gaudi® 2 support with SynapseAI 1.16.0. For more information, please refer to Intel® Gaudi® README.
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
@@ -60,18 +48,45 @@ vLLM is flexible and easy to use with:
- Tensor parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD GPUs, Intel CPUs and GPUs
+- Support NVIDIA GPUs and AMD GPUs
- (Experimental) Prefix caching support
- (Experimental) Multi-lora support
-vLLM seamlessly supports most popular open-source models on HuggingFace, including:
-- Transformer-like LLMs (e.g., Llama)
-- Mixture-of-Expert LLMs (e.g., Mixtral)
-- Multi-modal LLMs (e.g., LLaVA)
-
-Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
-
-## Getting Started
+vLLM seamlessly supports many Hugging Face models, including the following architectures:
+
+- Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
+- Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
+- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
+- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
+- Command-R (`CohereForAI/c4ai-command-r-v01`, etc.)
+- DBRX (`databricks/dbrx-base`, `databricks/dbrx-instruct` etc.)
+- DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
+- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
+- Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
+- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
+- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
+- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
+- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
+- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
+- InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
+- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
+- LLaMA, Llama 2, and Meta Llama 3 (`meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
+- MiniCPM (`openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, etc.)
+- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
+- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.)
+- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
+- OLMo (`allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.)
+- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
+- Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
+- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
+- Phi-3 (`microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, etc.)
+- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
+- Qwen2 (`Qwen/Qwen1.5-7B`, `Qwen/Qwen1.5-7B-Chat`, etc.)
+- Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
+- StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
+- Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
+- Xverse (`xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.)
+- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
@@ -79,7 +94,9 @@ Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/get
pip install vllm
```
-Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
+## Getting Started
+
+Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
@@ -89,34 +106,6 @@ Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
We welcome and value any contributions and collaborations.
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
-## Sponsors
-
-vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
-
-
-
-
-- a16z
-- AMD
-- Anyscale
-- AWS
-- Crusoe Cloud
-- Databricks
-- DeepInfra
-- Dropbox
-- Lambda Lab
-- NVIDIA
-- Replicate
-- Roblox
-- RunPod
-- Sequoia Capital
-- Trainy
-- UC Berkeley
-- UC San Diego
-- ZhenFund
-
-We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
-
## Citation
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index fe29c67086158..f9d167590fe47 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -4,13 +4,10 @@
import time
import traceback
from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import List, Optional
import aiohttp
-import huggingface_hub.constants
from tqdm.asyncio import tqdm
-from transformers import (AutoTokenizer, PreTrainedTokenizer,
- PreTrainedTokenizerFast)
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@@ -71,13 +68,9 @@ async def async_request_tgi(
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
- chunk_bytes = chunk_bytes.decode("utf-8")
- #NOTE: Sometimes TGI returns a ping response without
- # any data, we should skip it.
- if chunk_bytes.startswith(":"):
- continue
- chunk = remove_prefix(chunk_bytes, "data:")
+ chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+ "data:")
data = json.loads(chunk)
timestamp = time.perf_counter()
@@ -96,9 +89,6 @@ async def async_request_tgi(
output.latency = most_recent_timestamp - st
output.success = True
output.generated_text = data["generated_text"]
- else:
- output.error = response.reason or ""
- output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
@@ -225,8 +215,8 @@ async def async_request_openai_completions(
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(
- "completions"
- ), "OpenAI Completions API URL must end with 'completions'."
+ "v1/completions"
+ ), "OpenAI Completions API URL must end with 'v1/completions'."
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search
@@ -265,9 +255,6 @@ async def async_request_openai_completions(
else:
data = json.loads(chunk)
- # NOTE: Some completion API might have a last
- # usage summary response without a token so we
- # want to check a token was generated
if data["choices"][0]["text"]:
timestamp = time.perf_counter()
# First token
@@ -276,8 +263,12 @@ async def async_request_openai_completions(
output.ttft = ttft
# Decoding phase
- output.itl.append(timestamp -
- most_recent_timestamp)
+ # NOTE: Some completion API might have a last
+ # usage summary response without a token so we
+ # do not want to include as inter-token-latency
+ elif data.get("usage", None) is None:
+ output.itl.append(timestamp -
+ most_recent_timestamp)
most_recent_timestamp = timestamp
generated_text += data["choices"][0]["text"]
@@ -285,9 +276,6 @@ async def async_request_openai_completions(
output.generated_text = generated_text
output.success = True
output.latency = latency
- else:
- output.error = response.reason or ""
- output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
@@ -304,8 +292,8 @@ async def async_request_openai_chat_completions(
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(
- "chat/completions"
- ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
+ "v1/chat/completions"
+ ), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search
@@ -390,30 +378,6 @@ def remove_prefix(text: str, prefix: str) -> str:
return text
-def get_model(pretrained_model_name_or_path: str):
- if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
- from modelscope import snapshot_download
- else:
- from huggingface_hub import snapshot_download
-
- model_path = snapshot_download(
- model_id=pretrained_model_name_or_path,
- local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
- ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
- return model_path
-
-
-def get_tokenizer(
- pretrained_model_name_or_path: str, trust_remote_code: bool
-) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
- if pretrained_model_name_or_path is not None and not os.path.exists(
- pretrained_model_name_or_path):
- pretrained_model_name_or_path = get_model(
- pretrained_model_name_or_path)
- return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
- trust_remote_code=trust_remote_code)
-
-
ASYNC_REQUEST_FUNCS = {
"tgi": async_request_tgi,
"vllm": async_request_openai_completions,
@@ -422,5 +386,4 @@ def get_tokenizer(
"openai": async_request_openai_completions,
"openai-chat": async_request_openai_chat_completions,
"tensorrt-llm": async_request_trt_llm,
- "scalellm": async_request_openai_completions,
}
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 16802d879c0ca..e8530c2761acf 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,19 +1,15 @@
"""Benchmark the latency of processing a single batch of requests."""
import argparse
-import json
import time
from pathlib import Path
-from typing import List, Optional
+from typing import Optional
import numpy as np
import torch
from tqdm import tqdm
from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.inputs import PromptStrictInputs
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.utils import FlexibleArgumentParser
def main(args: argparse.Namespace):
@@ -21,33 +17,20 @@ def main(args: argparse.Namespace):
# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
- llm = LLM(
- model=args.model,
- speculative_model=args.speculative_model,
- num_speculative_tokens=args.num_speculative_tokens,
- speculative_draft_tensor_parallel_size=\
- args.speculative_draft_tensor_parallel_size,
- tokenizer=args.tokenizer,
- quantization=args.quantization,
- tensor_parallel_size=args.tensor_parallel_size,
- trust_remote_code=args.trust_remote_code,
- dtype=args.dtype,
- max_model_len=args.max_model_len,
- enforce_eager=args.enforce_eager,
- kv_cache_dtype=args.kv_cache_dtype,
- quantization_param_path=args.quantization_param_path,
- device=args.device,
- ray_workers_use_nsight=args.ray_workers_use_nsight,
- use_v2_block_manager=args.use_v2_block_manager,
- enable_chunked_prefill=args.enable_chunked_prefill,
- download_dir=args.download_dir,
- block_size=args.block_size,
- gpu_memory_utilization=args.gpu_memory_utilization,
- load_format=args.load_format,
- distributed_executor_backend=args.distributed_executor_backend,
- otlp_traces_endpoint=args.otlp_traces_endpoint,
- enable_prefix_caching=args.enable_prefix_caching,
- )
+ llm = LLM(model=args.model,
+ tokenizer=args.tokenizer,
+ quantization=args.quantization,
+ tensor_parallel_size=args.tensor_parallel_size,
+ trust_remote_code=args.trust_remote_code,
+ dtype=args.dtype,
+ enforce_eager=args.enforce_eager,
+ kv_cache_dtype=args.kv_cache_dtype,
+ quantization_param_path=args.quantization_param_path,
+ device=args.device,
+ ray_workers_use_nsight=args.ray_workers_use_nsight,
+ enable_chunked_prefill=args.enable_chunked_prefill,
+ download_dir=args.download_dir,
+ block_size=args.block_size)
sampling_params = SamplingParams(
n=args.n,
@@ -61,9 +44,7 @@ def main(args: argparse.Namespace):
dummy_prompt_token_ids = np.random.randint(10000,
size=(args.batch_size,
args.input_len))
- dummy_inputs: List[PromptStrictInputs] = [{
- "prompt_token_ids": batch
- } for batch in dummy_prompt_token_ids.tolist()]
+ dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()
def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir:
@@ -74,13 +55,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir))) as p:
- llm.generate(dummy_inputs,
+ llm.generate(prompt_token_ids=dummy_prompt_token_ids,
sampling_params=sampling_params,
use_tqdm=False)
print(p.key_averages())
else:
start_time = time.perf_counter()
- llm.generate(dummy_inputs,
+ llm.generate(prompt_token_ids=dummy_prompt_token_ids,
sampling_params=sampling_params,
use_tqdm=False)
end_time = time.perf_counter()
@@ -106,34 +87,18 @@ def run_to_completion(profile_dir: Optional[str] = None):
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
latencies.append(run_to_completion(profile_dir=None))
latencies = np.array(latencies)
- percentages = [10, 25, 50, 75, 90, 99]
+ percentages = [10, 25, 50, 75, 90]
percentiles = np.percentile(latencies, percentages)
print(f'Avg latency: {np.mean(latencies)} seconds')
for percentage, percentile in zip(percentages, percentiles):
print(f'{percentage}% percentile latency: {percentile} seconds')
- # Output JSON results if specified
- if args.output_json:
- results = {
- "avg_latency": np.mean(latencies),
- "latencies": latencies.tolist(),
- "percentiles": dict(zip(percentages, percentiles.tolist())),
- }
- with open(args.output_json, "w") as f:
- json.dump(results, f, indent=4)
-
if __name__ == '__main__':
- parser = FlexibleArgumentParser(
+ parser = argparse.ArgumentParser(
description='Benchmark the latency of processing a single batch of '
'requests till completion.')
parser.add_argument('--model', type=str, default='facebook/opt-125m')
- parser.add_argument('--speculative-model', type=str, default=None)
- parser.add_argument('--num-speculative-tokens', type=int, default=None)
- parser.add_argument('--speculative-draft-tensor-parallel-size',
- '-spec-draft-tp',
- type=int,
- default=None)
parser.add_argument('--tokenizer', type=str, default=None)
parser.add_argument('--quantization',
'-q',
@@ -159,12 +124,6 @@ def run_to_completion(profile_dir: Optional[str] = None):
parser.add_argument('--trust-remote-code',
action='store_true',
help='trust remote code from huggingface')
- parser.add_argument(
- '--max-model-len',
- type=int,
- default=None,
- help='Maximum length of a sequence (including prompt and output). '
- 'If None, will be derived from the model.')
parser.add_argument(
'--dtype',
type=str,
@@ -178,13 +137,15 @@ def run_to_completion(profile_dir: Optional[str] = None):
action='store_true',
help='enforce eager mode and disable CUDA graph')
parser.add_argument(
- '--kv-cache-dtype',
+ "--kv-cache-dtype",
type=str,
- choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
- default="auto",
- help='Data type for kv cache storage. If "auto", will use model '
- 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
- 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
+ choices=['auto', 'fp8'],
+ default='auto',
+ help=
+ 'Data type for kv cache storage. If "auto", will use model data type. '
+ 'FP8_E5M2 (without scaling) is only supported on cuda version greater '
+ 'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
+ 'common inference criteria.')
parser.add_argument(
'--quantization-param-path',
type=str,
@@ -208,10 +169,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
parser.add_argument(
"--device",
type=str,
- default="auto",
- choices=["auto", "cuda", "cpu", "hpu", "openvino", "tpu", "xpu"],
- help='device type for vLLM execution, supporting CUDA, HPU, '
- 'OpenVINO and CPU.')
+ default="cuda",
+ choices=["cuda", "cpu", "hpu"],
+ help='device type for vLLM execution, supporting CUDA, CPU and HPU.')
parser.add_argument('--block-size',
type=int,
default=16,
@@ -221,10 +181,6 @@ def run_to_completion(profile_dir: Optional[str] = None):
action='store_true',
help='If True, the prefill requests can be chunked based on the '
'max_num_batched_tokens')
- parser.add_argument("--enable-prefix-caching",
- action='store_true',
- help="Enable automatic prefix caching")
- parser.add_argument('--use-v2-block-manager', action='store_true')
parser.add_argument(
"--ray-workers-use-nsight",
action='store_true',
@@ -235,51 +191,5 @@ def run_to_completion(profile_dir: Optional[str] = None):
default=None,
help='directory to download and load the weights, '
'default to the default cache dir of huggingface')
- parser.add_argument(
- '--output-json',
- type=str,
- default=None,
- help='Path to save the latency results in JSON format.')
- parser.add_argument('--gpu-memory-utilization',
- type=float,
- default=0.9,
- help='the fraction of GPU memory to be used for '
- 'the model executor, which can range from 0 to 1.'
- 'If unspecified, will use the default value of 0.9.')
- parser.add_argument(
- '--load-format',
- type=str,
- default=EngineArgs.load_format,
- choices=[
- 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
- 'bitsandbytes'
- ],
- help='The format of the model weights to load.\n\n'
- '* "auto" will try to load the weights in the safetensors format '
- 'and fall back to the pytorch bin format if safetensors format '
- 'is not available.\n'
- '* "pt" will load the weights in the pytorch bin format.\n'
- '* "safetensors" will load the weights in the safetensors format.\n'
- '* "npcache" will load the weights in pytorch format and store '
- 'a numpy cache to speed up the loading.\n'
- '* "dummy" will initialize the weights with random values, '
- 'which is mainly for profiling.\n'
- '* "tensorizer" will load the weights using tensorizer from '
- 'CoreWeave. See the Tensorize vLLM Model script in the Examples'
- 'section for more information.\n'
- '* "bitsandbytes" will load the weights using bitsandbytes '
- 'quantization.\n')
- parser.add_argument(
- '--distributed-executor-backend',
- choices=['ray', 'mp'],
- default=None,
- help='Backend to use for distributed serving. When more than 1 GPU '
- 'is used, will be automatically set to "ray" if installed '
- 'or "mp" (multiprocessing) otherwise.')
- parser.add_argument(
- '--otlp-traces-endpoint',
- type=str,
- default=None,
- help='Target URL to which OpenTelemetry traces will be sent.')
args = parser.parse_args()
main(args)
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 395107a5ec747..089966986984f 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -1,7 +1,7 @@
+import argparse
import time
from vllm import LLM, SamplingParams
-from vllm.utils import FlexibleArgumentParser
PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501
@@ -44,7 +44,7 @@ def main(args):
if __name__ == "__main__":
- parser = FlexibleArgumentParser(
+ parser = argparse.ArgumentParser(
description='Benchmark the performance with or without automatic '
'prefix caching.')
parser.add_argument('--model',
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 42867fc40edd2..2c2d69da4a7d1 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -17,10 +17,6 @@
--dataset-path \
--request-rate \ # By default is inf
--num-prompts # By default is 1000
-
- when using tgi backend, add
- --endpoint /generate_stream
- to the end of the command above.
"""
import argparse
import asyncio
@@ -31,7 +27,7 @@
import warnings
from dataclasses import dataclass
from datetime import datetime
-from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
+from typing import AsyncGenerator, List, Optional, Tuple
import numpy as np
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
@@ -39,15 +35,7 @@
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
-try:
- from vllm.transformers_utils.tokenizer import get_tokenizer
-except ImportError:
- from backend_request_func import get_tokenizer
-
-try:
- from vllm.utils import FlexibleArgumentParser
-except ImportError:
- from argparse import ArgumentParser as FlexibleArgumentParser
+from vllm.transformers_utils.tokenizer import get_tokenizer
@dataclass
@@ -64,9 +52,6 @@ class BenchmarkMetrics:
mean_tpot_ms: float
median_tpot_ms: float
p99_tpot_ms: float
- mean_itl_ms: float
- median_itl_ms: float
- p99_itl_ms: float
def sample_sharegpt_requests(
@@ -208,37 +193,24 @@ def calculate_metrics(
dur_s: float,
tokenizer: PreTrainedTokenizerBase,
) -> Tuple[BenchmarkMetrics, List[int]]:
- actual_output_lens: List[int] = []
+ actual_output_lens = []
total_input = 0
completed = 0
- itls: List[float] = []
- tpots: List[float] = []
- ttfts: List[float] = []
+ tpots = []
+ ttfts = []
for i in range(len(outputs)):
if outputs[i].success:
- # We use the tokenizer to count the number of output tokens for all
- # serving backends instead of looking at len(outputs[i].itl) since
- # multiple output tokens may be bundled together
- # Note: this may inflate the output token count slightly
- output_len = len(
- tokenizer(outputs[i].generated_text,
- add_special_tokens=False).input_ids)
+ output_len = len(tokenizer(outputs[i].generated_text).input_ids)
actual_output_lens.append(output_len)
total_input += input_requests[i][1]
if output_len > 1:
tpots.append(
(outputs[i].latency - outputs[i].ttft) / (output_len - 1))
- itls += outputs[i].itl
ttfts.append(outputs[i].ttft)
completed += 1
else:
actual_output_lens.append(0)
- if completed == 0:
- warnings.warn(
- "All requests failed. This is likely due to a misconfiguration "
- "on the benchmark arguments.",
- stacklevel=2)
metrics = BenchmarkMetrics(
completed=completed,
total_input=total_input,
@@ -250,12 +222,9 @@ def calculate_metrics(
1000, # ttfts is empty if streaming is not supported by backend
median_ttft_ms=np.median(ttfts or 0) * 1000,
p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
- mean_tpot_ms=np.mean(tpots or 0) * 1000,
- median_tpot_ms=np.median(tpots or 0) * 1000,
- p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
- mean_itl_ms=np.mean(itls or 0) * 1000,
- median_itl_ms=np.median(itls or 0) * 1000,
- p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
+ mean_tpot_ms=np.mean(tpots) * 1000,
+ median_tpot_ms=np.median(tpots) * 1000,
+ p99_tpot_ms=np.percentile(tpots, 99) * 1000,
)
return metrics, actual_output_lens
@@ -273,34 +242,16 @@ async def benchmark(
disable_tqdm: bool,
):
if backend in ASYNC_REQUEST_FUNCS:
- request_func = ASYNC_REQUEST_FUNCS[backend]
+ request_func = ASYNC_REQUEST_FUNCS.get(backend)
else:
raise ValueError(f"Unknown backend: {backend}")
- print("Starting initial single prompt test run...")
- test_prompt, test_prompt_len, test_output_len = input_requests[0]
- test_input = RequestFuncInput(
- model=model_id,
- prompt=test_prompt,
- api_url=api_url,
- prompt_len=test_prompt_len,
- output_len=test_output_len,
- best_of=best_of,
- use_beam_search=use_beam_search,
- )
- test_output = await request_func(request_func_input=test_input)
- if not test_output.success:
- raise ValueError(
- "Initial test run failed - Please make sure benchmark arguments "
- f"are correctly specified. Error: {test_output.error}")
- else:
- print("Initial test run completed. Starting main benchmark run...")
print(f"Traffic request rate: {request_rate}")
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
benchmark_start_time = time.perf_counter()
- tasks: List[asyncio.Task] = []
+ tasks = []
async for request in get_request(input_requests, request_rate):
prompt, prompt_len, output_len = request
request_func_input = RequestFuncInput(
@@ -318,7 +269,7 @@ async def benchmark(
pbar=pbar)))
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
- if pbar is not None:
+ if not disable_tqdm:
pbar.close()
benchmark_duration = time.perf_counter() - benchmark_start_time
@@ -355,10 +306,6 @@ async def benchmark(
print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
metrics.median_tpot_ms))
print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
- print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
- print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
- print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
- print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
print("=" * 50)
result = {
@@ -375,9 +322,6 @@ async def benchmark(
"mean_tpot_ms": metrics.mean_tpot_ms,
"median_tpot_ms": metrics.median_tpot_ms,
"p99_tpot_ms": metrics.p99_tpot_ms,
- "mean_itl_ms": metrics.mean_itl_ms,
- "median_itl_ms": metrics.median_itl_ms,
- "p99_itl_ms": metrics.p99_itl_ms,
"input_lens": [output.prompt_len for output in outputs],
"output_lens": actual_output_lens,
"ttfts": [output.ttft for output in outputs],
@@ -474,7 +418,7 @@ def main(args: argparse.Namespace):
# Save config and results to json
if args.save_result:
- result_json: Dict[str, Any] = {}
+ result_json = {}
# Setup
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
@@ -507,8 +451,6 @@ def main(args: argparse.Namespace):
# Save to file
base_model_id = model_id.split("/")[-1]
file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
- if args.result_filename:
- file_name = args.result_filename
if args.result_dir:
file_name = os.path.join(args.result_dir, file_name)
with open(file_name, "w") as outfile:
@@ -516,7 +458,7 @@ def main(args: argparse.Namespace):
if __name__ == "__main__":
- parser = FlexibleArgumentParser(
+ parser = argparse.ArgumentParser(
description="Benchmark the online serving throughput.")
parser.add_argument(
"--backend",
@@ -649,15 +591,6 @@ def main(args: argparse.Namespace):
help="Specify directory to save benchmark json results."
"If not specified, results are saved in the current directory.",
)
- parser.add_argument(
- "--result-filename",
- type=str,
- default=None,
- help="Specify the filename to save benchmark json results."
- "If not specified, results will be saved in "
- "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
- " format.",
- )
args = parser.parse_args()
main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index ff33e3dced66f..2e8cfd3f2ca3e 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -10,9 +10,7 @@
from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase)
-from vllm.engine.arg_utils import EngineArgs
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.utils import FlexibleArgumentParser
def sample_requests(
@@ -80,10 +78,8 @@ def run_vllm(
enable_prefix_caching: bool,
enable_chunked_prefill: bool,
max_num_batched_tokens: int,
- distributed_executor_backend: Optional[str],
gpu_memory_utilization: float = 0.9,
download_dir: Optional[str] = None,
- load_format: str = EngineArgs.load_format,
) -> float:
from vllm import LLM, SamplingParams
llm = LLM(
@@ -104,13 +100,11 @@ def run_vllm(
download_dir=download_dir,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
- distributed_executor_backend=distributed_executor_backend,
- load_format=load_format,
)
# Add the requests to the engine.
- prompts: List[str] = []
- sampling_params: List[SamplingParams] = []
+ prompts = []
+ sampling_params = []
for prompt, _, output_len in requests:
prompts.append(prompt)
sampling_params.append(
@@ -231,8 +225,8 @@ def main(args: argparse.Namespace):
args.enforce_eager, args.kv_cache_dtype,
args.quantization_param_path, args.device,
args.enable_prefix_caching, args.enable_chunked_prefill,
- args.max_num_batched_tokens, args.distributed_executor_backend,
- args.gpu_memory_utilization, args.download_dir, args.load_format)
+ args.max_num_batched_tokens, args.gpu_memory_utilization,
+ args.download_dir)
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -248,21 +242,9 @@ def main(args: argparse.Namespace):
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
- # Output JSON results if specified
- if args.output_json:
- results = {
- "elapsed_time": elapsed_time,
- "num_requests": len(requests),
- "total_num_tokens": total_num_tokens,
- "requests_per_second": len(requests) / elapsed_time,
- "tokens_per_second": total_num_tokens / elapsed_time,
- }
- with open(args.output_json, "w") as f:
- json.dump(results, f, indent=4)
-
if __name__ == "__main__":
- parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+ parser = argparse.ArgumentParser(description="Benchmark the throughput.")
parser.add_argument("--backend",
type=str,
choices=["vllm", "hf", "mii"],
@@ -329,13 +311,15 @@ def main(args: argparse.Namespace):
action="store_true",
help="enforce eager execution")
parser.add_argument(
- '--kv-cache-dtype',
+ "--kv-cache-dtype",
type=str,
- choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
+ choices=["auto", "fp8"],
default="auto",
- help='Data type for kv cache storage. If "auto", will use model '
- 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
- 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
+ help=
+ 'Data type for kv cache storage. If "auto", will use model data type. '
+ 'FP8_E5M2 (without scaling) is only supported on cuda version greater '
+ 'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
+ 'common inference criteria.')
parser.add_argument(
'--quantization-param-path',
type=str,
@@ -349,10 +333,9 @@ def main(args: argparse.Namespace):
parser.add_argument(
"--device",
type=str,
- default="auto",
- choices=["auto", "cuda", "cpu", "hpu", "openvino", "tpu", "xpu"],
- help='device type for vLLM execution, supporting CUDA, HPU, '
- 'OpenVINO and CPU.')
+ default="cuda",
+ choices=["cuda", "cpu", "hpu"],
+ help='device type for vLLM execution, supporting CUDA, CPU and HPU.')
parser.add_argument(
"--enable-prefix-caching",
action='store_true',
@@ -370,41 +353,6 @@ def main(args: argparse.Namespace):
default=None,
help='directory to download and load the weights, '
'default to the default cache dir of huggingface')
- parser.add_argument(
- '--output-json',
- type=str,
- default=None,
- help='Path to save the throughput results in JSON format.')
- parser.add_argument(
- '--distributed-executor-backend',
- choices=['ray', 'mp'],
- default=None,
- help='Backend to use for distributed serving. When more than 1 GPU '
- 'is used, will be automatically set to "ray" if installed '
- 'or "mp" (multiprocessing) otherwise.')
- parser.add_argument(
- '--load-format',
- type=str,
- default=EngineArgs.load_format,
- choices=[
- 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
- 'bitsandbytes'
- ],
- help='The format of the model weights to load.\n\n'
- '* "auto" will try to load the weights in the safetensors format '
- 'and fall back to the pytorch bin format if safetensors format '
- 'is not available.\n'
- '* "pt" will load the weights in the pytorch bin format.\n'
- '* "safetensors" will load the weights in the safetensors format.\n'
- '* "npcache" will load the weights in pytorch format and store '
- 'a numpy cache to speed up the loading.\n'
- '* "dummy" will initialize the weights with random values, '
- 'which is mainly for profiling.\n'
- '* "tensorizer" will load the weights using tensorizer from '
- 'CoreWeave. See the Tensorize vLLM Model script in the Examples'
- 'section for more information.\n'
- '* "bitsandbytes" will load the weights using bitsandbytes '
- 'quantization.\n')
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
deleted file mode 100644
index 377f8683c021f..0000000000000
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ /dev/null
@@ -1,353 +0,0 @@
-import argparse
-import copy
-import itertools
-import pickle as pkl
-import time
-from typing import Callable, Iterable, List, Tuple
-
-import torch
-import torch.utils.benchmark as TBenchmark
-from torch.utils.benchmark import Measurement as TMeasurement
-from weight_shapes import WEIGHT_SHAPES
-
-from vllm import _custom_ops as ops
-from vllm.utils import FlexibleArgumentParser
-
-DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
-DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
-DEFAULT_TP_SIZES = [1]
-
-# helpers
-
-
-def to_fp8(tensor: torch.tensor) -> torch.tensor:
- finfo = torch.finfo(torch.float8_e4m3fn)
- return torch.round(tensor.clamp(
- min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.tensor) -> torch.tensor:
- return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
-def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
- k: int) -> Tuple[torch.tensor, torch.tensor]:
-
- a = torch.randn((m, k), device='cuda') * 5
- b = torch.randn((n, k), device='cuda').t() * 5
-
- if dtype == torch.int8:
- return to_int8(a), to_int8(b)
- if dtype == torch.float8_e4m3fn:
- return to_fp8(a), to_fp8(b)
-
- raise ValueError("unsupported dtype")
-
-
-# impl
-
-
-def pytorch_mm_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
- scale_b: torch.tensor,
- out_dtype: torch.dtype) -> torch.tensor:
- return torch.mm(a, b)
-
-
-def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
- scale_b: torch.tensor,
- out_dtype: torch.dtype) -> torch.tensor:
- return torch._scaled_mm(a,
- b,
- scale_a=scale_a,
- scale_b=scale_b,
- out_dtype=out_dtype)
-
-
-def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
- scale_a: torch.tensor, scale_b: torch.tensor,
- out_dtype: torch.dtype) -> torch.tensor:
- return torch._scaled_mm(a,
- b,
- scale_a=scale_a,
- scale_b=scale_b,
- out_dtype=out_dtype,
- use_fast_accum=True)
-
-
-def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
- scale_b: torch.tensor,
- out_dtype: torch.dtype) -> torch.tensor:
- return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
-
-
-# bench
-def bench_fn(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
- scale_b: torch.tensor, out_dtype: torch.dtype, label: str,
- sub_label: str, fn: Callable, description: str) -> TMeasurement:
-
- min_run_time = 1
-
- globals = {
- "a": a,
- "b": b,
- "scale_a": scale_a,
- "scale_b": scale_b,
- "out_dtype": out_dtype,
- "fn": fn,
- }
- return TBenchmark.Timer(
- stmt="fn(a, b, scale_a, scale_b, out_dtype)",
- globals=globals,
- label=label,
- sub_label=sub_label,
- description=description,
- ).blocked_autorange(min_run_time=min_run_time)
-
-
-def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
- sub_label: str) -> Iterable[TMeasurement]:
- assert dtype == torch.int8
- a, b = make_rand_tensors(torch.int8, m, n, k)
- scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
- scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-
- timers = []
- # pytorch impl
- timers.append(
- bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
- b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
- torch.bfloat16, label, sub_label, pytorch_mm_impl,
- "pytorch_bf16_bf16_bf16_matmul-no-scales"))
-
- # cutlass impl
- timers.append(
- bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
- cutlass_impl, "cutlass_i8_i8_bf16_scaled_mm"))
-
- return timers
-
-
-def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
- sub_label: str) -> Iterable[TMeasurement]:
- assert dtype == torch.float8_e4m3fn
- a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
- scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
- scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-
- timers = []
-
- # pytorch impl w. bf16
- timers.append(
- bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
- b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
- torch.bfloat16, label, sub_label, pytorch_mm_impl,
- "pytorch_bf16_bf16_bf16_matmul-no-scales"))
-
- # pytorch impl: bf16 output, without fp8 fast accum
- timers.append(
- bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
- pytorch_fp8_impl, "pytorch_fp8_fp8_bf16_scaled_mm"))
-
- # pytorch impl: bf16 output, with fp8 fast accum
- timers.append(
- bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
- pytorch_fp8_impl_fast_accum,
- "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"))
-
- # pytorch impl: fp16 output, without fp8 fast accum
- timers.append(
- bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
- pytorch_fp8_impl, "pytorch_fp8_fp8_fp16_scaled_mm"))
-
- # pytorch impl: fp16 output, with fp8 fast accum
- timers.append(
- bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
- pytorch_fp8_impl_fast_accum,
- "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"))
-
- # cutlass impl: bf16 output
- timers.append(
- bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
- cutlass_impl, "cutlass_fp8_fp8_bf16_scaled_mm"))
- # cutlass impl: fp16 output
- timers.append(
- bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
- cutlass_impl, "cutlass_fp8_fp8_fp16_scaled_mm"))
- return timers
-
-
-def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
- sub_label: str) -> Iterable[TMeasurement]:
- if dtype == torch.int8:
- return bench_int8(dtype, m, k, n, label, sub_label)
- if dtype == torch.float8_e4m3fn:
- return bench_fp8(dtype, m, k, n, label, sub_label)
- raise ValueError("unsupported type")
-
-
-# runner
-def print_timers(timers: Iterable[TMeasurement]):
- compare = TBenchmark.Compare(timers)
- compare.print()
-
-
-def run(dtype: torch.dtype,
- MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
-
- results = []
- for m, k, n in MKNs:
- timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
- f"MKN=({m}x{k}x{n})")
- print_timers(timers)
- results.extend(timers)
-
- return results
-
-
-# output makers
-def make_output(data: Iterable[TMeasurement],
- MKNs: Iterable[Tuple[int, int, int]],
- base_description: str,
- timestamp=None):
-
- print(f"== All Results {base_description} ====")
- print_timers(data)
-
- # pickle all the results
- timestamp = int(time.time()) if timestamp is None else timestamp
- with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
- pkl.dump(data, f)
-
-
-# argparse runners
-
-
-def run_square_bench(args):
- dim_sizes = list(
- range(args.dim_start, args.dim_end + 1, args.dim_increment))
- MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
- data = run(args.dtype, MKNs)
-
- make_output(data, MKNs, f"square_bench-{args.dtype}")
-
-
-def run_range_bench(args):
- dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
- n = len(dim_sizes)
- Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
- Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
- Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
- MKNs = list(zip(Ms, Ks, Ns))
- data = run(args.dtype, MKNs)
-
- make_output(data, MKNs, f"range_bench-{args.dtype}")
-
-
-def run_model_bench(args):
-
- print("Benchmarking models:")
- for i, model in enumerate(args.models):
- print(f"[{i}] {model}")
-
- def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
- KNs = []
- for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
- KN[tp_split_dim] = KN[tp_split_dim] // tp_size
- KNs.append(KN)
- return KNs
-
- model_bench_data = []
- models_tps = list(itertools.product(args.models, args.tp_sizes))
- for model, tp_size in models_tps:
- Ms = args.batch_sizes
- KNs = model_shapes(model, tp_size)
- MKNs = []
- for m in Ms:
- for k, n in KNs:
- MKNs.append((m, k, n))
-
- data = run(args.dtype, MKNs)
- model_bench_data.append(data)
-
- # Print all results
- for data, model_tp in zip(model_bench_data, models_tps):
- model, tp_size = model_tp
- print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
- print_timers(data)
-
- timestamp = int(time.time())
-
- all_data = []
- for d in model_bench_data:
- all_data.extend(d)
- # pickle all data
- with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
- pkl.dump(all_data, f)
-
-
-if __name__ == '__main__':
-
- def to_torch_dtype(dt):
- if dt == "int8":
- return torch.int8
- if dt == "fp8":
- return torch.float8_e4m3fn
- raise ValueError("unsupported dtype")
-
- parser = FlexibleArgumentParser(
- description="""
-Benchmark Cutlass GEMM.
-
- To run square GEMMs:
- python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
-
- To run constant N and K and sweep M:
- python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
-
- To run dimensions from a model:
- python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
-
- Output:
- - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
- """, # noqa: E501
- formatter_class=argparse.RawTextHelpFormatter)
-
- parser.add_argument("--dtype",
- type=to_torch_dtype,
- required=True,
- help="Available options are ['int8', 'fp8']")
- subparsers = parser.add_subparsers(dest="cmd")
-
- square_parser = subparsers.add_parser("square_bench")
- square_parser.add_argument("--dim-start", type=int, required=True)
- square_parser.add_argument("--dim-end", type=int, required=True)
- square_parser.add_argument("--dim-increment", type=int, required=True)
- square_parser.set_defaults(func=run_square_bench)
-
- range_parser = subparsers.add_parser("range_bench")
- range_parser.add_argument("--dim-start", type=int, required=True)
- range_parser.add_argument("--dim-end", type=int, required=True)
- range_parser.add_argument("--dim-increment", type=int, required=True)
- range_parser.add_argument("--m-constant", type=int, default=None)
- range_parser.add_argument("--n-constant", type=int, default=None)
- range_parser.add_argument("--k-constant", type=int, default=None)
- range_parser.set_defaults(func=run_range_bench)
-
- model_parser = subparsers.add_parser("model_bench")
- model_parser.add_argument("--models",
- nargs="+",
- type=str,
- default=DEFAULT_MODELS,
- choices=WEIGHT_SHAPES.keys())
- model_parser.add_argument("--tp-sizes",
- nargs="+",
- type=int,
- default=DEFAULT_TP_SIZES)
- model_parser.add_argument("--batch-sizes",
- nargs="+",
- type=int,
- default=DEFAULT_BATCH_SIZES)
- model_parser.set_defaults(func=run_model_bench)
-
- args = parser.parse_args()
- args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
deleted file mode 100644
index 25ec9d6028627..0000000000000
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Weight Shapes are in the format
-# ([K, N], TP_SPLIT_DIM)
-# Example:
-# A shape of ([14336, 4096], 0) indicates the following GEMM shape,
-# - TP1 : K = 14336, N = 4096
-# - TP2 : K = 7168, N = 4096
-# A shape of ([4096, 6144], 1) indicates the following GEMM shape,
-# - TP1 : K = 4096, N = 6144
-# - TP4 : K = 4096, N = 1536
-
-# TP1 shapes
-WEIGHT_SHAPES = {
- "mistralai/Mistral-7B-v0.1": [
- ([4096, 6144], 1),
- ([4096, 4096], 0),
- ([4096, 28672], 1),
- ([14336, 4096], 0),
- ],
- "meta-llama/Llama-2-7b-hf": [
- ([4096, 12288], 1),
- ([4096, 4096], 0),
- ([4096, 22016], 1),
- ([11008, 4096], 0),
- ],
- "meta-llama/Llama-3-8b": [
- ([4096, 6144], 1),
- ([4096, 4096], 0),
- ([4096, 28672], 1),
- ([14336, 4096], 0),
- ],
- "meta-llama/Llama-2-13b-hf": [
- ([5120, 15360], 1),
- ([5120, 5120], 0),
- ([5120, 27648], 1),
- ([13824, 5120], 0),
- ],
- "meta-llama/Llama-2-70b-hf": [
- ([8192, 10240], 1),
- ([8192, 8192], 0),
- ([8192, 57344], 1),
- ([28672, 8192], 0),
- ],
-}
diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
index 601c4ea439aea..59392947b15c8 100644
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -1,3 +1,4 @@
+import argparse
import os
import sys
from typing import Optional
@@ -9,7 +10,6 @@
from vllm.model_executor.layers.quantization.aqlm import (
dequantize_weight, generic_dequantize_gemm, get_int_dtype,
optimized_dequantize_gemm)
-from vllm.utils import FlexibleArgumentParser
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
@@ -86,9 +86,9 @@ def dequant_no_scale(
# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
# the generic pytorch version.
# Just visual comparison.
-def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
+def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
- n = int(parts.sum().item())
+ n = parts.sum().item()
device = torch.device('cuda:0')
@@ -137,7 +137,7 @@ def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
def main():
- parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
+ parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")
# Add arguments
parser.add_argument("--nbooks",
@@ -204,7 +204,7 @@ def main():
sys.stdout = sys.__stdout__
-def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
+def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
methods):
# I didn't see visible improvements from increasing these, but feel free :)
@@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
print('')
-def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
+def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
nbooks: int, bits: int, method) -> float:
- n = int(parts.sum().item())
+ n = parts.sum().item()
device = torch.device('cuda:0')
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
deleted file mode 100644
index 261f5829631ee..0000000000000
--- a/benchmarks/kernels/benchmark_marlin.py
+++ /dev/null
@@ -1,235 +0,0 @@
-from typing import List
-
-import torch
-import torch.utils.benchmark as benchmark
-from benchmark_shapes import WEIGHT_SHAPES
-
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.gptq_marlin import (
- GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
- GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
-from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
- GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
- GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
-from vllm.model_executor.layers.quantization.utils.marlin_utils import (
- MarlinWorkspace, marlin_24_quantize, marlin_quantize)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
- gptq_pack, quantize_weights, sort_weights)
-from vllm.utils import FlexibleArgumentParser
-
-DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
-DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
-
-ACT_ORDER_OPTS = [False, True]
-K_FULL_OPTS = [False, True]
-
-
-def bench_run(results: List[benchmark.Measurement], model: str,
- act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
- size_m: int, size_k: int, size_n: int):
- label = "Quant Matmul"
-
- sub_label = ("{}, act={} k_full={}, b={}, g={}, "
- "MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits,
- group_size, size_m, size_k, size_n))
-
- print(f"Testing: {sub_label}")
-
- a = torch.randn(size_m, size_k).to(torch.half).cuda()
- b = torch.rand(size_k, size_n).to(torch.half).cuda()
-
- a_tmp = (torch.zeros(size_m, size_k).to(torch.half).cuda())
-
- # Marlin quant
- (
- marlin_w_ref,
- marlin_q_w,
- marlin_s,
- marlin_g_idx,
- marlin_sort_indices,
- marlin_rand_perm,
- ) = marlin_quantize(b, num_bits, group_size, act_order)
-
- # Marlin_24 quant
- (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
- marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)
-
- # GPTQ quant
- (w_ref, q_w, s, g_idx,
- rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
- q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)
-
- # For act_order, sort the "weights" and "g_idx"
- # so that group ids are increasing
- repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
- if act_order:
- (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
-
- # Prepare
- marlin_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
- GPTQ_MARLIN_MAX_PARALLEL)
-
- marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
- GPTQ_MARLIN_24_MAX_PARALLEL)
-
- globals = {
- # Gen params
- "num_bits": num_bits,
- "group_size": group_size,
- "size_m": size_m,
- "size_n": size_n,
- "size_k": size_k,
- "a": a,
- "a_tmp": a_tmp,
- # Marlin params
- "marlin_w_ref": marlin_w_ref,
- "marlin_q_w": marlin_q_w,
- "marlin_s": marlin_s,
- "marlin_g_idx": marlin_g_idx,
- "marlin_sort_indices": marlin_sort_indices,
- "marlin_rand_perm": marlin_rand_perm,
- "marlin_workspace": marlin_workspace,
- "is_k_full": is_k_full,
- # Marlin_24 params
- "marlin_24_w_ref": marlin_24_w_ref,
- "marlin_24_q_w_comp": marlin_24_q_w_comp,
- "marlin_24_meta": marlin_24_meta,
- "marlin_24_s": marlin_24_s,
- "marlin_24_workspace": marlin_24_workspace,
- # GPTQ params
- "q_w_gptq": q_w_gptq,
- "repack_sort_indices": repack_sort_indices,
- # Kernels
- "gptq_marlin_gemm": ops.gptq_marlin_gemm,
- "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
- "gptq_marlin_repack": ops.gptq_marlin_repack,
- }
-
- min_run_time = 1
-
- # Warmup pytorch
- for i in range(5):
- torch.matmul(a, marlin_w_ref)
-
- results.append(
- benchmark.Timer(
- stmt="torch.matmul(a, marlin_w_ref)",
- globals=globals,
- label=label,
- sub_label=sub_label,
- description="pytorch_gemm",
- ).blocked_autorange(min_run_time=min_run_time))
-
- results.append(
- benchmark.Timer(
- stmt=
- "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)", # noqa: E501
- globals=globals,
- label=label,
- sub_label=sub_label,
- description="gptq_marlin_gemm",
- ).blocked_autorange(min_run_time=min_run_time))
-
- if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
- and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
- results.append(
- benchmark.Timer(
- stmt=
- "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)", # noqa: E501
- globals=globals,
- label=label,
- sub_label=sub_label,
- description="gptq_marlin_24_gemm",
- ).blocked_autorange(min_run_time=min_run_time))
-
- results.append(
- benchmark.Timer(
- stmt=
- "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)", # noqa: E501
- globals=globals,
- label=label,
- sub_label=sub_label,
- description="gptq_marlin_repack",
- ).blocked_autorange(min_run_time=min_run_time))
-
-
-def main(args):
- print("Benchmarking models:")
- for i, model in enumerate(args.models):
- print(f"[{i}] {model}")
-
- results: List[benchmark.Measurement] = []
-
- for model in args.models:
- for layer in WEIGHT_SHAPES[model]:
- size_k = layer[0]
- size_n = layer[1]
-
- if len(args.limit_k) > 0 and size_k not in args.limit_k:
- continue
-
- if len(args.limit_n) > 0 and size_n not in args.limit_n:
- continue
-
- for act_order in ACT_ORDER_OPTS:
- if len(args.limit_act_order
- ) > 0 and act_order not in args.limit_act_order:
- continue
-
- for is_k_full in K_FULL_OPTS:
- if len(args.limit_k_full
- ) > 0 and is_k_full not in args.limit_k_full:
- continue
-
- for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
- if len(args.limit_num_bits
- ) > 0 and num_bits not in args.limit_num_bits:
- continue
-
- for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
- if len(
- args.limit_group_size
- ) > 0 and group_size not in args.limit_group_size:
- continue
-
- # For act_order, the group_size must be less than
- # size_k
- if act_order and (group_size == size_k
- or group_size == -1):
- continue
-
- for size_m in args.batch_sizes:
- bench_run(results, model, act_order, is_k_full,
- num_bits, group_size, size_m, size_k,
- size_n)
-
- compare = benchmark.Compare(results)
- compare.print()
-
-
-# For quick benchmarking use:
-# python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
-#
-if __name__ == "__main__":
- parser = FlexibleArgumentParser(
- description="Benchmark Marlin across specified models/shapes/batches")
- parser.add_argument(
- "--models",
- nargs="+",
- type=str,
- default=DEFAULT_MODELS,
- choices=WEIGHT_SHAPES.keys(),
- )
- parser.add_argument("--batch-sizes",
- nargs="+",
- type=int,
- default=DEFAULT_BATCH_SIZES)
- parser.add_argument("--limit-k", nargs="+", type=int, default=[])
- parser.add_argument("--limit-n", nargs="+", type=int, default=[])
- parser.add_argument("--limit-group-size", nargs="+", type=int, default=[])
- parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[])
- parser.add_argument("--limit-act-order", nargs="+", type=int, default=[])
- parser.add_argument("--limit-k-full", nargs="+", type=int, default=[])
-
- args = parser.parse_args()
- main(args)
diff --git a/benchmarks/kernels/benchmark_mixtral_moe.py b/benchmarks/kernels/benchmark_mixtral_moe.py
new file mode 100644
index 0000000000000..5280b214144c9
--- /dev/null
+++ b/benchmarks/kernels/benchmark_mixtral_moe.py
@@ -0,0 +1,215 @@
+import argparse
+import json
+import os
+import sys
+
+import torch
+import torch.nn.functional as F
+import triton
+from tqdm import tqdm
+
+from vllm.model_executor.layers.fused_moe import (fused_moe,
+ get_config_file_name)
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+def main(dtype: str):
+ method = fused_moe
+ for bs in [
+ 1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
+ 2048, 3072, 4096
+ ]:
+ run_grid(bs, method=method, dtype=dtype)
+
+
+def run_grid(bs, method, dtype: str):
+ d_model = 4096
+ num_total_experts = 8
+ top_k = 2
+ tp_size = 2
+ model_intermediate_size = 14336
+ num_layers = 32
+ num_calls = 100
+
+ num_warmup_trials = 1
+ num_trials = 1
+
+ configs = []
+
+ for block_size_n in [32, 64, 128, 256]:
+ for block_size_m in [16, 32, 64, 128, 256]:
+ for block_size_k in [64, 128, 256]:
+ for group_size_m in [1, 16, 32, 64]:
+ for num_warps in [4, 8]:
+ for num_stages in [2, 3, 4, 5]:
+ configs.append({
+ "BLOCK_SIZE_M": block_size_m,
+ "BLOCK_SIZE_N": block_size_n,
+ "BLOCK_SIZE_K": block_size_k,
+ "GROUP_SIZE_M": group_size_m,
+ "num_warps": num_warps,
+ "num_stages": num_stages,
+ })
+
+ best_config = None
+ best_time_us = 1e20
+
+ print(f'{tp_size=} {bs=}')
+
+ for config in tqdm(configs):
+ # warmup
+ try:
+ for _ in range(num_warmup_trials):
+ run_timing(
+ num_calls=num_calls,
+ bs=bs,
+ d_model=d_model,
+ num_total_experts=num_total_experts,
+ top_k=top_k,
+ tp_size=tp_size,
+ model_intermediate_size=model_intermediate_size,
+ method=method,
+ config=config,
+ dtype=dtype,
+ )
+ except triton.runtime.autotuner.OutOfResources:
+ continue
+
+ # trial
+ for _ in range(num_trials):
+ kernel_dur_ms = run_timing(
+ num_calls=num_calls,
+ bs=bs,
+ d_model=d_model,
+ num_total_experts=num_total_experts,
+ top_k=top_k,
+ tp_size=tp_size,
+ model_intermediate_size=model_intermediate_size,
+ method=method,
+ config=config,
+ dtype=dtype,
+ )
+
+ kernel_dur_us = 1000 * kernel_dur_ms
+ model_dur_ms = kernel_dur_ms * num_layers
+
+ if kernel_dur_us < best_time_us:
+ best_config = config
+ best_time_us = kernel_dur_us
+
+ tqdm.write(
+ f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
+ f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
+ f'{d_model=} {model_intermediate_size=} {num_layers=}')
+
+ print("best_time_us", best_time_us)
+ print("best_config", best_config)
+
+ # holds Dict[str, Dict[str, int]]
+ filename = get_config_file_name(num_total_experts,
+ model_intermediate_size // tp_size,
+ "float8" if dtype == "float8" else None)
+ print(f"writing config to file {filename}")
+ existing_content = {}
+ if os.path.exists(filename):
+ with open(filename, "r") as f:
+ existing_content = json.load(f)
+ existing_content[str(bs)] = best_config
+ with open(filename, "w") as f:
+ json.dump(existing_content, f, indent=4)
+ f.write("\n")
+
+
+def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
+ top_k: int, tp_size: int, model_intermediate_size: int, method,
+ config, dtype: str) -> float:
+ shard_intermediate_size = model_intermediate_size // tp_size
+
+ hidden_states = torch.rand(
+ (bs, d_model),
+ device="cuda:0",
+ dtype=torch.float16,
+ )
+
+ w1 = torch.rand(
+ (num_total_experts, 2 * shard_intermediate_size, d_model),
+ device=hidden_states.device,
+ dtype=hidden_states.dtype,
+ )
+
+ w2 = torch.rand(
+ (num_total_experts, d_model, shard_intermediate_size),
+ device=hidden_states.device,
+ dtype=hidden_states.dtype,
+ )
+
+ w1_scale = None
+ w2_scale = None
+ a1_scale = None
+ a2_scale = None
+
+ if dtype == "float8":
+ w1 = w1.to(torch.float8_e4m3fn)
+ w2 = w2.to(torch.float8_e4m3fn)
+ w1_scale = torch.ones(num_total_experts,
+ device=hidden_states.device,
+ dtype=torch.float32)
+ w2_scale = torch.ones(num_total_experts,
+ device=hidden_states.device,
+ dtype=torch.float32)
+ a1_scale = torch.ones(1,
+ device=hidden_states.device,
+ dtype=torch.float32)
+ a2_scale = torch.ones(1,
+ device=hidden_states.device,
+ dtype=torch.float32)
+
+ gating_output = F.softmax(torch.rand(
+ (num_calls, bs, num_total_experts),
+ device=hidden_states.device,
+ dtype=torch.float32,
+ ),
+ dim=-1)
+
+ start_event = torch.cuda.Event(enable_timing=True)
+ end_event = torch.cuda.Event(enable_timing=True)
+
+ start_event.record()
+ for i in range(num_calls):
+ hidden_states = method(
+ hidden_states=hidden_states,
+ w1=w1,
+ w2=w2,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ gating_output=gating_output[i],
+ topk=2,
+ renormalize=True,
+ inplace=True,
+ override_config=config,
+ use_fp8=dtype == "float8",
+ )
+ end_event.record()
+ end_event.synchronize()
+
+ dur_ms = start_event.elapsed_time(end_event) / num_calls
+ return dur_ms
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ prog='benchmark_mixtral_moe',
+ description='Benchmark and tune the fused_moe kernel',
+ )
+ parser.add_argument(
+ '--dtype',
+ type=str,
+ default='auto',
+ choices=['float8', 'float16'],
+ help='Data type used for fused_moe kernel computations',
+ )
+ args = parser.parse_args()
+ sys.exit(main(args.dtype))
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
deleted file mode 100644
index e00696d6d43cb..0000000000000
--- a/benchmarks/kernels/benchmark_moe.py
+++ /dev/null
@@ -1,333 +0,0 @@
-import argparse
-import time
-from datetime import datetime
-from typing import Any, Dict, List, Tuple, TypedDict
-
-import ray
-import torch
-import triton
-from ray.experimental.tqdm_ray import tqdm
-from transformers import AutoConfig
-
-from vllm.model_executor.layers.fused_moe.fused_moe import *
-from vllm.utils import FlexibleArgumentParser
-
-
-class BenchmarkConfig(TypedDict):
- BLOCK_SIZE_M: int
- BLOCK_SIZE_N: int
- BLOCK_SIZE_K: int
- GROUP_SIZE_M: int
- num_warps: int
- num_stages: int
-
-
-def benchmark_config(
- config: BenchmarkConfig,
- num_tokens: int,
- num_experts: int,
- shard_intermediate_size: int,
- hidden_size: int,
- topk: int,
- dtype: torch.dtype,
- use_fp8: bool,
- num_iters: int = 100,
-) -> float:
- init_dtype = torch.float16 if use_fp8 else dtype
- x = torch.randn(num_tokens, hidden_size, dtype=dtype)
- w1 = torch.randn(num_experts,
- shard_intermediate_size,
- hidden_size,
- dtype=init_dtype)
- w2 = torch.randn(num_experts,
- hidden_size,
- shard_intermediate_size // 2,
- dtype=init_dtype)
- gating_output = torch.randn(num_iters,
- num_tokens,
- num_experts,
- dtype=torch.float32)
-
- w1_scale = None
- w2_scale = None
- a1_scale = None
- a2_scale = None
- if use_fp8:
- w1_scale = torch.randn(num_experts, dtype=torch.float32)
- w2_scale = torch.randn(num_experts, dtype=torch.float32)
- a1_scale = torch.randn(1, dtype=torch.float32)
- a2_scale = torch.randn(1, dtype=torch.float32)
-
- w1 = w1.to(torch.float8_e4m3fn)
- w2 = w2.to(torch.float8_e4m3fn)
-
- input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
-
- def prepare(i: int):
- input_gating.copy_(gating_output[i])
-
- def run():
- fused_moe(
- x,
- w1,
- w2,
- input_gating,
- topk,
- renormalize=True,
- inplace=True,
- override_config=config,
- use_fp8=use_fp8,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a1_scale,
- a2_scale=a2_scale,
- )
-
- # JIT compilation & warmup
- run()
- torch.cuda.synchronize()
-
- # Capture 10 invocations with CUDA graph
- graph = torch.cuda.CUDAGraph()
- with torch.cuda.graph(graph):
- for _ in range(10):
- run()
- torch.cuda.synchronize()
-
- # Warmup
- for _ in range(5):
- graph.replay()
- torch.cuda.synchronize()
-
- start_event = torch.cuda.Event(enable_timing=True)
- end_event = torch.cuda.Event(enable_timing=True)
-
- latencies: List[float] = []
- for i in range(num_iters):
- prepare(i)
- torch.cuda.synchronize()
-
- start_event.record()
- graph.replay()
- end_event.record()
- end_event.synchronize()
- latencies.append(start_event.elapsed_time(end_event))
- avg = sum(latencies) / (num_iters * 10) * 1000 # us
- graph.reset()
- return avg
-
-
-def get_configs_compute_bound() -> List[Dict[str, int]]:
- # Reduced search space for faster tuning.
- # TODO(woosuk): Increase the search space and use a performance model to
- # prune the search space.
- configs: List[BenchmarkConfig] = []
- for num_stages in [2, 3, 4, 5]:
- for block_m in [16, 32, 64, 128, 256]:
- for block_k in [64, 128, 256]:
- for block_n in [32, 64, 128, 256]:
- for num_warps in [4, 8]:
- for group_size in [1, 16, 32, 64]:
- configs.append({
- "BLOCK_SIZE_M": block_m,
- "BLOCK_SIZE_N": block_n,
- "BLOCK_SIZE_K": block_k,
- "GROUP_SIZE_M": group_size,
- "num_warps": num_warps,
- "num_stages": num_stages,
- })
- return configs
-
-
-@ray.remote(num_gpus=1)
-class BenchmarkWorker:
-
- def __init__(self, seed: int) -> None:
- torch.set_default_device("cuda")
- torch.cuda.manual_seed_all(seed)
- self.seed = seed
-
- def benchmark(
- self,
- num_tokens: int,
- num_experts: int,
- shard_intermediate_size: int,
- hidden_size: int,
- topk: int,
- dtype: torch.dtype,
- use_fp8: bool,
- ) -> Tuple[Dict[str, int], float]:
- torch.cuda.manual_seed_all(self.seed)
-
- dtype_str = "float8" if use_fp8 else None
- # NOTE(woosuk): The current naming convention uses w2.shape[2], which
- # is the intermediate size after silu_and_mul.
- op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
- dtype_str)
- if op_config is None:
- config = get_default_config(num_tokens, num_experts,
- shard_intermediate_size, hidden_size,
- topk, dtype_str)
- else:
- config = op_config[min(op_config.keys(),
- key=lambda x: abs(x - num_tokens))]
- kernel_time = benchmark_config(config, num_tokens, num_experts,
- shard_intermediate_size, hidden_size,
- topk, dtype, use_fp8)
- return config, kernel_time
-
- def tune(
- self,
- num_tokens: int,
- num_experts: int,
- shard_intermediate_size: int,
- hidden_size: int,
- topk: int,
- dtype: torch.dtype,
- use_fp8: bool,
- search_space: List[BenchmarkConfig],
- ) -> BenchmarkConfig:
- best_config = None
- best_time = float("inf")
- for config in tqdm(search_space):
- try:
- kernel_time = benchmark_config(config,
- num_tokens,
- num_experts,
- shard_intermediate_size,
- hidden_size,
- topk,
- dtype,
- use_fp8,
- num_iters=10)
- except triton.runtime.autotuner.OutOfResources:
- # Some configurations may be invalid and fail to compile.
- continue
-
- if kernel_time < best_time:
- best_time = kernel_time
- best_config = config
- now = datetime.now()
- print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
- assert best_config is not None
- return best_config
-
-
-def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
- return {
- "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
- "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
- "BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
- "GROUP_SIZE_M": config["GROUP_SIZE_M"],
- "num_warps": config["num_warps"],
- "num_stages": config["num_stages"],
- }
-
-
-def save_configs(
- configs: Dict[int, BenchmarkConfig],
- num_experts: int,
- shard_intermediate_size: int,
- hidden_size: int,
- topk: int,
- dtype: torch.dtype,
- use_fp8: bool,
-) -> None:
- dtype_str = "float8" if use_fp8 else None
- # NOTE(woosuk): The current naming convention uses w2.shape[2], which
- # is the intermediate size after silu_and_mul.
- filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
- dtype_str)
- print(f"Writing best config to {filename}...")
- with open(filename, "w") as f:
- json.dump(configs, f, indent=4)
- f.write("\n")
-
-
-def main(args: argparse.Namespace):
- print(args)
-
- config = AutoConfig.from_pretrained(args.model)
- if config.architectures[0] == "DbrxForCausalLM":
- E = config.ffn_config.moe_num_experts
- topk = config.ffn_config.moe_top_k
- intermediate_size = config.ffn_config.ffn_hidden_size
- shard_intermediate_size = 2 * intermediate_size // args.tp_size
- else:
- # Default: Mixtral.
- E = config.num_local_experts
- topk = config.num_experts_per_tok
- intermediate_size = config.intermediate_size
- shard_intermediate_size = 2 * intermediate_size // args.tp_size
-
- hidden_size = config.hidden_size
- dtype = config.torch_dtype
- use_fp8 = args.dtype == "fp8"
-
- if args.batch_size is None:
- batch_sizes = [
- 1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
- 2048, 3072, 4096
- ]
- else:
- batch_sizes = [args.batch_size]
-
- ray.init()
- num_gpus = int(ray.available_resources()["GPU"])
- workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
-
- def _distribute(method: str, inputs: List[Any]) -> List[Any]:
- outputs = []
- worker_idx = 0
- for input_args in inputs:
- worker = workers[worker_idx]
- worker_method = getattr(worker, method)
- output = worker_method.remote(*input_args)
- outputs.append(output)
- worker_idx = (worker_idx + 1) % num_gpus
- return ray.get(outputs)
-
- if args.tune:
- search_space = get_configs_compute_bound()
- print(f"Start tuning over {len(search_space)} configurations...")
-
- start = time.time()
- configs = _distribute(
- "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
- topk, dtype, use_fp8, search_space)
- for batch_size in batch_sizes])
- best_configs = {
- M: sort_config(config)
- for M, config in zip(batch_sizes, configs)
- }
- save_configs(best_configs, E, shard_intermediate_size, hidden_size,
- topk, dtype, use_fp8)
- end = time.time()
- print(f"Tuning took {end - start:.2f} seconds")
- else:
- outputs = _distribute("benchmark",
- [(batch_size, E, shard_intermediate_size,
- hidden_size, topk, dtype, use_fp8)
- for batch_size in batch_sizes])
-
- for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
- print(f"Batch size: {batch_size}, config: {config}")
- print(f"Kernel time: {kernel_time:.2f} us")
-
-
-if __name__ == "__main__":
- parser = FlexibleArgumentParser()
- parser.add_argument("--model",
- type=str,
- default="mistralai/Mixtral-8x7B-Instruct-v0.1")
- parser.add_argument("--tp-size", "-tp", type=int, default=2)
- parser.add_argument("--dtype",
- type=str,
- choices=["auto", "fp8"],
- default="auto")
- parser.add_argument("--seed", type=int, default=0)
- parser.add_argument("--batch-size", type=int, required=False)
- parser.add_argument("--tune", action="store_true")
- args = parser.parse_args()
-
- main(args)
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 16de60477c305..ca7967c1ab0d2 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -1,12 +1,12 @@
+import argparse
import random
import time
-from typing import List, Optional
+from typing import Optional
import torch
from vllm import _custom_ops as ops
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
- create_kv_caches_with_random)
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
NUM_BLOCKS = 1024
PARTITION_SIZE = 512
@@ -54,17 +54,14 @@ def main(
# Create the block tables.
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
- block_tables_lst: List[List[int]] = []
+ block_tables = []
for _ in range(num_seqs):
block_table = [
random.randint(0, NUM_BLOCKS - 1)
for _ in range(max_num_blocks_per_seq)
]
- block_tables_lst.append(block_table)
-
- block_tables = torch.tensor(block_tables_lst,
- dtype=torch.int,
- device=device)
+ block_tables.append(block_table)
+ block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
# Create the KV cache.
key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
@@ -161,19 +158,19 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
if __name__ == '__main__':
- parser = FlexibleArgumentParser(
+ parser = argparse.ArgumentParser(
description="Benchmark the paged attention kernel.")
parser.add_argument("--version",
type=str,
choices=["v1", "v2"],
default="v2")
parser.add_argument("--batch-size", type=int, default=8)
- parser.add_argument("--seq-len", type=int, default=4096)
+ parser.add_argument("--seq_len", type=int, default=4096)
parser.add_argument("--num-query-heads", type=int, default=64)
parser.add_argument("--num-kv-heads", type=int, default=8)
parser.add_argument("--head-size",
type=int,
- choices=[64, 80, 96, 112, 128, 192, 256],
+ choices=[64, 80, 96, 112, 128, 256],
default=128)
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
parser.add_argument("--use-alibi", action="store_true")
@@ -186,11 +183,13 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
parser.add_argument(
"--kv-cache-dtype",
type=str,
- choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
+ choices=["auto", "fp8"],
default="auto",
- help="Data type for kv cache storage. If 'auto', will use model "
- "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
- "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
+ help=
+ 'Data type for kv cache storage. If "auto", will use model data type. '
+ 'FP8_E5M2 (without scaling) is only supported on cuda version greater '
+ 'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
+ 'common inference criteria.')
args = parser.parse_args()
print(args)
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 78736c7a7ba6f..9188e811e2982 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -1,12 +1,11 @@
+import argparse
from itertools import accumulate
-from typing import List, Optional
+from typing import Optional
import nvtx
import torch
-from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
- get_rope)
-from vllm.utils import FlexibleArgumentParser
+from vllm.model_executor.layers.rotary_embedding import get_rope
def benchmark_rope_kernels_multi_lora(
@@ -38,7 +37,7 @@ def benchmark_rope_kernels_multi_lora(
})
# non-batched RoPE takes only one scaling factor, we create multiple
# instances to simulate the same behavior
- non_batched_ropes: List[RotaryEmbedding] = []
+ non_batched_ropes = []
for scaling_factor in scaling_factors:
non_batched_ropes.append(
get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
@@ -86,7 +85,7 @@ def benchmark_rope_kernels_multi_lora(
if __name__ == '__main__':
- parser = FlexibleArgumentParser(
+ parser = argparse.ArgumentParser(
description="Benchmark the rotary embedding kernels.")
parser.add_argument("--is-neox-style", type=bool, default=True)
parser.add_argument("--batch-size", type=int, default=16)
@@ -94,7 +93,7 @@ def benchmark_rope_kernels_multi_lora(
parser.add_argument("--num-heads", type=int, default=8)
parser.add_argument("--head-size",
type=int,
- choices=[64, 80, 96, 112, 128, 192, 256],
+ choices=[64, 80, 96, 112, 128, 256],
default=128)
parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
parser.add_argument("--dtype",
diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py
deleted file mode 100644
index 4eeeca35a37cc..0000000000000
--- a/benchmarks/kernels/benchmark_shapes.py
+++ /dev/null
@@ -1,75 +0,0 @@
-WEIGHT_SHAPES = {
- "ideal": [[4 * 256 * 32, 256 * 32]],
- "mistralai/Mistral-7B-v0.1/TP1": [
- [4096, 6144],
- [4096, 4096],
- [4096, 28672],
- [14336, 4096],
- ],
- "mistralai/Mistral-7B-v0.1/TP2": [
- [4096, 3072],
- [2048, 4096],
- [4096, 14336],
- [7168, 4096],
- ],
- "mistralai/Mistral-7B-v0.1/TP4": [
- [4096, 1536],
- [1024, 4096],
- [4096, 7168],
- [3584, 4096],
- ],
- "meta-llama/Llama-2-7b-hf/TP1": [
- [4096, 12288],
- [4096, 4096],
- [4096, 22016],
- [11008, 4096],
- ],
- "meta-llama/Llama-2-7b-hf/TP2": [
- [4096, 6144],
- [2048, 4096],
- [4096, 11008],
- [5504, 4096],
- ],
- "meta-llama/Llama-2-7b-hf/TP4": [
- [4096, 3072],
- [1024, 4096],
- [4096, 5504],
- [2752, 4096],
- ],
- "meta-llama/Llama-2-13b-hf/TP1": [
- [5120, 15360],
- [5120, 5120],
- [5120, 27648],
- [13824, 5120],
- ],
- "meta-llama/Llama-2-13b-hf/TP2": [
- [5120, 7680],
- [2560, 5120],
- [5120, 13824],
- [6912, 5120],
- ],
- "meta-llama/Llama-2-13b-hf/TP4": [
- [5120, 3840],
- [1280, 5120],
- [5120, 6912],
- [3456, 5120],
- ],
- "meta-llama/Llama-2-70b-hf/TP1": [
- [8192, 10240],
- [8192, 8192],
- [8192, 57344],
- [28672, 8192],
- ],
- "meta-llama/Llama-2-70b-hf/TP2": [
- [8192, 5120],
- [4096, 8192],
- [8192, 28672],
- [14336, 8192],
- ],
- "meta-llama/Llama-2-70b-hf/TP4": [
- [8192, 2560],
- [2048, 8192],
- [8192, 14336],
- [7168, 8192],
- ],
-}
diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh
index f491c90d0683e..64d3c4f4b3889 100755
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@@ -4,7 +4,7 @@ PORT=8000
MODEL=$1
TOKENS=$2
-docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
+docker run --gpus all --shm-size 1g -p $PORT:80 \
-v $PWD/data:/data \
ghcr.io/huggingface/text-generation-inference:1.4.0 \
--model-id $MODEL \
diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py
deleted file mode 100644
index 203699e9a8d06..0000000000000
--- a/benchmarks/overheads/benchmark_hashing.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import cProfile
-import pstats
-
-from vllm import LLM, SamplingParams
-from vllm.utils import FlexibleArgumentParser
-
-# A very long prompt, total number of tokens is about 15k.
-LONG_PROMPT = ["You are an expert in large language models, aren't you?"
- ] * 1000
-LONG_PROMPT = ' '.join(LONG_PROMPT)
-
-
-def main(args):
- llm = LLM(
- model=args.model,
- enforce_eager=True,
- enable_prefix_caching=True,
- tensor_parallel_size=args.tensor_parallel_size,
- use_v2_block_manager=args.use_v2_block_manager,
- )
-
- sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
- profiler = cProfile.Profile()
-
- print("------warm up------")
- for i in range(3):
- output = llm.generate(LONG_PROMPT, sampling_params)
- print(output[0].outputs[0].text)
-
- print("------start generating------")
- for i in range(3):
- profiler.runctx('llm.generate(LONG_PROMPT, sampling_params)',
- globals(), locals())
-
- # analyze the runtime of hashing function
- stats = pstats.Stats(profiler)
- stats.sort_stats('cumulative')
- total_time = 0
- total_calls = 0
- for func in stats.stats:
- if 'hash_of_block' in func[2]:
- total_time = stats.stats[func][3]
- total_calls = stats.stats[func][0]
- percentage = (total_time / stats.total_tt) * 100
- print(f"Hashing took {total_time:.2f} seconds,"
- f"{percentage:.2f}% of the total runtime.")
-
-
-if __name__ == "__main__":
- parser = FlexibleArgumentParser(
- description='Benchmark the performance of hashing function in'
- 'automatic prefix caching.')
- parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
- parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
- parser.add_argument('--output-len', type=int, default=10)
- parser.add_argument('--enable-prefix-caching',
- action='store_true',
- help='enable prefix caching')
- parser.add_argument('--use-v2-block-manager',
- action='store_true',
- help='Use BlockSpaceMangerV2')
- args = parser.parse_args()
- main(args)
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 690559ee265e9..0cf37769a6960 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -12,7 +12,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
#
# Check the compile flags
#
-list(APPEND CXX_COMPILE_FLAGS
+list(APPEND CXX_COMPILE_FLAGS
"-fopenmp"
"-DVLLM_CPU_EXTENSION")
@@ -33,23 +33,9 @@ function (find_isa CPUINFO TARGET OUT)
endif()
endfunction()
-function (is_avx512_disabled OUT)
- set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512})
- if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true")
- set(${OUT} ON PARENT_SCOPE)
- else()
- set(${OUT} OFF PARENT_SCOPE)
- endif()
-endfunction()
-
-is_avx512_disabled(AVX512_DISABLED)
-
-find_isa(${CPUINFO} "avx2" AVX2_FOUND)
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
-find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
-find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
-if (AVX512_FOUND AND NOT AVX512_DISABLED)
+if (AVX512_FOUND)
list(APPEND CXX_COMPILE_FLAGS
"-mavx512f"
"-mavx512vl"
@@ -58,8 +44,8 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND)
if (AVX512BF16_FOUND OR ENABLE_AVX512BF16)
- if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
- CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
+ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+ CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
else()
message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
@@ -67,18 +53,8 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
else()
message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
endif()
-elseif (AVX2_FOUND)
- list(APPEND CXX_COMPILE_FLAGS "-mavx2")
- message(WARNING "vLLM CPU backend using AVX2 ISA")
-elseif (POWER9_FOUND OR POWER10_FOUND)
- message(STATUS "PowerPC detected")
- # Check for PowerPC VSX support
- list(APPEND CXX_COMPILE_FLAGS
- "-mvsx"
- "-mcpu=native"
- "-mtune=native")
else()
- message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
+ message(FATAL_ERROR "vLLM CPU backend requires AVX512 ISA support.")
endif()
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
@@ -97,7 +73,7 @@ set(VLLM_EXT_SRC
"csrc/cpu/cache.cpp"
"csrc/cpu/layernorm.cpp"
"csrc/cpu/pos_encoding.cpp"
- "csrc/cpu/torch_bindings.cpp")
+ "csrc/cpu/pybind.cpp")
define_gpu_extension_target(
_C
@@ -105,10 +81,10 @@ define_gpu_extension_target(
LANGUAGE CXX
SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
- USE_SABI 3
- WITH_SOABI
+ WITH_SOABI
)
add_custom_target(default)
message(STATUS "Enabling C extension.")
add_dependencies(default _C)
+
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 4869cad541135..7c71673e36f29 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -5,7 +5,7 @@
macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
set(Python_EXECUTABLE ${EXECUTABLE})
- find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule)
+ find_package(Python COMPONENTS Interpreter Development.Module)
if (NOT Python_FOUND)
message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
endif()
@@ -99,7 +99,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
"Failed to determine torch nvcc compiler flags")
if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
- list(APPEND GPU_FLAGS "-DENABLE_FP8")
+ list(APPEND GPU_FLAGS "-DENABLE_FP8_E5M2")
endif()
if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0)
list(REMOVE_ITEM GPU_FLAGS
@@ -119,7 +119,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
list(APPEND GPU_FLAGS
"-DUSE_ROCM"
- "-DENABLE_FP8"
+ "-DENABLE_FP8_E4M3"
"-U__HIP_NO_HALF_CONVERSIONS__"
"-U__HIP_NO_HALF_OPERATORS__"
"-fno-gpu-rdc")
@@ -147,23 +147,16 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
if (${GPU_LANG} STREQUAL "HIP")
#
# `GPU_ARCHES` controls the `--offload-arch` flags.
+ # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
+ # via the `PYTORCH_ROCM_ARCH` env variable.
#
- # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list,
- # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling
- # "rocm_agent_enumerator" in "enable_language(HIP)"
- # (in file Modules/CMakeDetermineHIPCompiler.cmake)
- #
- if(DEFINED ENV{PYTORCH_ROCM_ARCH})
- set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH})
- else()
- set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES})
- endif()
+
#
# Find the intersection of the supported + detected architectures to
# set the module architecture flags.
#
set(${GPU_ARCHES})
- foreach (_ARCH ${HIP_ARCHITECTURES})
+ foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES})
if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
list(APPEND ${GPU_ARCHES} ${_ARCH})
endif()
@@ -171,7 +164,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
if(NOT ${GPU_ARCHES})
message(FATAL_ERROR
- "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
+ "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
" supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
endif()
@@ -301,7 +294,6 @@ endmacro()
# INCLUDE_DIRECTORIES - Extra include directories.
# LIBRARIES - Extra link libraries.
# WITH_SOABI - Generate library with python SOABI suffix name.
-# USE_SABI - Use python stable api
#
# Note: optimization level/debug info is set via cmake build type.
#
@@ -309,7 +301,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
cmake_parse_arguments(PARSE_ARGV 1
GPU
"WITH_SOABI"
- "DESTINATION;LANGUAGE;USE_SABI"
+ "DESTINATION;LANGUAGE"
"SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
# Add hipify preprocessing step when building with HIP/ROCm.
@@ -323,11 +315,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
set(GPU_WITH_SOABI)
endif()
- if (GPU_USE_SABI)
- Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
- else()
- Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
- endif()
+ Python_add_library(${GPU_MOD_NAME} MODULE "${GPU_SOURCES}" ${GPU_WITH_SOABI})
if (GPU_LANGUAGE STREQUAL "HIP")
# Make this target dependent on the hipify preprocessor step.
diff --git a/collect_env.py b/collect_env.py
index 083cb768f5399..1ecfeb8e22e2f 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -64,7 +64,6 @@
"triton",
"optree",
"nccl",
- "transformers",
}
DEFAULT_PIP_PATTERNS = {
@@ -76,7 +75,6 @@
"optree",
"onnx",
"nccl",
- "transformers",
}
@@ -603,11 +601,6 @@ def get_version_or_na(cfg, prefix):
{conda_packages}
""".strip()
-# both the above code and the following code use `strip()` to
-# remove leading/trailing whitespaces, so we need to add a newline
-# in between to separate the two sections
-env_info_fmt += "\n"
-
env_info_fmt += """
ROCM Version: {rocm_version}
Neuron SDK Version: {neuron_sdk_version}
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 5ed1dc3b8f792..24d972702c858 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -1,5 +1,5 @@
#include
-#include
+#include
#include
#include
@@ -10,11 +10,11 @@
namespace vllm {
// Activation and gating kernel template.
-template
+template
__global__ void act_and_mul_kernel(
- scalar_t* __restrict__ out, // [..., d]
- const scalar_t* __restrict__ input, // [..., 2, d]
- const int d) {
+ scalar_t* __restrict__ out, // [..., d]
+ const scalar_t* __restrict__ input, // [..., 2, d]
+ const int d) {
const int64_t token_idx = blockIdx.x;
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
@@ -23,66 +23,72 @@ __global__ void act_and_mul_kernel(
}
}
-template
+template
__device__ __forceinline__ T silu_kernel(const T& x) {
// x * sigmoid(x)
- return (T)(((float)x) / (1.0f + expf((float)-x)));
+ return (T) (((float) x) / (1.0f + expf((float) -x)));
}
-template
+template
__device__ __forceinline__ T gelu_kernel(const T& x) {
// Equivalent to PyTorch GELU with 'none' approximation.
// Refer to:
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
- const float f = (float)x;
+ const float f = (float) x;
constexpr float ALPHA = M_SQRT1_2;
- return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
+ return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
}
-template
+template
__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
// Equivalent to PyTorch GELU with 'tanh' approximation.
// Refer to:
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
- const float f = (float)x;
+ const float f = (float) x;
constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
constexpr float KAPPA = 0.044715;
float x_cube = f * f * f;
float inner = BETA * (f + KAPPA * x_cube);
- return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
+ return (T) (0.5f * f * (1.0f + ::tanhf(inner)));
}
-} // namespace vllm
+} // namespace vllm
// Launch activation and gating kernel.
-#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \
- int d = input.size(-1) / 2; \
- int64_t num_tokens = input.numel() / input.size(-1); \
- dim3 grid(num_tokens); \
- dim3 block(std::min(d, 1024)); \
- const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
- const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
- VLLM_DISPATCH_FLOATING_TYPES( \
- input.scalar_type(), "act_and_mul_kernel", [&] { \
- vllm::act_and_mul_kernel> \
- <<>>(out.data_ptr(), \
- input.data_ptr(), d); \
- });
-
-void silu_and_mul(torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., 2 * d]
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \
+ int d = input.size(-1) / 2; \
+ int64_t num_tokens = input.numel() / input.size(-1); \
+ dim3 grid(num_tokens); \
+ dim3 block(std::min(d, 1024)); \
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
+ VLLM_DISPATCH_FLOATING_TYPES( \
+ input.scalar_type(), \
+ "act_and_mul_kernel", \
+ [&] { \
+ vllm::act_and_mul_kernel><<>>( \
+ out.data_ptr(), \
+ input.data_ptr(), \
+ d); \
+ });
+
+void silu_and_mul(
+ torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
}
-void gelu_and_mul(torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., 2 * d]
+void gelu_and_mul(
+ torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
}
-void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., 2 * d]
+void gelu_tanh_and_mul(
+ torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
}
@@ -90,11 +96,11 @@ void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
namespace vllm {
// Element-wise activation kernel template.
-template
+template
__global__ void activation_kernel(
- scalar_t* __restrict__ out, // [..., d]
- const scalar_t* __restrict__ input, // [..., d]
- const int d) {
+ scalar_t* __restrict__ out, // [..., d]
+ const scalar_t* __restrict__ input, // [..., d]
+ const int d) {
const int64_t token_idx = blockIdx.x;
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
@@ -102,61 +108,54 @@ __global__ void activation_kernel(
}
}
-} // namespace vllm
+} // namespace vllm
// Launch element-wise activation kernel.
-#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
- int d = input.size(-1); \
- int64_t num_tokens = input.numel() / d; \
- dim3 grid(num_tokens); \
- dim3 block(std::min(d, 1024)); \
- const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
- const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
- VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \
- vllm::activation_kernel> \
- <<>>(out.data_ptr(), \
- input.data_ptr(), d); \
- });
+#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
+ int d = input.size(-1); \
+ int64_t num_tokens = input.numel() / d; \
+ dim3 grid(num_tokens); \
+ dim3 block(std::min(d, 1024)); \
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
+ VLLM_DISPATCH_FLOATING_TYPES( \
+ input.scalar_type(), \
+ "activation_kernel", \
+ [&] { \
+ vllm::activation_kernel><<>>( \
+ out.data_ptr(), \
+ input.data_ptr(), \
+ d); \
+ });
namespace vllm {
-template
+template
__device__ __forceinline__ T gelu_new_kernel(const T& x) {
- const float x3 = (float)(x * x * x);
- const T t = (T)tanhf((T)(0.79788456f * (float)(x + (T)(0.044715f * x3))));
- return ((T)0.5) * x * (((T)1.0) + t);
+ const float x3 = (float) (x * x * x);
+ const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));
+ return ((T) 0.5) * x * (((T) 1.0) + t);
}
-template
+template
__device__ __forceinline__ T gelu_fast_kernel(const T& x) {
- const float f = (float)x;
- const T t =
- (T)tanhf(((T)(f * 0.79788456f)) * (((T)1.0) + (T)(0.044715f * f) * x));
- return ((T)0.5) * x * (((T)1.0) + t);
+ const float f = (float) x;
+ const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));
+ return ((T) 0.5) * x * (((T) 1.0) + t);
}
-template
-__device__ __forceinline__ T gelu_quick_kernel(const T& x) {
- // x * sigmoid(1.702 * x)
- return (T)(((float)x) / (1.0f + expf(-1.702f * (float)x)));
-}
-
-} // namespace vllm
+} // namespace vllm
-void gelu_new(torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., d]
+void gelu_new(
+ torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., d]
{
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
}
-void gelu_fast(torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., d]
+void gelu_fast(
+ torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., d]
{
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
}
-
-void gelu_quick(torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., d]
-{
- LAUNCH_ACTIVATION_KERNEL(vllm::gelu_quick_kernel);
-}
diff --git a/csrc/attention/attention_generic.cuh b/csrc/attention/attention_generic.cuh
index 62409c0cce93e..31fb401cbe2c1 100644
--- a/csrc/attention/attention_generic.cuh
+++ b/csrc/attention/attention_generic.cuh
@@ -1,6 +1,5 @@
/*
- * Adapted from
- * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -23,31 +22,31 @@
namespace vllm {
// A vector type to store Q, K, V elements.
-template
+template
struct Vec {};
// A vector type to store FP32 accumulators.
-template
+template
struct FloatVec {};
// Template vector operations.
-template
+template
inline __device__ Acc mul(A a, B b);
-template
+template
inline __device__ float sum(T v);
-template
+template
inline __device__ float dot(T a, T b) {
return sum(mul(a, b));
}
-template
+template
inline __device__ float dot(T a, T b) {
return sum(mul(a, b));
}
-template
+template
inline __device__ void zero(T& dst) {
constexpr int WORDS = sizeof(T) / 4;
union {
@@ -62,4 +61,4 @@ inline __device__ void zero(T& dst) {
dst = tmp.raw;
}
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index 91083481705cb..8b1b5e098015f 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -1,6 +1,5 @@
/*
- * Adapted from
- * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -17,26 +16,30 @@
* limitations under the License.
*/
-#include
+#include
#include
#include
-#include
#include "attention_dtypes.h"
#include "attention_utils.cuh"
+#if defined(ENABLE_FP8_E5M2)
+#include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh"
+#elif defined(ENABLE_FP8_E4M3)
+#include "../quantization/fp8/amd_detail/quant_utils.cuh"
+#endif
+
+#include
+
#ifdef USE_ROCM
#include
- #include "../quantization/fp8/amd/quant_utils.cuh"
-typedef __hip_bfloat16 __nv_bfloat16;
-#else
- #include "../quantization/fp8/nvidia/quant_utils.cuh"
+ typedef __hip_bfloat16 __nv_bfloat16;
#endif
#ifndef USE_ROCM
- #define WARP_SIZE 32
+#define WARP_SIZE 32
#else
- #define WARP_SIZE warpSize
+#define WARP_SIZE warpSize
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -46,7 +49,7 @@ typedef __hip_bfloat16 __nv_bfloat16;
namespace vllm {
// Utility function for attention softmax.
-template
+template
inline __device__ float block_sum(float* red_smem, float sum) {
// Decompose the thread index into warp / lane.
int warp = threadIdx.x / WARP_SIZE;
@@ -83,31 +86,31 @@ inline __device__ float block_sum(float* red_smem, float sum) {
// TODO(woosuk): Merge the last two dimensions of the grid.
// Grid: (num_heads, num_seqs, max_num_partitions).
-template // Zero means no partitioning.
+template<
+ typename scalar_t,
+ typename cache_t,
+ int HEAD_SIZE,
+ int BLOCK_SIZE,
+ int NUM_THREADS,
+ bool IS_FP8_KV_CACHE,
+ int PARTITION_SIZE = 0> // Zero means no partitioning.
__device__ void paged_attention_kernel(
- float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
- float* __restrict__ max_logits, // [num_seqs, num_heads,
- // max_num_partitions]
- scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions,
- // head_size]
- const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
- const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
- // head_size/x, block_size, x]
- const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
- // head_size, block_size]
- const int num_kv_heads, // [num_heads]
- const float scale,
- const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
- const int* __restrict__ seq_lens, // [num_seqs]
- const int max_num_blocks_per_seq,
- const float* __restrict__ alibi_slopes, // [num_heads]
- const int q_stride, const int kv_block_stride, const int kv_head_stride,
- const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
- const int blocksparse_vert_stride, const int blocksparse_block_size,
- const int blocksparse_head_sliding_step) {
+ float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
+ float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
+ scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size]
+ const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
+ const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+ const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
+ const int num_kv_heads, // [num_heads]
+ const float scale,
+ const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
+ const int* __restrict__ seq_lens, // [num_seqs]
+ const int max_num_blocks_per_seq,
+ const float* __restrict__ alibi_slopes, // [num_heads]
+ const int q_stride,
+ const int kv_block_stride,
+ const int kv_head_stride,
+ const float kv_scale) {
const int seq_idx = blockIdx.y;
const int partition_idx = blockIdx.z;
const int max_num_partitions = gridDim.z;
@@ -119,29 +122,22 @@ __device__ void paged_attention_kernel(
}
const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
- const int num_blocks_per_partition =
- USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_seq_blocks;
+ const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_seq_blocks;
// [start_block_idx, end_block_idx) is the range of blocks to process.
- const int start_block_idx =
- USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
- const int end_block_idx =
- MIN(start_block_idx + num_blocks_per_partition, num_seq_blocks);
+ const int start_block_idx = USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
+ const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_seq_blocks);
const int num_blocks = end_block_idx - start_block_idx;
// [start_token_idx, end_token_idx) is the range of tokens to process.
const int start_token_idx = start_block_idx * BLOCK_SIZE;
- const int end_token_idx =
- MIN(start_token_idx + num_blocks * BLOCK_SIZE, seq_len);
+ const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, seq_len);
const int num_tokens = end_token_idx - start_token_idx;
constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
- constexpr int NUM_THREAD_GROUPS =
- NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE
- // divides NUM_THREADS
+ constexpr int NUM_THREAD_GROUPS = NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE divides NUM_THREADS
assert(NUM_THREADS % THREAD_GROUP_SIZE == 0);
- constexpr int NUM_TOKENS_PER_THREAD_GROUP =
- DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
+ constexpr int NUM_TOKENS_PER_THREAD_GROUP = DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
const int thread_idx = threadIdx.x;
const int warp_idx = thread_idx / WARP_SIZE;
@@ -151,18 +147,19 @@ __device__ void paged_attention_kernel(
const int num_heads = gridDim.x;
const int num_queries_per_kv = num_heads / num_kv_heads;
const int kv_head_idx = head_idx / num_queries_per_kv;
- const float alibi_slope =
- alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
+ const float alibi_slope = alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
// A vector type to store a part of a key or a query.
- // The vector size is configured in such a way that the threads in a thread
- // group fetch or compute 16 bytes at a time. For example, if the size of a
- // thread group is 4 and the data type is half, then the vector size is 16 /
- // (4 * sizeof(half)) == 2.
+ // The vector size is configured in such a way that the threads in a thread group
+ // fetch or compute 16 bytes at a time.
+ // For example, if the size of a thread group is 4 and the data type is half,
+ // then the vector size is 16 / (4 * sizeof(half)) == 2.
constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(scalar_t)), 1);
using K_vec = typename Vec::Type;
using Q_vec = typename Vec::Type;
+#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3)
using Quant_vec = typename Vec::Type;
+#endif
constexpr int NUM_ELEMS_PER_THREAD = HEAD_SIZE / THREAD_GROUP_SIZE;
constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE;
@@ -172,21 +169,18 @@ __device__ void paged_attention_kernel(
// Load the query to registers.
// Each thread in a thread group has a different part of the query.
- // For example, if the the thread group size is 4, then the first thread in
- // the group has 0, 4, 8, ... th vectors of the query, and the second thread
- // has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
- // q is split from a qkv tensor, it may not be contiguous.
+ // For example, if the the thread group size is 4, then the first thread in the group
+ // has 0, 4, 8, ... th vectors of the query, and the second thread has 1, 5, 9, ...
+ // th vectors of the query, and so on.
+ // NOTE(woosuk): Because q is split from a qkv tensor, it may not be contiguous.
const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
__shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
#pragma unroll
- for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD;
- i += NUM_THREAD_GROUPS) {
+ for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD; i += NUM_THREAD_GROUPS) {
const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
- q_vecs[thread_group_offset][i] =
- *reinterpret_cast(q_ptr + vec_idx * VEC_SIZE);
+ q_vecs[thread_group_offset][i] = *reinterpret_cast(q_ptr + vec_idx * VEC_SIZE);
}
- __syncthreads(); // TODO(naed90): possible speedup if this is replaced with a
- // memory wall right before we use q_vecs
+ __syncthreads(); // TODO(naed90): possible speedup if this is replaced with a memory wall right before we use q_vecs
// Memory planning.
extern __shared__ char shared_mem[];
@@ -205,94 +199,51 @@ __device__ void paged_attention_kernel(
// Each thread group in a warp fetches a key from the block, and computes
// dot product with the query.
const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
-
- // blocksparse specific vars
- int bs_block_offset;
- int q_bs_block_id;
- if constexpr (IS_BLOCK_SPARSE) {
- // const int num_blocksparse_blocks = DIVIDE_ROUND_UP(seq_len,
- // blocksparse_block_size);
- q_bs_block_id = (seq_len - 1) / blocksparse_block_size;
- if (blocksparse_head_sliding_step >= 0)
- // sliding on q heads
- bs_block_offset =
- (tp_rank * num_heads + head_idx) * blocksparse_head_sliding_step + 1;
- else
- // sliding on kv heads
- bs_block_offset = (tp_rank * num_kv_heads + kv_head_idx) *
- (-blocksparse_head_sliding_step) +
- 1;
- }
-
- for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
- block_idx += NUM_WARPS) {
- // NOTE(woosuk): The block number is stored in int32. However, we cast it to
- // int64 because int32 can lead to overflow when this variable is multiplied
- // by large numbers (e.g., kv_block_stride).
- // For blocksparse attention: skip computation on blocks that are not
- // attended
- if constexpr (IS_BLOCK_SPARSE) {
- const int k_bs_block_id = block_idx * BLOCK_SIZE / blocksparse_block_size;
- const bool is_remote =
- ((k_bs_block_id + bs_block_offset) % blocksparse_vert_stride == 0);
- const bool is_local =
- (k_bs_block_id > q_bs_block_id - blocksparse_local_blocks);
- if (!is_remote && !is_local) {
- for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
- const int physical_block_offset =
- (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
- const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
-
- if (thread_group_offset == 0) {
- // NOTE(linxihui): assign very large number to skipped tokens to
- // avoid contribution to the sumexp softmax normalizer. This will
- // not be used at computing sum(softmax*v) as the blocks will be
- // skipped.
- logits[token_idx - start_token_idx] = -FLT_MAX;
- }
- }
- continue;
- }
- }
- const int64_t physical_block_number =
- static_cast(block_table[block_idx]);
+ for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
+ // NOTE(woosuk): The block number is stored in int32. However, we cast it to int64
+ // because int32 can lead to overflow when this variable is multiplied by large numbers
+ // (e.g., kv_block_stride).
+ const int64_t physical_block_number = static_cast(block_table[block_idx]);
// Load a key to registers.
// Each thread in a thread group has a different part of the key.
- // For example, if the the thread group size is 4, then the first thread in
- // the group has 0, 4, 8, ... th vectors of the key, and the second thread
- // has 1, 5, 9, ... th vectors of the key, and so on.
+ // For example, if the the thread group size is 4, then the first thread in the group
+ // has 0, 4, 8, ... th vectors of the key, and the second thread has 1, 5, 9, ... th
+ // vectors of the key, and so on.
for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
- const int physical_block_offset =
- (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
+ const int physical_block_offset = (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
K_vec k_vecs[NUM_VECS_PER_THREAD];
#pragma unroll
for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
- const cache_t* k_ptr =
- k_cache + physical_block_number * kv_block_stride +
- kv_head_idx * kv_head_stride + physical_block_offset * x;
+ const cache_t* k_ptr = k_cache + physical_block_number * kv_block_stride
+ + kv_head_idx * kv_head_stride
+ + physical_block_offset * x;
const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
const int offset1 = (vec_idx * VEC_SIZE) / x;
const int offset2 = (vec_idx * VEC_SIZE) % x;
-
- if constexpr (KV_DTYPE == Fp8KVCacheDataType::kAuto) {
- k_vecs[j] = *reinterpret_cast(
- k_ptr + offset1 * BLOCK_SIZE * x + offset2);
- } else {
+ if constexpr (IS_FP8_KV_CACHE) {
+#if defined(ENABLE_FP8_E5M2)
+ Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
// Vector conversion from Quant_vec to K_vec.
- Quant_vec k_vec_quant = *reinterpret_cast(
- k_ptr + offset1 * BLOCK_SIZE * x + offset2);
- k_vecs[j] = fp8::scaled_convert(
- k_vec_quant, kv_scale);
+ k_vecs[j] = fp8_e5m2_unscaled::vec_conversion(k_vec_quant);
+#elif defined(ENABLE_FP8_E4M3)
+ Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
+ // Vector conversion from Quant_vec to K_vec. Use scaled_vec_conversion to convert FP8_E4M3 quantized k
+ // cache vec to k vec in higher precision (FP16, BFloat16, etc.)
+ k_vecs[j] = fp8_e4m3::scaled_vec_conversion(k_vec_quant, kv_scale);
+#else
+ assert(false);
+#endif
+ } else {
+ k_vecs[j] = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
}
}
// Compute dot product.
// This includes a reduction across the threads in the same thread group.
- float qk = scale * Qk_dot::dot(
- q_vecs[thread_group_offset], k_vecs);
+ float qk = scale * Qk_dot::dot(q_vecs[thread_group_offset], k_vecs);
// Add the ALiBi bias if slopes are given.
qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0;
@@ -347,12 +298,13 @@ __device__ void paged_attention_kernel(
// If partitioning is enabled, store the max logit and exp_sum.
if (USE_PARTITIONING && thread_idx == 0) {
- float* max_logits_ptr = max_logits +
- seq_idx * num_heads * max_num_partitions +
- head_idx * max_num_partitions + partition_idx;
+ float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions
+ + head_idx * max_num_partitions
+ + partition_idx;
*max_logits_ptr = qk_max;
- float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions +
- head_idx * max_num_partitions + partition_idx;
+ float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions
+ + head_idx * max_num_partitions
+ + partition_idx;
*exp_sums_ptr = exp_sum;
}
@@ -360,13 +312,14 @@ __device__ void paged_attention_kernel(
constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE);
using V_vec = typename Vec::Type;
using L_vec = typename Vec::Type;
+#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3)
using V_quant_vec = typename Vec::Type;
+#endif
using Float_L_vec = typename FloatVec::Type;
constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW;
- constexpr int NUM_ROWS_PER_THREAD =
- DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
+ constexpr int NUM_ROWS_PER_THREAD = DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
// NOTE(woosuk): We use FP32 for the accumulator for better accuracy.
float accs[NUM_ROWS_PER_THREAD];
@@ -377,51 +330,44 @@ __device__ void paged_attention_kernel(
scalar_t zero_value;
zero(zero_value);
- for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
- block_idx += NUM_WARPS) {
- // NOTE(woosuk): The block number is stored in int32. However, we cast it to
- // int64 because int32 can lead to overflow when this variable is multiplied
- // by large numbers (e.g., kv_block_stride).
- // For blocksparse attention: skip computation on blocks that are not
- // attended
- if constexpr (IS_BLOCK_SPARSE) {
- int v_bs_block_id = block_idx * BLOCK_SIZE / blocksparse_block_size;
- if (!((v_bs_block_id + bs_block_offset) % blocksparse_vert_stride == 0) &&
- !((v_bs_block_id > q_bs_block_id - blocksparse_local_blocks))) {
- continue;
- }
- }
- const int64_t physical_block_number =
- static_cast(block_table[block_idx]);
+ for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
+ // NOTE(woosuk): The block number is stored in int32. However, we cast it to int64
+ // because int32 can lead to overflow when this variable is multiplied by large numbers
+ // (e.g., kv_block_stride).
+ const int64_t physical_block_number = static_cast(block_table[block_idx]);
const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
L_vec logits_vec;
- from_float(logits_vec, *reinterpret_cast(logits + token_idx -
- start_token_idx));
+ from_float(logits_vec, *reinterpret_cast(logits + token_idx - start_token_idx));
- const cache_t* v_ptr = v_cache + physical_block_number * kv_block_stride +
- kv_head_idx * kv_head_stride;
+ const cache_t* v_ptr = v_cache + physical_block_number * kv_block_stride
+ + kv_head_idx * kv_head_stride;
#pragma unroll
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
if (row_idx < HEAD_SIZE) {
const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
V_vec v_vec;
-
- if constexpr (KV_DTYPE == Fp8KVCacheDataType::kAuto) {
- v_vec = *reinterpret_cast(v_ptr + offset);
- } else {
- V_quant_vec v_quant_vec =
- *reinterpret_cast(v_ptr + offset);
+ if constexpr (IS_FP8_KV_CACHE) {
+#if defined(ENABLE_FP8_E5M2)
+ V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset);
// Vector conversion from V_quant_vec to V_vec.
- v_vec = fp8::scaled_convert(v_quant_vec,
- kv_scale);
+ v_vec = fp8_e5m2_unscaled::vec_conversion(v_quant_vec);
+#elif defined(ENABLE_FP8_E4M3)
+ V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset);
+ // Vector conversion from V_quant_vec to V_vec. Use scaled_vec_conversion to convert
+ // FP8_E4M3 quantized v cache vec to v vec in higher precision (FP16, BFloat16, etc.)
+ v_vec = fp8_e4m3::scaled_vec_conversion(v_quant_vec, kv_scale);
+#else
+ assert(false);
+#endif
+ } else {
+ v_vec = *reinterpret_cast(v_ptr + offset);
}
if (block_idx == num_seq_blocks - 1) {
- // NOTE(woosuk): When v_vec contains the tokens that are out of the
- // context, we should explicitly zero out the values since they may
- // contain NaNs. See
- // https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
+ // NOTE(woosuk): When v_vec contains the tokens that are out of the context,
+ // we should explicitly zero out the values since they may contain NaNs.
+ // See https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
scalar_t* v_vec_ptr = reinterpret_cast(&v_vec);
#pragma unroll
for (int j = 0; j < V_VEC_SIZE; j++) {
@@ -444,8 +390,8 @@ __device__ void paged_attention_kernel(
accs[i] = acc;
}
- // NOTE(woosuk): A barrier is required because the shared memory space for
- // logits is reused for the output.
+ // NOTE(woosuk): A barrier is required because the shared memory space for logits
+ // is reused for the output.
__syncthreads();
// Perform reduction across warps.
@@ -482,9 +428,9 @@ __device__ void paged_attention_kernel(
// Write the final output.
if (warp_idx == 0) {
- scalar_t* out_ptr =
- out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
- head_idx * max_num_partitions * HEAD_SIZE + partition_idx * HEAD_SIZE;
+ scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
+ + head_idx * max_num_partitions * HEAD_SIZE
+ + partition_idx * HEAD_SIZE;
#pragma unroll
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
@@ -496,84 +442,79 @@ __device__ void paged_attention_kernel(
}
// Grid: (num_heads, num_seqs, 1).
-template
+template<
+ typename scalar_t,
+ typename cache_t,
+ int HEAD_SIZE,
+ int BLOCK_SIZE,
+ int NUM_THREADS,
+ bool IS_FP8_KV_CACHE>
__global__ void paged_attention_v1_kernel(
- scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
- const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
- const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
- // head_size/x, block_size, x]
- const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
- // head_size, block_size]
- const int num_kv_heads, // [num_heads]
- const float scale,
- const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
- const int* __restrict__ seq_lens, // [num_seqs]
- const int max_num_blocks_per_seq,
- const float* __restrict__ alibi_slopes, // [num_heads]
- const int q_stride, const int kv_block_stride, const int kv_head_stride,
- const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
- const int blocksparse_vert_stride, const int blocksparse_block_size,
- const int blocksparse_head_sliding_step) {
- paged_attention_kernel(
- /* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache,
- v_cache, num_kv_heads, scale, block_tables, seq_lens,
- max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride,
- kv_head_stride, kv_scale, tp_rank, blocksparse_local_blocks,
- blocksparse_vert_stride, blocksparse_block_size,
- blocksparse_head_sliding_step);
+ scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
+ const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
+ const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+ const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
+ const int num_kv_heads, // [num_heads]
+ const float scale,
+ const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
+ const int* __restrict__ seq_lens, // [num_seqs]
+ const int max_num_blocks_per_seq,
+ const float* __restrict__ alibi_slopes, // [num_heads]
+ const int q_stride,
+ const int kv_block_stride,
+ const int kv_head_stride,
+ const float kv_scale) {
+ paged_attention_kernel(
+ /* exp_sums */ nullptr, /* max_logits */ nullptr,
+ out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, seq_lens,
+ max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride, kv_scale);
}
// Grid: (num_heads, num_seqs, max_num_partitions).
-template
+template<
+ typename scalar_t,
+ typename cache_t,
+ int HEAD_SIZE,
+ int BLOCK_SIZE,
+ int NUM_THREADS,
+ bool IS_FP8_KV_CACHE,
+ int PARTITION_SIZE>
__global__ void paged_attention_v2_kernel(
- float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
- float* __restrict__ max_logits, // [num_seqs, num_heads,
- // max_num_partitions]
- scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
- // max_num_partitions, head_size]
- const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
- const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
- // head_size/x, block_size, x]
- const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
- // head_size, block_size]
- const int num_kv_heads, // [num_heads]
- const float scale,
- const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
- const int* __restrict__ seq_lens, // [num_seqs]
- const int max_num_blocks_per_seq,
- const float* __restrict__ alibi_slopes, // [num_heads]
- const int q_stride, const int kv_block_stride, const int kv_head_stride,
- const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
- const int blocksparse_vert_stride, const int blocksparse_block_size,
- const int blocksparse_head_sliding_step) {
- paged_attention_kernel(
- exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
- block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes, q_stride,
- kv_block_stride, kv_head_stride, kv_scale, tp_rank,
- blocksparse_local_blocks, blocksparse_vert_stride, blocksparse_block_size,
- blocksparse_head_sliding_step);
+ float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
+ float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
+ scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
+ const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
+ const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+ const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
+ const int num_kv_heads, // [num_heads]
+ const float scale,
+ const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
+ const int* __restrict__ seq_lens, // [num_seqs]
+ const int max_num_blocks_per_seq,
+ const float* __restrict__ alibi_slopes, // [num_heads]
+ const int q_stride,
+ const int kv_block_stride,
+ const int kv_head_stride,
+ const float kv_scale) {
+ paged_attention_kernel(
+ exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
+ block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes,
+ q_stride, kv_block_stride, kv_head_stride, kv_scale);
}
// Grid: (num_heads, num_seqs).
-template
+template<
+ typename scalar_t,
+ int HEAD_SIZE,
+ int NUM_THREADS,
+ int PARTITION_SIZE>
__global__ void paged_attention_v2_reduce_kernel(
- scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
- const float* __restrict__ exp_sums, // [num_seqs, num_heads,
- // max_num_partitions]
- const float* __restrict__ max_logits, // [num_seqs, num_heads,
- // max_num_partitions]
- const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
- // max_num_partitions, head_size]
- const int* __restrict__ seq_lens, // [num_seqs]
- const int max_num_partitions) {
+ scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
+ const float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
+ const float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
+ const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
+ const int* __restrict__ seq_lens, // [num_seqs]
+ const int max_num_partitions) {
const int num_heads = gridDim.x;
const int head_idx = blockIdx.x;
const int seq_idx = blockIdx.y;
@@ -581,11 +522,9 @@ __global__ void paged_attention_v2_reduce_kernel(
const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
if (num_partitions == 1) {
// No need to reduce. Only copy tmp_out to out.
- scalar_t* out_ptr =
- out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
- const scalar_t* tmp_out_ptr =
- tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
- head_idx * max_num_partitions * HEAD_SIZE;
+ scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
+ const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
+ + head_idx * max_num_partitions * HEAD_SIZE;
for (int i = threadIdx.x; i < HEAD_SIZE; i += blockDim.x) {
out_ptr[i] = tmp_out_ptr[i];
}
@@ -604,9 +543,8 @@ __global__ void paged_attention_v2_reduce_kernel(
// Load max logits to shared memory.
float* shared_max_logits = reinterpret_cast(shared_mem);
- const float* max_logits_ptr = max_logits +
- seq_idx * num_heads * max_num_partitions +
- head_idx * max_num_partitions;
+ const float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions
+ + head_idx * max_num_partitions;
float max_logit = -FLT_MAX;
for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
const float l = max_logits_ptr[i];
@@ -635,11 +573,9 @@ __global__ void paged_attention_v2_reduce_kernel(
max_logit = VLLM_SHFL_SYNC(max_logit, 0);
// Load rescaled exp sums to shared memory.
- float* shared_exp_sums =
- reinterpret_cast(shared_mem + sizeof(float) * num_partitions);
- const float* exp_sums_ptr = exp_sums +
- seq_idx * num_heads * max_num_partitions +
- head_idx * max_num_partitions;
+ float* shared_exp_sums = reinterpret_cast(shared_mem + sizeof(float) * num_partitions);
+ const float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions
+ + head_idx * max_num_partitions;
float global_exp_sum = 0.0f;
for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
float l = shared_max_logits[i];
@@ -652,52 +588,61 @@ __global__ void paged_attention_v2_reduce_kernel(
const float inv_global_exp_sum = __fdividef(1.0f, global_exp_sum + 1e-6f);
// Aggregate tmp_out to out.
- const scalar_t* tmp_out_ptr =
- tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
- head_idx * max_num_partitions * HEAD_SIZE;
- scalar_t* out_ptr =
- out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
+ const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
+ + head_idx * max_num_partitions * HEAD_SIZE;
+ scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
#pragma unroll
for (int i = threadIdx.x; i < HEAD_SIZE; i += NUM_THREADS) {
float acc = 0.0f;
for (int j = 0; j < num_partitions; ++j) {
- acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] *
- inv_global_exp_sum;
+ acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] * inv_global_exp_sum;
}
from_float(out_ptr[i], acc);
}
}
-} // namespace vllm
-
-#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \
- VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \
- ((void*)vllm::paged_attention_v1_kernel), \
- shared_mem_size); \
- vllm::paged_attention_v1_kernel \
- <<>>( \
- out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
- scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \
- alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \
- kv_scale, tp_rank, blocksparse_local_blocks, \
- blocksparse_vert_stride, blocksparse_block_size, \
- blocksparse_head_sliding_step);
+} // namespace vllm
+
+#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \
+ VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \
+ ((void*)vllm::paged_attention_v1_kernel), shared_mem_size); \
+ vllm::paged_attention_v1_kernel<<>>( \
+ out_ptr, \
+ query_ptr, \
+ key_cache_ptr, \
+ value_cache_ptr, \
+ num_kv_heads, \
+ scale, \
+ block_tables_ptr, \
+ seq_lens_ptr, \
+ max_num_blocks_per_seq, \
+ alibi_slopes_ptr, \
+ q_stride, \
+ kv_block_stride, \
+ kv_head_stride, \
+ kv_scale);
// TODO(woosuk): Tune NUM_THREADS.
-template
+template<
+ typename T,
+ typename CACHE_T,
+ int BLOCK_SIZE,
+ bool IS_FP8_KV_CACHE,
+ int NUM_THREADS = 128>
void paged_attention_v1_launcher(
- torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
- torch::Tensor& value_cache, int num_kv_heads, float scale,
- torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
- const c10::optional& alibi_slopes, float kv_scale,
- const int tp_rank, const int blocksparse_local_blocks,
- const int blocksparse_vert_stride, const int blocksparse_block_size,
- const int blocksparse_head_sliding_step) {
+ torch::Tensor& out,
+ torch::Tensor& query,
+ torch::Tensor& key_cache,
+ torch::Tensor& value_cache,
+ int num_kv_heads,
+ float scale,
+ torch::Tensor& block_tables,
+ torch::Tensor& seq_lens,
+ int max_seq_len,
+ const c10::optional& alibi_slopes,
+ float kv_scale) {
int num_seqs = query.size(0);
int num_heads = query.size(1);
int head_size = query.size(2);
@@ -710,10 +655,9 @@ void paged_attention_v1_launcher(
assert(head_size % thread_group_size == 0);
// NOTE: alibi_slopes is optional.
- const float* alibi_slopes_ptr =
- alibi_slopes
- ? reinterpret_cast(alibi_slopes.value().data_ptr())
- : nullptr;
+ const float* alibi_slopes_ptr = alibi_slopes ?
+ reinterpret_cast(alibi_slopes.value().data_ptr())
+ : nullptr;
T* out_ptr = reinterpret_cast(out.data_ptr());
T* query_ptr = reinterpret_cast(query.data_ptr());
@@ -723,8 +667,7 @@ void paged_attention_v1_launcher(
int* seq_lens_ptr = seq_lens.data_ptr();
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
- int padded_max_seq_len =
- DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
+ int padded_max_seq_len = DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
int logits_size = padded_max_seq_len * sizeof(float);
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
// Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
@@ -754,9 +697,6 @@ void paged_attention_v1_launcher(
case 128:
LAUNCH_PAGED_ATTENTION_V1(128);
break;
- case 192:
- LAUNCH_PAGED_ATTENTION_V1(192);
- break;
case 256:
LAUNCH_PAGED_ATTENTION_V1(256);
break;
@@ -766,94 +706,128 @@ void paged_attention_v1_launcher(
}
}
-#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \
- paged_attention_v1_launcher( \
- out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
- seq_lens, max_seq_len, alibi_slopes, kv_scale, tp_rank, \
- blocksparse_local_blocks, blocksparse_vert_stride, \
- blocksparse_block_size, blocksparse_head_sliding_step);
-
-#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
- switch (is_block_sparse) { \
- case true: \
- CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
- break; \
- case false: \
- CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
- break; \
- }
+#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
+ paged_attention_v1_launcher( \
+ out, \
+ query, \
+ key_cache, \
+ value_cache, \
+ num_kv_heads, \
+ scale, \
+ block_tables, \
+ seq_lens, \
+ max_seq_len, \
+ alibi_slopes, \
+ kv_scale);
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
// 1, 2, 4, 64, 128, 256.
-#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \
- switch (block_size) { \
- case 8: \
- CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \
- break; \
- case 16: \
- CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \
- break; \
- case 32: \
- CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \
- break; \
- default: \
- TORCH_CHECK(false, "Unsupported block size: ", block_size); \
- break; \
+#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \
+ switch (block_size) { \
+ case 8: \
+ CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \
+ break; \
+ case 16: \
+ CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \
+ break; \
+ case 32: \
+ CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \
+ break; \
+ default: \
+ TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+ break; \
}
void paged_attention_v1(
- torch::Tensor& out, // [num_seqs, num_heads, head_size]
- torch::Tensor& query, // [num_seqs, num_heads, head_size]
- torch::Tensor&
- key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
- torch::Tensor&
- value_cache, // [num_blocks, num_heads, head_size, block_size]
- int64_t num_kv_heads, // [num_heads]
- double scale,
- torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
- torch::Tensor& seq_lens, // [num_seqs]
- int64_t block_size, int64_t max_seq_len,
- const c10::optional& alibi_slopes,
- const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
- const int64_t blocksparse_local_blocks,
- const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
- const int64_t blocksparse_head_sliding_step) {
- const bool is_block_sparse = (blocksparse_vert_stride > 1);
-
- DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
- CALL_V1_LAUNCHER_BLOCK_SIZE)
+ torch::Tensor& out, // [num_seqs, num_heads, head_size]
+ torch::Tensor& query, // [num_seqs, num_heads, head_size]
+ torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
+ torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size]
+ int num_kv_heads, // [num_heads]
+ float scale,
+ torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
+ torch::Tensor& seq_lens, // [num_seqs]
+ int block_size,
+ int max_seq_len,
+ const c10::optional& alibi_slopes,
+ const std::string& kv_cache_dtype,
+ float kv_scale) {
+ if (kv_cache_dtype == "auto") {
+ if (query.dtype() == at::ScalarType::Float) {
+ CALL_V1_LAUNCHER_BLOCK_SIZE(float, float, false);
+ } else if (query.dtype() == at::ScalarType::Half) {
+ CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, uint16_t, false);
+ } else if (query.dtype() == at::ScalarType::BFloat16) {
+ CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, __nv_bfloat16, false);
+ } else {
+ TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
+ }
+ } else if (kv_cache_dtype == "fp8") {
+ if (query.dtype() == at::ScalarType::Float) {
+ CALL_V1_LAUNCHER_BLOCK_SIZE(float, uint8_t, true);
+ } else if (query.dtype() == at::ScalarType::Half) {
+ CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, uint8_t, true);
+ } else if (query.dtype() == at::ScalarType::BFloat16) {
+ CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, uint8_t, true);
+ } else {
+ TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
+ }
+ } else {
+ TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
+ }
}
-#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \
- vllm::paged_attention_v2_kernel \
- <<>>( \
- exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
- value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
- seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
- kv_block_stride, kv_head_stride, kv_scale, tp_rank, \
- blocksparse_local_blocks, blocksparse_vert_stride, \
- blocksparse_block_size, blocksparse_head_sliding_step); \
- vllm::paged_attention_v2_reduce_kernel \
- <<>>( \
- out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \
- max_num_partitions);
-
-template
+#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \
+ vllm::paged_attention_v2_kernel \
+ <<>>( \
+ exp_sums_ptr, \
+ max_logits_ptr, \
+ tmp_out_ptr, \
+ query_ptr, \
+ key_cache_ptr, \
+ value_cache_ptr, \
+ num_kv_heads, \
+ scale, \
+ block_tables_ptr, \
+ seq_lens_ptr, \
+ max_num_blocks_per_seq, \
+ alibi_slopes_ptr, \
+ q_stride, \
+ kv_block_stride, \
+ kv_head_stride, \
+ kv_scale); \
+ vllm::paged_attention_v2_reduce_kernel \
+ <<>>( \
+ out_ptr, \
+ exp_sums_ptr, \
+ max_logits_ptr, \
+ tmp_out_ptr, \
+ seq_lens_ptr, \
+ max_num_partitions);
+
+template<
+ typename T,
+ typename CACHE_T,
+ int BLOCK_SIZE,
+ bool IS_FP8_KV_CACHE,
+ int NUM_THREADS = 128,
+ int PARTITION_SIZE = 512>
void paged_attention_v2_launcher(
- torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
- torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
- torch::Tensor& value_cache, int num_kv_heads, float scale,
- torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
- const c10::optional& alibi_slopes, float kv_scale,
- const int tp_rank, const int blocksparse_local_blocks,
- const int blocksparse_vert_stride, const int blocksparse_block_size,
- const int blocksparse_head_sliding_step) {
+ torch::Tensor& out,
+ torch::Tensor& exp_sums,
+ torch::Tensor& max_logits,
+ torch::Tensor& tmp_out,
+ torch::Tensor& query,
+ torch::Tensor& key_cache,
+ torch::Tensor& value_cache,
+ int num_kv_heads,
+ float scale,
+ torch::Tensor& block_tables,
+ torch::Tensor& seq_lens,
+ int max_seq_len,
+ const c10::optional& alibi_slopes,
+ float kv_scale) {
int num_seqs = query.size(0);
int num_heads = query.size(1);
int head_size = query.size(2);
@@ -866,10 +840,9 @@ void paged_attention_v2_launcher(
assert(head_size % thread_group_size == 0);
// NOTE: alibi_slopes is optional.
- const float* alibi_slopes_ptr =
- alibi_slopes
- ? reinterpret_cast(alibi_slopes.value().data_ptr())
- : nullptr;
+ const float* alibi_slopes_ptr = alibi_slopes ?
+ reinterpret_cast(alibi_slopes.value().data_ptr())
+ : nullptr;
T* out_ptr = reinterpret_cast(out.data_ptr());
float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr());
@@ -915,9 +888,6 @@ void paged_attention_v2_launcher(
case 128:
LAUNCH_PAGED_ATTENTION_V2(128);
break;
- case 192:
- LAUNCH_PAGED_ATTENTION_V2(192);
- break;
case 256:
LAUNCH_PAGED_ATTENTION_V2(256);
break;
@@ -927,66 +897,81 @@ void paged_attention_v2_launcher(
}
}
-#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \
- paged_attention_v2_launcher( \
- out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
- num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
- kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride, \
- blocksparse_block_size, blocksparse_head_sliding_step);
-
-#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
- switch (is_block_sparse) { \
- case true: \
- CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
- break; \
- case false: \
- CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
- break; \
- }
+#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
+ paged_attention_v2_launcher( \
+ out, \
+ exp_sums, \
+ max_logits, \
+ tmp_out, \
+ query, \
+ key_cache, \
+ value_cache, \
+ num_kv_heads, \
+ scale, \
+ block_tables, \
+ seq_lens, \
+ max_seq_len, \
+ alibi_slopes, \
+ kv_scale);
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
// 1, 2, 4, 64, 128, 256.
-#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \
- switch (block_size) { \
- case 8: \
- CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \
- break; \
- case 16: \
- CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \
- break; \
- case 32: \
- CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \
- break; \
- default: \
- TORCH_CHECK(false, "Unsupported block size: ", block_size); \
- break; \
+#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \
+ switch (block_size) { \
+ case 8: \
+ CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \
+ break; \
+ case 16: \
+ CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \
+ break; \
+ case 32: \
+ CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \
+ break; \
+ default: \
+ TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+ break; \
}
void paged_attention_v2(
- torch::Tensor& out, // [num_seqs, num_heads, head_size]
- torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions]
- torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions]
- torch::Tensor&
- tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
- torch::Tensor& query, // [num_seqs, num_heads, head_size]
- torch::Tensor&
- key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
- torch::Tensor&
- value_cache, // [num_blocks, num_heads, head_size, block_size]
- int64_t num_kv_heads, // [num_heads]
- double scale,
- torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
- torch::Tensor& seq_lens, // [num_seqs]
- int64_t block_size, int64_t max_seq_len,
- const c10::optional& alibi_slopes,
- const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
- const int64_t blocksparse_local_blocks,
- const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
- const int64_t blocksparse_head_sliding_step) {
- const bool is_block_sparse = (blocksparse_vert_stride > 1);
- DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
- CALL_V2_LAUNCHER_BLOCK_SIZE)
+ torch::Tensor& out, // [num_seqs, num_heads, head_size]
+ torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions]
+ torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions]
+ torch::Tensor& tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
+ torch::Tensor& query, // [num_seqs, num_heads, head_size]
+ torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
+ torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size]
+ int num_kv_heads, // [num_heads]
+ float scale,
+ torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
+ torch::Tensor& seq_lens, // [num_seqs]
+ int block_size,
+ int max_seq_len,
+ const c10::optional& alibi_slopes,
+ const std::string& kv_cache_dtype,
+ float kv_scale) {
+ if (kv_cache_dtype == "auto") {
+ if (query.dtype() == at::ScalarType::Float) {
+ CALL_V2_LAUNCHER_BLOCK_SIZE(float, float, false);
+ } else if (query.dtype() == at::ScalarType::Half) {
+ CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, uint16_t, false);
+ } else if (query.dtype() == at::ScalarType::BFloat16) {
+ CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, __nv_bfloat16, false);
+ } else {
+ TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
+ }
+ } else if (kv_cache_dtype == "fp8") {
+ if (query.dtype() == at::ScalarType::Float) {
+ CALL_V2_LAUNCHER_BLOCK_SIZE(float, uint8_t, true);
+ } else if (query.dtype() == at::ScalarType::Half) {
+ CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, uint8_t, true);
+ } else if (query.dtype() == at::ScalarType::BFloat16) {
+ CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, uint8_t, true);
+ } else {
+ TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
+ }
+ } else {
+ TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
+ }
}
#undef WARP_SIZE
diff --git a/csrc/attention/attention_utils.cuh b/csrc/attention/attention_utils.cuh
index cdcee42748998..ff64c4bd8f80c 100644
--- a/csrc/attention/attention_utils.cuh
+++ b/csrc/attention/attention_utils.cuh
@@ -1,6 +1,5 @@
/*
- * Adapted from
- * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -27,7 +26,7 @@
namespace vllm {
// Q*K^T operation.
-template
+template
inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
using A_vec = typename FloatVec::Type;
// Compute the parallel products for Q*K^T (treat vector lanes separately).
@@ -46,12 +45,12 @@ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
return qk;
}
-template
+template
struct Qk_dot {
- template
+ template
static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
return qk_dot_(q, k);
}
};
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/dtype_bfloat16.cuh b/csrc/attention/dtype_bfloat16.cuh
index 3cdcb95e08099..31e0cee01d2e1 100644
--- a/csrc/attention/dtype_bfloat16.cuh
+++ b/csrc/attention/dtype_bfloat16.cuh
@@ -1,8 +1,6 @@
/*
- * Adapted from
- * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
- * and
- * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -30,8 +28,8 @@
#include
#include
-typedef __hip_bfloat162 __nv_bfloat162;
-typedef __hip_bfloat16 __nv_bfloat16;
+ typedef __hip_bfloat162 __nv_bfloat162;
+ typedef __hip_bfloat16 __nv_bfloat16;
#endif
#include
@@ -52,37 +50,37 @@ struct bf16_8_t {
};
// BF16 vector types for Q, K, V.
-template <>
+template<>
struct Vec<__nv_bfloat16, 1> {
using Type = __nv_bfloat16;
};
-template <>
+template<>
struct Vec<__nv_bfloat16, 2> {
using Type = __nv_bfloat162;
};
-template <>
+template<>
struct Vec<__nv_bfloat16, 4> {
using Type = bf16_4_t;
};
-template <>
+template<>
struct Vec<__nv_bfloat16, 8> {
using Type = bf16_8_t;
};
// FP32 accumulator vector types corresponding to Vec.
-template <>
+template<>
struct FloatVec<__nv_bfloat16> {
using Type = float;
};
-template <>
+template<>
struct FloatVec<__nv_bfloat162> {
using Type = float2;
};
-template <>
+template<>
struct FloatVec {
using Type = Float4_;
};
-template <>
+template<>
struct FloatVec {
using Type = Float8_;
};
@@ -110,9 +108,9 @@ inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) {
assert(false);
#else
#ifndef USE_ROCM
- return a + b;
+ return a + b;
#else
- return __hadd(a, b);
+ return __hadd(a, b);
#endif
#endif
}
@@ -163,7 +161,7 @@ inline __device__ Float8_ add(bf16_8_t a, Float8_ fb) {
}
// Vector multiplication.
-template <>
+template<>
inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
assert(false);
@@ -172,7 +170,7 @@ inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
#endif
}
-template <>
+template<>
inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
assert(false);
@@ -181,12 +179,12 @@ inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
#endif
}
-template <>
+template<>
inline __device__ __nv_bfloat162 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
return mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
}
-template <>
+template<>
inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
bf16_4_t c;
c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
@@ -194,7 +192,7 @@ inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
return c;
}
-template <>
+template<>
inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
__nv_bfloat162 s = bf162bf162(a);
bf16_4_t c;
@@ -203,7 +201,7 @@ inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
return c;
}
-template <>
+template<>
inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
bf16_8_t c;
c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
@@ -213,7 +211,7 @@ inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
return c;
}
-template <>
+template<>
inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
__nv_bfloat162 s = bf162bf162(a);
bf16_8_t c;
@@ -224,26 +222,26 @@ inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
return c;
}
-template <>
+template<>
inline __device__ float mul(__nv_bfloat16 a, __nv_bfloat16 b) {
float fa = __bfloat162float(a);
float fb = __bfloat162float(b);
return fa * fb;
}
-template <>
+template<>
inline __device__ float2 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
float2 fa = bf1622float2(a);
float2 fb = bf1622float2(b);
return mul(fa, fb);
}
-template <>
+template<>
inline __device__ float2 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
return mul(bf162bf162(a), b);
}
-template <>
+template<>
inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
Float4_ fc;
fc.x = mul(a.x, b.x);
@@ -251,7 +249,7 @@ inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
return fc;
}
-template <>
+template<>
inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
__nv_bfloat162 s = bf162bf162(a);
Float4_ fc;
@@ -260,7 +258,7 @@ inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
return fc;
}
-template <>
+template<>
inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
Float8_ fc;
fc.x = mul(a.x, b.x);
@@ -270,7 +268,7 @@ inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
return fc;
}
-template <>
+template<>
inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
__nv_bfloat162 s = bf162bf162(a);
Float8_ fc;
@@ -282,8 +280,7 @@ inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
}
// Vector fused multiply-add.
-inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b,
- __nv_bfloat162 c) {
+inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
assert(false);
#else
@@ -291,8 +288,7 @@ inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b,
#endif
}
-inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b,
- __nv_bfloat162 c) {
+inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b, __nv_bfloat162 c) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
assert(false);
#else
@@ -383,23 +379,23 @@ inline __device__ Float8_ fma(__nv_bfloat16 a, bf16_8_t b, Float8_ fc) {
}
// Vector sum.
-template <>
+template<>
inline __device__ float sum(__nv_bfloat16 v) {
return __bfloat162float(v);
}
-template <>
+template<>
inline __device__ float sum(__nv_bfloat162 v) {
float2 vf = bf1622float2(v);
return vf.x + vf.y;
}
-template <>
+template<>
inline __device__ float sum(bf16_4_t v) {
return sum(v.x) + sum(v.y);
}
-template <>
+template<>
inline __device__ float sum(bf16_8_t v) {
return sum(v.x) + sum(v.y) + sum(v.z) + sum(v.w);
}
@@ -452,4 +448,4 @@ inline __device__ void zero(__nv_bfloat16& dst) {
#endif
}
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/dtype_float16.cuh b/csrc/attention/dtype_float16.cuh
index 3a1815f0ed4fc..d3271e69cd69d 100644
--- a/csrc/attention/dtype_float16.cuh
+++ b/csrc/attention/dtype_float16.cuh
@@ -1,8 +1,6 @@
/*
- * Adapted from
- * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
- * and
- * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -32,37 +30,37 @@
namespace vllm {
// FP16 vector types for Q, K, V.
-template <>
+template<>
struct Vec {
using Type = uint16_t;
};
-template <>
+template<>
struct Vec {
using Type = uint32_t;
};
-template <>
+template<>
struct Vec {
using Type = uint2;
};
-template <>
+template<>
struct Vec {
using Type = uint4;
};
// FP32 accumulator vector types corresponding to Vec.
-template <>
+template<>
struct FloatVec {
using Type = float;
};
-template <>
+template<>
struct FloatVec {
using Type = float2;
};
-template <>
+template<>
struct FloatVec {
using Type = Float4_;
};
-template <>
+template<>
struct FloatVec {
using Type = Float8_;
};
@@ -75,8 +73,8 @@ inline __device__ uint32_t h0_h0(uint16_t a) {
return b;
#else
union {
- uint32_t u32;
- uint16_t u16[2];
+ uint32_t u32;
+ uint16_t u16[2];
} tmp;
tmp.u16[0] = a;
tmp.u16[1] = a;
@@ -132,12 +130,10 @@ inline __device__ uint32_t float2_to_half2(float2 f) {
} tmp;
#ifndef USE_ROCM
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
- asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n"
- : "=r"(tmp.u32)
- : "f"(f.y), "f"(f.x));
+ asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x));
#else
- asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
- asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
+ asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
+ asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
#endif
#else
tmp.u16[0] = float_to_half(f.x);
@@ -205,7 +201,7 @@ inline __device__ Float8_ add(uint4 a, Float8_ fb) {
}
// Vector multiplication.
-template <>
+template<>
inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
uint16_t c;
#ifndef USE_ROCM
@@ -216,7 +212,7 @@ inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
return c;
}
-template <>
+template<>
inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
uint32_t c;
#ifndef USE_ROCM
@@ -227,12 +223,12 @@ inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
return c;
}
-template <>
+template<>
inline __device__ uint32_t mul(uint16_t a, uint32_t b) {
return mul(h0_h0(a), b);
}
-template <>
+template<>
inline __device__ uint2 mul(uint2 a, uint2 b) {
uint2 c;
c.x = mul(a.x, b.x);
@@ -240,7 +236,7 @@ inline __device__ uint2 mul(uint2 a, uint2 b) {
return c;
}
-template <>
+template<>
inline __device__ uint2 mul(uint16_t a, uint2 b) {
uint32_t s = h0_h0(a);
uint2 c;
@@ -249,7 +245,7 @@ inline __device__ uint2 mul(uint16_t a, uint2 b) {
return c;
}
-template <>
+template<>
inline __device__ uint4 mul(uint4 a, uint4 b) {
uint4 c;
c.x = mul(a.x, b.x);
@@ -259,7 +255,7 @@ inline __device__ uint4 mul(uint4 a, uint4 b) {
return c;
}
-template <>
+template<>
inline __device__ uint4 mul(uint16_t a, uint4 b) {
uint32_t s = h0_h0(a);
uint4 c;
@@ -270,26 +266,26 @@ inline __device__ uint4 mul(uint16_t a, uint4 b) {
return c;
}
-template <>
+template<>
inline __device__ float mul(uint16_t a, uint16_t b) {
float fa = half_to_float(a);
float fb = half_to_float(b);
return fa * fb;
}
-template <>
+template<>
inline __device__ float2 mul(uint32_t a, uint32_t b) {
float2 fa = half2_to_float2(a);
float2 fb = half2_to_float2(b);
return mul(fa, fb);
}
-template <>
+template<>
inline __device__ float2 mul(uint16_t a, uint32_t b) {
return mul(h0_h0(a), b);
}
-template <>
+template<>
inline __device__ Float4_ mul(uint2 a, uint2 b) {
Float4_ fc;
fc.x = mul(a.x, b.x);
@@ -297,7 +293,7 @@ inline __device__ Float4_ mul(uint2 a, uint2 b) {
return fc;
}
-template <>
+template<>
inline __device__ Float4_ mul(uint16_t a, uint2 b) {
uint32_t s = h0_h0(a);
Float4_ fc;
@@ -306,7 +302,7 @@ inline __device__ Float4_ mul(uint16_t a, uint2 b) {
return fc;
}
-template <>
+template<>
inline __device__ Float8_ mul(uint4 a, uint4 b) {
Float8_ fc;
fc.x = mul(a.x, b.x);
@@ -316,7 +312,7 @@ inline __device__ Float8_ mul(uint4 a, uint4 b) {
return fc;
}
-template <>
+template<>
inline __device__ Float8_ mul(uint16_t a, uint4 b) {
uint32_t s = h0_h0(a);
Float8_ fc;
@@ -331,13 +327,9 @@ inline __device__ Float8_ mul(uint16_t a, uint4 b) {
inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) {
uint32_t d;
#ifndef USE_ROCM
- asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
- : "=r"(d)
- : "r"(a), "r"(b), "r"(c));
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
#else
- asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n"
- : "=v"(d)
- : "v"(a), "v"(b), "v"(c));
+ asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" : "=v"(d) : "v"(a), "v"(b), "v"(c));
#endif
return d;
}
@@ -431,24 +423,24 @@ inline __device__ Float8_ fma(uint16_t a, uint4 b, Float8_ fc) {
}
// Vector sum.
-template <>
+template<>
inline __device__ float sum(uint16_t v) {
return half_to_float(v);
}
-template <>
+template<>
inline __device__ float sum(uint32_t v) {
float2 tmp = half2_to_float2(v);
return tmp.x + tmp.y;
}
-template <>
+template<>
inline __device__ float sum(uint2 v) {
uint32_t c = add(v.x, v.y);
return sum(c);
}
-template <>
+template<>
inline __device__ float sum(uint4 v) {
uint32_t c = add(v.x, v.y);
c = add(c, v.z);
@@ -478,9 +470,13 @@ inline __device__ void from_float(uint4& dst, Float8_ src) {
}
// From float16 to float32.
-inline __device__ float to_float(uint16_t u) { return half_to_float(u); }
+inline __device__ float to_float(uint16_t u) {
+ return half_to_float(u);
+}
-inline __device__ float2 to_float(uint32_t u) { return half2_to_float2(u); }
+inline __device__ float2 to_float(uint32_t u) {
+ return half2_to_float2(u);
+}
inline __device__ Float4_ to_float(uint2 u) {
Float4_ tmp;
@@ -499,6 +495,8 @@ inline __device__ Float8_ to_float(uint4 u) {
}
// Zero-out a variable.
-inline __device__ void zero(uint16_t& dst) { dst = uint16_t(0); }
+inline __device__ void zero(uint16_t& dst) {
+ dst = uint16_t(0);
+}
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/dtype_float32.cuh b/csrc/attention/dtype_float32.cuh
index 7c6a686db3ba9..b200d2d226eb0 100644
--- a/csrc/attention/dtype_float32.cuh
+++ b/csrc/attention/dtype_float32.cuh
@@ -1,8 +1,6 @@
/*
- * Adapted from
- * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
- * and
- * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -40,35 +38,37 @@ struct Float8_ {
};
// FP32 vector types for Q, K, V.
-template <>
+template<>
struct Vec {
using Type = float;
};
-template <>
+template<>
struct Vec {
using Type = float2;
};
-template <>
+template<>
struct Vec {
using Type = float4;
};
// FP32 accumulator vector types corresponding to Vec.
-template <>
+template<>
struct FloatVec {
using Type = float;
};
-template <>
+template<>
struct FloatVec {
using Type = float2;
};
-template <>
+template<>
struct FloatVec {
using Type = float4;
};
// Vector addition.
-inline __device__ float add(float a, float b) { return a + b; }
+inline __device__ float add(float a, float b) {
+ return a + b;
+}
inline __device__ float2 add(float2 a, float2 b) {
float2 c;
@@ -87,12 +87,12 @@ inline __device__ float4 add(float4 a, float4 b) {
}
// Vector multiplication.
-template <>
+template<>
inline __device__ float mul(float a, float b) {
return a * b;
}
-template <>
+template<>
inline __device__ float2 mul(float2 a, float2 b) {
float2 c;
c.x = a.x * b.x;
@@ -100,7 +100,7 @@ inline __device__ float2 mul(float2 a, float2 b) {
return c;
}
-template <>
+template<>
inline __device__ float2 mul(float a, float2 b) {
float2 c;
c.x = a * b.x;
@@ -108,7 +108,7 @@ inline __device__ float2 mul(float a, float2 b) {
return c;
}
-template <>
+template<>
inline __device__ float4 mul(float4 a, float4 b) {
float4 c;
c.x = a.x * b.x;
@@ -118,7 +118,7 @@ inline __device__ float4 mul(float4 a, float4 b) {
return c;
}
-template <>
+template<>
inline __device__ float4 mul(float a, float4 b) {
float4 c;
c.x = a * b.x;
@@ -129,7 +129,9 @@ inline __device__ float4 mul(float a, float4 b) {
}
// Vector fused multiply-add.
-inline __device__ float fma(float a, float b, float c) { return a * b + c; }
+inline __device__ float fma(float a, float b, float c) {
+ return a * b + c;
+}
inline __device__ float2 fma(float2 a, float2 b, float2 c) {
float2 d;
@@ -180,33 +182,35 @@ inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) {
}
// Vector sum.
-template <>
+template<>
inline __device__ float sum(float v) {
return v;
}
-template <>
+template<>
inline __device__ float sum(float2 v) {
return v.x + v.y;
}
-template <>
+template<>
inline __device__ float sum(float4 v) {
return v.x + v.y + v.z + v.w;
}
-template <>
+template<>
inline __device__ float sum(Float4_ v) {
return v.x.x + v.x.y + v.y.x + v.y.y;
}
-template <>
+template<>
inline __device__ float sum(Float8_ v) {
return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y;
}
// Vector dot product.
-inline __device__ float dot(float a, float b) { return a * b; }
+inline __device__ float dot(float a, float b) {
+ return a * b;
+}
inline __device__ float dot(float2 a, float2 b) {
float2 c = mul(a, b);
@@ -228,24 +232,42 @@ inline __device__ float dot(Float8_ a, Float8_ b) {
}
// From float to float.
-inline __device__ void from_float(float& dst, float src) { dst = src; }
+inline __device__ void from_float(float& dst, float src) {
+ dst = src;
+}
-inline __device__ void from_float(float2& dst, float2 src) { dst = src; }
+inline __device__ void from_float(float2& dst, float2 src) {
+ dst = src;
+}
-inline __device__ void from_float(float4& dst, float4 src) { dst = src; }
+inline __device__ void from_float(float4& dst, float4 src) {
+ dst = src;
+}
// From float to float.
-inline __device__ float to_float(float u) { return u; }
+inline __device__ float to_float(float u) {
+ return u;
+}
-inline __device__ float2 to_float(float2 u) { return u; }
+inline __device__ float2 to_float(float2 u) {
+ return u;
+}
-inline __device__ float4 to_float(float4 u) { return u; }
+inline __device__ float4 to_float(float4 u) {
+ return u;
+}
-inline __device__ Float4_ to_float(Float4_ u) { return u; }
+inline __device__ Float4_ to_float(Float4_ u) {
+ return u;
+}
-inline __device__ Float8_ to_float(Float8_ u) { return u; }
+inline __device__ Float8_ to_float(Float8_ u) {
+ return u;
+}
// Zero-out a variable.
-inline __device__ void zero(float& dst) { dst = 0.f; }
+inline __device__ void zero(float& dst) {
+ dst = 0.f;
+}
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/dtype_fp8.cuh b/csrc/attention/dtype_fp8.cuh
index e714e321b0beb..d11dee91ebe87 100644
--- a/csrc/attention/dtype_fp8.cuh
+++ b/csrc/attention/dtype_fp8.cuh
@@ -3,39 +3,33 @@
#include "attention_generic.cuh"
#include
-#ifdef ENABLE_FP8
- #ifndef USE_ROCM
- #include
- #endif // USE_ROCM
-#endif // ENABLE_FP8
+#ifdef ENABLE_FP8_E5M2
+#include
+#endif
namespace vllm {
-
-enum class Fp8KVCacheDataType {
- kAuto = 0,
- kFp8E4M3 = 1,
- kFp8E5M2 = 2,
-};
-
+#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3)
// fp8 vector types for quantization of kv cache
-template <>
+
+template<>
struct Vec {
- using Type = uint8_t;
+ using Type = uint8_t;
};
-template <>
+template<>
struct Vec {
- using Type = uint16_t;
+ using Type = uint16_t;
};
-template <>
+template<>
struct Vec {
- using Type = uint32_t;
+ using Type = uint32_t;
};
-template <>
+template<>
struct Vec {
- using Type = uint2;
+ using Type = uint2;
};
+#endif // ENABLE_FP8_E5M2
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/cache.h b/csrc/cache.h
index 86caa9345361d..10871b3670bac 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -1,32 +1,38 @@
#pragma once
-#include
+#include
#include