From 9db52eab3dc0b7b2cf30fa4399d569131e90c2d4 Mon Sep 17 00:00:00 2001 From: rasmith Date: Fri, 6 Sep 2024 17:26:09 -0500 Subject: [PATCH 01/54] [Kernel] [Triton] Memory optimization for awq_gemm and awq_dequantize, 2x throughput (#8248) --- .../layers/quantization/awq_triton.py | 34 +++++++++++++------ 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py index ad706f28a742..d0b210c3a274 100644 --- a/vllm/model_executor/layers/quantization/awq_triton.py +++ b/vllm/model_executor/layers/quantization/awq_triton.py @@ -22,7 +22,7 @@ def awq_dequantize_kernel( # Compute offsets and masks for qweight_ptr. offsets_y = pid_y * BLOCK_SIZE_Y + tl.arange(0, BLOCK_SIZE_Y) - offsets_x = pid_x * BLOCK_SIZE_X + tl.arange(0, BLOCK_SIZE_X * 8) // 8 + offsets_x = pid_x * BLOCK_SIZE_X + tl.arange(0, BLOCK_SIZE_X) offsets = num_cols * offsets_y[:, None] + offsets_x[None, :] masks_y = offsets_y < num_rows @@ -43,6 +43,9 @@ def awq_dequantize_kernel( # Load the weights. iweights = tl.load(qweight_ptr + offsets, masks) + iweights = tl.interleave(iweights, iweights) + iweights = tl.interleave(iweights, iweights) + iweights = tl.interleave(iweights, iweights) # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7] # that will map given indices to the correct order. @@ -59,9 +62,8 @@ def awq_dequantize_kernel( iweights = (iweights >> shifts) & 0xF # Compute zero offsets and masks. - zero_offsets_y = (pid_y * BLOCK_SIZE_Y // group_size + - tl.arange(0, BLOCK_SIZE_Y) // group_size) - zero_offsets_x = pid_x * BLOCK_SIZE_X + tl.arange(0, BLOCK_SIZE_X * 8) // 8 + zero_offsets_y = pid_y * BLOCK_SIZE_Y // group_size + tl.arange(0, 1) + zero_offsets_x = pid_x * BLOCK_SIZE_X + tl.arange(0, BLOCK_SIZE_X) zero_offsets = num_cols * zero_offsets_y[:, None] + zero_offsets_x[None, :] zero_masks_y = zero_offsets_y < num_rows // group_size @@ -70,13 +72,16 @@ def awq_dequantize_kernel( # Load the zeros. zeros = tl.load(zeros_ptr + zero_offsets, zero_masks) + zeros = tl.interleave(zeros, zeros) + zeros = tl.interleave(zeros, zeros) + zeros = tl.interleave(zeros, zeros) + zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8)) # Unpack and reorder: shift out the correct 4-bit value and mask. zeros = (zeros >> shifts) & 0xF # Compute scale offsets and masks. - scale_offsets_y = (pid_y * BLOCK_SIZE_Y // group_size + - tl.arange(0, BLOCK_SIZE_Y) // group_size) + scale_offsets_y = pid_y * BLOCK_SIZE_Y // group_size + tl.arange(0, 1) scale_offsets_x = (pid_x * BLOCK_SIZE_X * 8 + tl.arange(0, BLOCK_SIZE_X * 8)) scale_offsets = (num_cols * 8 * scale_offsets_y[:, None] + @@ -87,6 +92,7 @@ def awq_dequantize_kernel( # Load the scales. scales = tl.load(scales_ptr + scale_offsets, scale_masks) + scales = tl.broadcast_to(scales, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8)) # Dequantize. iweights = (iweights - zeros) * scales @@ -137,12 +143,10 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K, offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) masks_am = offsets_am < M - offsets_bn = (pid_n * (BLOCK_SIZE_N // 8) + - tl.arange(0, BLOCK_SIZE_N) // 8) + offsets_bn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8) masks_bn = offsets_bn < N // 8 - offsets_zn = (pid_n * (BLOCK_SIZE_N // 8) + - tl.arange(0, BLOCK_SIZE_N) // 8) + offsets_zn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8) masks_zn = offsets_zn < N // 8 offsets_sn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) @@ -165,22 +169,30 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K, masks_b = masks_k[:, None] & masks_bn[None, :] b = tl.load(b_ptrs, mask=masks_b) + b = tl.interleave(b, b) + b = tl.interleave(b, b) + b = tl.interleave(b, b) # Dequantize b. offsets_szk = ( (BLOCK_SIZE_K * SPLIT_K * k + pid_z * BLOCK_SIZE_K) // group_size + - tl.arange(0, BLOCK_SIZE_K) // group_size) + tl.arange(0, 1)) offsets_z = (N // 8) * offsets_szk[:, None] + offsets_zn[None, :] masks_zk = offsets_szk < K // group_size masks_z = masks_zk[:, None] & masks_zn[None, :] zeros_ptrs = zeros_ptr + offsets_z zeros = tl.load(zeros_ptrs, mask=masks_z) + zeros = tl.interleave(zeros, zeros) + zeros = tl.interleave(zeros, zeros) + zeros = tl.interleave(zeros, zeros) + zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N)) offsets_s = N * offsets_szk[:, None] + offsets_sn[None, :] masks_sk = offsets_szk < K // group_size masks_s = masks_sk[:, None] & masks_sn[None, :] scales_ptrs = scales_ptr + offsets_s scales = tl.load(scales_ptrs, mask=masks_s) + scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N)) b = (b >> shifts) & 0xF zeros = (zeros >> shifts) & 0xF From 23f322297f33a50dd1fe0870665d0c4414fd78ab Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 6 Sep 2024 18:29:03 -0400 Subject: [PATCH 02/54] [Misc] Remove `SqueezeLLM` (#8220) --- CMakeLists.txt | 1 - csrc/ops.h | 3 - .../squeezellm/quant_cuda_kernel.cu | 216 ------------------ csrc/torch_bindings.cpp | 6 - .../quantization/supported_hardware.rst | 11 - examples/fp8/README.md | 4 +- vllm/_custom_ops.py | 6 - vllm/config.py | 4 +- vllm/entrypoints/llm.py | 2 +- vllm/lora/layers.py | 2 +- .../layers/quantization/__init__.py | 2 - .../layers/quantization/squeezellm.py | 138 ----------- 12 files changed, 6 insertions(+), 389 deletions(-) delete mode 100644 csrc/quantization/squeezellm/quant_cuda_kernel.cu delete mode 100644 vllm/model_executor/layers/quantization/squeezellm.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 923ed084ffd9..9c88c31c83da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,7 +181,6 @@ set(VLLM_EXT_SRC "csrc/pos_encoding_kernels.cu" "csrc/activation_kernels.cu" "csrc/layernorm_kernels.cu" - "csrc/quantization/squeezellm/quant_cuda_kernel.cu" "csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/fp8/common.cu" diff --git a/csrc/ops.h b/csrc/ops.h index 8d24545de898..45a3868395d1 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -170,9 +170,6 @@ void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scales); -void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, - torch::Tensor lookup_table); - torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight, torch::Tensor b_gptq_qzeros, torch::Tensor b_gptq_scales, torch::Tensor b_g_idx, diff --git a/csrc/quantization/squeezellm/quant_cuda_kernel.cu b/csrc/quantization/squeezellm/quant_cuda_kernel.cu deleted file mode 100644 index 8ed918b3d7c2..000000000000 --- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu +++ /dev/null @@ -1,216 +0,0 @@ -#include -#include -#include -#include - -// half-tensor -#include -#include -#include - -#define BLOCKWIDTH 128 -#define BLOCKHEIGHT4 16 - -namespace vllm { -namespace squeezellm { - -__device__ inline unsigned int as_unsigned(int i) { - return *reinterpret_cast(&i); -} - -// 4-bit matvec kernel (LUT-based) -__global__ void NUQ4MatMulKernel( -#ifndef USE_ROCM - const half2* __restrict__ vec, -#else - const __half2* __restrict__ vec, -#endif - const int* __restrict__ mat, -#ifndef USE_ROCM - half2* __restrict__ mul, -#else - float2* __restrict__ mul, -#endif - const __half* __restrict__ lookup_table, int height, int width, int batch, - int vec_height) { - - const int blockwidth2 = BLOCKWIDTH / 2; - - int row = BLOCKHEIGHT4 * blockIdx.x; - int col = BLOCKWIDTH * blockIdx.y + threadIdx.x; - -#ifndef USE_ROCM - __shared__ half2 blockvec[blockwidth2]; -#else - __shared__ __half2 blockvec[blockwidth2]; -#endif - - __shared__ __half deq2[16][BLOCKWIDTH]; - int off = threadIdx.x; - int column_offset = col * 16; - for (int val = 0; val < 16; val += 1) { - int lut_index = column_offset + val; - deq2[val][off] = lookup_table[lut_index]; - } - - __half res; -#ifndef USE_ROCM - half2 res2; - half2 tmp2; -#else - __half2 res2; - __half2 tmp2; -#endif - - int i; - int k; - - unsigned int tmp1; - unsigned int lut_index1, lut_index2; - - for (int b = 0; b < batch; ++b) { - i = width * row + col; - res = __int2half_rd(0); - k = 0; - - __syncthreads(); - if (threadIdx.x < blockwidth2) - blockvec[threadIdx.x] = - vec[b * vec_height / 2 + (row / BLOCKHEIGHT4) * blockwidth2 + - threadIdx.x]; - __syncthreads(); - - while (k < blockwidth2) { - tmp1 = as_unsigned(mat[i]); - -#ifndef USE_ROCM - res2 = {}; - tmp2 = {}; -#else - res2.x = __half_as_ushort(__float2half(0)); - res2.y = __half_as_ushort(__float2half(0)); - tmp2.x = __half_as_ushort(__float2half(0)); - tmp2.y = __half_as_ushort(__float2half(0)); -#endif - - lut_index1 = tmp1 & 0xF; - lut_index2 = (tmp1 >> 4) & 0xF; -#ifndef USE_ROCM - tmp2.x = deq2[lut_index1][off]; - tmp2.y = deq2[lut_index2][off]; -#else - tmp2.x = __half_as_ushort(deq2[lut_index1][off]); - tmp2.y = __half_as_ushort(deq2[lut_index2][off]); -#endif - res2 = __hfma2(tmp2, blockvec[k + 0], res2); - - lut_index1 = (tmp1 >> 8) & 0xF; - lut_index2 = (tmp1 >> 12) & 0xF; -#ifndef USE_ROCM - tmp2.x = deq2[lut_index1][off]; - tmp2.y = deq2[lut_index2][off]; -#else - tmp2.x = __half_as_ushort(deq2[lut_index1][off]); - tmp2.y = __half_as_ushort(deq2[lut_index2][off]); -#endif - res2 = __hfma2(tmp2, blockvec[k + 1], res2); - - lut_index1 = (tmp1 >> 16) & 0xF; - lut_index2 = (tmp1 >> 20) & 0xF; -#ifndef USE_ROCM - tmp2.x = deq2[lut_index1][off]; - tmp2.y = deq2[lut_index2][off]; -#else - tmp2.x = __half_as_ushort(deq2[lut_index1][off]); - tmp2.y = __half_as_ushort(deq2[lut_index2][off]); -#endif - res2 = __hfma2(tmp2, blockvec[k + 2], res2); - - lut_index1 = (tmp1 >> 24) & 0xF; - lut_index2 = (tmp1 >> 28) & 0xF; -#ifndef USE_ROCM - tmp2.x = deq2[lut_index1][off]; - tmp2.y = deq2[lut_index2][off]; -#else - tmp2.x = __half_as_ushort(deq2[lut_index1][off]); - tmp2.y = __half_as_ushort(deq2[lut_index2][off]); -#endif - res2 = __hfma2(tmp2, blockvec[k + 3], res2); - -#ifndef USE_ROCM - res = __hadd(__hadd(res2.x, res2.y), res); -#else - res = __hadd(__hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)), - res); -#endif - - i += width; - k += 4; - } - - // col%2 -> only set one of the two values -#ifndef USE_ROCM - half2 res3 = {}; - if (col % 2 == 0) { - res3.x = res; - } else { - res3.y = res; - } -#else - __half2 res3; - res3.x = __half_as_ushort(__float2half(0)); - res3.y = __half_as_ushort(__float2half(0)); - if (col % 2 == 0) { - res3.x = __half_as_ushort(res); - } else { - res3.y = __half_as_ushort(res); - } -#endif - -#ifndef USE_ROCM - atomicAdd(&mul[b * width / 2 + col / 2], res3); -#else - int tmp_addr = b * width / 2 + col / 2; - atomicAdd(&(mul[tmp_addr].x), __half2float(__ushort_as_half(res3.x))); - atomicAdd(&(mul[tmp_addr].y), __half2float(__ushort_as_half(res3.y))); -#endif - } -} - -} // namespace squeezellm -} // namespace vllm - -// 4-bit matvec kernel (LUT-based) -void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, - torch::Tensor lookup_table) { - int height = mat.size(0); - int width = mat.size(1); - - int batch = vec.size(0); - int vec_height = vec.size(1); - - dim3 blocks((height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4, - (width + BLOCKWIDTH - 1) / BLOCKWIDTH); - dim3 threads(BLOCKWIDTH); - - const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - vllm::squeezellm::NUQ4MatMulKernel<<>>( -#ifndef USE_ROCM - (half2*)vec.data_ptr(), -#else - (__half2*)vec.data_ptr(), -#endif - mat.data_ptr(), -#ifndef USE_ROCM - (half2*)mul.data_ptr(), - (__half*)lookup_table.data_ptr(), -#else - (float2*)mul.data_ptr(), - (__half*)lookup_table.data_ptr(), -#endif - height, width, batch, vec_height); -} - -#undef BLOCKWIDTH -#undef BLOCKHEIGHT4 diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 7783acd741f5..07b14e7a6ff6 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -237,12 +237,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()"); ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle); - // Quantized GEMM for SqueezeLLM. - ops.def( - "squeezellm_gemm(Tensor vec, Tensor mat, Tensor! mul, Tensor " - "lookup_table) -> ()"); - ops.impl("squeezellm_gemm", torch::kCUDA, &squeezellm_gemm); - // Compute FP8 quantized tensor for given scaling factor. ops.def( "static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()"); diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.rst index 6341b583f0cf..ea587e0525a7 100644 --- a/docs/source/quantization/supported_hardware.rst +++ b/docs/source/quantization/supported_hardware.rst @@ -119,17 +119,6 @@ The table below shows the compatibility of various quantization implementations - ✗ - ✗ - ✗ - * - SqueezeLLM - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ Notes: ^^^^^^ diff --git a/examples/fp8/README.md b/examples/fp8/README.md index 84ad76c71862..181c36558fcf 100644 --- a/examples/fp8/README.md +++ b/examples/fp8/README.md @@ -62,7 +62,7 @@ This script evaluates the inference throughput of language models using various python3 benchmarks/benchmark_throughput.py --help usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL] - [--tokenizer TOKENIZER] [--quantization {awq,gptq,squeezellm,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N] + [--tokenizer TOKENIZER] [--quantization {awq,gptq,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N] [--use-beam-search] [--num-prompts NUM_PROMPTS] [--seed SEED] [--hf-max-batch-size HF_MAX_BATCH_SIZE] [--trust-remote-code] [--max-model-len MAX_MODEL_LEN] [--dtype {auto,half,float16,bfloat16,float,float32}] [--enforce-eager] [--kv-cache-dtype {auto,fp8}] [--quantization-param-path KV_CACHE_quantization_param_path] @@ -76,7 +76,7 @@ optional arguments: --output-len OUTPUT_LEN Output length for each request. Overrides the output length from the dataset. --model MODEL --tokenizer TOKENIZER - --quantization {awq,gptq,squeezellm,None}, -q {awq,gptq,squeezellm,None} + --quantization {awq,gptq,None}, -q {awq,gptq,None} --tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE --n N Number of generated sequences per prompt. --use-beam-search diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index fe254732e730..151cdbee8eb0 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -209,12 +209,6 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor, torch.ops._C.gptq_shuffle(q_weight, q_perm, bit) -# squeezellm -def squeezellm_gemm(vec: torch.Tensor, mat: torch.Tensor, mul: torch.Tensor, - lookup_table: torch.Tensor) -> None: - torch.ops._C.squeezellm_gemm(vec, mat, mul, lookup_table) - - # marlin def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, b_scales: torch.Tensor, workspace: torch.Tensor, size_m: int, diff --git a/vllm/config.py b/vllm/config.py index e513608eca9f..1c9e30b2682b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -277,7 +277,7 @@ def _parse_quant_hf_config(self): def _verify_quantization(self) -> None: supported_quantization = [*QUANTIZATION_METHODS] - rocm_supported_quantization = ["awq", "gptq", "squeezellm", "fp8"] + rocm_supported_quantization = ["awq", "gptq", "fp8"] optimized_quantization_methods = [ "fp8", "marlin", "gptq_marlin_24", "gptq_marlin", "awq_marlin", "fbgemm_fp8", "compressed_tensors", "compressed-tensors", @@ -1537,7 +1537,7 @@ def verify_with_model_config(self, model_config: ModelConfig): if model_config.quantization and model_config.quantization not in [ "awq", "gptq" ]: - # TODO support marlin and squeezellm + # TODO support marlin logger.warning("%s quantization is not tested with LoRA yet.", model_config.quantization) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index b32c90a4df1a..f587ec300314 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -55,7 +55,7 @@ class LLM: However, if the `torch_dtype` in the config is `float32`, we will use `float16` instead. quantization: The method used to quantize the model weights. Currently, - we support "awq", "gptq", "squeezellm", and "fp8" (experimental). + we support "awq", "gptq", and "fp8" (experimental). If None, we first check the `quantization_config` attribute in the model config file. If that is None, we assume the model weights are not quantized and use `dtype` to determine the data type of diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index a8ea67991a37..b9ac498b23a7 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -39,7 +39,7 @@ def _get_lora_device(base_layer: nn.Module) -> torch.device: # unquantizedLinear if hasattr(base_layer, "weight"): return base_layer.weight.device - # GPTQ/AWQ/SqueezeLLM + # GPTQ/AWQ elif hasattr(base_layer, "qweight"): return base_layer.qweight.device # marlin diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index c6fb6ca0d2e0..aa5c288962d9 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -25,7 +25,6 @@ from vllm.model_executor.layers.quantization.neuron_quant import ( NeuronQuantConfig) from vllm.model_executor.layers.quantization.qqq import QQQConfig -from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = { @@ -43,7 +42,6 @@ "gptq_marlin": GPTQMarlinConfig, "awq_marlin": AWQMarlinConfig, "gptq": GPTQConfig, - "squeezellm": SqueezeLLMConfig, "compressed-tensors": CompressedTensorsConfig, "bitsandbytes": BitsAndBytesConfig, "qqq": QQQConfig, diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py deleted file mode 100644 index afb3c0497673..000000000000 --- a/vllm/model_executor/layers/quantization/squeezellm.py +++ /dev/null @@ -1,138 +0,0 @@ -from typing import Any, Dict, List, Optional - -import torch -from torch.nn.parameter import Parameter - -from vllm import _custom_ops as ops -from vllm.model_executor.layers.linear import LinearBase -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig, QuantizeMethodBase) -from vllm.model_executor.utils import set_weight_attrs -from vllm.utils import is_hip - - -class SqueezeLLMConfig(QuantizationConfig): - """Config class for SqueezeLLM. - - Reference: https://arxiv.org/pdf/2306.07629 - """ - - def __init__( - self, - weight_bits: int, - ) -> None: - self.weight_bits = weight_bits - - if self.weight_bits != 4: - raise ValueError( - "Currently, only 4-bit weight quantization is supported for " - f"SqueezeLLM, but got {self.weight_bits} bits.") - - self.pack_factor = 32 // self.weight_bits - - def __repr__(self) -> str: - return f"SqueezeLLMConfig(weight_bits={self.weight_bits})" - - def get_name(self) -> str: - return "squeezellm" - - def get_supported_act_dtypes(self) -> List[torch.dtype]: - return [torch.half] - - @classmethod - def get_min_capability(cls) -> int: - return 70 - - @staticmethod - def get_config_filenames() -> List[str]: - return ["quant_config.json"] - - @classmethod - def from_config(cls, config: Dict[str, Any]) -> "SqueezeLLMConfig": - weight_bits = cls.get_from_keys(config, ["wbits"]) - return cls(weight_bits) - - def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional[QuantizeMethodBase]: - if isinstance(layer, LinearBase): - return SqueezeLLMLinearMethod(self) - return None - - def get_scaled_act_names(self) -> List[str]: - return [] - - -class SqueezeLLMLinearMethod(QuantizeMethodBase): - """Linear method for SqueezeLLM. - - Args: - quant_config: The SqueezeLLM quantization config. - """ - - def __init__(self, quant_config: SqueezeLLMConfig): - self.quant_config = quant_config - - def create_weights(self, layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: List[int], input_size: int, - output_size: int, params_dtype: torch.dtype, - **extra_weight_attrs): - if input_size_per_partition % self.quant_config.pack_factor != 0: - raise ValueError( - "The input size is not aligned with the quantized " - "weight shape. This can be caused by too large " - "tensor parallel size.") - - output_size_per_partition = sum(output_partition_sizes) - qweight = Parameter( - torch.empty( - input_size_per_partition // self.quant_config.pack_factor, - output_size_per_partition, - dtype=torch.int32, - ), - requires_grad=False, - ) - set_weight_attrs( - qweight, { - "input_dim": 0, - "output_dim": 1, - "packed_dim": 0, - "pack_factor": self.quant_config.pack_factor, - }) - lookup_table = Parameter( - torch.empty( - output_size, - self.quant_config.weight_bits**2, - dtype=params_dtype, - ), - requires_grad=False, - ) - set_weight_attrs(lookup_table, { - "output_dim": 0, - }) - - layer.register_parameter("qweight", qweight) - set_weight_attrs(qweight, extra_weight_attrs) - layer.register_parameter("lookup_table", lookup_table) - set_weight_attrs(lookup_table, extra_weight_attrs) - - def apply(self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - qweight = layer.qweight - lookup_table = layer.lookup_table - out_shape = x.shape[:-1] + (qweight.shape[-1], ) - reshaped_x = x.reshape(-1, x.shape[-1]) - if is_hip(): - out_f = torch.zeros(out_shape, dtype=torch.float) - ops.squeezellm_gemm(reshaped_x, qweight, out_f, lookup_table) - out = out_f.to(dtype=torch.float16) - else: - # NOTE: The output tensor should be zero-initialized. - out = torch.zeros(out_shape, dtype=torch.float16) - ops.squeezellm_gemm(reshaped_x, qweight, out, lookup_table) - - if bias is not None: - out.add_(bias) - return out.reshape(out_shape) From 29f49cd6e3d3c5658b92ea3e97138c1ab5cb6b30 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Sat, 7 Sep 2024 01:02:05 +0200 Subject: [PATCH 03/54] [Model] Allow loading from original Mistral format (#8168) Co-authored-by: Michael Goin --- tests/models/test_mistral.py | 40 +++++ vllm/config.py | 62 ++++--- vllm/engine/arg_utils.py | 21 ++- vllm/model_executor/model_loader/loader.py | 12 +- .../model_loader/weight_utils.py | 21 +-- vllm/model_executor/models/llama.py | 51 ++++++ vllm/transformers_utils/config.py | 165 ++++++++++++++---- 7 files changed, 291 insertions(+), 81 deletions(-) diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py index 4965354c0016..0741174497e3 100644 --- a/tests/models/test_mistral.py +++ b/tests/models/test_mistral.py @@ -41,3 +41,43 @@ def test_models( name_0="hf", name_1="vllm", ) + + +@pytest.mark.parametrize("model", MODELS[1:]) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("max_tokens", [64]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_mistral_format( + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, + num_logprobs: int, +) -> None: + with vllm_runner( + model, + dtype=dtype, + tokenizer_mode="auto", + load_format="safetensors", + config_format="hf", + ) as hf_format_model: + hf_format_outputs = hf_format_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + + with vllm_runner( + model, + dtype=dtype, + tokenizer_mode="mistral", + load_format="mistral", + config_format="mistral", + ) as mistral_format_model: + mistral_format_outputs = mistral_format_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + + check_logprobs_close( + outputs_0_lst=hf_format_outputs, + outputs_1_lst=mistral_format_outputs, + name_0="hf", + name_1="mistral", + ) diff --git a/vllm/config.py b/vllm/config.py index 1c9e30b2682b..8f5e02e35f28 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -13,7 +13,7 @@ from vllm.model_executor.models import ModelRegistry from vllm.platforms import current_platform from vllm.tracing import is_otel_available, otel_import_error_traceback -from vllm.transformers_utils.config import (get_config, +from vllm.transformers_utils.config import (ConfigFormat, get_config, get_hf_image_processor_config, get_hf_text_config) from vllm.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, GiB_bytes, @@ -121,35 +121,37 @@ class ModelConfig: override default neuron config that are specific to Neuron devices, this argument will be used to configure the neuron config that can not be gathered from the vllm arguments. + config_format: The config format which shall be loaded. + Defaults to 'auto' which defaults to 'hf'. """ - def __init__( - self, - model: str, - tokenizer: str, - tokenizer_mode: str, - trust_remote_code: bool, - dtype: Union[str, torch.dtype], - seed: int, - revision: Optional[str] = None, - code_revision: Optional[str] = None, - rope_scaling: Optional[dict] = None, - rope_theta: Optional[float] = None, - tokenizer_revision: Optional[str] = None, - max_model_len: Optional[int] = None, - spec_target_max_model_len: Optional[int] = None, - quantization: Optional[str] = None, - quantization_param_path: Optional[str] = None, - enforce_eager: Optional[bool] = None, - max_context_len_to_capture: Optional[int] = None, - max_seq_len_to_capture: Optional[int] = None, - max_logprobs: int = 20, - disable_sliding_window: bool = False, - skip_tokenizer_init: bool = False, - served_model_name: Optional[Union[str, List[str]]] = None, - limit_mm_per_prompt: Optional[Mapping[str, int]] = None, - use_async_output_proc: bool = True, - override_neuron_config: Optional[Dict[str, Any]] = None) -> None: + def __init__(self, + model: str, + tokenizer: str, + tokenizer_mode: str, + trust_remote_code: bool, + dtype: Union[str, torch.dtype], + seed: int, + revision: Optional[str] = None, + code_revision: Optional[str] = None, + rope_scaling: Optional[dict] = None, + rope_theta: Optional[float] = None, + tokenizer_revision: Optional[str] = None, + max_model_len: Optional[int] = None, + spec_target_max_model_len: Optional[int] = None, + quantization: Optional[str] = None, + quantization_param_path: Optional[str] = None, + enforce_eager: Optional[bool] = None, + max_context_len_to_capture: Optional[int] = None, + max_seq_len_to_capture: Optional[int] = None, + max_logprobs: int = 20, + disable_sliding_window: bool = False, + skip_tokenizer_init: bool = False, + served_model_name: Optional[Union[str, List[str]]] = None, + limit_mm_per_prompt: Optional[Mapping[str, int]] = None, + use_async_output_proc: bool = True, + override_neuron_config: Optional[Dict[str, Any]] = None, + config_format: ConfigFormat = ConfigFormat.AUTO) -> None: self.model = model self.tokenizer = tokenizer self.tokenizer_mode = tokenizer_mode @@ -176,7 +178,8 @@ def __init__( self.skip_tokenizer_init = skip_tokenizer_init self.hf_config = get_config(self.model, trust_remote_code, revision, - code_revision, rope_scaling, rope_theta) + code_revision, rope_scaling, rope_theta, + config_format) self.hf_text_config = get_hf_text_config(self.hf_config) self.hf_image_processor_config = get_hf_image_processor_config( self.model, revision) @@ -746,6 +749,7 @@ class LoadFormat(str, enum.Enum): SHARDED_STATE = "sharded_state" GGUF = "gguf" BITSANDBYTES = "bitsandbytes" + MISTRAL = "mistral" @dataclass diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f0b866db6432..7620093660b4 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -8,10 +8,10 @@ import torch import vllm.envs as envs -from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, - EngineConfig, LoadConfig, LoadFormat, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig, +from vllm.config import (CacheConfig, ConfigFormat, DecodingConfig, + DeviceConfig, EngineConfig, LoadConfig, LoadFormat, + LoRAConfig, ModelConfig, ObservabilityConfig, + ParallelConfig, PromptAdapterConfig, SchedulerConfig, SpeculativeConfig, TokenizerPoolConfig) from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger @@ -65,6 +65,7 @@ class EngineArgs: trust_remote_code: bool = False download_dir: Optional[str] = None load_format: str = 'auto' + config_format: str = 'auto' dtype: str = 'auto' kv_cache_dtype: str = 'auto' quantization_param_path: Optional[str] = None @@ -234,6 +235,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'section for more information.\n' '* "bitsandbytes" will load the weights using bitsandbytes ' 'quantization.\n') + parser.add_argument( + '--config-format', + default=EngineArgs.config_format, + choices=[f.value for f in ConfigFormat], + help='The format of the model config to load.\n\n' + '* "auto" will try to load the config in hf format ' + 'if available else it will try to load in mistral format ') parser.add_argument( '--dtype', type=str, @@ -813,7 +821,10 @@ def create_engine_config(self) -> EngineConfig: served_model_name=self.served_model_name, limit_mm_per_prompt=self.limit_mm_per_prompt, use_async_output_proc=not self.disable_async_output_proc, - override_neuron_config=self.override_neuron_config) + override_neuron_config=self.override_neuron_config, + config_format=self.config_format, + ) + cache_config = CacheConfig( block_size=self.block_size if self.device != "neuron" else self.max_model_len, # neuron needs block_size = max_model_len diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 553fa848489b..bcc866a19463 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -17,6 +17,7 @@ from huggingface_hub import HfApi, hf_hub_download from torch import nn from transformers import AutoModelForCausalLM, PretrainedConfig +from transformers.utils import SAFE_WEIGHTS_INDEX_NAME from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoadFormat, LoRAConfig, ModelConfig, MultiModalConfig, @@ -241,12 +242,17 @@ def _prepare_weights(self, model_name_or_path: str, is_local = os.path.isdir(model_name_or_path) load_format = self.load_config.load_format use_safetensors = False + index_file = SAFE_WEIGHTS_INDEX_NAME # Some quantized models use .pt files for storing the weights. if load_format == LoadFormat.AUTO: allow_patterns = ["*.safetensors", "*.bin"] elif load_format == LoadFormat.SAFETENSORS: use_safetensors = True allow_patterns = ["*.safetensors"] + elif load_format == LoadFormat.MISTRAL: + use_safetensors = True + allow_patterns = ["consolidated*.safetensors"] + index_file = "consolidated.safetensors.index.json" elif load_format == LoadFormat.PT: allow_patterns = ["*.pt"] elif load_format == LoadFormat.NPCACHE: @@ -284,10 +290,10 @@ def _prepare_weights(self, model_name_or_path: str, # any files not found in the index. if not is_local: download_safetensors_index_file_from_hf( - model_name_or_path, self.load_config.download_dir, - revision) + model_name_or_path, index_file, + self.load_config.download_dir, revision) hf_weights_files = filter_duplicate_safetensors_files( - hf_weights_files, hf_folder) + hf_weights_files, hf_folder, index_file) else: hf_weights_files = filter_files_not_needed_for_inference( hf_weights_files) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 0666457756b0..075451292a8e 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -16,7 +16,6 @@ from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download from safetensors.torch import load_file, safe_open, save_file from tqdm.auto import tqdm -from transformers.utils import SAFE_WEIGHTS_INDEX_NAME from vllm.config import LoadConfig, ModelConfig from vllm.distributed import get_tensor_model_parallel_rank @@ -251,6 +250,7 @@ def download_weights_from_hf( def download_safetensors_index_file_from_hf( model_name_or_path: str, + index_file: str, cache_dir: Optional[str], revision: Optional[str] = None, ) -> None: @@ -269,36 +269,37 @@ def download_safetensors_index_file_from_hf( # Download the safetensors index file. hf_hub_download( repo_id=model_name_or_path, - filename=SAFE_WEIGHTS_INDEX_NAME, + filename=index_file, cache_dir=cache_dir, revision=revision, local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, ) # If file not found on remote or locally, we should not fail since - # only some models will have SAFE_WEIGHTS_INDEX_NAME. + # only some models will have index_file. except huggingface_hub.utils.EntryNotFoundError: - logger.info("No %s found in remote.", SAFE_WEIGHTS_INDEX_NAME) + logger.info("No %s found in remote.", index_file) except huggingface_hub.utils.LocalEntryNotFoundError: - logger.info("No %s found in local cache.", SAFE_WEIGHTS_INDEX_NAME) + logger.info("No %s found in local cache.", index_file) # For models like Mistral-7B-v0.3, there are both sharded # safetensors files and a consolidated safetensors file. # Passing both of these to the weight loader functionality breaks. -# So, we use the SAFE_WEIGHTS_INDEX_NAME to +# So, we use the index_file to # look up which safetensors files should be used. def filter_duplicate_safetensors_files(hf_weights_files: List[str], - hf_folder: str) -> List[str]: + hf_folder: str, + index_file: str) -> List[str]: # model.safetensors.index.json is a mapping from keys in the # torch state_dict to safetensors file holding that weight. - index_file_name = os.path.join(hf_folder, SAFE_WEIGHTS_INDEX_NAME) + index_file_name = os.path.join(hf_folder, index_file) if not os.path.isfile(index_file_name): return hf_weights_files # Iterate through the weight_map (weight_name: safetensors files) # to identify weights that we should use. - with open(index_file_name) as index_file: - weight_map = json.load(index_file)["weight_map"] + with open(index_file_name, "r") as f: + weight_map = json.load(f)["weight_map"] weight_files_in_index = set() for weight_name in weight_map: weight_files_in_index.add( diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index e55c01316087..5ff31e3833ec 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -375,6 +375,25 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA): "gate_proj": ("gate_up_proj", 0), "up_proj": ("gate_up_proj", 1), } + # Mistral/Llama models can also be loaded with --load-format mistral + # from consolidated.safetensors checkpoints + mistral_mapping = { + "layers": "model.layers", + "attention": "self_attn", + "wq": "q_proj", + "wk": "k_proj", + "wv": "v_proj", + "wo": "o_proj", + "attention_norm": "input_layernorm", + "feed_forward": "mlp", + "w1": "gate_proj", + "w2": "down_proj", + "w3": "up_proj", + "ffn_norm": "post_attention_layernorm", + "tok_embeddings": "model.embed_tokens", + "output": "lm_head", + "norm": "model.norm" + } def __init__( self, @@ -472,6 +491,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ] params_dict = dict(self.named_parameters()) for name, loaded_weight in weights: + name, loaded_weight = self.maybe_remap_mistral(name, loaded_weight) + if "rotary_emb.inv_freq" in name: continue if ("rotary_emb.cos_cached" in name @@ -549,3 +570,33 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None: else: raise RuntimeError("Self attention has no KV cache scaling " "factor attribute!") + + # This function is used to remap the mistral format as + # used by Mistral and Llama <=2 + def maybe_remap_mistral( + self, name: str, + loaded_weight: torch.Tensor) -> Tuple[str, torch.Tensor]: + + def permute(w, n_heads): + attn_in = self.config.head_dim * n_heads + attn_out = self.config.hidden_size + + return w.view(n_heads, attn_in // n_heads // 2, 2, + attn_out).transpose(1, 2).reshape(attn_in, attn_out) + + mapping = self.mistral_mapping + modules = name.split(".") + + # rotary embeds should be sliced + if "wk" in modules: + loaded_weight = permute(loaded_weight, + self.config.num_key_value_heads) + elif "wq" in modules: + loaded_weight = permute(loaded_weight, + self.config.num_attention_heads) + + for item in modules: + if item in mapping and mapping[item] not in name: + name = name.replace(item, mapping[item]) + + return name, loaded_weight diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 4f4e79d10a67..13fcf6b91860 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -1,12 +1,16 @@ import contextlib +import enum +import json from pathlib import Path from typing import Any, Dict, Optional, Type, Union +from huggingface_hub import file_exists, hf_hub_download from transformers import GenerationConfig, PretrainedConfig from transformers.models.auto.image_processing_auto import ( get_image_processor_config) from transformers.models.auto.modeling_auto import ( MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) +from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger @@ -27,6 +31,8 @@ else: from transformers import AutoConfig +MISTRAL_CONFIG_NAME = "params.json" + logger = init_logger(__name__) _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { @@ -53,6 +59,20 @@ AutoConfig.register(name, cls) +class ConfigFormat(str, enum.Enum): + AUTO = "auto" + HF = "hf" + MISTRAL = "mistral" + + +def file_or_path_exists(model: Union[str, Path], config_name, revision, + token) -> bool: + if Path(model).exists(): + return (Path(model) / config_name).is_file() + + return file_exists(model, HF_CONFIG_NAME, revision=revision, token=token) + + def get_config( model: Union[str, Path], trust_remote_code: bool, @@ -60,45 +80,68 @@ def get_config( code_revision: Optional[str] = None, rope_scaling: Optional[dict] = None, rope_theta: Optional[float] = None, + config_format: ConfigFormat = ConfigFormat.AUTO, **kwargs, ) -> PretrainedConfig: - # Separate model folder from file path for GGUF models + is_gguf = check_gguf_file(model) if is_gguf: kwargs["gguf_file"] = Path(model).name model = Path(model).parent - config_dict, _ = PretrainedConfig.get_config_dict( - model, revision=revision, code_revision=code_revision, **kwargs) + if config_format == ConfigFormat.AUTO: + if is_gguf or file_or_path_exists(model, + HF_CONFIG_NAME, + revision=revision, + token=kwargs.get("token")): + config_format = ConfigFormat.HF + elif file_or_path_exists(model, + MISTRAL_CONFIG_NAME, + revision=revision, + token=kwargs.get("token")): + config_format = ConfigFormat.MISTRAL + else: + raise ValueError(f"No supported config format found in {model}") + + if config_format == ConfigFormat.HF: + config_dict, _ = PretrainedConfig.get_config_dict( + model, revision=revision, code_revision=code_revision, **kwargs) + + # Use custom model class if it's in our registry + model_type = config_dict.get("model_type") + if model_type in _CONFIG_REGISTRY: + config_class = _CONFIG_REGISTRY[model_type] + config = config_class.from_pretrained(model, + revision=revision, + code_revision=code_revision) + else: + try: + config = AutoConfig.from_pretrained( + model, + trust_remote_code=trust_remote_code, + revision=revision, + code_revision=code_revision, + **kwargs, + ) + except ValueError as e: + if (not trust_remote_code + and "requires you to execute the configuration file" + in str(e)): + err_msg = ( + "Failed to load the model config. If the model " + "is a custom model not yet available in the " + "HuggingFace transformers library, consider setting " + "`trust_remote_code=True` in LLM or using the " + "`--trust-remote-code` flag in the CLI.") + raise RuntimeError(err_msg) from e + else: + raise e - # Use custom model class if it's in our registry - model_type = config_dict.get("model_type") - if model_type in _CONFIG_REGISTRY: - config_class = _CONFIG_REGISTRY[model_type] - config = config_class.from_pretrained(model, - revision=revision, - code_revision=code_revision) + elif config_format == ConfigFormat.MISTRAL: + config = load_params_config(model, revision) else: - try: - config = AutoConfig.from_pretrained( - model, - trust_remote_code=trust_remote_code, - revision=revision, - code_revision=code_revision, - **kwargs) - except ValueError as e: - if (not trust_remote_code - and "requires you to execute the configuration file" - in str(e)): - err_msg = ( - "Failed to load the model config. If the model is a custom " - "model not yet available in the HuggingFace transformers " - "library, consider setting `trust_remote_code=True` in LLM " - "or using the `--trust-remote-code` flag in the CLI.") - raise RuntimeError(err_msg) from e - else: - raise e + raise ValueError(f"Unsupported config format: {config_format}") # Special architecture mapping check for GGUF models if is_gguf: @@ -108,16 +151,70 @@ def get_config( model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type] config.update({"architectures": [model_type]}) - for key, value in [("rope_scaling", rope_scaling), - ("rope_theta", rope_theta)]: + for key, value in [ + ("rope_scaling", rope_scaling), + ("rope_theta", rope_theta), + ]: if value is not None: - logger.info("Updating %s from %r to %r", key, - getattr(config, key, None), value) + logger.info( + "Updating %s from %r to %r", + key, + getattr(config, key, None), + value, + ) config.update({key: value}) return config +def load_params_config(model, revision) -> PretrainedConfig: + # This function loads a params.json config which + # should be used when loading models in mistral format + + config_file_name = "params.json" + + config_path = Path(model) / config_file_name + + if not config_path.is_file(): + config_path = Path( + hf_hub_download(model, config_file_name, revision=revision)) + + with open(config_path, "r") as file: + config_dict = json.load(file) + + config_mapping = { + "dim": "hidden_size", + "norm_eps": "rms_norm_eps", + "n_kv_heads": "num_key_value_heads", + "n_layers": "num_hidden_layers", + "n_heads": "num_attention_heads", + "hidden_dim": "intermediate_size", + } + + def recurse_elems(elem: Any): + if isinstance(elem, dict): + config_dict = {} + for key, value in elem.items(): + key = config_mapping.get(key, key) + config_dict[key] = recurse_elems(value) + return PretrainedConfig(**config_dict) + else: + return elem + + config_dict["model_type"] = config_dict.get("model_type", "transformer") + config_dict["hidden_act"] = config_dict.get("activation", "silu") + config_dict["tie_word_embeddings"] = config_dict.get( + "tie_embeddings", False) + + if config_dict["model_type"] == "transformer": + if "moe" in config_dict: + config_dict["architectures"] = ["MixtralForCausalLM"] + else: + config_dict["architectures"] = ["MistralForCausalLM"] + + return recurse_elems(config_dict) + + def get_hf_image_processor_config( model: Union[str, Path], revision: Optional[str] = None, @@ -134,7 +231,7 @@ def get_hf_image_processor_config( def get_hf_text_config(config: PretrainedConfig): """Get the "sub" config relevant to llm for multi modal models. - No op for pure text models. + No op for pure text models. """ if hasattr(config, "text_config"): # The code operates under the assumption that text_config should have From 12dd715807ccbd7fafbb64d42571792db1cc6497 Mon Sep 17 00:00:00 2001 From: William Lin Date: Fri, 6 Sep 2024 17:48:48 -0700 Subject: [PATCH 04/54] [misc] [doc] [frontend] LLM torch profiler support (#7943) --- docs/source/dev/profiling/profiling_index.rst | 20 +++++++++-- examples/offline_inference_with_profiler.py | 33 +++++++++++++++++++ vllm/engine/llm_engine.py | 6 ++++ vllm/entrypoints/llm.py | 6 ++++ vllm/executor/cpu_executor.py | 6 ++++ vllm/executor/gpu_executor.py | 6 ++++ 6 files changed, 74 insertions(+), 3 deletions(-) create mode 100644 examples/offline_inference_with_profiler.py diff --git a/docs/source/dev/profiling/profiling_index.rst b/docs/source/dev/profiling/profiling_index.rst index af3c78c3b5a5..e22d54729344 100644 --- a/docs/source/dev/profiling/profiling_index.rst +++ b/docs/source/dev/profiling/profiling_index.rst @@ -17,14 +17,28 @@ Traces can be visualized using https://ui.perfetto.dev/. .. tip:: Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. - -Example commands: + +.. tip:: + + To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100. + Set the env variable VLLM_RPC_GET_DATA_TIMEOUT_MS to a big number before you start the server. Say something like 30 minutes. + ``export VLLM_RPC_GET_DATA_TIMEOUT_MS=1800000`` + +Example commands and usage: +=========================== + +Offline Inference: +------------------ + +Refer to `examples/offline_inference_with_profiler.py `_ for an example. + OpenAI Server: +-------------- .. code-block:: bash - VLLM_TORCH_PROFILER_DIR=/mnt/traces/ python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B + VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B benchmark_serving.py: diff --git a/examples/offline_inference_with_profiler.py b/examples/offline_inference_with_profiler.py new file mode 100644 index 000000000000..906c9502800d --- /dev/null +++ b/examples/offline_inference_with_profiler.py @@ -0,0 +1,33 @@ +import os + +from vllm import LLM, SamplingParams + +# enable torch profiler, can also be set on cmd line +os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile" + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM(model="facebook/opt-125m") + +llm.start_profile() + +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) + +llm.stop_profile() + +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 50dcb6937eb6..78ddcd1daaf6 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1914,6 +1914,12 @@ def check_health(self) -> None: self.tokenizer.check_health() self.model_executor.check_health() + def start_profile(self) -> None: + self.model_executor.start_profile() + + def stop_profile(self) -> None: + self.model_executor.stop_profile() + def is_tracing_enabled(self) -> bool: return self.tracer is not None diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f587ec300314..1e4432eaaa66 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -560,6 +560,12 @@ def encode( outputs = self._run_engine(use_tqdm=use_tqdm) return LLMEngine.validate_outputs(outputs, EmbeddingRequestOutput) + def start_profile(self) -> None: + self.llm_engine.start_profile() + + def stop_profile(self) -> None: + self.llm_engine.stop_profile() + # LEGACY def _convert_v1_inputs( self, diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 21ad43f64168..ec9b24ce1318 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -296,6 +296,12 @@ def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: for result in parallel_worker_tasks: result.get() + def start_profile(self) -> None: + self.driver_method_invoker(self.driver_worker, "start_profile") + + def stop_profile(self) -> None: + self.driver_method_invoker(self.driver_worker, "stop_profile") + class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase): diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 947776e5d6ef..2185c9cf6cea 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -169,6 +169,12 @@ def check_health(self) -> None: # it's running. return + def start_profile(self) -> None: + self.driver_worker.start_profile() + + def stop_profile(self) -> None: + self.driver_worker.stop_profile() + class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase): From 41e95c5247c9703c3e11f3b563d8bba70ed31aca Mon Sep 17 00:00:00 2001 From: Kyle Mistele Date: Fri, 6 Sep 2024 21:49:01 -0500 Subject: [PATCH 05/54] [Bugfix] Fix Hermes tool call chat template bug (#8256) Co-authored-by: Kyle Mistele --- examples/tool_chat_template_hermes.jinja | 31 ++++++++++++------------ 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/examples/tool_chat_template_hermes.jinja b/examples/tool_chat_template_hermes.jinja index b18b463032d4..0b0902c8e749 100644 --- a/examples/tool_chat_template_hermes.jinja +++ b/examples/tool_chat_template_hermes.jinja @@ -89,22 +89,23 @@ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" and message.tool_calls is defined %} {{- '<|im_start|>' + message.role }} - {%- for tool_call in message.tool_calls %} - {{- '\n\n' }} - {%- if tool_call.function is defined %} - {%- set tool_call = tool_call.function %} - {%- endif %} - {{- '{' }} - {{- '"name": "' }} - {{- tool_call.name }} - {{- '"}' }} + {%- for tool_call in message.tool_calls %} + {{- '\n\n' }} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '{' }} + {{- '"name": "' }} + {{- tool_call.name }} + {{- '"' }} + {%- if tool_call.arguments is defined %} {{- ', ' }} - {%- if tool_call.arguments is defined %} - {{- '"arguments": ' }} - {{- tool_call.arguments|tojson }} - {%- endif %} - {{- '\n' }} - {%- endfor %} + {{- '"arguments": ' }} + {{- tool_call.arguments|tojson }} + {%- endif %} + {{- '}' }} + {{- '\n' }} + {%- endfor %} {{- '<|im_end|>\n' }} {%- elif message.role == "tool" %} {%- if loop.previtem and loop.previtem.role != "tool" %} From 2f707fcb35c5bc4b9164cf2bbce0254a72f7348b Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 7 Sep 2024 10:57:24 +0800 Subject: [PATCH 06/54] [Model] Multi-input support for LLaVA (#8238) --- docs/source/models/supported_models.rst | 16 +- tests/conftest.py | 12 +- .../distributed/test_multimodal_broadcast.py | 6 +- tests/models/test_llava.py | 141 ++++++++++++++++-- vllm/model_executor/models/clip.py | 2 +- vllm/model_executor/models/internvl.py | 2 +- vllm/model_executor/models/llava.py | 32 ++-- vllm/model_executor/models/llava_next.py | 4 +- vllm/model_executor/models/phi3v.py | 4 +- vllm/model_executor/models/siglip.py | 2 +- 10 files changed, 176 insertions(+), 45 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 0c0a54281e3f..fe01e1681353 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -219,7 +219,7 @@ Multimodal Language Models - * - :code:`LlavaForConditionalGeneration` - LLaVA-1.5 - - Image\ :sup:`E` + - Image\ :sup:`E+` - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc. - * - :code:`LlavaNextForConditionalGeneration` @@ -227,6 +227,11 @@ Multimodal Language Models - Image\ :sup:`E+` - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - + * - :code:`MiniCPMV` + - MiniCPM-V + - Image\ :sup:`+` + - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc. + - * - :code:`PaliGemmaForConditionalGeneration` - PaliGemma - Image\ :sup:`E` @@ -237,14 +242,9 @@ Multimodal Language Models - Image\ :sup:`E+` - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc. - - * - :code:`MiniCPMV` - - MiniCPM-V - - Image\ :sup:`+` - - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc. - - * - :code:`QWenLMHeadModel` - - Qwen - - Image + - Qwen-VL + - Image\ :sup:`E` - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc. - * - :code:`UltravoxModel` diff --git a/tests/conftest.py b/tests/conftest.py index e66a14598c34..cd0091b7cba6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -278,7 +278,7 @@ def __init__( def generate( self, prompts: List[str], - images: Optional[List[Image.Image]] = None, + images: Optional[PromptImageInput] = None, **kwargs: Any, ) -> List[Tuple[List[List[int]], List[str]]]: if images: @@ -314,7 +314,7 @@ def generate_greedy( self, prompts: List[str], max_tokens: int, - images: Optional[List[Image.Image]] = None, + images: Optional[PromptImageInput] = None, **kwargs: Any, ) -> List[Tuple[List[int], str]]: outputs = self.generate(prompts, @@ -351,7 +351,7 @@ def generate_greedy_logprobs( self, prompts: List[str], max_tokens: int, - images: Optional[List[Image.Image]] = None, + images: Optional[PromptImageInput] = None, **kwargs: Any, ) -> List[List[torch.Tensor]]: all_logprobs: List[List[torch.Tensor]] = [] @@ -433,8 +433,8 @@ def generate_greedy_logprobs_limit( prompts: List[str], max_tokens: int, num_logprobs: int, - images: Optional[List[Image.Image]] = None, - audios: Optional[List[Tuple[np.ndarray, int]]] = None, + images: Optional[PromptImageInput] = None, + audios: Optional[PromptAudioInput] = None, **kwargs: Any, ) -> List[Tuple[List[int], str, List[Dict[int, float]]]]: all_logprobs: List[List[Dict[int, float]]] = [] @@ -671,7 +671,7 @@ def generate_greedy( self, prompts: List[str], max_tokens: int, - images: Optional[List[Image.Image]] = None, + images: Optional[PromptImageInput] = None, ) -> List[Tuple[List[int], str]]: greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) outputs = self.generate(prompts, greedy_params, images=images) diff --git a/tests/distributed/test_multimodal_broadcast.py b/tests/distributed/test_multimodal_broadcast.py index e7723a7ae248..73ef863c2f19 100644 --- a/tests/distributed/test_multimodal_broadcast.py +++ b/tests/distributed/test_multimodal_broadcast.py @@ -35,9 +35,11 @@ def test_models(hf_runner, vllm_runner, image_assets, model: str, if model.startswith("llava-hf/llava-1.5"): from ..models.test_llava import models, run_test elif model.startswith("llava-hf/llava-v1.6"): - from ..models.test_llava_next import models, run_test + from ..models.test_llava_next import run_test # type: ignore[no-redef] + from ..models.test_llava_next import models elif model.startswith("facebook/chameleon"): - from ..models.test_chameleon import models, run_test + from ..models.test_chameleon import run_test # type: ignore[no-redef] + from ..models.test_chameleon import models else: raise NotImplementedError(f"Unsupported model: {model}") diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index 9d7da5f803ea..84ca23f6222a 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Tuple, Type +from typing import List, Optional, Tuple, Type, overload import pytest from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, @@ -8,11 +8,14 @@ from vllm.sequence import SampleLogprobs from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE -from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets +from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, + _ImageAssets) from .utils import check_logprobs_close pytestmark = pytest.mark.vlm +_LIMIT_IMAGE_PER_PROMPT = 4 + HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ "stop_sign": "USER: \nWhat's the content of the image?\nASSISTANT:", @@ -52,6 +55,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, return hf_output_ids, hf_output_str, out_logprobs +@overload def run_test( hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], @@ -64,6 +68,78 @@ def run_test( num_logprobs: int, tensor_parallel_size: int, distributed_executor_backend: Optional[str] = None, +): + ... + + +@overload +def run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + image_assets: _ImageAssets, + model: str, + *, + sizes: List[Tuple[int, int]], + dtype: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): + ... + + +def run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + image_assets: _ImageAssets, + model: str, + *, + size_factors: Optional[List[float]] = None, + sizes: Optional[List[Tuple[int, int]]] = None, + dtype: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): + images = [asset.pil_image for asset in image_assets] + + if size_factors is not None: + inputs_per_image = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + elif sizes is not None: + inputs_per_image = [( + [prompt for _ in sizes], + [image.resize(size) for size in sizes], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + else: + raise ValueError("You must provide either `size_factors` or `sizes`") + + _run_test(hf_runner, + vllm_runner, + inputs_per_image, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend) + + +def _run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + inputs: List[Tuple[List[str], PromptImageInput]], + model: str, + *, + dtype: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, ): """Inference result should be the same between hf and vllm. @@ -85,13 +161,6 @@ def run_test( else: mantis_processor = None - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - # NOTE: take care of the order. run vLLM first, and then run HF. # vLLM needs a fresh new process without cuda initialization. # if we run HF first, the cuda initialization will be done and it @@ -100,15 +169,18 @@ def run_test( # max_model_len should be greater than image_feature_size with vllm_runner(model, dtype=dtype, + max_model_len=4096, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: + enforce_eager=True, + limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT + }) as vllm_model: vllm_outputs_per_image = [ vllm_model.generate_greedy_logprobs(prompts, max_tokens, num_logprobs=num_logprobs, images=images) - for prompts, images in inputs_per_image + for prompts, images in inputs ] if mantis_processor is not None: @@ -131,7 +203,7 @@ def process(hf_inputs: BatchEncoding): max_tokens, num_logprobs=num_logprobs, images=images) - for prompts, images in inputs_per_image + for prompts, images in inputs ] for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, @@ -181,6 +253,51 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, ) +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets, + model, dtype, max_tokens, + num_logprobs) -> None: + stop_sign = image_assets[0].pil_image + cherry_blossom = image_assets[1].pil_image + + inputs = [( + [ + "USER: \nDescribe 2 images.\nASSISTANT:", + "USER: \nDescribe 2 images.\nASSISTANT:", + "USER: \nDescribe 4 images.\nASSISTANT:", # noqa: E501 + "USER: \nWhat is the season?\nASSISTANT:", + ], + [ + [stop_sign, cherry_blossom], + # Images with different sizes and aspect-ratios + [ + rescale_image_size(stop_sign, 0.1), + stop_sign, + ], + [ + stop_sign, + rescale_image_size(stop_sign, 0.25), + cherry_blossom.resize((183, 488)), + cherry_blossom.resize((488, 183)) + ], + cherry_blossom, + ])] + + _run_test( + hf_runner, + vllm_runner, + inputs, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) + + @pytest.mark.parametrize("model", models) def test_context_length_too_short(vllm_runner, image_assets, model): images = [asset.pil_image for asset in image_assets] diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index b581a501e333..70f1522ae252 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -105,7 +105,7 @@ def input_processor_for_clip( if isinstance(image_data, Image.Image): image_feature_size = get_clip_image_feature_size(hf_config) elif isinstance(image_data, torch.Tensor): - image_feature_size = image_data.shape[0] + num_images, image_feature_size, hidden_size = image_data.shape else: raise TypeError(f"Invalid image type: {type(image_data)}") else: diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index d317fdce3ba6..10fbb5663d27 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -209,7 +209,7 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs): image_feature_size = num_blocks * num_patches elif isinstance(image_data, torch.Tensor): - image_feature_size = image_data.shape[0] + num_images, image_feature_size, hidden_size = image_data.shape else: raise TypeError(f"Invalid image type: {type(image_data)}") diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 43c485bdf366..7a6c991fb133 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -4,6 +4,7 @@ import torch import torch.nn as nn +from PIL import Image from transformers import CLIPVisionConfig, LlavaConfig, SiglipVisionConfig from vllm.attention import AttentionMetadata @@ -16,6 +17,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sequence import IntermediateTensors +from vllm.utils import is_list_of from .clip import (CLIPVisionModel, dummy_image_for_clip, dummy_seq_data_for_clip, get_max_clip_image_tokens, @@ -24,7 +26,7 @@ from .siglip import (SiglipVisionModel, dummy_image_for_siglip, dummy_seq_data_for_siglip, get_max_siglip_image_tokens, input_processor_for_siglip) -from .utils import (filter_weights, init_vllm_registered_model, +from .utils import (filter_weights, flatten_bn, init_vllm_registered_model, merge_multimodal_embeddings) @@ -133,7 +135,18 @@ def input_processor_for_llava(ctx: InputContext, llm_inputs: LLMInputs): hf_config = ctx.get_hf_config(LlavaConfig) vision_config = hf_config.vision_config - image_feature_size = get_max_llava_image_tokens(ctx) + image_data = multi_modal_data["image"] + if isinstance(image_data, Image.Image): + image_feature_size = get_max_llava_image_tokens(ctx) + elif is_list_of(image_data, Image.Image): + image_feature_size = [get_max_llava_image_tokens(ctx) + ] * len(image_data) + elif isinstance(image_data, torch.Tensor): + num_images, image_feature_size, hidden_size = image_data.shape + elif is_list_of(image_data, torch.Tensor): + image_feature_size = [item.shape[1] for item in image_data] + else: + raise TypeError(f"Invalid image type: {type(image_data)}") if isinstance(vision_config, CLIPVisionConfig): return input_processor_for_clip( @@ -230,29 +243,24 @@ def _parse_and_validate_image_input( return None if pixel_values is not None: - if not isinstance(pixel_values, torch.Tensor): + if not isinstance(pixel_values, (torch.Tensor, list)): raise ValueError("Incorrect type of pixel values. " f"Got type: {type(pixel_values)}") - # Remove the N dimension until multiple images are supported. - pixel_values = pixel_values.squeeze(1) - return LlavaImagePixelInputs( type="pixel_values", - data=self._validate_pixel_values(pixel_values), + data=self._validate_pixel_values( + flatten_bn(pixel_values, concat=True)), ) if image_embeds is not None: - if not isinstance(image_embeds, torch.Tensor): + if not isinstance(image_embeds, (torch.Tensor, list)): raise ValueError("Incorrect type of image embeddings. " f"Got type: {type(image_embeds)}") - # Remove the N dimension until multiple images are supported. - image_embeds = image_embeds.squeeze(1) - return LlavaImageEmbeddingInputs( type="image_embeds", - data=image_embeds, + data=flatten_bn(image_embeds, concat=True), ) raise AssertionError("This line should be unreachable.") diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 5a179e960371..c6bd46dd7eda 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -234,7 +234,9 @@ def input_processor_for_llava_next(ctx: InputContext, llm_inputs: LLMInputs): for img in image_data ] elif isinstance(image_data, torch.Tensor): - image_feature_size = image_data.shape[0] + num_images, image_feature_size, hidden_size = image_data.shape + elif is_list_of(image_data, torch.Tensor): + image_feature_size = [item.shape[1] for item in image_data] else: raise TypeError(f"Invalid image type: {type(image_data)}") diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index c449e0fc759a..6f17f571ccae 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -424,7 +424,9 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs): input_width=w, input_height=h)) elif isinstance(image_data, torch.Tensor): - image_feature_size = image_data.shape[0] + num_images, image_feature_size, hidden_size = image_data.shape + elif is_list_of(image_data, torch.Tensor): + image_feature_size = [item.shape[1] for item in image_data] else: raise TypeError(f"Invalid image type: {type(image_data)}") diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 0bee75e2f0cb..fb4c30c1a13f 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -110,7 +110,7 @@ def input_processor_for_siglip( if isinstance(image_data, Image.Image): image_feature_size = get_siglip_image_feature_size(hf_config) elif isinstance(image_data, torch.Tensor): - image_feature_size = image_data.shape[0] + num_images, image_feature_size, hidden_size = image_data.shape else: raise TypeError(f"Invalid image type: {type(image_data)}") else: From 795b662cffe79fa0fa9a3f13a65113abdb4f96a9 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Fri, 6 Sep 2024 20:18:16 -0700 Subject: [PATCH 07/54] Enable Random Prefix Caching in Serving Profiling Tool (benchmark_serving.py) (#8241) --- benchmarks/benchmark_serving.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index bdfa81be4208..9ba3f649810b 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -195,8 +195,16 @@ def sample_sonnet_requests( def sample_random_requests( - input_len: int, output_len: int, num_prompts: int, range_ratio: float, - tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]: + prefix_len: int, + input_len: int, + output_len: int, + num_prompts: int, + range_ratio: float, + tokenizer: PreTrainedTokenizerBase, +) -> List[Tuple[str, int, int]]: + prefix_token_ids = np.random.randint(0, + tokenizer.vocab_size, + size=prefix_len).tolist() input_lens = np.random.randint( int(input_len * range_ratio), @@ -211,10 +219,12 @@ def sample_random_requests( offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts) input_requests = [] for i in range(num_prompts): - prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size + prompt = tokenizer.decode(prefix_token_ids + + [(offsets[i] + i + j) % tokenizer.vocab_size for j in range(input_lens[i])]) + input_requests.append( - (prompt, int(input_lens[i]), int(output_lens[i]))) + (prompt, int(prefix_len + input_lens[i]), int(output_lens[i]))) return input_requests @@ -567,6 +577,7 @@ def main(args: argparse.Namespace): elif args.dataset_name == "random": input_requests = sample_random_requests( + prefix_len=args.random_prefix_len, input_len=args.random_input_len, output_len=args.random_output_len, num_prompts=args.num_prompts, @@ -765,6 +776,14 @@ def main(args: argparse.Namespace): help="Range of sampled ratio of input/output length, " "used only for random sampling.", ) + parser.add_argument( + "--random-prefix-len", + type=int, + default=0, + help="Number of fixed prefix tokens before random " + " context. The length range of context in a random " + " request is [random-prefix-len, " + " random-prefix-len + random-prefix-len * random-range-ratio).") parser.add_argument( "--request-rate", type=float, From ce2702a92356b69ec1ea35ecd46263ddf98e8e2c Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 6 Sep 2024 22:40:46 -0700 Subject: [PATCH 08/54] [tpu][misc] fix typo (#8260) --- tests/compile/test_wrapper.py | 4 ++-- vllm/compilation/wrapper.py | 2 +- vllm/worker/tpu_model_runner.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py index cef516ade27e..3668c1fab6b8 100644 --- a/tests/compile/test_wrapper.py +++ b/tests/compile/test_wrapper.py @@ -2,7 +2,7 @@ import torch -from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispacther +from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher class MyMod(torch.nn.Module): @@ -13,7 +13,7 @@ def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None): return x * 2 -class MyWrapper(TorchCompileWrapperWithCustomDispacther): +class MyWrapper(TorchCompileWrapperWithCustomDispatcher): def __init__(self, model): self.model = model diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index c3d863299dd0..e923bd36ccc0 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -10,7 +10,7 @@ import vllm.envs as envs -class TorchCompileWrapperWithCustomDispacther: +class TorchCompileWrapperWithCustomDispatcher: """ A wrapper class for torch.compile, with a custom dispatch logic. Subclasses should: diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 684c54b7d813..db306bc743d3 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -11,7 +11,7 @@ import torch_xla.runtime as xr from vllm.attention import AttentionMetadata, get_attn_backend -from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispacther +from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig, ParallelConfig, SchedulerConfig) from vllm.logger import init_logger @@ -611,7 +611,7 @@ def _execute_model(*args): return [SamplerOutput(sampler_outputs)] -class ModelWrapper(TorchCompileWrapperWithCustomDispacther): +class ModelWrapper(TorchCompileWrapperWithCustomDispatcher): def __init__(self, model: nn.Module): self.model = model From 9f68e00d27b0f8252549be3adbb47c5b735a8103 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 7 Sep 2024 16:02:39 +0800 Subject: [PATCH 09/54] [Bugfix] Fix broken OpenAI tensorizer test (#8258) --- tests/utils.py | 12 ++-- vllm/engine/arg_utils.py | 72 ++++++++++--------- vllm/model_executor/model_loader/loader.py | 30 +++++++- .../model_executor/model_loader/tensorizer.py | 7 ++ 4 files changed, 81 insertions(+), 40 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index 04067ef372ac..6e5bc05b3901 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -20,7 +20,7 @@ init_distributed_environment) from vllm.engine.arg_utils import AsyncEngineArgs from vllm.entrypoints.openai.cli_args import make_arg_parser -from vllm.model_executor.model_loader.loader import DefaultModelLoader +from vllm.model_executor.model_loader.loader import get_model_loader from vllm.platforms import current_platform from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip @@ -89,11 +89,11 @@ def __init__(self, is_local = os.path.isdir(model) if not is_local: engine_args = AsyncEngineArgs.from_cli_args(args) - engine_config = engine_args.create_engine_config() - dummy_loader = DefaultModelLoader(engine_config.load_config) - dummy_loader._prepare_weights(engine_config.model_config.model, - engine_config.model_config.revision, - fall_back_to_pt=True) + model_config = engine_args.create_model_config() + load_config = engine_args.create_load_config() + + model_loader = get_model_loader(load_config) + model_loader.download_model(model_config) env = os.environ.copy() # the current process might initialize cuda, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7620093660b4..9bc03948d384 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -771,33 +771,8 @@ def from_cli_args(cls, args: argparse.Namespace): engine_args = cls(**{attr: getattr(args, attr) for attr in attrs}) return engine_args - def create_engine_config(self) -> EngineConfig: - # gguf file needs a specific model loader and doesn't use hf_repo - if check_gguf_file(self.model): - self.quantization = self.load_format = "gguf" - - # bitsandbytes quantization needs a specific model loader - # so we make sure the quant method and the load format are consistent - if (self.quantization == "bitsandbytes" or - self.qlora_adapter_name_or_path is not None) and \ - self.load_format != "bitsandbytes": - raise ValueError( - "BitsAndBytes quantization and QLoRA adapter only support " - f"'bitsandbytes' load format, but got {self.load_format}") - - if (self.load_format == "bitsandbytes" or - self.qlora_adapter_name_or_path is not None) and \ - self.quantization != "bitsandbytes": - raise ValueError( - "BitsAndBytes load format and QLoRA adapter only support " - f"'bitsandbytes' quantization, but got {self.quantization}") - - assert self.cpu_offload_gb >= 0, ( - "CPU offload space must be non-negative" - f", but got {self.cpu_offload_gb}") - - device_config = DeviceConfig(device=self.device) - model_config = ModelConfig( + def create_model_config(self) -> ModelConfig: + return ModelConfig( model=self.model, tokenizer=self.tokenizer, tokenizer_mode=self.tokenizer_mode, @@ -825,6 +800,42 @@ def create_engine_config(self) -> EngineConfig: config_format=self.config_format, ) + def create_load_config(self) -> LoadConfig: + return LoadConfig( + load_format=self.load_format, + download_dir=self.download_dir, + model_loader_extra_config=self.model_loader_extra_config, + ignore_patterns=self.ignore_patterns, + ) + + def create_engine_config(self) -> EngineConfig: + # gguf file needs a specific model loader and doesn't use hf_repo + if check_gguf_file(self.model): + self.quantization = self.load_format = "gguf" + + # bitsandbytes quantization needs a specific model loader + # so we make sure the quant method and the load format are consistent + if (self.quantization == "bitsandbytes" or + self.qlora_adapter_name_or_path is not None) and \ + self.load_format != "bitsandbytes": + raise ValueError( + "BitsAndBytes quantization and QLoRA adapter only support " + f"'bitsandbytes' load format, but got {self.load_format}") + + if (self.load_format == "bitsandbytes" or + self.qlora_adapter_name_or_path is not None) and \ + self.quantization != "bitsandbytes": + raise ValueError( + "BitsAndBytes load format and QLoRA adapter only support " + f"'bitsandbytes' quantization, but got {self.quantization}") + + assert self.cpu_offload_gb >= 0, ( + "CPU offload space must be non-negative" + f", but got {self.cpu_offload_gb}") + + device_config = DeviceConfig(device=self.device) + model_config = self.create_model_config() + cache_config = CacheConfig( block_size=self.block_size if self.device != "neuron" else self.max_model_len, # neuron needs block_size = max_model_len @@ -967,12 +978,7 @@ def create_engine_config(self) -> EngineConfig: self.model_loader_extra_config[ "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path - load_config = LoadConfig( - load_format=self.load_format, - download_dir=self.download_dir, - model_loader_extra_config=self.model_loader_extra_config, - ignore_patterns=self.ignore_patterns, - ) + load_config = self.create_load_config() prompt_adapter_config = PromptAdapterConfig( max_prompt_adapters=self.max_prompt_adapters, diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index bcc866a19463..f59eb805ea90 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -185,6 +185,11 @@ class BaseModelLoader(ABC): def __init__(self, load_config: LoadConfig): self.load_config = load_config + @abstractmethod + def download_model(self, model_config: ModelConfig) -> None: + """Download a model so that it can be immediately loaded.""" + raise NotImplementedError + @abstractmethod def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, @@ -193,7 +198,7 @@ def load_model(self, *, model_config: ModelConfig, scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module: """Load a model with the given configurations.""" - ... + raise NotImplementedError class DefaultModelLoader(BaseModelLoader): @@ -335,6 +340,11 @@ def _xla_weights_iterator(iterator: Generator): weights_iterator = _xla_weights_iterator(weights_iterator) return weights_iterator + def download_model(self, model_config: ModelConfig) -> None: + self._prepare_weights(model_config.model, + model_config.revision, + fall_back_to_pt=True) + def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], @@ -377,6 +387,9 @@ def __init__(self, load_config: LoadConfig): raise ValueError(f"Model loader extra config is not supported for " f"load format {load_config.load_format}") + def download_model(self, model_config: ModelConfig) -> None: + pass # Nothing to download + def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], @@ -467,6 +480,12 @@ def _load_model_serialized( model = load_with_tensorizer(tensorizer_config, **extra_kwargs) return model.eval() + def download_model(self, model_config: ModelConfig) -> None: + self.tensorizer_config.verify_with_model_config(model_config) + + with self.tensorizer_config.open_stream(): + pass + def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], @@ -568,6 +587,9 @@ def _prepare_weights(self, model_name_or_path: str, ignore_patterns=self.load_config.ignore_patterns, ) + def download_model(self, model_config: ModelConfig) -> None: + self._prepare_weights(model_config.model, model_config.revision) + def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], @@ -995,6 +1017,9 @@ def _load_weights(self, model_config: ModelConfig, set_weight_attrs( param, {"matmul_state": [None] * len(quant_states)}) + def download_model(self, model_config: ModelConfig) -> None: + self._prepare_weights(model_config.model, model_config.revision) + def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], @@ -1070,6 +1095,9 @@ def _get_weights_iterator( return gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map) + def download_model(self, model_config: ModelConfig) -> None: + self._prepare_weights(model_config.model) + def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index b009ad8c882d..3aac5cd2b43a 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -99,6 +99,13 @@ def verify_with_model_config(self, model_config: "ModelConfig") -> None: "Loading a model using Tensorizer with quantization on vLLM" " is unstable and may lead to errors.") + def open_stream(self, tensorizer_args: Optional["TensorizerArgs"] = None): + if tensorizer_args is None: + tensorizer_args = self._construct_tensorizer_args() + + return open_stream(self.tensorizer_uri, + **tensorizer_args.stream_params) + def load_with_tensorizer(tensorizer_config: TensorizerConfig, **extra_kwargs) -> nn.Module: From e807125936a9db796746b67ba72c222b5c26582e Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sat, 7 Sep 2024 16:38:23 +0800 Subject: [PATCH 10/54] [Model][VLM] Support multi-images inputs for InternVL2 models (#8201) --- docs/source/models/supported_models.rst | 2 +- ...e_inference_vision_language_multi_image.py | 94 +++++++++++++++---- tests/models/test_internvl.py | 92 ++++++++++++++---- tests/models/test_phi3v.py | 8 +- vllm/model_executor/models/internvl.py | 60 +++++++++--- 5 files changed, 199 insertions(+), 57 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index fe01e1681353..1bb3a448f2c9 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -214,7 +214,7 @@ Multimodal Language Models - * - :code:`InternVLChatModel` - InternVL2 - - Image\ :sup:`E` + - Image\ :sup:`E+` - :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc. - * - :code:`LlavaForConditionalGeneration` diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py index 73543ab5da2b..dd84627b9dc5 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference_vision_language_multi_image.py @@ -6,7 +6,9 @@ from argparse import Namespace from typing import List -from vllm import LLM +from transformers import AutoTokenizer + +from vllm import LLM, SamplingParams from vllm.multimodal.utils import fetch_image from vllm.utils import FlexibleArgumentParser @@ -17,36 +19,84 @@ ] -def _load_phi3v(image_urls: List[str]): - return LLM( +def load_phi3v(question, image_urls: List[str]): + llm = LLM( model="microsoft/Phi-3.5-vision-instruct", trust_remote_code=True, max_model_len=4096, limit_mm_per_prompt={"image": len(image_urls)}, ) - - -def run_phi3v_generate(question: str, image_urls: List[str]): - llm = _load_phi3v(image_urls) - placeholders = "\n".join(f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1)) prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n" + stop_token_ids = None + return llm, prompt, stop_token_ids - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": { - "image": [fetch_image(url) for url in image_urls] + +def load_internvl(question, image_urls: List[str]): + model_name = "OpenGVLab/InternVL2-2B" + + llm = LLM( + model=model_name, + trust_remote_code=True, + max_num_seqs=5, + max_model_len=4096, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = "\n".join(f"Image-{i}: \n" + for i, _ in enumerate(image_urls, start=1)) + messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] + + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + # Stop tokens for InternVL + # models variants may have different stop tokens + # please refer to the model card for the correct "stop words": + # https://huggingface.co/OpenGVLab/InternVL2-2B#service + stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + return llm, prompt, stop_token_ids + + +model_example_map = { + "phi3_v": load_phi3v, + "internvl_chat": load_internvl, +} + + +def run_generate(model, question: str, image_urls: List[str]): + llm, prompt, stop_token_ids = model_example_map[model](question, + image_urls) + + sampling_params = SamplingParams(temperature=0.0, + max_tokens=128, + stop_token_ids=stop_token_ids) + + outputs = llm.generate( + { + "prompt": prompt, + "multi_modal_data": { + "image": [fetch_image(url) for url in image_urls] + }, }, - }) + sampling_params=sampling_params) for o in outputs: generated_text = o.outputs[0].text print(generated_text) -def run_phi3v_chat(question: str, image_urls: List[str]): - llm = _load_phi3v(image_urls) +def run_chat(model: str, question: str, image_urls: List[str]): + llm, _, stop_token_ids = model_example_map[model](question, image_urls) + + sampling_params = SamplingParams(temperature=0.0, + max_tokens=128, + stop_token_ids=stop_token_ids) outputs = llm.chat([{ "role": @@ -63,7 +113,8 @@ def run_phi3v_chat(question: str, image_urls: List[str]): }, } for image_url in image_urls), ], - }]) + }], + sampling_params=sampling_params) for o in outputs: generated_text = o.outputs[0].text @@ -71,12 +122,13 @@ def run_phi3v_chat(question: str, image_urls: List[str]): def main(args: Namespace): + model = args.model_type method = args.method if method == "generate": - run_phi3v_generate(QUESTION, IMAGE_URLS) + run_generate(model, QUESTION, IMAGE_URLS) elif method == "chat": - run_phi3v_chat(QUESTION, IMAGE_URLS) + run_chat(model, QUESTION, IMAGE_URLS) else: raise ValueError(f"Invalid method: {method}") @@ -85,6 +137,12 @@ def main(args: Namespace): parser = FlexibleArgumentParser( description='Demo on using vLLM for offline inference with ' 'vision language models that support multi-image input') + parser.add_argument('--model-type', + '-m', + type=str, + default="phi3_v", + choices=model_example_map.keys(), + help='Huggingface "model_type".') parser.add_argument("--method", type=str, default="generate", diff --git a/tests/models/test_internvl.py b/tests/models/test_internvl.py index 42732cebc656..fa3369dc5334 100644 --- a/tests/models/test_internvl.py +++ b/tests/models/test_internvl.py @@ -1,5 +1,5 @@ import types -from typing import List, Optional, Tuple, Type +from typing import List, Optional, Tuple, Type, Union import pytest import torch @@ -9,7 +9,8 @@ from vllm.multimodal.utils import rescale_image_size from vllm.utils import is_cpu -from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets +from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, + _ImageAssets) from .utils import check_logprobs_close pytestmark = pytest.mark.vlm @@ -20,6 +21,7 @@ "cherry_blossom": "<|im_start|>User\n\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 }) +HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: \nImage-2: \nDescribe the two images in detail.<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501 models = [ "OpenGVLab/InternVL2-1B", @@ -64,13 +66,13 @@ def generate( def run_test( hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, + inputs: List[Tuple[List[str], PromptImageInput]], model: str, *, - size_factors: List[float], dtype: str, max_tokens: int, num_logprobs: int, + mm_limit: int, tensor_parallel_size: int, distributed_executor_backend: Optional[str] = None, ): @@ -83,12 +85,6 @@ def run_test( Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. """ - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] # NOTE: take care of the order. run vLLM first, and then run HF. # vLLM needs a fresh new process without cuda initialization. @@ -110,13 +106,21 @@ def __init__(self, hf_runner: HfRunner): self.max_num = self.config.max_dynamic_patch self.image_size = self.vision_config.image_size - def __call__(self, text: str, images: Image, **kwargs): + def __call__(self, text: str, images: Union[Image, List[Image]], + **kwargs): from vllm.model_executor.models.internvl import ( IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values) - pixel_values = image_to_pixel_values( - images, self.image_size, self.min_num, self.max_num, - self.use_thumbnail).to(self.dtype) - num_patches_list = [pixel_values.shape[0]] + images = [images] if isinstance(images, Image) else images + pixel_values = [ + image_to_pixel_values(image, self.image_size, self.min_num, + self.max_num, + self.use_thumbnail).to(self.dtype) + for image in images + ] + num_patches_list = [ + pixel_value.shape[0] for pixel_value in pixel_values + ] + pixel_values = torch.cat(pixel_values, dim=0) for num_patches in num_patches_list: context_tokens = IMG_CONTEXT * self.num_image_token \ * num_patches @@ -130,6 +134,7 @@ def __call__(self, text: str, images: Image, **kwargs): with vllm_runner(model, max_model_len=4096, dtype=dtype, + limit_mm_per_prompt={"image": mm_limit}, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, enforce_eager=True) as vllm_model: @@ -138,7 +143,7 @@ def __call__(self, text: str, images: Image, **kwargs): max_tokens, num_logprobs=num_logprobs, images=images) - for prompts, images in inputs_per_image + for prompts, images in inputs ] with hf_runner(model, dtype=dtype) as hf_model: @@ -156,7 +161,7 @@ def __call__(self, text: str, images: Image, **kwargs): num_logprobs=num_logprobs, images=hf_images, eos_token_id=eos_token_id) - for prompts, hf_images in inputs_per_image + for prompts, hf_images in inputs ] for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, @@ -264,15 +269,64 @@ def run_awq_test( @torch.inference_mode() def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, dtype: str, max_tokens: int, num_logprobs: int) -> None: + images = [asset.pil_image for asset in image_assets] + + inputs_per_image = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + run_test( hf_runner, vllm_runner, - image_assets, + inputs_per_image, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + mm_limit=1, + tensor_parallel_size=1, + ) + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # No image + [], + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.5, 0.75, 1.0], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +@torch.inference_mode() +def test_multi_images_models(hf_runner, vllm_runner, image_assets, model, + size_factors, dtype: str, max_tokens: int, + num_logprobs: int) -> None: + images = [asset.pil_image for asset in image_assets] + + inputs_per_case = [ + ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], + [[rescale_image_size(image, factor) for image in images] + for factor in size_factors]) + ] + + run_test( + hf_runner, + vllm_runner, + inputs_per_case, model, - size_factors=size_factors, dtype=dtype, max_tokens=max_tokens, num_logprobs=num_logprobs, + mm_limit=2, tensor_parallel_size=1, ) diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index e416a85b8962..6ecbf07a08b7 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -1,16 +1,15 @@ import os import re -from typing import List, Optional, Tuple, Type, Union +from typing import List, Optional, Tuple, Type import pytest -from PIL import Image from transformers import AutoTokenizer from vllm.multimodal.utils import rescale_image_size from vllm.sequence import SampleLogprobs from vllm.utils import is_cpu, is_hip -from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner +from ..conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner from .utils import check_logprobs_close pytestmark = pytest.mark.vlm @@ -60,8 +59,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, def run_test( hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], Union[List[Image.Image], - List[List[Image.Image]]]]], + inputs: List[Tuple[List[str], PromptImageInput]], model: str, *, dtype: str, diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 10fbb5663d27..0cf63d9e1fb2 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -5,6 +5,7 @@ # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- import itertools +import re from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, TypedDict, Union) @@ -26,6 +27,7 @@ from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors +from vllm.utils import is_list_of from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip, get_clip_num_patches) @@ -95,8 +97,8 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int, - max_num: int, - image_size: int) -> Tuple[int, int, int]: + max_num: int, image_size: int, + use_thumbnail: bool) -> Tuple[int, int, int]: aspect_ratio = orig_width / orig_height # calculate the existing image aspect ratio @@ -114,17 +116,26 @@ def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int, target_width = image_size * target_aspect_ratio[0] target_height = image_size * target_aspect_ratio[1] blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + # add thumbnail image if num_blocks > 1 + if use_thumbnail and blocks > 1: + blocks += 1 return blocks, target_width, target_height # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B def dynamic_preprocess(image: Image.Image, min_num: int, max_num: int, image_size: int, - use_thumbnail: int) -> List[Image.Image]: + use_thumbnail: bool) -> List[Image.Image]: orig_width, orig_height = image.size + # calculate the number of blocks without thumbnail blocks, target_width, target_height = calculate_num_blocks( - orig_width, orig_height, min_num, max_num, image_size) + orig_width, + orig_height, + min_num, + max_num, + image_size, + use_thumbnail=False) # resize the image resized_img = image.resize((target_width, target_height)) processed_images = [] @@ -197,17 +208,23 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs): downsample_ratio) image_data = multi_modal_data["image"] + min_num = hf_config.min_dynamic_patch + max_num = hf_config.max_dynamic_patch + use_thumbnail = hf_config.use_thumbnail if isinstance(image_data, Image.Image): width, height = image_data.size - min_num = hf_config.min_dynamic_patch - max_num = hf_config.max_dynamic_patch num_blocks, _, _ = calculate_num_blocks(width, height, min_num, - max_num, image_size) - # add thumbnail image if num_blocks > 1 - if hf_config.use_thumbnail and num_blocks > 1: - num_blocks += 1 - image_feature_size = num_blocks * num_patches - + max_num, image_size, + use_thumbnail) + image_feature_size = [num_blocks * num_patches] + elif is_list_of(image_data, Image.Image): + image_feature_size = [] + for image in image_data: + width, height = image.size + num_blocks, _, _ = calculate_num_blocks(width, height, min_num, + max_num, image_size, + use_thumbnail) + image_feature_size.append(num_blocks * num_patches) elif isinstance(image_data, torch.Tensor): num_images, image_feature_size, hidden_size = image_data.shape else: @@ -220,8 +237,14 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs): prompt_token_ids = llm_inputs["prompt_token_ids"] if prompt is None: prompt = tokenizer.decode(prompt_token_ids) - image_prompt = IMG_START + IMG_CONTEXT * image_feature_size + IMG_END - new_prompt = prompt.replace('', image_prompt, 1) + + new_prompt = prompt + image_idx = sorted(map(int, re.findall(r"Image-(\d+): \n", prompt))) + for idx, feature_size in enumerate(image_feature_size, start=1): + image_prompt = IMG_START + IMG_CONTEXT * feature_size + IMG_END + if not image_idx: + image_prompt = f"Image-{idx}: {image_prompt}" + new_prompt = new_prompt.replace('', image_prompt, 1) new_prompt_token_ids = tokenizer.encode(new_prompt) return LLMInputs(prompt=prompt, @@ -245,6 +268,15 @@ def input_mapper_for_internvl(ctx: InputContext, data: object): use_thumbnail=use_thumbnail) # Add an N dimension for number of images per prompt (currently 1). data = data.unsqueeze(0) + elif is_list_of(data, Image.Image): + data = [ + image_to_pixel_values(img, + image_size, + min_num, + max_num, + use_thumbnail=use_thumbnail) for img in data + ] + data = torch.stack(data) model_config = ctx.model_config tokenizer = cached_get_tokenizer(model_config.tokenizer, trust_remote_code=True) From 36bf8150cc3a048d69d9d2196128462014b9599d Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 8 Sep 2024 01:45:44 +0800 Subject: [PATCH 11/54] [Model][VLM] Decouple weight loading logic for `Paligemma` (#8269) --- vllm/model_executor/models/paligemma.py | 112 ++++++++---------------- vllm/model_executor/models/siglip.py | 23 ++++- 2 files changed, 54 insertions(+), 81 deletions(-) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index b6f4275fbc94..5fd39b5e35be 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -1,3 +1,4 @@ +import itertools from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, TypedDict, Union) @@ -13,7 +14,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.gemma import GemmaModel +from vllm.model_executor.models.gemma import GemmaForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.utils import cached_get_tokenizer @@ -22,14 +23,10 @@ from .interfaces import SupportsMultiModal from .siglip import (SiglipVisionModel, dummy_image_for_siglip, dummy_seq_data_for_siglip, get_max_siglip_image_tokens) -from .utils import merge_multimodal_embeddings +from .utils import filter_weights, merge_multimodal_embeddings logger = init_logger(__name__) -_KEYS_TO_MODIFY_MAPPING = { - "language_model.model": "language_model", -} - class PaliGemmaImagePixelInputs(TypedDict): type: Literal["pixel_values"] @@ -151,8 +148,8 @@ def __init__(self, projection_dim=config.vision_config.projection_dim) self.quant_config = quant_config - self.language_model = GemmaModel(config.text_config, cache_config, - quant_config) + self.language_model = GemmaForCausalLM(config.text_config, + cache_config, quant_config) self.unpadded_vocab_size = config.text_config.vocab_size logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, @@ -252,7 +249,8 @@ def forward(self, vision_embeddings = vision_embeddings * (self.config.hidden_size** -0.5) - inputs_embeds = self.language_model.get_input_embeddings(input_ids) + inputs_embeds = self.language_model.model.get_input_embeddings( + input_ids) inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, vision_embeddings, @@ -262,87 +260,47 @@ def forward(self, else: inputs_embeds = None - hidden_states = self.language_model(input_ids, - positions, - kv_caches, - attn_metadata, - None, - inputs_embeds=inputs_embeds) + hidden_states = self.language_model.model(input_ids, + positions, + kv_caches, + attn_metadata, + None, + inputs_embeds=inputs_embeds) return hidden_states - # Copied from vllm/model_executor/models/gemma.py def compute_logits( self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.language_model.embed_tokens, - hidden_states, sampling_metadata) - return logits + return self.language_model.compute_logits(hidden_states, + sampling_metadata) - # Copied from vllm/model_executor/models/gemma.py def sample( self, logits: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens + return self.language_model.sample(logits, sampling_metadata) - # Adapted from vllm/model_executor/models/gemma.py def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params = set() - for name, loaded_weight in weights: - for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in name: - name = name.replace(key_to_modify, new_key) - use_default_weight_loading = False - if "vision" not in name or self.vision_tower.shard_weight: - for (param_name, shard_name, - shard_id) in stacked_params_mapping: - if shard_name not in name: - continue - name = name.replace(shard_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # lm_head is not used in vllm as it is tied with - # embed_token. To prevent errors, skip loading - # lm_head.weight. - if "lm_head.weight" in name: - continue - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - use_default_weight_loading = True - else: - use_default_weight_loading = True - - if use_default_weight_loading: - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - - loaded_params.add(name) - - unloaded_params = params_dict.keys() - loaded_params - if unloaded_params: - logger.warning( - "Some weights are not initialized from checkpoints: %s", - unloaded_params) + # prepare weight iterators for components + vit_weights, mlp_weights, llm_weights = itertools.tee(weights, 3) + + # load vision tower + vit_weights = filter_weights(vit_weights, "vision_tower") + self.vision_tower.load_weights(vit_weights) + + # load mlp projector + mlp_weights = filter_weights(mlp_weights, "multi_modal_projector") + mlp_params_dict = dict(self.multi_modal_projector.named_parameters()) + for name, loaded_weight in mlp_weights: + param = mlp_params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + + # load llm backbone + llm_weights = filter_weights(llm_weights, "language_model") + self.language_model.load_weights(llm_weights) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index fb4c30c1a13f..13d09e4cd4c2 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -529,6 +529,12 @@ def forward( ) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] if self.shard_weight else [] params_dict = dict(self.named_parameters()) layer_count = len(self.vision_model.encoder.layers) @@ -544,7 +550,16 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): if layer_idx >= layer_count: continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) From b962ee1470a019a72a1c17eddcf3a0471658a123 Mon Sep 17 00:00:00 2001 From: sumitd2 <91451282+sumitd2@users.noreply.github.com> Date: Sat, 7 Sep 2024 23:48:40 +0530 Subject: [PATCH 12/54] ppc64le: Dockerfile fixed, and a script for buildkite (#8026) --- .buildkite/run-cpu-test-ppc64le.sh | 32 ++++++++++++++++++++++++++++++ Dockerfile.ppc64le | 16 ++++++++++----- 2 files changed, 43 insertions(+), 5 deletions(-) create mode 100755 .buildkite/run-cpu-test-ppc64le.sh diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh new file mode 100755 index 000000000000..a01cf3fe6748 --- /dev/null +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -0,0 +1,32 @@ +# This script build the CPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# Try building the docker image +docker build -t cpu-test -f Dockerfile.ppc64le . + +# Setup cleanup +remove_docker_container() { docker rm -f cpu-test || true; } +trap remove_docker_container EXIT +remove_docker_container + +# Run the image, setting --shm-size=4g for tensor parallel. +#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test +docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test cpu-test + +# Run basic model test +docker exec cpu-test bash -c " + pip install pytest matplotlib einops transformers_stream_generator + pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported + +# online inference +docker exec cpu-test bash -c " + python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & + timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 + python3 benchmarks/benchmark_serving.py \ + --backend vllm \ + --dataset-name random \ + --model facebook/opt-125m \ + --num-prompts 20 \ + --endpoint /v1/completions \ + --tokenizer facebook/opt-125m" diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le index d4e4c483cada..16780f8ab950 100644 --- a/Dockerfile.ppc64le +++ b/Dockerfile.ppc64le @@ -2,21 +2,27 @@ FROM mambaorg/micromamba ARG MAMBA_DOCKERFILE_ACTIVATE=1 USER root -RUN apt-get update -y && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 +ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/" + +RUN apt-get update -y && apt-get install -y git wget vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential # Some packages in requirements-cpu are installed here # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba # Currently these may not be available for venv or pip directly -RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 pytorch-cpu=2.1.2 torchvision-cpu=0.16.2 && micromamba clean --all --yes +RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes COPY ./ /workspace/vllm WORKDIR /workspace/vllm # These packages will be in rocketce eventually -RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing +RUN pip install -v cmake torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install -WORKDIR /vllm-workspace -ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"] +WORKDIR /workspace/ + +RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks + +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] + From cfe712bf1aedbee4f26105737710ff80ae9d624e Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Sat, 7 Sep 2024 14:03:16 -0600 Subject: [PATCH 13/54] [CI/Build] Use python 3.12 in cuda image (#8133) Signed-off-by: Joe Runde --- Dockerfile | 8 ++++++-- requirements-common.txt | 1 + tests/test_logger.py | 6 +++--- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2375e3f4d738..0ec6655ed449 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,7 +10,7 @@ ARG CUDA_VERSION=12.4.1 # prepare basic build environment FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base ARG CUDA_VERSION=12.4.1 -ARG PYTHON_VERSION=3.10 +ARG PYTHON_VERSION=3.12 ENV DEBIAN_FRONTEND=noninteractive # Install Python and other dependencies @@ -133,7 +133,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ # image with vLLM installed FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base ARG CUDA_VERSION=12.4.1 -ARG PYTHON_VERSION=3.10 +ARG PYTHON_VERSION=3.12 WORKDIR /vllm-workspace ENV DEBIAN_FRONTEND=noninteractive @@ -179,6 +179,10 @@ FROM vllm-base AS test ADD . /vllm-workspace/ # install development dependencies (for testing) +# A newer setuptools is required for installing some test dependencies from source that do not publish python 3.12 wheels +# This installation must complete before the test dependencies are collected and installed. +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install "setuptools>=74.1.1" RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-dev.txt diff --git a/requirements-common.txt b/requirements-common.txt index e430753357ca..49a290317f81 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -27,3 +27,4 @@ gguf == 0.9.1 importlib_metadata mistral_common >= 1.3.4 pyyaml +six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 diff --git a/tests/test_logger.py b/tests/test_logger.py index 29346cd0878b..8f3d21841687 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -95,7 +95,7 @@ def test_logger_configuring_can_be_disabled(): config behavior, however mocks are used to ensure no changes in behavior or configuration occur.""" - with patch("logging.config.dictConfig") as dict_config_mock: + with patch("vllm.logger.dictConfig") as dict_config_mock: _configure_vllm_root_logger() dict_config_mock.assert_not_called() @@ -175,9 +175,9 @@ def test_custom_logging_config_is_parsed_and_used_when_provided(): logging_config_file.flush() with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name), patch( - "logging.config.dictConfig") as dict_config_mock: + "vllm.logger.dictConfig") as dict_config_mock: _configure_vllm_root_logger() - assert dict_config_mock.called_with(valid_logging_config) + dict_config_mock.assert_called_with(valid_logging_config) @patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 0) From 4ef41b84766670c1bd8079f58d35bf32b5bcb3ab Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Sun, 8 Sep 2024 00:01:51 -0400 Subject: [PATCH 14/54] [Bugfix] Fix async postprocessor in case of preemption (#8267) --- vllm/core/scheduler.py | 87 ++++++++------- vllm/engine/async_llm_engine.py | 24 ++-- vllm/engine/llm_engine.py | 149 ++++++++++++++++--------- vllm/worker/multi_step_model_runner.py | 26 +++-- 4 files changed, 172 insertions(+), 114 deletions(-) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 81c78bda3b50..c3fa95f57b73 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -537,13 +537,6 @@ def _schedule_running( preempted: List[SequenceGroup] = ret.preempted swapped_out: List[SequenceGroup] = ret.swapped_out - # NOTE(woosuk): Preemption happens only when there is no available slot - # to keep all the sequence groups in the RUNNING state. - - # Store original running requests for the case of async + preemption - if self.use_async_output_proc: - orig_running = self.running.copy() - running_queue = self.running assert len(self._async_stopped) == 0 while running_queue: @@ -552,6 +545,7 @@ def _schedule_running( seq_group, SequenceStatus.RUNNING, enable_chunking, budget) if num_running_tokens == 0: + # No budget => Stop break running_queue.popleft() @@ -565,18 +559,8 @@ def _schedule_running( self._async_stopped.append(seq_group) continue - # With async postprocessor, when preemption kicks in, we need - # first to drain the async postprocessor, so that all async - # block_table freeing is applied before the preemption freeing - # is applied. - if self.use_async_output_proc and not self._can_append_slots( - seq_group): - tmp = self.running - self.running = orig_running - assert self.output_proc_callback is not None - self.output_proc_callback() - self.running = tmp - + # NOTE(woosuk): Preemption happens only when there is no available + # slot to keep all the sequence groups in the RUNNING state. while not self._can_append_slots(seq_group): budget.subtract_num_batched_tokens(seq_group.request_id, num_running_tokens) @@ -588,24 +572,43 @@ def _schedule_running( and seq_group.lora_int_id in curr_loras): curr_loras.remove(seq_group.lora_int_id) + # Determine victim sequence + cont_loop = True if running_queue: - # Preempt the lowest-priority sequence groups. + # Preempt the lowest-priority sequence group. victim_seq_group = running_queue.pop() + else: + # No other sequence group can be preempted. + # Preempt the current sequence group. + # Note: This is also where we stop this loop + # (since there is nothing else to preempt) + victim_seq_group = seq_group + cont_loop = False + + # With async postprocessor, before preempting a sequence + # we need to ensure it has no pending async postprocessor + do_preempt = True + if self.use_async_output_proc: + assert self.output_proc_callback is not None + self.output_proc_callback( + request_id=victim_seq_group.request_id) + + # It may be that the async pending "victim_seq_group" + # becomes finished, in which case we simply free it. + if victim_seq_group.is_finished(): + self._free_finished_seq_group(victim_seq_group) + do_preempt = False + + # Do preemption + if do_preempt: preempted_mode = self._preempt(victim_seq_group, blocks_to_swap_out) if preempted_mode == PreemptionMode.RECOMPUTE: preempted.append(victim_seq_group) else: swapped_out.append(victim_seq_group) - else: - # No other sequence groups can be preempted. - # Preempt the current sequence group. - preempted_mode = self._preempt(seq_group, - blocks_to_swap_out) - if preempted_mode == PreemptionMode.RECOMPUTE: - preempted.append(seq_group) - else: - swapped_out.append(seq_group) + + if not cont_loop: break else: self._append_slots(seq_group, blocks_to_copy) @@ -1264,22 +1267,26 @@ def _free_finished_seqs(self, seq_group: SequenceGroup) -> None: if seq.is_finished(): self.free_seq(seq) + def _free_finished_seq_group(self, seq_group: SequenceGroup) -> None: + if seq_group.is_finished(): + # Free cross-attention block table, if it exists + self._free_seq_group_cross_attn_blocks(seq_group) + + # Add the finished requests to the finished requests list. + # This list will be used to update the Mamba cache in the + # next step. + self._finished_requests_ids.append(seq_group.request_id) + + # Free finished seqs + self._free_finished_seqs(seq_group) + def free_finished_seq_groups(self) -> None: remaining: Deque[SequenceGroup] = deque() for seq_group in self.running: - if seq_group.is_finished(): - # Free cross-attention block table, if it exists - self._free_seq_group_cross_attn_blocks(seq_group) - # Add the finished requests to the finished requests list. - # This list will be used to update the Mamba cache in the - # next step. - self._finished_requests_ids.append(seq_group.request_id) - else: + self._free_finished_seq_group(seq_group) + if not seq_group.is_finished(): remaining.append(seq_group) - # Free finished seqs - self._free_finished_seqs(seq_group) - self.running = remaining # Handle async stopped sequence groups diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 7fe8053fffb7..6ed1a6bba08e 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -342,17 +342,17 @@ async def step_async( virtual_engine] # Execute the model. - output = await self.model_executor.execute_model_async( + outputs = await self.model_executor.execute_model_async( execute_model_req) # we need to do this here so that last step's sampled_token_ids can # be passed to the next iteration for PP. if self.scheduler_config.is_multi_step: - self._update_cached_scheduler_output(virtual_engine, output) + self._update_cached_scheduler_output(virtual_engine, outputs) else: if len(ctx.output_queue) > 0: self._process_model_outputs(ctx=ctx) - output = [] + outputs = [] # Finish the current step for all the sequence groups. if self.scheduler_config.is_multi_step: @@ -365,25 +365,25 @@ async def step_async( self.cached_scheduler_outputs[ virtual_engine] = SchedulerOutputState() - is_async = allow_async_output_proc - is_last_step = True - ctx.output_queue.append( - (output, seq_group_metadata_list, scheduler_outputs, is_async, - is_last_step)) + ctx.append_output(outputs=outputs, + seq_group_metadata_list=seq_group_metadata_list, + scheduler_outputs=scheduler_outputs, + is_async=allow_async_output_proc, + is_last_step=True) - if output and allow_async_output_proc: + if outputs and allow_async_output_proc: assert len( - output + outputs ) == 1, "Async postprocessor expects only a single output set" self._advance_to_next_step( - output[0], seq_group_metadata_list, + outputs[0], seq_group_metadata_list, scheduler_outputs.scheduled_seq_groups) if not allow_async_output_proc: self._process_model_outputs(ctx=ctx) # Log stats. - self.do_log_stats(scheduler_outputs, output) + self.do_log_stats(scheduler_outputs, outputs) # Tracing self.do_tracing(scheduler_outputs) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 78ddcd1daaf6..94271c4a9315 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -2,9 +2,9 @@ import time from collections import deque from contextlib import contextmanager -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import (TYPE_CHECKING, Any, ClassVar, Deque, Dict, Iterable, List, - Mapping, Optional) + Mapping, NamedTuple, Optional) from typing import Sequence as GenericSequence from typing import Set, Tuple, Type, Union @@ -90,17 +90,36 @@ class SchedulerOutputState: last_output: Optional[SamplerOutput] = None -@dataclass +class OutputData(NamedTuple): + outputs: List[SamplerOutput] + seq_group_metadata_list: List[SequenceGroupMetadata] + scheduler_outputs: SchedulerOutputs + is_async: bool + is_last_step: bool + skip: List[int] + + class SchedulerContext: - output_queue: Deque[Tuple[Optional[List[SamplerOutput]], - List[SequenceGroupMetadata], SchedulerOutputs, - bool, - bool]] = field(default_factory=lambda: deque()) - request_outputs: List[Union[RequestOutput, - EmbeddingRequestOutput]] = field( - default_factory=lambda: []) - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None - scheduler_outputs: Optional[SchedulerOutputs] = None + + def __init__(self): + self.output_queue: Deque[OutputData] = deque() + self.request_outputs: List[Union[RequestOutput, + EmbeddingRequestOutput]] = [] + self.seq_group_metadata_list: Optional[ + List[SequenceGroupMetadata]] = None + self.scheduler_outputs: Optional[SchedulerOutputs] = None + + def append_output(self, outputs: List[SamplerOutput], + seq_group_metadata_list: List[SequenceGroupMetadata], + scheduler_outputs: SchedulerOutputs, is_async: bool, + is_last_step: bool): + self.output_queue.append( + OutputData(outputs=outputs, + seq_group_metadata_list=seq_group_metadata_list, + scheduler_outputs=scheduler_outputs, + is_async=is_async, + is_last_step=is_last_step, + skip=[])) class LLMEngine: @@ -1246,23 +1265,15 @@ def _process_sequence_group_outputs( return - def _process_model_outputs(self, ctx: SchedulerContext) -> None: - """Apply the model output to the sequences in the scheduled seq groups. + def _process_model_outputs(self, + ctx: SchedulerContext, + request_id: Optional[str] = None) -> None: + """Apply the model output to the sequences in the scheduled seq groups + and return responses. - virtual_engine: The engine id to operate on + ctx: The virtual engine context to work on + request_id: If provided, then only this request is going to be processed - is_async: Indicates whether this postprocessor runs in - parallel with the GPU forward pass and is processing - tokens from the previous step. If this is true, then - no tokens need to be appended since it is already done - externally (before the next schedule() call) - - sampler_output: Used with multi-step execution to provide - sampler_output of each step - is_last_output: Used with multi-step execution to indicate - the last step (of each multi-step group) - - Returns RequestOutputs that can be returned to the client. """ now = time.time() @@ -1270,9 +1281,14 @@ def _process_model_outputs(self, ctx: SchedulerContext) -> None: return None # Get pending async postprocessor - (outputs, seq_group_metadata_list, scheduler_outputs, is_async, - is_last_step) = ctx.output_queue.popleft() - assert outputs is not None + if request_id: + # When we process only one request, no pop is required + # (since later we will process all of the rest) + (outputs, seq_group_metadata_list, scheduler_outputs, is_async, + is_last_step, skip) = ctx.output_queue[0] + else: + (outputs, seq_group_metadata_list, scheduler_outputs, is_async, + is_last_step, skip) = ctx.output_queue.popleft() # Sanity check assert len(seq_group_metadata_list) == len( @@ -1286,9 +1302,30 @@ def _process_model_outputs(self, ctx: SchedulerContext) -> None: else: outputs_by_sequence_group = outputs + # Determine the requests we need to operate on + if request_id: + indices = [] + for i, seq_group_meta in enumerate(seq_group_metadata_list): + if seq_group_meta.request_id == request_id: + assert i not in skip # Cannot be called twice + indices.append(i) + break + + # If the request_id was not found, then it means that + # this is a new request that has no pending async + # postprocessor + if not indices: + return + else: + indices = range(len(seq_group_metadata_list)) # type: ignore + finished_before: List[int] = [] finished_now: List[int] = [] - for i, seq_group_meta in enumerate(seq_group_metadata_list): + for i in indices: + if i in skip: + continue + + seq_group_meta = seq_group_metadata_list[i] scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i] seq_group = scheduled_seq_group.seq_group @@ -1343,6 +1380,18 @@ def _process_model_outputs(self, ctx: SchedulerContext) -> None: request_output = RequestOutputFactory.create(seq_group) ctx.request_outputs.append(request_output) + # When we process a single request, we skip it for the next time, + # and invoke the request output callback (if there was final output) + if request_id: + assert len(indices) == 1 + skip.append(indices[0]) + + if (finished_now + and self.process_request_outputs_callback is not None): + self.process_request_outputs_callback(ctx.request_outputs) + ctx.request_outputs.clear() + return + # Free currently finished requests if finished_now: for scheduler in self.scheduler: @@ -1354,17 +1403,16 @@ def _process_model_outputs(self, ctx: SchedulerContext) -> None: if (finished_now and self.process_request_outputs_callback is not None): self.process_request_outputs_callback(ctx.request_outputs) + ctx.request_outputs.clear() return # Create the outputs - # Note: scheduled_seq_groups and seq_group_metadata_list - # must match with the indices - for i, scheduled_seq_group in enumerate( - scheduler_outputs.scheduled_seq_groups): - - if i in finished_before or i in finished_now: + for i in indices: + if i in skip or i in finished_before or i in finished_now: continue # Avoids double processing + scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i] + seq_group = scheduled_seq_group.seq_group seq_group.maybe_set_first_token_time(now) if (seq_group.is_finished() @@ -1380,6 +1428,7 @@ def _process_model_outputs(self, ctx: SchedulerContext) -> None: if (ctx.request_outputs and self.process_request_outputs_callback is not None): self.process_request_outputs_callback(ctx.request_outputs) + ctx.request_outputs.clear() # For async case, we need to record the stats here. # For non-async case, the stats are done in the @@ -1548,20 +1597,20 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: execute_model_req.async_callback = self.async_callbacks[ virtual_engine] - output = self.model_executor.execute_model( + outputs = self.model_executor.execute_model( execute_model_req=execute_model_req) # We need to do this here so that last step's sampled_token_ids can # be passed to the next iteration for PP. if self.scheduler_config.is_multi_step: - self._update_cached_scheduler_output(virtual_engine, output) + self._update_cached_scheduler_output(virtual_engine, outputs) else: # Nothing scheduled => If there is pending async postprocessor, # then finish it here. if len(ctx.output_queue) > 0: self._process_model_outputs(ctx=ctx) # No outputs in this case - output = [] + outputs = [] # Finish the current step for all the sequence groups. if self.scheduler_config.is_multi_step: @@ -1574,18 +1623,18 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: self.cached_scheduler_outputs[0] = SchedulerOutputState() # Add results to the output_queue - is_async = allow_async_output_proc - is_last_step = True - ctx.output_queue.append( - (output, seq_group_metadata_list, scheduler_outputs, is_async, - is_last_step)) - - if output and allow_async_output_proc: - assert len(output) == 1, ( + ctx.append_output(outputs=outputs, + seq_group_metadata_list=seq_group_metadata_list, + scheduler_outputs=scheduler_outputs, + is_async=allow_async_output_proc, + is_last_step=True) + + if outputs and allow_async_output_proc: + assert len(outputs) == 1, ( "Async postprocessor expects only a single output set") self._advance_to_next_step( - output[0], seq_group_metadata_list, + outputs[0], seq_group_metadata_list, scheduler_outputs.scheduled_seq_groups) # Check if need to run the usual non-async path @@ -1593,7 +1642,7 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: self._process_model_outputs(ctx=ctx) # Log stats. - self.do_log_stats(scheduler_outputs, output) + self.do_log_stats(scheduler_outputs, outputs) # Tracing self.do_tracing(scheduler_outputs) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index b52f2a07e344..b13cf39bd846 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -274,12 +274,13 @@ def _async_process_outputs(self, model_input: StatefulModelInput, self.pinned_sampled_token_ids) if model_output.pythonized: ctx = output_proc_callback.keywords["ctx"] - is_async = False - is_last_step = False - ctx.output_queue.append( - ([model_output.sampler_output - ], ctx.seq_group_metadata_list, - ctx.scheduler_outputs, is_async, is_last_step)) + ctx.append_output( + outputs=[model_output.sampler_output], + seq_group_metadata_list=ctx.seq_group_metadata_list, + scheduler_outputs=ctx.scheduler_outputs, + is_async=False, + is_last_step=False) + output_proc_callback() else: cont = False @@ -319,12 +320,13 @@ def _final_process_outputs(self, model_input: StatefulModelInput, if not is_last_step: ctx = output_proc_callback.keywords[ # type: ignore "ctx"] # type: ignore - is_async = False - is_last_step = False - ctx.output_queue.append( - ([output.sampler_output - ], ctx.seq_group_metadata_list, - ctx.scheduler_outputs, is_async, is_last_step)) + ctx.append_output( + outputs=[output.sampler_output], + seq_group_metadata_list=ctx. + seq_group_metadata_list, + scheduler_outputs=ctx.scheduler_outputs, + is_async=False, + is_last_step=False) else: outputs.append(output.sampler_output) else: From 08287ef6751e79a89bf4f060f5f9545560a6de12 Mon Sep 17 00:00:00 2001 From: Kyle Mistele Date: Mon, 9 Sep 2024 09:45:11 -0500 Subject: [PATCH 15/54] [Bugfix] Streamed tool calls now more strictly follow OpenAI's format; ensures Vercel AI SDK compatibility (#8272) --- tests/tool_use/utils.py | 2 +- vllm/entrypoints/openai/protocol.py | 7 ----- vllm/entrypoints/openai/serving_chat.py | 6 ++++- .../tool_parsers/abstract_tool_parser.py | 1 - .../openai/tool_parsers/hermes_tool_parser.py | 20 ++++---------- .../tool_parsers/mistral_tool_parser.py | 27 ++++++------------- 6 files changed, 19 insertions(+), 44 deletions(-) diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index 8ec9b05b2c52..e447469e3341 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -19,7 +19,7 @@ class ServerConfig(TypedDict): CONFIGS: Dict[str, ServerConfig] = { "hermes": { "model": - "NousResearch/Hermes-2-Pro-Llama-3-8B", + "NousResearch/Hermes-3-Llama-3.1-8B", "arguments": [ "--tool-call-parser", "hermes", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja") diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 970262a4bd35..374196044b7e 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -713,13 +713,6 @@ class DeltaToolCall(OpenAIBaseModel): function: Optional[DeltaFunctionCall] = None -# the initial delta that gets sent once a new tool call is started; -class InitialDeltaToolCall(DeltaToolCall): - id: str = Field(default_factory=lambda: f"chatcmpl-tool-{random_uuid()}") - type: Literal["function"] = "function" - index: int - - class ExtractedToolCallInformation(BaseModel): # indicate if tools were called tools_called: bool diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 78f355228012..8ed81e9c88cb 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -271,9 +271,13 @@ async def chat_completion_stream_generator( # NOTE num_choices defaults to 1 so this usually executes # once per request for i in range(num_choices): + choice_data = ChatCompletionResponseStreamChoice( index=i, - delta=DeltaMessage(role=role), + delta=DeltaMessage( + role=role, + content="", + ), logprobs=None, finish_reason=None) chunk = ChatCompletionStreamResponse( diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py index b0807e6f1e78..873f615d4325 100644 --- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -20,7 +20,6 @@ def __init__(self, tokenizer: AnyTokenizer): # the index of the tool call that is currently being parsed self.current_tool_id: int = -1 self.current_tool_name_sent: bool = False - self.current_tool_initial_sent: bool = False self.streamed_args_for_tool: List[str] = [] self.model_tokenizer = tokenizer diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index 7afbca7162ed..bde9b47ce60d 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -8,14 +8,14 @@ from vllm.entrypoints.openai.protocol import (DeltaFunctionCall, DeltaMessage, DeltaToolCall, ExtractedToolCallInformation, - FunctionCall, - InitialDeltaToolCall, ToolCall) + FunctionCall, ToolCall) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser) from vllm.entrypoints.openai.tool_parsers.utils import ( extract_intermediate_diff) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.utils import random_uuid logger = init_logger(__name__) @@ -34,7 +34,6 @@ def __init__(self, tokenizer: AnyTokenizer): self.prev_tool_call_arr: List[Dict] = [] self.current_tool_id: int = -1 self.current_tool_name_sent = False - self.current_tool_initial_sent: bool = False self.streamed_args_for_tool: List[str] = [ ] # map what has been streamed for each tool so far to a list @@ -168,7 +167,6 @@ def extract_tool_calls_streaming( # set cursors and state appropriately self.current_tool_id += 1 self.current_tool_name_sent = False - self.current_tool_initial_sent = False self.streamed_args_for_tool.append("") logger.debug("Starting on a new tool %s", self.current_tool_id) @@ -218,24 +216,16 @@ def extract_tool_calls_streaming( logger.debug('not enough tokens to parse into JSON yet') return None - # case - we haven't sent the initial delta with the tool call ID - # (it will be sent) - if not self.current_tool_initial_sent: - self.current_tool_initial_sent = True - return DeltaMessage(tool_calls=[ - InitialDeltaToolCall( - index=self.current_tool_id).model_dump( - exclude_none=True) - ]) - # case - we haven't sent the tool name yet. If it's available, send # it. otherwise, wait until it's available. - elif not self.current_tool_name_sent: + if not self.current_tool_name_sent: function_name: Union[str, None] = current_tool_call.get("name") if function_name: self.current_tool_name_sent = True return DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, + type="function", + id=f"chatcmpl-tool-{random_uuid()}", function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index d48770c792e9..4b0e1c91df97 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -8,14 +8,14 @@ from vllm.entrypoints.openai.protocol import (DeltaFunctionCall, DeltaMessage, DeltaToolCall, ExtractedToolCallInformation, - FunctionCall, - InitialDeltaToolCall, ToolCall) + FunctionCall, ToolCall) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser) from vllm.entrypoints.openai.tool_parsers.utils import ( extract_intermediate_diff) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.utils import random_uuid logger = init_logger(__name__) @@ -25,7 +25,7 @@ class MistralToolParser(ToolParser): Tool call parser for Mistral 7B Instruct v0.3, intended for use with the examples/tool_chat_template_mistral.jinja template. - Used when --enable-auto-tool-choice --tool-call-parser gmistral are all set + Used when --enable-auto-tool-choice --tool-call-parser mistral are all set """ def __init__(self, tokenizer: AnyTokenizer): @@ -42,7 +42,6 @@ def __init__(self, tokenizer: AnyTokenizer): self.prev_tool_call_arr: List[Dict] = [] self.current_tool_id: int = -1 self.current_tool_name_sent: bool = False - self.current_tool_initial_sent: bool = False self.streamed_args_for_tool: List[str] = [ ] # map what has been streamed for each tool so far to a list self.bot_token = "[TOOL_CALLS]" @@ -91,7 +90,6 @@ def extract_tool_calls(self, except Exception as e: logger.error("Error in extracting tool call from response: %s", e) - print("ERROR", e) # return information to just treat the tool call as regular JSON return ExtractedToolCallInformation(tools_called=False, tool_calls=[], @@ -109,7 +107,7 @@ def extract_tool_calls_streaming( # if the tool call token is not in the tokens generated so far, append # output to contents since it's not a tool - if self.bot_token_id not in current_token_ids: + if self.bot_token not in current_text: return DeltaMessage(content=delta_text) # if the tool call token ID IS in the tokens generated so far, that @@ -134,7 +132,7 @@ def extract_tool_calls_streaming( # replace BOT token with empty string, and convert single quotes # to double to allow parsing as JSON since mistral uses single # quotes instead of double for tool calls - parsable_arr = current_text.split(self.bot_token)[1] + parsable_arr = current_text.split(self.bot_token)[-1] # tool calls are generated in an array, so do partial JSON # parsing on the entire array @@ -186,31 +184,22 @@ def extract_tool_calls_streaming( # re-set stuff pertaining to progress in the current tool self.current_tool_id = len(tool_call_arr) - 1 self.current_tool_name_sent = False - self.current_tool_initial_sent = False self.streamed_args_for_tool.append("") logger.debug("starting on new tool %d", self.current_tool_id) return delta # case: update an existing tool - this is handled below - # if the current tool initial data incl. the id, type=function - # and idx not sent, send that - if not self.current_tool_initial_sent: - self.current_tool_initial_sent = True - delta = DeltaMessage(tool_calls=[ - InitialDeltaToolCall( - index=self.current_tool_id).model_dump( - exclude_none=True) - ]) - # if the current tool name hasn't been sent, send if available # - otherwise send nothing - elif not self.current_tool_name_sent: + if not self.current_tool_name_sent: function_name = current_tool_call.get("name") if function_name: delta = DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, + type="function", + id=f"chatcmpl-tool-{random_uuid()}", function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) From 58fcc8545a149c9c5b1f91f417a68f5ba1fdabf3 Mon Sep 17 00:00:00 2001 From: Adam Lugowski Date: Mon, 9 Sep 2024 11:16:37 -0700 Subject: [PATCH 16/54] [Frontend] Add progress reporting to run_batch.py (#8060) Co-authored-by: Adam Lugowski --- vllm/entrypoints/openai/run_batch.py | 54 ++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 32bbade25697..278be8cd11a1 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -1,9 +1,11 @@ import asyncio from io import StringIO -from typing import Awaitable, Callable, List +from typing import Awaitable, Callable, List, Optional import aiohttp +import torch from prometheus_client import start_http_server +from tqdm import tqdm from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -78,6 +80,38 @@ def parse_args(): return parser.parse_args() +# explicitly use pure text format, with a newline at the end +# this makes it impossible to see the animation in the progress bar +# but will avoid messing up with ray or multiprocessing, which wraps +# each line of output with some prefix. +_BAR_FORMAT = "{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]\n" # noqa: E501 + + +class BatchProgressTracker: + + def __init__(self): + self._total = 0 + self._pbar: Optional[tqdm] = None + + def submitted(self): + self._total += 1 + + def completed(self): + if self._pbar: + self._pbar.update() + + def pbar(self) -> tqdm: + enable_tqdm = not torch.distributed.is_initialized( + ) or torch.distributed.get_rank() == 0 + self._pbar = tqdm(total=self._total, + unit="req", + desc="Running batch", + mininterval=5, + disable=not enable_tqdm, + bar_format=_BAR_FORMAT) + return self._pbar + + async def read_file(path_or_url: str) -> str: if path_or_url.startswith("http://") or path_or_url.startswith("https://"): async with aiohttp.ClientSession() as session, \ @@ -102,7 +136,8 @@ async def write_file(path_or_url: str, data: str) -> None: async def run_request(serving_engine_func: Callable, - request: BatchRequestInput) -> BatchRequestOutput: + request: BatchRequestInput, + tracker: BatchProgressTracker) -> BatchRequestOutput: response = await serving_engine_func(request.body) if isinstance(response, (ChatCompletionResponse, EmbeddingResponse)): @@ -125,6 +160,7 @@ async def run_request(serving_engine_func: Callable, else: raise ValueError("Request must not be sent in stream mode") + tracker.completed() return batch_output @@ -164,6 +200,9 @@ async def main(args): request_logger=request_logger, ) + tracker = BatchProgressTracker() + logger.info("Reading batch from %s...", args.input_file) + # Submit all requests in the file to the engine "concurrently". response_futures: List[Awaitable[BatchRequestOutput]] = [] for request_json in (await read_file(args.input_file)).strip().split("\n"): @@ -178,16 +217,19 @@ async def main(args): if request.url == "/v1/chat/completions": response_futures.append( run_request(openai_serving_chat.create_chat_completion, - request)) + request, tracker)) + tracker.submitted() elif request.url == "/v1/embeddings": response_futures.append( - run_request(openai_serving_embedding.create_embedding, - request)) + run_request(openai_serving_embedding.create_embedding, request, + tracker)) + tracker.submitted() else: raise ValueError("Only /v1/chat/completions and /v1/embeddings are" "supported in the batch endpoint.") - responses = await asyncio.gather(*response_futures) + with tracker.pbar(): + responses = await asyncio.gather(*response_futures) output_buffer = StringIO() for response in responses: From f9b4a2d41587da0692d32797221df55a02d890a6 Mon Sep 17 00:00:00 2001 From: Vladislav Kruglikov Date: Mon, 9 Sep 2024 21:20:46 +0300 Subject: [PATCH 17/54] [Bugfix] Correct adapter usage for cohere and jamba (#8292) --- vllm/model_executor/models/commandr.py | 5 +++-- vllm/model_executor/models/jamba.py | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index be7f19d15b62..649dc798d22d 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -47,6 +47,8 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import IntermediateTensors +from .interfaces import SupportsLoRA + @torch.compile def layer_norm_func(hidden_states, weight, variance_epsilon): @@ -292,8 +294,7 @@ def forward( return hidden_states -class CohereForCausalLM(nn.Module): - +class CohereForCausalLM(nn.Module, SupportsLoRA): packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 73be7ffed0f8..29dd09afac5a 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -38,6 +38,8 @@ from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE, _get_graph_batch_size) +from .interfaces import SupportsLoRA + KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -539,7 +541,7 @@ def forward( return hidden_states -class JambaForCausalLM(nn.Module, HasInnerState): +class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA): packed_modules_mapping = { "qkv_proj": [ "q_proj", From c7cb5c333564cb00fc4f6a99d32c35e9ebc0f1ed Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 9 Sep 2024 16:27:26 -0400 Subject: [PATCH 18/54] [Misc] GPTQ Activation Ordering (#8135) --- tests/weight_loading/models.txt | 1 + .../compressed_tensors/compressed_tensors.py | 3 +- .../schemes/compressed_tensors_wNa16.py | 45 ++++++++++++++----- .../quantization/compressed_tensors/utils.py | 30 ++++++++++++- 4 files changed, 64 insertions(+), 15 deletions(-) diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt index 1dc529037a98..c708e6d5eb89 100644 --- a/tests/weight_loading/models.txt +++ b/tests/weight_loading/models.txt @@ -21,6 +21,7 @@ compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main +compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main awq, casperhansen/mixtral-instruct-awq, main awq_marlin, casperhansen/mixtral-instruct-awq, main fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 0768b37044aa..1170d55f3199 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -232,7 +232,8 @@ def _get_scheme_from_parts( return CompressedTensorsWNA16( num_bits=weight_quant.num_bits, strategy=weight_quant.strategy, - group_size=weight_quant.group_size) + group_size=weight_quant.group_size, + actorder=weight_quant.actorder) # Detect If Activation Quantization. # TODO @dsikka: clean-up conditions diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index 7ca8eecb9283..8897737c1c55 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -5,14 +5,18 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( + ActivationOrdering) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( apply_gptq_marlin_linear, marlin_make_empty_g_idx, marlin_make_workspace, - marlin_permute_scales, replace_tensor, verify_marlin_supported, + marlin_permute_scales, marlin_repeat_scales_on_all_ranks, + marlin_sort_g_idx, replace_tensor, verify_marlin_supported, verify_marlin_supports_shape) from vllm.model_executor.parameter import (BasevLLMParameter, ChannelQuantScaleParameter, GroupQuantScaleParameter, - PackedvLLMParameter) + PackedvLLMParameter, + RowvLLMParameter) from vllm.scalar_type import scalar_types __all__ = ["CompressedTensorsWNA16"] @@ -28,11 +32,13 @@ class CompressedTensorsWNA16(CompressedTensorsScheme): def __init__(self, strategy: str, num_bits: int, - group_size: Optional[int] = None): + group_size: Optional[int] = None, + actorder: Optional[ActivationOrdering] = None): self.pack_factor = 32 // num_bits self.strategy = strategy self.group_size = -1 if group_size is None else group_size + self.has_g_idx = actorder == ActivationOrdering.GROUP if self.group_size == -1 and self.strategy != "channel": raise ValueError("Marlin kernels require group quantization or " @@ -64,12 +70,10 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, output_size_per_partition = sum(output_partition_sizes) # If group_size is -1, we are in channelwise case. - channelwise = (self.group_size == -1) group_size = self.group_size if self.group_size != -1 else input_size row_parallel = (input_size != input_size_per_partition) - # In the case of channelwise quantization, we need to replicate the - # scales across all gpus. - partition_scales = (row_parallel and not channelwise) + partition_scales = not marlin_repeat_scales_on_all_ranks( + self.has_g_idx, self.group_size, row_parallel) verify_marlin_supports_shape( output_size_per_partition=output_size_per_partition, @@ -123,6 +127,16 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, layer.register_parameter("weight_scale", weight_scale) layer.register_parameter("weight_shape", weight_shape) + # group index (for activation reordering) + if self.has_g_idx: + weight_g_idx = RowvLLMParameter(data=torch.empty( + input_size_per_partition, + dtype=torch.int32, + ), + input_dim=0, + weight_loader=weight_loader) + layer.register_parameter("weight_g_idx", weight_g_idx) + layer.input_size_per_partition = input_size_per_partition layer.output_size_per_partition = output_size_per_partition layer.input_size = input_size @@ -137,9 +151,14 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.workspace = marlin_make_workspace( layer.output_size_per_partition, device) - # Act-order not supported in compressed-tensors yet, so set to empty. - layer.g_idx = marlin_make_empty_g_idx(device) - layer.g_idx_sort_indices = marlin_make_empty_g_idx(device) + # Handle sorting for activation reordering if needed. + if self.has_g_idx: + g_idx, g_idx_sort_indices = marlin_sort_g_idx(layer.weight_g_idx) + layer.g_idx_sort_indices = g_idx_sort_indices + replace_tensor(layer, "weight_g_idx", g_idx) + else: + layer.weight_g_idx = marlin_make_empty_g_idx(device) + layer.g_idx_sort_indices = marlin_make_empty_g_idx(device) # No zero-point layer.weight_zp = marlin_make_empty_g_idx(device) @@ -159,9 +178,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: replace_tensor(layer, "weight_packed", marlin_qweight) # Permute scales from compressed-tensors format to marlin format. + # scale is required on all partitions if activation reordering marlin_scales = marlin_permute_scales( layer.weight_scale, - size_k=layer.input_size_per_partition, + size_k=(layer.input_size + if self.has_g_idx else layer.input_size_per_partition), size_n=layer.output_size_per_partition, group_size=layer.group_size) replace_tensor(layer, "weight_scale", marlin_scales) @@ -174,7 +195,7 @@ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, weight=layer.weight_packed, weight_scale=layer.weight_scale, weight_zp=layer.weight_zp, - g_idx=layer.g_idx, + g_idx=layer.weight_g_idx, g_idx_sort_indices=layer.g_idx_sort_indices, workspace=layer.workspace, wtype=self.quant_type, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index 7912cbde5721..fc531b9d666e 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -1,8 +1,8 @@ import re from enum import Enum -from typing import Any, Dict, Iterable, Optional +from typing import Any, Dict, Iterable, Optional, Union -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator from torch.nn import Module from vllm.model_executor.layers.quantization.utils.quant_utils import ( @@ -40,6 +40,19 @@ class QuantizationStrategy(str, Enum): TOKEN = "token" +class ActivationOrdering(str, Enum): + """ + Enum storing strategies for activation ordering + + Group: reorder groups and weight\n + Weight: only reorder weight, not groups. Slightly lower latency and + accuracy compared to group actorder\n + """ + + GROUP = "group" + WEIGHT = "weight" + + class QuantizationArgs(BaseModel): """ User facing arguments used to define a quantization config @@ -58,6 +71,8 @@ class QuantizationArgs(BaseModel): observed with every sample. Defaults to False for static quantization. Note that enabling dynamic quantization will change the default observer to a memoryless one + :param actorder: whether to apply group quantization in decreasing order of + activation. Defaults to None for arbitrary ordering """ num_bits: int = 8 @@ -67,6 +82,7 @@ class QuantizationArgs(BaseModel): strategy: Optional[QuantizationStrategy] = None block_structure: Optional[str] = None dynamic: bool = False + actorder: Union[ActivationOrdering, bool, None] = None observer: str = Field( default="minmax", description=("The class to use to compute the quantization param - " @@ -79,6 +95,16 @@ class QuantizationArgs(BaseModel): "Observers constructor excluding quantization range or symmetry"), ) + @field_validator("actorder", mode="before") + def validate_actorder(cls, value) -> Optional[ActivationOrdering]: + if isinstance(value, bool): + return ActivationOrdering.GROUP if value else None + + if isinstance(value, str): + return ActivationOrdering(value.lower()) + + return value + def is_activation_quantization_format(format: str) -> bool: _ACTIVATION_QUANTIZATION_FORMATS = [ From 6cd5e5b07e4415d064d93b8a66331a097bd9287e Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Mon, 9 Sep 2024 23:02:52 -0400 Subject: [PATCH 19/54] [Misc] Fused MoE Marlin support for GPTQ (#8217) --- .buildkite/test-pipeline.yaml | 13 +- csrc/moe/marlin_moe_ops.cu | 2 +- csrc/moe/marlin_moe_ops.h | 2 +- csrc/moe/torch_bindings.cpp | 1 - tests/kernels/test_moe.py | 221 ++++++++++++- tests/weight_loading/models-large.txt | 3 + tests/weight_loading/models.txt | 2 - .../layers/fused_moe/__init__.py | 14 +- .../layers/fused_moe/fused_marlin_moe.py | 219 ++++++++++++ .../layers/fused_moe/fused_moe.py | 138 ++------ vllm/model_executor/layers/fused_moe/layer.py | 75 +++-- .../compressed_tensors_moe.py | 48 +-- .../schemes/compressed_tensors_wNa16.py | 2 +- .../layers/quantization/gptq_marlin.py | 312 +++++++++++++++++- .../layers/quantization/utils/marlin_utils.py | 17 + .../quantization/utils/marlin_utils_test.py | 11 +- .../layers/quantization/utils/quant_utils.py | 19 +- vllm/model_executor/model_loader/utils.py | 8 + vllm/model_executor/models/mixtral.py | 9 +- 19 files changed, 912 insertions(+), 204 deletions(-) create mode 100644 tests/weight_loading/models-large.txt create mode 100644 vllm/model_executor/layers/fused_moe/fused_marlin_moe.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d0317b2fc48c..a0c7b7442b3b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -386,7 +386,18 @@ steps: - vllm/ - tests/weight_loading commands: - - bash weight_loading/run_model_weight_loading_test.sh + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt + +- label: Weight Loading Multiple GPU Test - Large Models # optional + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + gpu: a100 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt ##### multi gpus test ##### diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu index 1e170e80d2f7..92184f43c9eb 100644 --- a/csrc/moe/marlin_moe_ops.cu +++ b/csrc/moe/marlin_moe_ops.cu @@ -1737,4 +1737,4 @@ torch::Tensor marlin_gemm_moe( moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, max_par, replicate_input, apply_weights); return c; -} \ No newline at end of file +} diff --git a/csrc/moe/marlin_moe_ops.h b/csrc/moe/marlin_moe_ops.h index 01ba8ff69850..43d264e0770d 100644 --- a/csrc/moe/marlin_moe_ops.h +++ b/csrc/moe/marlin_moe_ops.h @@ -9,4 +9,4 @@ torch::Tensor marlin_gemm_moe( const torch::Tensor& g_idx, const torch::Tensor& perm, torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size, - bool replicate_input, bool apply_weights); \ No newline at end of file + bool replicate_input, bool apply_weights); diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index d4d43e2c601b..8a0e625b43fa 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -16,7 +16,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { "g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int " "size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, " "bool replicate_input, bool apply_weights) -> Tensor"); - m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe); #endif } diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index f526c381b333..2250cf1598b8 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -2,6 +2,8 @@ Run `pytest tests/kernels/test_moe.py`. """ +from typing import List + import pytest import torch from transformers import MixtralConfig @@ -9,7 +11,13 @@ from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( + fused_marlin_moe, single_marlin_moe) +from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk +from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( + marlin_quantize) from vllm.model_executor.models.mixtral import MixtralMoE +from vllm.scalar_type import scalar_types def torch_moe(a, w1, w2, score, topk): @@ -29,6 +37,20 @@ def torch_moe(a, w1, w2, score, topk): topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1) +def torch_moe_single(a, w, score, topk): + B, D = a.shape + a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) + out = torch.zeros(B * topk, w.shape[1], dtype=a.dtype, device=a.device) + score = torch.softmax(score, dim=-1, dtype=torch.float32) + _, topk_ids = torch.topk(score, topk) + topk_ids = topk_ids.view(-1) + for i in range(w.shape[0]): + mask = topk_ids == i + if mask.sum(): + out[mask] = a[mask] @ w[i].transpose(0, 1) + return (out.view(B, -1, w.shape[1])).sum(dim=1) + + @pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1]) @pytest.mark.parametrize("n", [2048, 256, 1024]) @pytest.mark.parametrize("k", [128, 511, 1024]) @@ -43,11 +65,11 @@ def test_fused_moe( topk: int, dtype: torch.dtype, ): - a = torch.randn((m, k), device='cuda', dtype=dtype) / 10 - w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10 - w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10 + a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 + w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 + w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 - score = torch.randn((m, e), device='cuda', dtype=dtype) + score = torch.randn((m, e), device="cuda", dtype=dtype) triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False) torch_output = torch_moe(a, w1, w2, score, topk) torch.testing.assert_close(triton_output, torch_output, atol=1e-2, rtol=0) @@ -99,3 +121,194 @@ def test_mixtral_moe(dtype: torch.dtype): vllm_states, rtol=mixtral_moe_tol[dtype], atol=mixtral_moe_tol[dtype]) + + +def stack_and_dev(tensors: List[torch.Tensor]): + dev = tensors[0].device + return torch.stack(tensors, dim=0).to(dev) + + +def compute_max_diff(output, output_ref): + return torch.mean(torch.abs(output - output_ref)) / torch.mean( + torch.abs(output_ref)) + + +@pytest.mark.parametrize("m", [64, 512, 222, 33, 1]) +@pytest.mark.parametrize("n", [128, 2048, 256, 1024]) +@pytest.mark.parametrize("k", [128, 1024, 512]) +@pytest.mark.parametrize("e", [4, 8, 64]) +@pytest.mark.parametrize("topk", [2, 6]) +@pytest.mark.parametrize("group_size", [-1, 32, 64, 128]) +@pytest.mark.parametrize("act_order", [True, False]) +def test_fused_marlin_moe( + m: int, + n: int, + k: int, + e: int, + topk: int, + group_size: int, + act_order: bool, +): + torch.manual_seed(7) + + if topk > e: + return + + # Filter act_order + if act_order: + if group_size == -1: + return + if group_size in (k, n): + return + + quant_type = scalar_types.uint4b8 + dtype = torch.float16 + a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 + w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 + w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 + for i in range(w2.shape[0]): + w2[0] = torch.eye(k, n, device="cuda", dtype=dtype) + + w_ref1_l = [] + qweight1_l = [] + scales1_l = [] + g_idx1_l = [] + sort_indices1_l = [] + + for i in range(w1.shape[0]): + test_perm = torch.randperm(k) + w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize( + w1[i].transpose(1, 0), quant_type, group_size, act_order, + test_perm) + w_ref1_l.append(w_ref1) + qweight1_l.append(qweight1) + scales1_l.append(scales1) + g_idx1_l.append(g_idx1) + sort_indices1_l.append(sort_indices1) + + w_ref1 = stack_and_dev(w_ref1_l) + qweight1 = stack_and_dev(qweight1_l).contiguous() + scales1 = stack_and_dev(scales1_l) + g_idx1 = stack_and_dev(g_idx1_l) + sort_indices1 = stack_and_dev(sort_indices1_l) + + w_ref2_l = [] + qweight2_l = [] + scales2_l = [] + g_idx2_l = [] + sort_indices2_l = [] + + for i in range(w2.shape[0]): + test_perm = torch.randperm(n) + w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize( + w2[i].transpose(1, 0), quant_type, group_size, act_order, + test_perm) + w_ref2_l.append(w_ref2) + qweight2_l.append(qweight2) + scales2_l.append(scales2) + g_idx2_l.append(g_idx2) + sort_indices2_l.append(sort_indices2) + + w_ref2 = stack_and_dev(w_ref2_l) + qweight2 = stack_and_dev(qweight2_l).contiguous() + scales2 = stack_and_dev(scales2_l) + g_idx2 = stack_and_dev(g_idx2_l) + sort_indices2 = stack_and_dev(sort_indices2_l) + + score = torch.randn((m, e), device="cuda", dtype=dtype) + + topk_weights, topk_ids = fused_topk(a, score, topk, False) + + triton_output = fused_moe( + a, + w_ref1.transpose(1, 2).contiguous(), + w_ref2.transpose(1, 2).contiguous(), + score, + topk, + renormalize=False, + ) + marlin_output = fused_marlin_moe( + a, + qweight1, + qweight2, + score, + g_idx1, + g_idx2, + sort_indices1, + sort_indices2, + topk_weights, + topk_ids, + w1_scale=scales1, + w2_scale=scales2, + ) + + assert compute_max_diff(marlin_output, triton_output) < 4e-2 + + +@pytest.mark.skip("This test is here for the sake of debugging, " + "don't run it in automated tests.") +@pytest.mark.parametrize("m", [64, 512, 222, 33, 1]) +@pytest.mark.parametrize("n", [128, 2048, 256, 1024]) +@pytest.mark.parametrize("k", [128, 1024, 512]) +@pytest.mark.parametrize("e", [4, 8, 64]) +@pytest.mark.parametrize("topk", [2, 6]) +@pytest.mark.parametrize("group_size", [-1, 32, 64, 128]) +@pytest.mark.parametrize("act_order", [True, False]) +def test_marlin_moe_mmm( + m: int, + n: int, + k: int, + e: int, + topk: int, + group_size: int, + act_order: bool, +): + if topk > e: + return + + # Filter act_order + if act_order: + if group_size == -1: + return + if group_size == k: + return + + quant_type = scalar_types.uint4b8 + dtype = torch.float16 + a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 + w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10 + + w_ref_l = [] + qweights_l = [] + scales_l = [] + g_idx_l = [] + sort_indices_l = [] + + for i in range(w.shape[0]): + test_perm = torch.randperm(k) + w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize( + w[i].transpose(1, 0), quant_type, group_size, act_order, test_perm) + w_ref_l.append(w_ref) + qweights_l.append(qweight) + scales_l.append(scales) + g_idx_l.append(g_idx) + sort_indices_l.append(sort_indices) + + w_ref = stack_and_dev(w_ref_l) + qweight = stack_and_dev(qweights_l).contiguous() + scales = stack_and_dev(scales_l) + g_idx = stack_and_dev(g_idx_l) + sort_indices = stack_and_dev(sort_indices_l) + + score = torch.randn((m, e), device="cuda", dtype=dtype) + marlin_output = single_marlin_moe(a, + qweight, + scales, + score, + g_idx, + sort_indices, + topk, + renormalize=False) + torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk) + + assert compute_max_diff(marlin_output, torch_output) < 1e-2 diff --git a/tests/weight_loading/models-large.txt b/tests/weight_loading/models-large.txt new file mode 100644 index 000000000000..fe7670574676 --- /dev/null +++ b/tests/weight_loading/models-large.txt @@ -0,0 +1,3 @@ +compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main +compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main +gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main \ No newline at end of file diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt index c708e6d5eb89..a90b352a39bc 100644 --- a/tests/weight_loading/models.txt +++ b/tests/weight_loading/models.txt @@ -19,8 +19,6 @@ compressed-tensors, nm-testing/tinyllama-oneshot-w8a16-per-channel, main compressed-tensors, nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test, main compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main -compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main -compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main awq, casperhansen/mixtral-instruct-awq, main awq_marlin, casperhansen/mixtral-instruct-awq, main diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index fd6f41b90042..e9b5703ca28b 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -2,16 +2,22 @@ FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.triton_utils import HAS_TRITON -__all__ = ["FusedMoE", "FusedMoEMethodBase", "FusedMoeWeightScaleSupported"] +__all__ = [ + "FusedMoE", + "FusedMoEMethodBase", + "FusedMoeWeightScaleSupported", +] if HAS_TRITON: - + from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( + fused_marlin_moe, single_marlin_moe) from vllm.model_executor.layers.fused_moe.fused_moe import ( - fused_experts, fused_marlin_moe, fused_moe, fused_topk, - get_config_file_name, grouped_topk) + fused_experts, fused_moe, fused_topk, get_config_file_name, + grouped_topk) __all__ += [ "fused_marlin_moe", + "single_marlin_moe", "fused_moe", "fused_topk", "fused_experts", diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py new file mode 100644 index 000000000000..200a6148978a --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -0,0 +1,219 @@ +"""Fused MoE utilities for GPTQ.""" +import functools +from typing import Any, Dict, Optional + +import torch + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.fused_moe.fused_moe import ( + fused_topk, moe_align_block_size, try_get_optimal_moe_config) + + +def single_marlin_moe( + hidden_states: torch.Tensor, + w: torch.Tensor, + scales: torch.Tensor, + gating_output: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + topk: int, + renormalize: bool, + override_config: Optional[Dict[str, Any]] = None) -> torch.Tensor: + """ + This function computes the multiplication of hidden_states with expert + weights used in Marlin MoE, using weights w and top-k gating mechanism. + Its purpose is testing and debugging the fused MoE kernel. + + Parameters: + - hidden_states (torch.Tensor): The input tensor to the Marlin Mul. + - w (torch.Tensor): The set of expert weights. + - scales (torch.Tensor): The quantization scales. + - gating_output (torch.Tensor): The output of the gating operation + (before softmax). + - g_idx (torch.Tensor): The act_order indices. + - perm (torch.Tensor): The act_order input permutation. + - topk (int): The number of top-k experts to select. + - renormalize (bool): If True, renormalize the top-k weights to sum to 1. + - override_config (Optional[Dict[str, Any]]): Optional override + for the kernel configuration. + + Returns: + - torch.Tensor: The output tensor after applying the MoE layer. + """ + # Check constraints. + assert hidden_states.shape[0] == gating_output.shape[0], ( + "Number of tokens mismatch") + assert hidden_states.shape[1] == w.shape[1] * 16, "Hidden size mismatch" + assert gating_output.shape[1] == w.shape[0], "Number of experts mismatch" + assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" + assert w.is_contiguous(), "Expert weights must be contiguous" + assert hidden_states.dtype == torch.float16 + + M, K = hidden_states.shape + E = w.shape[0] + N = w.shape[2] // 2 + + topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk, + renormalize) + + # This might not be an optimal config for a single MMM + get_config_func = functools.partial(try_get_optimal_moe_config, + w.shape, + w.shape, + topk_ids.shape[1], + None, + override_config=override_config, + is_marlin=True) + config = get_config_func(M) + + block_size_m = config['BLOCK_SIZE_M'] + + sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E) + + max_workspace_size = (N // 64) * 16 + workspace = torch.zeros(max_workspace_size, + dtype=torch.int, + device="cuda", + requires_grad=False) + + intermediate_cache = torch.ops._moe_C.marlin_gemm_moe( + hidden_states, w, sorted_token_ids, topk_weights, topk_ids, scales, + g_idx, perm, workspace, M, N, K, True, E, topk, block_size_m, True, + False) + + return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1) + + +def fused_marlin_moe( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + gating_output: torch.Tensor, + g_idx1: torch.Tensor, + g_idx2: torch.Tensor, + perm1: torch.Tensor, + perm2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + override_config: Optional[Dict[str, Any]] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, +) -> torch.Tensor: + """ + This function computes a Mixture of Experts (MoE) layer using two sets of + weights, w1 and w2, and top-k gating mechanism. + + Parameters: + - hidden_states (torch.Tensor): The input tensor to the MoE layer. + - w1 (torch.Tensor): The first set of expert weights. + - w2 (torch.Tensor): The second set of expert weights. + - gating_output (torch.Tensor): The output of the gating operation + (before softmax). + - g_idx1 (torch.Tensor): The first set of act_order indices. + - g_idx2 (torch.Tensor): The second set of act_order indices. + - perm1 (torch.Tensor): The first act_order input permutation. + - perm2 (torch.Tensor): The second act_order input permutation. + - topk_weights (torch.Tensor): Top-k weights. + - topk_ids (torch.Tensor): Indices of topk-k elements. + - renormalize (bool): If True, renormalize the top-k weights to sum to 1. + - override_config (Optional[Dict[str, Any]]): Optional override + for the kernel configuration. + - w1_scale (Optional[torch.Tensor]): Optional scale to be used for + w1. + - w2_scale (Optional[torch.Tensor]): Optional scale to be used for + w2. + + Returns: + - torch.Tensor: The output tensor after applying the MoE layer. + """ + # Check constraints. + assert hidden_states.shape[0] == gating_output.shape[ + 0], "Number of tokens mismatch" + assert hidden_states.shape[ + 1] == w1.shape[1] * 16, "Hidden size mismatch w1" + assert hidden_states.shape[ + 1] == w2.shape[2] // 2, "Hidden size mismatch w2" + assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch" + assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" + assert w1.is_contiguous(), "Expert weights1 must be contiguous" + assert w2.is_contiguous(), "Expert weights2 must be contiguous" + assert hidden_states.dtype == torch.float16 + + M, K = hidden_states.shape + E = w1.shape[0] + N = w2.shape[1] * 16 + topk = topk_ids.shape[1] + + get_config_func = functools.partial( + try_get_optimal_moe_config, + w1.shape, + w2.shape, + topk_ids.shape[1], + None, + override_config=override_config, + is_marlin=True, + ) + config = get_config_func(M) + + block_size_m = config["BLOCK_SIZE_M"] + + sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E) + + max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16 + workspace = torch.zeros(max_workspace_size, + dtype=torch.int, + device="cuda", + requires_grad=False) + + intermediate_cache2 = torch.empty( + (M * topk_ids.shape[1], N), + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + + intermediate_cache1 = torch.ops._moe_C.marlin_gemm_moe( + hidden_states, + w1, + sorted_token_ids, + topk_weights, + topk_ids, + w1_scale, + g_idx1, + perm1, + workspace, + M, + 2 * N, + K, + True, + E, + topk, + block_size_m, + True, + False, + ) + + ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N)) + + intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe( + intermediate_cache2, + w2, + sorted_token_ids, + topk_weights, + topk_ids, + w2_scale, + g_idx2, + perm2, + workspace, + M, + K, + N, + True, + E, + topk, + block_size_m, + False, + True, + ) + + return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), + dim=1) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 05169eaddb25..bd13d8fecbb9 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -323,15 +323,22 @@ def get_moe_configs(E: int, N: int, return None -def get_default_config(M: int, E: int, N: int, K: int, topk: int, - dtype: Optional[str], - is_marlin: bool) -> Dict[str, int]: +def get_default_config( + M: int, + E: int, + N: int, + K: int, + topk: int, + dtype: Optional[str], + is_marlin: bool, +) -> Dict[str, int]: config = { 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8 } + # A heuristic: fused marlin works faster with this config for small M if M <= E or (is_marlin and M <= 32): config = { 'BLOCK_SIZE_M': 16, @@ -342,14 +349,15 @@ def get_default_config(M: int, E: int, N: int, K: int, topk: int, return config -def try_get_optimal_moe_config(w1_shape: Tuple[int, ...], - w2_shape: Tuple[int, ...], - top_k: int, - dtype: Optional[str], - M: int, - override_config: Optional[Dict[str, - Any]] = None, - is_marlin: bool = False): +def try_get_optimal_moe_config( + w1_shape: Tuple[int, ...], + w2_shape: Tuple[int, ...], + top_k: int, + dtype: Optional[str], + M: int, + override_config: Optional[Dict[str, Any]] = None, + is_marlin: bool = False, +): if override_config: config = override_config else: @@ -391,6 +399,7 @@ def fused_topk( topk, dtype=torch.int32, device=hidden_states.device) + ops.topk_softmax( topk_weights, topk_ids, @@ -437,113 +446,6 @@ def grouped_topk(hidden_states: torch.Tensor, return topk_weights, topk_ids -def fused_marlin_moe(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - gating_output: torch.Tensor, - g_idx1: torch.Tensor, - g_idx2: torch.Tensor, - rand_perm1: torch.Tensor, - rand_perm2: torch.Tensor, - topk: int, - custom_routing_function: Optional[Callable] = None, - renormalize: bool = True, - override_config: Optional[Dict[str, Any]] = None, - use_fp8: bool = False, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None) -> torch.Tensor: - """ - This function computes a Mixture of Experts (MoE) layer using two sets of - weights, w1 and w2, and top-k gating mechanism. - Parameters: - - hidden_states (torch.Tensor): The input tensor to the MoE layer. - - w1 (torch.Tensor): The first set of expert weights. - - w2 (torch.Tensor): The second set of expert weights. - - gating_output (torch.Tensor): The output of the gating operation - (before softmax). - - topk (int): The number of top-k experts to select. - - renormalize (bool): If True, renormalize the top-k weights to sum to 1. - - inplace (bool): If True, perform the operation in-place. - Defaults to False. - - override_config (Optional[Dict[str, Any]]): Optional override - for the kernel configuration. - - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner - products for w1 and w2. Defaults to False. - - w1_scale (Optional[torch.Tensor]): Optional scale to be used for - w1. - - w2_scale (Optional[torch.Tensor]): Optional scale to be used for - w2. - Returns: - - torch.Tensor: The output tensor after applying the MoE layer. - """ - # Check constraints. - assert hidden_states.shape[0] == gating_output.shape[0], ( - "Number of tokens mismatch") - assert hidden_states.shape[ - 1] == w1.shape[1] * 16, "Hidden size mismatch w1" - assert hidden_states.shape[ - 1] == w2.shape[2] // 2, "Hidden size mismatch w2" - assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch" - assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" - assert w1.is_contiguous(), "Expert weights1 must be contiguous" - assert w2.is_contiguous(), "Expert weights2 must be contiguous" - assert hidden_states.dtype in [ - torch.float32, torch.float16, torch.bfloat16 - ] - - #TODO fp8 is not implemented yet - assert not use_fp8 - - M, K = hidden_states.shape - E = w1.shape[0] - N = w2.shape[1] * 16 - - if custom_routing_function is None: - topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk, - renormalize) - else: - topk_weights, topk_ids = custom_routing_function( - hidden_states, gating_output, topk, renormalize) - - get_config_func = functools.partial(try_get_optimal_moe_config, - w1.shape, - w2.shape, - topk_ids.shape[1], - "float8" if use_fp8 else None, - override_config=override_config, - is_marlin=True) - config = get_config_func(M) - - block_size_m = config['BLOCK_SIZE_M'] - - sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E) - - max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16 - workspace = torch.zeros(max_workspace_size, - dtype=torch.int, - device="cuda", - requires_grad=False) - - intermediate_cache2 = torch.empty((M * topk_ids.shape[1], N), - device=hidden_states.device, - dtype=hidden_states.dtype) - - intermediate_cache1 = torch.ops._moe_C.marlin_gemm_moe( - hidden_states, w1, sorted_token_ids, topk_weights, topk_ids, w1_scale, - g_idx1, rand_perm1, workspace, M, 2 * N, K, True, E, topk, - block_size_m, True, False) - - ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N)) - - intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe( - intermediate_cache2, w2, sorted_token_ids, topk_weights, topk_ids, - w2_scale, g_idx2, rand_perm2, workspace, M, K, N, True, E, topk, - block_size_m, False, True) - - return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), - dim=1) - - def get_config_dtype_str(dtype: torch.dtype, use_int8_w8a16: Optional[bool] = False, use_fp8_w8a8: Optional[bool] = False): diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 3df0b61a9ebe..f6c6f5f52940 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -306,10 +306,28 @@ def _load_single_value(self, param: torch.nn.Parameter, # Input scales can be loaded directly and should be equal. param_data[expert_id] = loaded_weight + def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor, + shard_dim: int, loaded_weight: torch.tensor, tp_rank: int): + + if shard_id == "w2": + self._load_w2(shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=tp_rank) + else: + assert shard_id in ("w1", "w3") + expert_data.copy_(loaded_weight) + def weight_loader(self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, weight_name: str, shard_id: str, expert_id: int) -> None: + # compressed-tensors represents weights on disk which are flipped + loaded_weight = loaded_weight.t().contiguous() if ( + self.quant_method.__class__.__name__ + == "CompressedTensorsMoEMethod") else loaded_weight + if shard_id not in ("w1", "w2", "w3"): raise ValueError(f"shard_id must be ['w1','w2','w3'] but " f"got {shard_id}.") @@ -325,19 +343,41 @@ def weight_loader(self, param: torch.nn.Parameter, expert_data = param.data[expert_id] tp_rank = get_tensor_model_parallel_rank() - # is_transposed: whether or not the parameter is transposed on disk - # If transposed, the loaded weight will be transposed and the dim - # to shard the loaded weight will be flipped. + # is_transposed: if the dim to shard the weight + # should be flipped. Required by GPTQ, compressed-tensors + # should be whatever dimension intermediate_size is is_transposed = getattr(param, "is_transposed", False) shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id] if is_transposed: - loaded_weight = loaded_weight.t().contiguous() shard_dim = ~shard_dim - # Case weight_scales - if "weight_scale" in weight_name: - # load the weight scaling based on the quantization scheme - # supported weight scales can be found in + # Case input scale: input_scale loading is only supported for fp8 + if "input_scale" in weight_name: + if param.data[expert_id] != 1 and (param.data[expert_id] - + loaded_weight).abs() > 1e-5: + raise ValueError( + "input_scales of w1 and w3 of a layer " + f"must be equal. But got {param.data[expert_id]} " + f"vs. {loaded_weight}") + + self._load_single_value(param=param, + loaded_weight=loaded_weight, + expert_id=expert_id) + return + + # Case g_idx + if "g_idx" in weight_name: + self._load_g_idx(shard_dim=0, + shard_id=shard_id, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=tp_rank) + return + + # Case weight scales and zero_points + if ("scale" in weight_name or "zero" in weight_name): + # load the weight scales and zp based on the quantization scheme + # supported weight scales/zp can be found in # FusedMoeWeightScaleSupported # TODO @dsikka: once hardened, refactor to use vLLM Parameters # specific to each case @@ -366,22 +406,9 @@ def weight_loader(self, param: torch.nn.Parameter, f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}") return + # Case weight_shape if "weight_shape" in weight_name: - self._load_single_value(param=param, - loaded_weight=loaded_weight, - expert_id=expert_id) - return - - # Case input scale - if "input_scale" in weight_name: - # Note: input_scale loading is only supported for fp8 - if param.data[expert_id] != 1 and (param.data[expert_id] - - loaded_weight).abs() > 1e-5: - raise ValueError( - "input_scales of w1 and w3 of a layer " - f"must be equal. But got {param.data[expert_id]} " - f"vs. {loaded_weight}") - + # only required by compressed-tensors self._load_single_value(param=param, loaded_weight=loaded_weight, expert_id=expert_id) @@ -498,4 +525,4 @@ def _load_fp8_scale(self, param: torch.nn.Parameter, param_data[expert_id][idx] = loaded_weight # If we are in the row parallel case (down_proj) else: - param_data[expert_id] = loaded_weight \ No newline at end of file + param_data[expert_id] = loaded_weight diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 36323493d601..49c29c2775cb 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -5,9 +5,7 @@ import torch from vllm import _custom_ops as ops -from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase -from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( - WNA16_SUPPORTED_BITS) +from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( CompressionFormat) from vllm.model_executor.utils import set_weight_attrs @@ -40,11 +38,10 @@ def __init__( if not (self.quant_config.quant_format == CompressionFormat.pack_quantized.value - and self.num_bits in WNA16_SUPPORTED_BITS): + and self.num_bits == 4): raise ValueError("For Fused MoE layers, only ", f"{CompressionFormat.pack_quantized.value} ", - "is supported for the following bits: ", - f"{WNA16_SUPPORTED_BITS}") + "is supported for 4 bits") def create_weights(self, layer: torch.nn.Module, num_experts: int, hidden_size: int, intermediate_size: int, @@ -269,19 +266,30 @@ def apply( custom_routing_function: Optional[Callable] = None, ) -> torch.Tensor: - from vllm.model_executor.layers.fused_moe.fused_moe import ( + from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( fused_marlin_moe) - return fused_marlin_moe(x, - layer.w13_weight_packed, - layer.w2_weight_packed, - router_logits, - layer.w13_g_idx, - layer.w2_g_idx, - layer.w13_g_idx_sort_indices, - layer.w2_g_idx_sort_indices, - top_k, - custom_routing_function, - renormalize=renormalize, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale) + topk_weights, topk_ids = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function) + + return fused_marlin_moe( + x, + layer.w13_weight_packed, + layer.w2_weight_packed, + router_logits, + layer.w13_g_idx, + layer.w2_g_idx, + layer.w13_g_idx_sort_indices, + layer.w2_g_idx_sort_indices, + topk_weights, + topk_ids, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index 8897737c1c55..3cade3d3fbcd 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -22,7 +22,7 @@ __all__ = ["CompressedTensorsWNA16"] WNA16_SUPPORTED_TYPES_MAP = { 4: scalar_types.uint4b8, - 8: scalar_types.uint8b128, + 8: scalar_types.uint8b128 } WNA16_SUPPORTED_BITS = list(WNA16_SUPPORTED_TYPES_MAP.keys()) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index b06ff7bd2bac..3617a32f80fc 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -1,18 +1,22 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Callable, Dict, List, Optional, Union import torch from torch.nn import Parameter from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.fused_moe.layer import ( + FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) +from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, + set_weight_attrs) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( apply_gptq_marlin_linear, check_marlin_supported, marlin_is_k_full, - marlin_make_empty_g_idx, marlin_make_workspace, marlin_permute_scales, - marlin_repeat_scales_on_all_ranks, marlin_sort_g_idx, replace_tensor, - verify_marlin_supported, verify_marlin_supports_shape) + marlin_make_empty_g_idx, marlin_make_workspace, marlin_moe_permute_scales, + marlin_permute_scales, marlin_repeat_scales_on_all_ranks, + marlin_sort_g_idx, replace_tensor, verify_marlin_supported, + verify_marlin_supports_shape) from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.parameter import (ChannelQuantScaleParameter, GroupQuantScaleParameter, @@ -33,8 +37,14 @@ class GPTQMarlinConfig(QuantizationConfig): (8, True): scalar_types.uint8b128, } - def __init__(self, weight_bits: int, group_size: int, desc_act: bool, - is_sym: bool, lm_head_quantized: bool) -> None: + def __init__( + self, + weight_bits: int, + group_size: int, + desc_act: bool, + is_sym: bool, + lm_head_quantized: bool, + ) -> None: if desc_act and group_size == -1: # In this case, act_order == True is the same as act_order == False # (since we have only one group per output channel) @@ -105,11 +115,14 @@ def override_quantization_method(cls, hf_quant_cfg, " faster inference") return None - def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["GPTQMarlinLinearMethod"]: - if (isinstance(layer, LinearBase) or - (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)): + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod"]]: + if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead) + and self.lm_head_quantized): return GPTQMarlinLinearMethod(self) + elif isinstance(layer, FusedMoE): + return GPTQMarlinMoEMethod(self) return None def get_scaled_act_names(self) -> List[str]: @@ -179,7 +192,8 @@ def create_weights( output_size_per_partition=output_size_per_partition, input_size_per_partition=input_size_per_partition, input_size=input_size, - group_size=group_size) + group_size=group_size, + ) # Determine sharding if marlin_repeat_scales_on_all_ranks(self.quant_config.desc_act, @@ -299,7 +313,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: perm=layer.g_idx_sort_indices, size_k=layer.input_size_per_partition, size_n=layer.output_size_per_partition, - num_bits=self.quant_config.quant_type.size_bits) + num_bits=self.quant_config.quant_type.size_bits, + ) replace_tensor(layer, "qweight", marlin_qweight) # Permute scales from autogptq format to marlin format. @@ -308,7 +323,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: size_k=(layer.input_size if self.quant_config.desc_act else layer.input_size_per_partition), size_n=layer.output_size_per_partition, - group_size=self.quant_config.group_size) + group_size=self.quant_config.group_size, + ) replace_tensor(layer, "scales", marlin_scales) def apply( @@ -329,4 +345,270 @@ def apply( output_size_per_partition=layer.output_size_per_partition, input_size_per_partition=layer.input_size_per_partition, is_k_full=layer.is_k_full, - bias=bias) + bias=bias, + ) + + +class GPTQMarlinMoEMethod(FusedMoEMethodBase): + """MoE Marlin method with quantization.""" + + def __init__(self, quant_config: GPTQMarlinConfig) -> None: + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + # Currently assuming is_k_full is always True + # (input size per partition is the same as full input size) + # Supports only sym for now (no zp) + if self.quant_config.group_size != -1: + scales_size13 = hidden_size // self.quant_config.group_size + scales_size2 = intermediate_size // self.quant_config.group_size + strategy = FusedMoeWeightScaleSupported.GROUP.value + else: + scales_size13 = 1 + scales_size2 = 1 + strategy = FusedMoeWeightScaleSupported.CHANNEL.value + + extra_weight_attrs.update({ + "quant_method": strategy, + "is_transposed": True + }) + # Fused gate_up_proj (column parallel) + w13_qweight = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size // self.quant_config.pack_factor, + 2 * intermediate_size, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_qweight", w13_qweight) + set_weight_attrs(w13_qweight, extra_weight_attrs) + # down_proj (row parallel) + w2_qweight = torch.nn.Parameter( + torch.empty( + num_experts, + intermediate_size // self.quant_config.pack_factor, + hidden_size, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_qweight", w2_qweight) + set_weight_attrs(w2_qweight, extra_weight_attrs) + # up_proj scales + w13_scales = torch.nn.Parameter( + torch.empty(num_experts, + scales_size13, + 2 * intermediate_size, + dtype=torch.half), + requires_grad=False, + ) + layer.register_parameter("w13_scales", w13_scales) + set_weight_attrs(w13_scales, extra_weight_attrs) + # down_proj scales + w2_scales = torch.nn.Parameter( + torch.empty(num_experts, + scales_size2, + hidden_size, + dtype=torch.half), + requires_grad=False, + ) + layer.register_parameter("w2_scales", w2_scales) + set_weight_attrs(w2_scales, extra_weight_attrs) + # up_proj scales + w13_qzeros = torch.nn.Parameter( + torch.empty(num_experts, + scales_size13, + 2 * intermediate_size // self.quant_config.pack_factor, + dtype=params_dtype), + requires_grad=False, + ) + layer.register_parameter("w13_qzeros", w13_qzeros) + set_weight_attrs(w13_qzeros, extra_weight_attrs) + # down_proj scales + w2_qzeros = torch.nn.Parameter( + torch.empty(num_experts, + scales_size2, + hidden_size // self.quant_config.pack_factor, + dtype=params_dtype), + requires_grad=False, + ) + layer.register_parameter("w2_qzeros", w2_qzeros) + set_weight_attrs(w2_qzeros, extra_weight_attrs) + w13_g_idx = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_g_idx", w13_g_idx) + set_weight_attrs(w13_g_idx, extra_weight_attrs) + w2_g_idx = torch.nn.Parameter( + torch.empty( + num_experts, + intermediate_size, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_g_idx", w2_g_idx) + set_weight_attrs(w2_g_idx, extra_weight_attrs) + w13_g_idx_sort_indices = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_g_idx_sort_indices", + w13_g_idx_sort_indices) + set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs) + w2_g_idx_sort_indices = torch.nn.Parameter( + torch.empty( + num_experts, + intermediate_size, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_g_idx_sort_indices", + w2_g_idx_sort_indices) + set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + + # Process act_order + if self.quant_config.desc_act: + # Get sorting based on g_idx + num_experts = layer.w13_g_idx.shape[0] + w13_g_idx_sort_indices = torch.empty_like(layer.w13_g_idx) + w2_g_idx_sort_indices = torch.empty_like(layer.w2_g_idx) + w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx) + w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx) + for e in range(num_experts): + w13_g_idx_sort_indices[e] = torch.argsort( + layer.w13_g_idx[e]).to(torch.int32) + w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_g_idx[e]).to( + torch.int32) + w13_sorted_g_idx[e] = layer.w13_g_idx[e][ + w13_g_idx_sort_indices[e]] + w2_sorted_g_idx[e] = layer.w2_g_idx[e][ + w2_g_idx_sort_indices[e]] + replace_tensor(layer, "w13_g_idx", w13_sorted_g_idx) + replace_tensor(layer, "w2_g_idx", w2_sorted_g_idx) + replace_tensor(layer, "w13_g_idx_sort_indices", + w13_g_idx_sort_indices) + replace_tensor(layer, "w2_g_idx_sort_indices", + w2_g_idx_sort_indices) + else: + # Reset g_idx related tensors + num_experts = layer.w13_g_idx.shape[0] + device = layer.w13_g_idx.device + layer.w13_g_idx = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, + device=device), + requires_grad=False, + ) + layer.w2_g_idx = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, + device=device), + requires_grad=False, + ) + layer.w13_g_idx_sort_indices = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, + device=device), + requires_grad=False, + ) + layer.w2_g_idx_sort_indices = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, + device=device), + requires_grad=False, + ) + # Repack weights + marlin_w13_qweight = ops.gptq_marlin_moe_repack( + layer.w13_qweight, + layer.w13_g_idx_sort_indices, + layer.w13_qweight.shape[1] * self.quant_config.pack_factor, + layer.w13_qweight.shape[2], + self.quant_config.quant_type.size_bits, + ) + replace_tensor(layer, "w13_qweight", marlin_w13_qweight) + marlin_w2_qweight = ops.gptq_marlin_moe_repack( + layer.w2_qweight, + layer.w2_g_idx_sort_indices, + layer.w2_qweight.shape[1] * self.quant_config.pack_factor, + layer.w2_qweight.shape[2], + self.quant_config.quant_type.size_bits, + ) + replace_tensor(layer, "w2_qweight", marlin_w2_qweight) + # Repack scales + marlin_w13_scales = marlin_moe_permute_scales( + s=layer.w13_scales, + size_k=layer.intermediate_size_per_partition, + size_n=layer.w13_scales.shape[2], + group_size=self.quant_config.group_size, + ) + replace_tensor(layer, "w13_scales", marlin_w13_scales) + marlin_w2_scales = marlin_moe_permute_scales( + s=layer.w2_scales, + size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor, + size_n=layer.w2_scales.shape[2], + group_size=self.quant_config.group_size, + ) + replace_tensor(layer, "w2_scales", marlin_w2_scales) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool = True, + use_grouped_topk: bool = False, + num_expert_group: Optional[int] = None, + topk_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + ) -> torch.Tensor: + from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( + fused_marlin_moe) + + # The input must currently be float16 + orig_dtype = x.dtype + x = x.half() + + topk_weights, topk_ids = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=None) + + return fused_marlin_moe( + x, + layer.w13_qweight, + layer.w2_qweight, + router_logits, + layer.w13_g_idx, + layer.w2_g_idx, + layer.w13_g_idx_sort_indices, + layer.w2_g_idx_sort_indices, + topk_weights, + topk_ids, + w1_scale=layer.w13_scales, + w2_scale=layer.w2_scales, + ).to(orig_dtype) diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 0ec68ac5b0f2..699d5f184414 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -176,6 +176,23 @@ def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int, return s +def marlin_moe_permute_scales( + s: torch.Tensor, + size_k: int, + size_n: int, + group_size: int, +): + num_experts = s.shape[0] + output = torch.empty( + (num_experts, s.shape[1], s.shape[2]), + device=s.device, + dtype=s.dtype, + ) + for e in range(num_experts): + output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size) + return output + + def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int, num_bits: int) -> torch.Tensor: # Permute zero-points in a similar way to scales, but do not use the diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py index 7d08ac6f8746..4a06c5d63d52 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py @@ -1,6 +1,6 @@ """Utility functions used for tests and benchmarks""" -from typing import List +from typing import List, Optional import numpy as np import torch @@ -92,8 +92,11 @@ def get_weight_perm(num_bits: int): return perm -def marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int, - act_order: bool): +def marlin_quantize(w: torch.Tensor, + quant_type: ScalarType, + group_size: int, + act_order: bool, + test_perm: Optional[torch.Tensor] = None): size_k, size_n = w.shape num_bits = quant_type.size_bits @@ -104,7 +107,7 @@ def marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int, # Quantize (and apply act_order if provided) w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights( - w, quant_type, group_size, act_order) + w, quant_type, group_size, act_order, test_perm) # For act_order, sort the "weights" and "g_idx" so that group ids are # increasing diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index 33f24ff5d54d..bdfda31de852 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -1,5 +1,5 @@ """This file is used for /tests and /benchmarks""" -from typing import List +from typing import List, Optional import numpy import torch @@ -53,7 +53,10 @@ def get_pack_factor(num_bits): return 32 // num_bits -def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int): +def permute_rows(q_w: torch.Tensor, + w_ref: torch.Tensor, + group_size: int, + test_perm: Optional[torch.Tensor] = None): assert q_w.shape == w_ref.shape orig_device = q_w.device @@ -64,7 +67,7 @@ def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int): g_idx[i] = i // group_size # Simulate act_order by doing a random permutation on K - rand_perm = torch.randperm(k_size) + rand_perm = test_perm if test_perm is not None else torch.randperm(k_size) g_idx = g_idx[rand_perm].contiguous() q_w = q_w[rand_perm, :].contiguous() @@ -164,8 +167,11 @@ def reshape_w(w): ) -def gptq_quantize_weights(w: torch.Tensor, quant_type: ScalarType, - group_size: int, act_order: bool): +def gptq_quantize_weights(w: torch.Tensor, + quant_type: ScalarType, + group_size: int, + act_order: bool, + test_perm: Optional[torch.Tensor] = None): size_k, _ = w.shape assert w.is_floating_point(), "w must be float" @@ -186,7 +192,8 @@ def gptq_quantize_weights(w: torch.Tensor, quant_type: ScalarType, ), "For act_order, groupsize = {} must be less than size_k = {}".format( group_size, size_k) - w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size) + w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size, + test_perm) return w_ref, w_q, w_s, g_idx, rand_perm diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 4bb943ab3afe..0052489d99dc 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -24,10 +24,18 @@ def get_model_architecture( # Special handling for quantized Mixtral. # FIXME(woosuk): This is a temporary hack. mixtral_supported = ["fp8", "compressed-tensors"] + # for gptq_marlin, only run fused MoE for int4 + if model_config.quantization == "gptq_marlin": + hf_quant_config = getattr(model_config.hf_config, + "quantization_config", None) + if hf_quant_config and hf_quant_config.get("bits") == 4: + mixtral_supported.append("gptq_marlin") + if (model_config.quantization is not None and model_config.quantization not in mixtral_supported and "MixtralForCausalLM" in architectures): architectures = ["QuantMixtralForCausalLM"] + return ModelRegistry.resolve_model_cls(architectures) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index e744e36ac08b..10cbfcf6432b 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -435,7 +435,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): continue name = name.replace(weight_name, param_name) # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): continue # Skip layers on other devices. if is_pp_missing_parameter(name, self): @@ -454,6 +455,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip layers on other devices. if is_pp_missing_parameter(name, self): continue + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, @@ -464,7 +468,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): break else: # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): continue # Skip layers on other devices. if is_pp_missing_parameter(name, self): From a1d874224d9c29ae84f3850474b4816f0ed9574b Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Mon, 9 Sep 2024 23:21:00 -0700 Subject: [PATCH 20/54] Add NVIDIA Meetup slides, announce AMD meetup, and add contact info (#8319) --- README.md | 16 ++++++++++++---- docs/source/community/meetups.rst | 1 + 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9ae30f8d2de5..53749cb36b97 100644 --- a/README.md +++ b/README.md @@ -17,15 +17,16 @@ Easy, fast, and cheap LLM serving for everyone --- -**vLLM & NVIDIA Triton User Meetup (Monday, September 9, 5pm-9pm PT) at Fort Mason, San Francisco** +**vLLM, AMD, Anyscale Meet & Greet at [Ray Summit 2024](http://raysummit.anyscale.com) (Monday, Sept 30th, 5-7pm PT) at Marriott Marquis San Francisco** -We are excited to announce our sixth vLLM Meetup, in collaboration with NVIDIA Triton Team. -Join us to hear the vLLM's recent update about performance. -Register now [here](https://lu.ma/87q3nvnh) and be part of the event! +We are excited to announce our special vLLM event in collaboration with AMD and Anyscale. +Join us to learn more about recent advancements of vLLM on MI300X. +Register [here](https://lu.ma/db5ld9n5) and be a part of the event! --- *Latest News* 🔥 +- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing). - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing). - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html). - [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing). @@ -130,3 +131,10 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs year={2023} } ``` + +## Contact Us + +* For technical questions and feature requests, please use Github issues or discussions. +* For discussing with fellow users, please use Discord. +* For security disclosures, please use Github's security advisory feature. +* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu. \ No newline at end of file diff --git a/docs/source/community/meetups.rst b/docs/source/community/meetups.rst index 3b01b109ebf2..a3962e96e791 100644 --- a/docs/source/community/meetups.rst +++ b/docs/source/community/meetups.rst @@ -5,6 +5,7 @@ vLLM Meetups We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +- `The sixth vLLM meetup `__, with NVIDIA, September 9th 2024. `[Slides] `__ - `The fifth vLLM meetup `__, with AWS, July 24th 2024. `[Slides] `__ - `The fourth vLLM meetup `__, with Cloudflare and BentoML, June 11th 2024. `[Slides] `__ - `The third vLLM meetup `__, with Roblox, April 2nd 2024. `[Slides] `__ From da1a844e61366b473cef6b3f7437ea5dc41876a1 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 10 Sep 2024 16:22:50 +0800 Subject: [PATCH 21/54] [Bugfix] Fix missing `post_layernorm` in CLIP (#8155) --- vllm/model_executor/models/clip.py | 29 +++++++++++++++++++++---- vllm/model_executor/models/siglip.py | 32 +++++++++++++++------------- 2 files changed, 42 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 70f1522ae252..078928f281c2 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -355,6 +355,19 @@ def __init__(self, quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override) + if len(self.encoder.layers) > config.num_hidden_layers: + raise ValueError( + f"The original encoder only has {config.num_hidden_layers} " + f"layers, but you requested {len(self.encoder.layers)} layers." + ) + elif len(self.encoder.layers) == config.num_hidden_layers: + self.post_layernorm = nn.LayerNorm(embed_dim, + eps=config.layer_norm_eps) + else: + # post_layernorm is unused when we extract intermediate features + # In this case, we can skip it to conserve memory + self.post_layernorm = None + def forward( self, pixel_values: torch.Tensor, @@ -364,7 +377,10 @@ def forward( hidden_states = self.pre_layrnorm(hidden_states) hidden_states = self.encoder(inputs_embeds=hidden_states) - return hidden_states + if self.post_layernorm is None: + return hidden_states + + return self.post_layernorm(hidden_states) class CLIPVisionModel(nn.Module): @@ -386,9 +402,12 @@ def __init__(self, quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override) - def forward(self, pixel_values: Optional[torch.Tensor] = None): + @property + def _require_post_layernorm(self) -> bool: + return self.vision_model.post_layernorm is not None - return self.vision_model(pixel_values=pixel_values) + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + return self.vision_model(pixel_values) @property def device(self): @@ -408,8 +427,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): for name, loaded_weight in weights: # post_layernorm is not needed in CLIPVisionModel - if "vision_model.post_layernorm" in name: + if ("vision_model.post_layernorm" in name + and not self._require_post_layernorm): continue + # omit layers when num_hidden_layers_override is set if "vision_model.encoder.layers." in name: layer_idx = int(name.split(".")[3]) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 13d09e4cd4c2..f7976eba7420 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -443,27 +443,26 @@ def __init__( self.config = config embed_dim = config.hidden_size - if (num_hidden_layers_override is None - or num_hidden_layers_override == config.num_hidden_layers): - self.need_post_layernorm = True - elif num_hidden_layers_override > config.num_hidden_layers: - raise ValueError( - "num_hidden_layers_override cannot be greater than " - "num_hidden_layers") - else: - self.need_post_layernorm = False - self.embeddings = SiglipVisionEmbeddings(config) self.encoder = SiglipEncoder( config, quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, ) - if self.need_post_layernorm: + + if len(self.encoder.layers) > config.num_hidden_layers: + raise ValueError( + f"The original encoder only has {config.num_hidden_layers} " + f"layers, but you requested {len(self.encoder.layers)} layers." + ) + elif len(self.encoder.layers) == config.num_hidden_layers: self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) else: - self.post_layernorm = nn.Identity() + # post_layernorm is unused when we extract intermediate features + # In this case, we can skip it to conserve memory + self.post_layernorm = None + self.use_head = (True if not hasattr(config, "vision_use_head") else config.vision_use_head) if self.use_head: @@ -482,6 +481,9 @@ def forward( encoder_outputs = self.encoder(inputs_embeds=hidden_states) + if self.post_layernorm is None: + return encoder_outputs + last_hidden_state = self.post_layernorm(encoder_outputs) # TODO: add this back when pooled_output is used in inference # if self.use_head: @@ -512,8 +514,8 @@ def __init__( ) @property - def need_post_layernorm(self): - return self.vision_model.need_post_layernorm + def _require_post_layernorm(self) -> bool: + return self.vision_model.post_layernorm is not None def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding @@ -541,7 +543,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): for name, loaded_weight in weights: # post_layernorm is optional in SiglipVisionModel if ("vision_model.post_layernorm" in name - and not self.need_post_layernorm): + and not self._require_post_layernorm): continue # omit layers when num_hidden_layers_override is set From 6234385f4a826edd5c4e0ca7dbdea480be215c5e Mon Sep 17 00:00:00 2001 From: Daniele <36171005+dtrifiro@users.noreply.github.com> Date: Tue, 10 Sep 2024 17:55:08 +0200 Subject: [PATCH 22/54] [CI/Build] enable ccache/scccache for HIP builds (#8327) --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1e08a5bd70cd..994920ede349 100644 --- a/setup.py +++ b/setup.py @@ -170,14 +170,17 @@ def configure(self, ext: CMakeExtension) -> None: if is_sccache_available(): cmake_args += [ + '-DCMAKE_C_COMPILER_LAUNCHER=sccache', '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache', '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache', - '-DCMAKE_C_COMPILER_LAUNCHER=sccache', + '-DCMAKE_HIP_COMPILER_LAUNCHER=sccache', ] elif is_ccache_available(): cmake_args += [ + '-DCMAKE_C_COMPILER_LAUNCHER=ccache', '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache', '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache', + '-DCMAKE_HIP_COMPILER_LAUNCHER=ccache', ] # Pass the python executable to cmake so it can find an exact From 8c054b7a6290551c868451dfd449d40cf37d8b62 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 11 Sep 2024 00:49:11 +0800 Subject: [PATCH 23/54] [Frontend] Clean up type annotations for mistral tokenizer (#8314) --- tests/async_engine/test_chat_template.py | 5 +- vllm/entrypoints/chat_utils.py | 61 +++++++++++++------ vllm/entrypoints/llm.py | 26 +++++--- vllm/entrypoints/openai/serving_chat.py | 48 +++++++++------ .../openai/serving_tokenization.py | 25 +++++--- vllm/transformers_utils/tokenizers/mistral.py | 8 +-- 6 files changed, 114 insertions(+), 59 deletions(-) diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py index 4df6c0297328..61a6d77cd875 100644 --- a/tests/async_engine/test_chat_template.py +++ b/tests/async_engine/test_chat_template.py @@ -1,6 +1,7 @@ import pytest -from vllm.entrypoints.chat_utils import apply_chat_template, load_chat_template +from vllm.entrypoints.chat_utils import (apply_hf_chat_template, + load_chat_template) from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.transformers_utils.tokenizer import get_tokenizer @@ -87,7 +88,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt, add_generation_prompt=add_generation_prompt) # Call the function and get the result - result = apply_chat_template( + result = apply_hf_chat_template( tokenizer, conversation=mock_request.messages, chat_template=mock_request.chat_template or template_content, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index f9f9536a7c16..a42ad81b3eef 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -23,6 +23,7 @@ # yapf: enable # pydantic needs the TypedDict from typing_extensions from pydantic import ConfigDict +from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast from typing_extensions import Required, TypeAlias, TypedDict from vllm.config import ModelConfig @@ -31,7 +32,7 @@ from vllm.multimodal.utils import (async_get_and_parse_audio, async_get_and_parse_image, get_and_parse_audio, get_and_parse_image) -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer logger = init_logger(__name__) @@ -379,6 +380,9 @@ def _parse_chat_message_content_parts( audio_url = _AudioParser(part)["audio_url"] mm_parser.parse_audio(audio_url["url"]) + elif part_type == "refusal": + text = _RefusalParser(part)["refusal"] + texts.append(text) else: raise NotImplementedError(f"Unknown part type: {part_type}") @@ -433,6 +437,21 @@ def _parse_chat_message_content( return result +def _postprocess_messages(messages: List[ConversationMessage]) -> None: + # per the Transformers docs & maintainers, tool call arguments in + # assistant-role messages with tool_calls need to be dicts not JSON str - + # this is how tool-use chat templates will expect them moving forwards + # so, for messages that have tool_calls, parse the string (which we get + # from openAI format) to dict + for message in messages: + if (message["role"] == "assistant" and "tool_calls" in message + and isinstance(message["tool_calls"], list)): + + for item in message["tool_calls"]: + item["function"]["arguments"] = json.loads( + item["function"]["arguments"]) + + def parse_chat_messages( messages: List[ChatCompletionMessageParam], model_config: ModelConfig, @@ -446,6 +465,8 @@ def parse_chat_messages( conversation.extend(sub_messages) + _postprocess_messages(conversation) + return conversation, mm_tracker.all_mm_data() @@ -462,41 +483,41 @@ def parse_chat_messages_futures( conversation.extend(sub_messages) + _postprocess_messages(conversation) + return conversation, mm_tracker.all_mm_data() -def apply_chat_template( - tokenizer: AnyTokenizer, +def apply_hf_chat_template( + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], conversation: List[ConversationMessage], chat_template: Optional[str], *, tokenize: bool = False, # Different from HF's default **kwargs: Any, -) -> Union[str, List[int]]: +) -> str: if chat_template is None and tokenizer.chat_template is None: raise ValueError( "As of transformers v4.44, default chat template is no longer " "allowed, so you must provide a chat template if the tokenizer " "does not define one.") - # per the Transformers docs & maintainers, tool call arguments in - # assistant-role messages with tool_calls need to be dicts not JSON str - - # this is how tool-use chat templates will expect them moving forwards - # so, for messages that have tool_calls, parse the string (which we get - # from openAI format) to dict - for message in conversation: - if (message["role"] == "assistant" and "tool_calls" in message - and isinstance(message["tool_calls"], list)): + return tokenizer.apply_chat_template( + conversation=conversation, # type: ignore[arg-type] + chat_template=chat_template, + tokenize=tokenize, + **kwargs, + ) - for i in range(len(message["tool_calls"])): - args: str = message["tool_calls"][i]["function"]["arguments"] - parsed_args: Dict = json.loads(args) - message["tool_calls"][i]["function"]["arguments"] = parsed_args - prompt = tokenizer.apply_chat_template( - conversation=conversation, +def apply_mistral_chat_template( + tokenizer: MistralTokenizer, + messages: List[ChatCompletionMessageParam], + chat_template: Optional[str], + **kwargs: Any, +) -> List[int]: + return tokenizer.apply_chat_template( + messages=messages, chat_template=chat_template, - tokenize=tokenize, **kwargs, ) - return prompt diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 1e4432eaaa66..b1d9f386b6c3 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -6,7 +6,8 @@ from vllm.engine.arg_utils import EngineArgs from vllm.engine.llm_engine import LLMEngine from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, - apply_chat_template, + apply_hf_chat_template, + apply_mistral_chat_template, parse_chat_messages) from vllm.inputs import PromptInputs, TextPrompt, TokensPrompt from vllm.inputs.parse import parse_and_batch_prompt @@ -19,7 +20,7 @@ from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams -from vllm.transformers_utils.tokenizer import (AnyTokenizer, +from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer, get_cached_tokenizer) from vllm.transformers_utils.tokenizer_group import TokenizerGroup from vllm.usage.usage_lib import UsageContext @@ -393,12 +394,21 @@ def chat( conversation, mm_data = parse_chat_messages(messages, model_config, tokenizer) - prompt = apply_chat_template( - tokenizer, - conversation, - chat_template=chat_template, - add_generation_prompt=add_generation_prompt, - ) + prompt: Union[str, List[int]] + if isinstance(tokenizer, MistralTokenizer): + prompt = apply_mistral_chat_template( + tokenizer, + messages=messages, + chat_template=chat_template, + add_generation_prompt=add_generation_prompt, + ) + else: + prompt = apply_hf_chat_template( + tokenizer, + conversation=conversation, + chat_template=chat_template, + add_generation_prompt=add_generation_prompt, + ) inputs: PromptInputs if is_list_of(prompt, int): diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 8ed81e9c88cb..a81d2aa989aa 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -11,7 +11,8 @@ from vllm.config import ModelConfig from vllm.engine.protocol import AsyncEngineClient from vllm.entrypoints.chat_utils import (ConversationMessage, - apply_chat_template, + apply_hf_chat_template, + apply_mistral_chat_template, load_chat_template, parse_chat_messages_futures) from vllm.entrypoints.logger import RequestLogger @@ -35,7 +36,7 @@ from vllm.sequence import Logprob from vllm.tracing import (contains_trace_headers, extract_trace_headers, log_tracing_disabled_warning) -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.utils import iterate_with_cancellation, random_uuid logger = init_logger(__name__) @@ -121,15 +122,27 @@ async def create_chat_completion( tool.model_dump() for tool in request.tools ] - prompt = apply_chat_template( - tokenizer, - conversation=conversation, - chat_template=request.chat_template or self.chat_template, - add_generation_prompt=request.add_generation_prompt, - tools=tool_dicts, - documents=request.documents, - **(request.chat_template_kwargs or {}), - ) + prompt: Union[str, List[int]] + if isinstance(tokenizer, MistralTokenizer): + prompt = apply_mistral_chat_template( + tokenizer, + messages=request.messages, + chat_template=request.chat_template or self.chat_template, + add_generation_prompt=request.add_generation_prompt, + tools=tool_dicts, + documents=request.documents, + **(request.chat_template_kwargs or {}), + ) + else: + prompt = apply_hf_chat_template( + tokenizer, + conversation=conversation, + chat_template=request.chat_template or self.chat_template, + add_generation_prompt=request.add_generation_prompt, + tools=tool_dicts, + documents=request.documents, + **(request.chat_template_kwargs or {}), + ) except Exception as e: logger.error("Error in applying chat template from request: %s", e) return self.create_error_response(str(e)) @@ -307,11 +320,10 @@ async def chat_completion_stream_generator( # Send response to echo the input portion of the # last message if request.echo: - last_msg_content: Optional[str] = "" - if conversation and conversation[-1].get( - "content") and conversation[-1].get( - "role") == role: - last_msg_content = conversation[-1]["content"] + last_msg_content: str = "" + if conversation and "content" in conversation[ + -1] and conversation[-1].get("role") == role: + last_msg_content = conversation[-1]["content"] or "" if last_msg_content: for i in range(num_choices): @@ -659,8 +671,8 @@ async def chat_completion_full_generator( if request.echo: last_msg_content = "" - if conversation and conversation[-1].get( - "content") and conversation[-1].get("role") == role: + if conversation and "content" in conversation[-1] and conversation[ + -1].get("role") == role: last_msg_content = conversation[-1]["content"] or "" for choice in choices: diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 69a5ad5b62cf..6e802b71ae2b 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -2,7 +2,8 @@ from vllm.config import ModelConfig from vllm.engine.protocol import AsyncEngineClient -from vllm.entrypoints.chat_utils import (apply_chat_template, +from vllm.entrypoints.chat_utils import (apply_hf_chat_template, + apply_mistral_chat_template, load_chat_template, parse_chat_messages_futures) from vllm.entrypoints.logger import RequestLogger @@ -18,6 +19,7 @@ from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, OpenAIServing) from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.utils import random_uuid logger = init_logger(__name__) @@ -66,6 +68,7 @@ async def create_tokenize( tokenizer = await self.async_engine_client.get_tokenizer(lora_request) + prompt: Union[str, List[int]] if isinstance(request, TokenizeChatRequest): model_config = self.model_config @@ -77,12 +80,20 @@ async def create_tokenize( logger.warning( "Multi-modal inputs are ignored during tokenization") - prompt = apply_chat_template( - tokenizer, - conversation=conversation, - chat_template=self.chat_template, - add_generation_prompt=request.add_generation_prompt, - ) + if isinstance(tokenizer, MistralTokenizer): + prompt = apply_mistral_chat_template( + tokenizer, + messages=request.messages, + chat_template=self.chat_template, + add_generation_prompt=request.add_generation_prompt, + ) + else: + prompt = apply_hf_chat_template( + tokenizer, + conversation=conversation, + chat_template=self.chat_template, + add_generation_prompt=request.add_generation_prompt, + ) else: prompt = request.prompt diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 533a86b78732..17e318cb5e04 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -16,7 +16,7 @@ Tekkenizer) if TYPE_CHECKING: - from vllm.entrypoints.chat_utils import ConversationMessage + from vllm.entrypoints.chat_utils import ChatCompletionMessageParam @dataclass @@ -122,19 +122,19 @@ def get_added_vocab(self) -> List[str]: return [] def encode(self, prompt: str) -> List[int]: - # `encode ` should only be used for prompt completion + # `encode` should only be used for prompt completion # it should never be used for chat_completion. # For chat completion use `apply_chat_template` return self.tokenizer.encode(prompt, bos=True, eos=False) def apply_chat_template(self, - conversation: List["ConversationMessage"], + messages: List["ChatCompletionMessageParam"], tools: Optional[Dict[str, Any]] = None, **kwargs) -> List[int]: assert tools is None, "`tools` are not yet supported." request = ChatCompletionRequest( - messages=conversation) # type: ignore[type-var] + messages=messages) # type: ignore[type-var] encoded = self.mistral.encode_chat_completion(request) # encode-decode to get clean prompt From f421f3cefb58d968767536d745fcc6e9ac342df5 Mon Sep 17 00:00:00 2001 From: "Alexey Kondratiev(AMD)" <143633163+alexeykondrat@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:51:15 -0400 Subject: [PATCH 24/54] [CI/Build] Enabling kernels tests for AMD, ignoring some of then that fail (#8130) --- .buildkite/run-amd-test.sh | 24 +++++++++++++++++++++++- .buildkite/test-pipeline.yaml | 1 + 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 972c62a091ae..c9b72a3264e8 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -71,13 +71,35 @@ mkdir -p ${HF_CACHE} HF_MOUNT="/root/.cache/huggingface" commands=$@ +echo "Commands:$commands" +#ignore certain kernels tests +if [[ $commands == *" kernels "* ]]; then + commands="${commands} \ + --ignore=kernels/test_attention.py \ + --ignore=kernels/test_attention_selector.py \ + --ignore=kernels/test_blocksparse_attention.py \ + --ignore=kernels/test_causal_conv1d.py \ + --ignore=kernels/test_cutlass.py \ + --ignore=kernels/test_encoder_decoder_attn.py \ + --ignore=kernels/test_flash_attn.py \ + --ignore=kernels/test_flashinfer.py \ + --ignore=kernels/test_int8_quant.py \ + --ignore=kernels/test_machete_gemm.py \ + --ignore=kernels/test_mamba_ssm.py \ + --ignore=kernels/test_marlin_gemm.py \ + --ignore=kernels/test_prefix_prefill.py \ + --ignore=kernels/test_rand.py \ + --ignore=kernels/test_sampler.py" +fi + PARALLEL_JOB_COUNT=8 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. if [[ $commands == *"--shard-id="* ]]; then for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do #replace shard arguments - commands=${@//"--shard-id= "/"--shard-id=${GPU} "} + commands=${commands//"--shard-id= "/"--shard-id=${GPU} "} commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} + echo "Shard ${GPU} commands:$commands" docker run \ --device /dev/kfd --device /dev/dri \ --network host \ diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a0c7b7442b3b..e4f70c5d4920 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -228,6 +228,7 @@ steps: parallelism: 4 - label: Kernels Test %N # 30min each + mirror_hardwares: [amd] source_file_dependencies: - csrc/ - vllm/attention From 02751a7a42c18454030ff35e350afab31e26f51d Mon Sep 17 00:00:00 2001 From: sumitd2 <91451282+sumitd2@users.noreply.github.com> Date: Wed, 11 Sep 2024 01:28:34 +0530 Subject: [PATCH 25/54] Fix ppc64le buildkite job (#8309) --- .buildkite/run-cpu-test-ppc64le.sh | 3 ++- Dockerfile.ppc64le | 5 ++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index a01cf3fe6748..49ae838cf069 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -11,8 +11,9 @@ trap remove_docker_container EXIT remove_docker_container # Run the image, setting --shm-size=4g for tensor parallel. +source /etc/environment #docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test cpu-test +docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test # Run basic model test docker exec cpu-test bash -c " diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le index 16780f8ab950..27d10e91342e 100644 --- a/Dockerfile.ppc64le +++ b/Dockerfile.ppc64le @@ -4,7 +4,7 @@ USER root ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/" -RUN apt-get update -y && apt-get install -y git wget vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential +RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential # Some packages in requirements-cpu are installed here # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba @@ -16,7 +16,7 @@ COPY ./ /workspace/vllm WORKDIR /workspace/vllm # These packages will be in rocketce eventually -RUN pip install -v cmake torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing +RUN pip install -v cmake xformers torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install @@ -25,4 +25,3 @@ WORKDIR /workspace/ RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] - From 5faedf1b6224f6e7348e9223f3e3107ec03954d3 Mon Sep 17 00:00:00 2001 From: Kevin Lin <42618777+kevin314@users.noreply.github.com> Date: Tue, 10 Sep 2024 15:18:14 -0500 Subject: [PATCH 26/54] [Spec Decode] Move ops.advance_step to flash attn advance_step (#8224) --- vllm/attention/backends/flash_attn.py | 21 +++++++++++++++------ vllm/spec_decode/draft_model_runner.py | 16 +++------------- vllm/worker/multi_step_model_runner.py | 19 +++++-------------- 3 files changed, 23 insertions(+), 33 deletions(-) diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 30ce715d5d05..06b178798dcd 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -16,7 +16,8 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUBuilder + from vllm.worker.model_runner import (ModelInputForGPUBuilder, + ModelInputForGPUWithSamplingMetadata) from vllm_flash_attn import flash_attn_varlen_func as _flash_attn_varlen_func from vllm_flash_attn import flash_attn_with_kvcache as _flash_attn_with_kvcache @@ -302,14 +303,12 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]: ) return self._cached_decode_metadata - def advance_step(self, num_seqs: int, num_queries: int): + def advance_step(self, model_input: "ModelInputForGPUWithSamplingMetadata", + sampled_token_ids: Optional[torch.Tensor], + block_size: int, num_seqs: int, num_queries: int): """ Update metadata in-place to advance one decode step. """ - # GPU in-place update is currently called separately through - # custom_ops.advance_step(). See draft_model_runner. TODO(will): Move - # this logic to the backend. - # When using cudagraph, the num_seqs is padded to the next captured # batch sized, but num_queries tracks the actual number of requests in # the batch. For --enforce-eager mode, num_seqs == num_queries @@ -347,6 +346,16 @@ def advance_step(self, num_seqs: int, num_queries: int): self.seq_lens[i] += 1 self.max_decode_seq_len = max(self.seq_lens) + ops.advance_step(num_seqs=num_seqs, + num_queries=num_queries, + block_size=block_size, + input_tokens=model_input.input_tokens, + sampled_token_ids=sampled_token_ids, + input_positions=model_input.input_positions, + seq_lens=self.seq_lens_tensor, + slot_mapping=self.slot_mapping, + block_tables=self.block_tables) + class FlashAttentionMetadataBuilder( AttentionMetadataBuilder[FlashAttentionMetadata]): diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index 6e35e4029438..1e403637d238 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -2,7 +2,6 @@ import torch -from vllm import _custom_ops as ops from vllm.model_executor.layers.sampler import SamplerOutput try: @@ -116,18 +115,9 @@ def _gpu_advance_step( # Update attn_metadata attn_metadata = model_input.attn_metadata assert isinstance(attn_metadata, FlashAttentionMetadata) - attn_metadata.advance_step(num_seqs, num_queries) - - # Update GPU tensors - ops.advance_step(num_seqs=num_seqs, - num_queries=num_queries, - block_size=self.block_size, - input_tokens=model_input.input_tokens, - sampled_token_ids=sampled_token_ids, - input_positions=model_input.input_positions, - seq_lens=attn_metadata.seq_lens_tensor, - slot_mapping=attn_metadata.slot_mapping, - block_tables=attn_metadata.block_tables) + + attn_metadata.advance_step(model_input, sampled_token_ids, + self.block_size, num_seqs, num_queries) # Update sampling_metadata sampling_metadata = model_input.sampling_metadata diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index b13cf39bd846..9a196c3dfcd1 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -13,7 +13,6 @@ import torch -from vllm import _custom_ops as ops from vllm.distributed import get_pp_group from vllm.logger import init_logger from vllm.model_executor.layers.sampler import (PromptLogprobs, SampleLogprobs, @@ -499,19 +498,11 @@ def _advance_step(self, model_input: StatefulModelInput, attn_metadata = frozen_model_input.attn_metadata assert isinstance(attn_metadata, FlashAttentionMetadata) - attn_metadata.advance_step(num_seqs, num_queries) - - # Update GPU tensors - ops.advance_step( - num_seqs=num_seqs, - num_queries=num_queries, - block_size=self.block_size, - input_tokens=frozen_model_input.input_tokens, - sampled_token_ids=model_input.cached_outputs[-1].sampled_token_ids, - input_positions=frozen_model_input.input_positions, - seq_lens=attn_metadata.seq_lens_tensor, - slot_mapping=attn_metadata.slot_mapping, - block_tables=attn_metadata.block_tables) + + attn_metadata.advance_step( + frozen_model_input, + model_input.cached_outputs[-1].sampled_token_ids, self.block_size, + num_seqs, num_queries) if frozen_model_input.seq_lens is not None: for i in range(num_queries): From 04e7c4e77118159e0b892681acd04a1b50a7ea6e Mon Sep 17 00:00:00 2001 From: Prashant Gupta Date: Tue, 10 Sep 2024 14:21:56 -0700 Subject: [PATCH 27/54] [Misc] remove peft as dependency for prompt models (#8162) --- vllm/config.py | 8 --- vllm/prompt_adapter/models.py | 2 +- vllm/prompt_adapter/utils.py | 93 +++++++++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+), 9 deletions(-) create mode 100644 vllm/prompt_adapter/utils.py diff --git a/vllm/config.py b/vllm/config.py index 8f5e02e35f28..9e7c107900aa 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1558,14 +1558,6 @@ class PromptAdapterConfig: prompt_adapter_dtype: Optional[torch.dtype] = None def __post_init__(self): - library_name = 'peft' - try: - __import__(library_name) - except ImportError as e: - raise ImportError( - f"'{library_name}' is not installed for prompt adapter support." - f"Please install it using 'pip install {library_name}'." - ) from e if self.max_prompt_adapters < 1: raise ValueError(f"max_prompt_adapters " diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py index 93eb3bde646a..18a5f86c341a 100644 --- a/vllm/prompt_adapter/models.py +++ b/vllm/prompt_adapter/models.py @@ -14,6 +14,7 @@ from vllm.prompt_adapter.layers import ( VocabParallelEmbeddingWithPromptAdapter) # yapf: disable from vllm.prompt_adapter.layers import PromptAdapterMapping +from vllm.prompt_adapter.utils import load_peft_weights logger = logging.getLogger(__name__) @@ -90,7 +91,6 @@ def from_local_checkpoint( config: PromptAdapterConfig, device: str = "cuda", ) -> "PromptAdapterModel": - from peft.utils import load_peft_weights if num_virtual_tokens > config.max_prompt_adapter_token: raise ValueError( diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py new file mode 100644 index 000000000000..989cc5a0f87c --- /dev/null +++ b/vllm/prompt_adapter/utils.py @@ -0,0 +1,93 @@ +# code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420 + +import os +from typing import Optional + +import torch +from huggingface_hub import file_exists, hf_hub_download +from huggingface_hub.utils import EntryNotFoundError +from safetensors.torch import load_file as safe_load_file + +WEIGHTS_NAME = "adapter_model.bin" +SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors" + + +# Get current device name based on available devices +def infer_device() -> str: + if torch.cuda.is_available(): + return "cuda" + return "cpu" + + +def load_peft_weights(model_id: str, + device: Optional[str] = None, + **hf_hub_download_kwargs) -> dict: + r""" + A helper method to load the PEFT weights from the HuggingFace Hub or locally + + Args: + model_id (`str`): + The local path to the adapter weights or the name of the adapter to + load from the HuggingFace Hub. + device (`str`): + The device to load the weights onto. + hf_hub_download_kwargs (`dict`): + Additional arguments to pass to the `hf_hub_download` method when + loading from the HuggingFace Hub. + """ + path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"]) + if hf_hub_download_kwargs.get("subfolder", None) is not None else + model_id) + + if device is None: + device = infer_device() + + if os.path.exists(os.path.join(path, SAFETENSORS_WEIGHTS_NAME)): + filename = os.path.join(path, SAFETENSORS_WEIGHTS_NAME) + use_safetensors = True + elif os.path.exists(os.path.join(path, WEIGHTS_NAME)): + filename = os.path.join(path, WEIGHTS_NAME) + use_safetensors = False + else: + token = hf_hub_download_kwargs.get("token", None) + if token is None: + token = hf_hub_download_kwargs.get("use_auth_token", None) + + hub_filename = (os.path.join(hf_hub_download_kwargs["subfolder"], + SAFETENSORS_WEIGHTS_NAME) + if hf_hub_download_kwargs.get("subfolder", None) + is not None else SAFETENSORS_WEIGHTS_NAME) + has_remote_safetensors_file = file_exists( + repo_id=model_id, + filename=hub_filename, + revision=hf_hub_download_kwargs.get("revision", None), + repo_type=hf_hub_download_kwargs.get("repo_type", None), + token=token, + ) + use_safetensors = has_remote_safetensors_file + + if has_remote_safetensors_file: + # Priority 1: load safetensors weights + filename = hf_hub_download( + model_id, + SAFETENSORS_WEIGHTS_NAME, + **hf_hub_download_kwargs, + ) + else: + try: + filename = hf_hub_download(model_id, WEIGHTS_NAME, + **hf_hub_download_kwargs) + except EntryNotFoundError: + raise ValueError( # noqa: B904 + f"Can't find weights for {model_id} in {model_id} or \ + in the Hugging Face Hub. " + f"Please check that the file {WEIGHTS_NAME} or \ + {SAFETENSORS_WEIGHTS_NAME} is present at {model_id}.") + + if use_safetensors: + adapters_weights = safe_load_file(filename, device=device) + else: + adapters_weights = torch.load(filename, + map_location=torch.device(device)) + + return adapters_weights From b1f3e189586dce42bb3dcda20169a9308c9a25fa Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Tue, 10 Sep 2024 15:28:28 -0700 Subject: [PATCH 28/54] [MISC] Keep chunked prefill enabled by default with long context when prefix caching is enabled (#8342) --- vllm/engine/arg_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 9bc03948d384..7748e1109204 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -878,7 +878,6 @@ def create_engine_config(self) -> EngineConfig: if (is_gpu and not use_sliding_window and not use_spec_decode and not self.enable_lora and not self.enable_prompt_adapter - and not self.enable_prefix_caching and not has_seqlen_agnostic_layers): self.enable_chunked_prefill = True logger.warning( From 22f3a4bc6c6801101728d97edd25ffcdd5a7fd8c Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Tue, 10 Sep 2024 19:00:35 -0400 Subject: [PATCH 29/54] [Bugfix] lookahead block table with cuda graph max capture (#8340) [Bugfix] Ensure multistep lookahead allocation is compatible with cuda graph max capture (#8340) --- vllm/attention/backends/flash_attn.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 06b178798dcd..69faa6d343ed 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -471,9 +471,19 @@ def build(self, seq_lens: List[int], query_lens: List[int], # The shape of graph_block_tables is # [max batch size, max context len // block size]. input_block_tables = self.runner.graph_block_tables[:batch_size] + max_blocks = input_block_tables.shape[1] for i, block_table in enumerate(self.block_tables): if block_table: - input_block_tables[i, :len(block_table)] = block_table + num_blocks = len(block_table) + if num_blocks <= max_blocks: + input_block_tables[i, :num_blocks] = block_table + else: + # It may be possible to have more blocks allocated due + # to lookahead slots of multi-step, however, they are + # not used anyway, so can be safely ignored. + input_block_tables[ + i, :max_blocks] = block_table[:max_blocks] + block_tables = torch.from_numpy(input_block_tables).to( device=device, non_blocking=True) else: From 1d5e397aa4d94d0ccc1c9dbad533afa5cb60bb69 Mon Sep 17 00:00:00 2001 From: William Lin Date: Tue, 10 Sep 2024 16:46:08 -0700 Subject: [PATCH 30/54] [Core/Bugfix] pass VLLM_ATTENTION_BACKEND to ray workers (#8172) --- vllm/executor/ray_gpu_executor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 1359a0d310a7..b124fe2e08ea 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -242,6 +242,9 @@ def sort_by_driver_then_worker_ip(worker): VLLM_INSTANCE_ID, "VLLM_TRACE_FUNCTION": str(envs.VLLM_TRACE_FUNCTION), + **({ + "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND + } if envs.VLLM_ATTENTION_BACKEND is not None else {}) }, ) for (node_id, _) in worker_node_and_gpu_ids] self._env_vars_for_all_workers = ( From 94144e726cfeeba0c1758751b7fd46a20b6bd3b4 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Tue, 10 Sep 2024 19:51:58 -0400 Subject: [PATCH 31/54] [CI/Build][Kernel] Update CUTLASS to 3.5.1 tag (#8043) --- CMakeLists.txt | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9c88c31c83da..f8d6a2be9fea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -195,9 +195,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") FetchContent_Declare( cutlass GIT_REPOSITORY https://github.com/nvidia/cutlass.git - # CUTLASS 3.5.1 - GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9 + GIT_TAG v3.5.1 GIT_PROGRESS TRUE + + # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history. + # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags. + # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE + GIT_SHALLOW TRUE ) FetchContent_MakeAvailable(cutlass) @@ -231,6 +235,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "-gencode arch=compute_90a,code=sm_90a") endif() + # # Machete kernels @@ -289,6 +294,12 @@ define_gpu_extension_target( USE_SABI 3 WITH_SOABI) +# If CUTLASS is compiled on NVCC >= 12.5, it by default uses +# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the +# driver API. This causes problems when linking with earlier versions of CUDA. +# Setting this variable sidesteps the issue by calling the driver directly. +target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) + # # _moe_C extension # From e497b8aeff5799d4ca2cfd6e01105194ebd39eac Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 11 Sep 2024 08:59:19 +0800 Subject: [PATCH 32/54] [Misc] Skip loading extra bias for Qwen2-MOE GPTQ models (#8329) --- vllm/model_executor/models/qwen2_moe.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 56129515ca8d..d80064601d99 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -469,7 +469,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): continue name = name.replace(weight_name, param_name) # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): continue # Skip layers on other devices. if is_pp_missing_parameter(name, self): @@ -490,6 +491,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip layers on other devices. if is_pp_missing_parameter(name, self): continue + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, @@ -500,7 +505,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): break else: # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): continue # Skip layers on other devices. if is_pp_missing_parameter(name, self): From 1230263e161caa9fd698e109d33437950769ec09 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Wed, 11 Sep 2024 10:11:01 +0800 Subject: [PATCH 33/54] [Bugfix] Fix InternVL2 vision embeddings process with pipeline parallel (#8299) --- tests/distributed/test_pipeline_parallel.py | 10 ++++++++-- vllm/model_executor/models/internvl.py | 3 ++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 637d2b30f6b1..d2219eed988e 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -32,7 +32,9 @@ (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"), (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"), (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"), - (2, 2, 1, 1, 1, "internlm/internlm2_5-7b-chat", "ray"), + (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "ray"), + (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "ray"), + (1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "ray"), ], ) @fork_new_process_for_each_test @@ -46,6 +48,8 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, # use half precision for speed and memory savings in CI environment "--dtype", "float16", + "--max-model-len", + "8192", "--pipeline-parallel-size", str(PP_SIZE), "--tensor-parallel-size", @@ -62,7 +66,9 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, tp_args = [ # use half precision for speed and memory savings in CI environment "--dtype", - "bfloat16", + "float16", + "--max-model-len", + "8192", "--tensor-parallel-size", str(max(TP_SIZE, 2)), # We only use 2 GPUs in the CI. "--distributed-executor-backend", diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 0cf63d9e1fb2..81819578a4d8 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -17,6 +17,7 @@ from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, MultiModalConfig +from vllm.distributed import get_pp_group from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput @@ -480,7 +481,7 @@ def forward( **kwargs: object, ) -> SamplerOutput: image_input = self._parse_and_validate_image_input(**kwargs) - if image_input is not None: + if image_input is not None and get_pp_group().is_first_rank: inputs_embeds = self.language_model.model.get_input_embeddings( input_ids) vision_embeddings = self._process_image_input(image_input) From efcf946a158f02a597086199890b5c7673ffe467 Mon Sep 17 00:00:00 2001 From: Pavani Majety Date: Tue, 10 Sep 2024 21:38:40 -0700 Subject: [PATCH 34/54] [Hardware][NV] Add support for ModelOpt static scaling checkpoints. (#6112) --- examples/fp8/quantizer/README.md | 4 +- tests/models/test_modelopt.py | 79 +++++++++ vllm/config.py | 6 +- vllm/model_executor/layers/linear.py | 3 +- .../layers/quantization/__init__.py | 2 + .../layers/quantization/modelopt.py | 163 ++++++++++++++++++ .../model_loader/weight_utils.py | 7 + 7 files changed, 258 insertions(+), 6 deletions(-) create mode 100644 tests/models/test_modelopt.py create mode 100644 vllm/model_executor/layers/quantization/modelopt.py diff --git a/examples/fp8/quantizer/README.md b/examples/fp8/quantizer/README.md index 0b6944f688b4..d0895e97dc34 100644 --- a/examples/fp8/quantizer/README.md +++ b/examples/fp8/quantizer/README.md @@ -1,6 +1,6 @@ ### Quantizer Utilities -`quantize.py`: NVIDIA Quantization utilities using AMMO, ported from TensorRT-LLM: -`https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py` +`quantize.py`: NVIDIA Quantization utilities using TensorRT-Model-Optimizer, ported +from TensorRT-LLM: [`examples/quantization/quantize.py`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py) ### Prerequisite diff --git a/tests/models/test_modelopt.py b/tests/models/test_modelopt.py new file mode 100644 index 000000000000..e643b115d0ea --- /dev/null +++ b/tests/models/test_modelopt.py @@ -0,0 +1,79 @@ +# flake8: noqa +"""Tests Model Optimizer fp8 models against ground truth generation +Note: these tests will only pass on H100 +""" +import os +from typing import List + +import pytest +from transformers import AutoTokenizer + +from tests.quantization.utils import is_quant_method_supported +from vllm import LLM, SamplingParams + +os.environ["TOKENIZERS_PARALLELISM"] = "true" + +MAX_MODEL_LEN = 1024 + +MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"] + +EXPECTED_STRS_MAP = { + "nvidia/Llama-3.1-8B-Instruct-FP8": [ + "You're referring to VLLM, a high-performance Large Language Model (LLM) inference and", + 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', + 'The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and', + 'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne', + '**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir', + 'The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to', + 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', + 'Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる' + ] +} + + +# This test compares against golden strings for exact match since +# there is no baseline implementation to compare against +# and is unstable w.r.t specifics of the fp8 implementation or +# the hardware being run on. +# Disabled to prevent it from breaking the build +@pytest.mark.skip( + reason= + "Prevent unstable test based on golden strings from breaking the build.") +@pytest.mark.skipif(not is_quant_method_supported("fp8"), + reason="fp8 is not supported on this GPU type.") +@pytest.mark.parametrize("model_name", MODELS) +def test_models(example_prompts, model_name) -> None: + model = LLM( + model=model_name, + max_model_len=MAX_MODEL_LEN, + trust_remote_code=True, + enforce_eager=True, + quantization="modelopt", + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name) + formatted_prompts = [ + tokenizer.apply_chat_template([{ + "role": "user", + "content": prompt + }], + tokenize=False, + add_generation_prompt=True) + for prompt in example_prompts + ] + params = SamplingParams(max_tokens=20, temperature=0) + generations: List[str] = [] + # Note: these need to be run 1 at a time due to numerical precision, + # since the expected strs were generated this way. + for prompt in formatted_prompts: + outputs = model.generate(prompt, params) + generations.append(outputs[0].outputs[0].text) + del model + + print(model_name, generations) + expected_strs = EXPECTED_STRS_MAP[model_name] + for i in range(len(example_prompts)): + generated_str = generations[i] + expected_str = expected_strs[i] + assert expected_str == generated_str, ( + f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}") diff --git a/vllm/config.py b/vllm/config.py index 9e7c107900aa..4d9310af79ed 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -282,9 +282,9 @@ def _verify_quantization(self) -> None: supported_quantization = [*QUANTIZATION_METHODS] rocm_supported_quantization = ["awq", "gptq", "fp8"] optimized_quantization_methods = [ - "fp8", "marlin", "gptq_marlin_24", "gptq_marlin", "awq_marlin", - "fbgemm_fp8", "compressed_tensors", "compressed-tensors", - "experts_int8" + "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin", + "awq_marlin", "fbgemm_fp8", "compressed_tensors", + "compressed-tensors", "experts_int8" ] tpu_supported_quantization = ["tpu_int8"] neuron_supported_quantization = ["neuron_quant"] diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index b997507ea738..cea768469aeb 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -26,7 +26,8 @@ "CompressedTensorsLinearMethod", "AWQMarlinLinearMethod", "AWQLinearMethod", "GPTQMarlinLinearMethod", "Fp8LinearMethod", "MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod", - "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod" + "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod", + "ModelOptFp8LinearMethod" ] diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index aa5c288962d9..3c38f0a00607 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -22,6 +22,7 @@ from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( GPTQMarlin24Config) from vllm.model_executor.layers.quantization.marlin import MarlinConfig +from vllm.model_executor.layers.quantization.modelopt import ModelOptFp8Config from vllm.model_executor.layers.quantization.neuron_quant import ( NeuronQuantConfig) from vllm.model_executor.layers.quantization.qqq import QQQConfig @@ -34,6 +35,7 @@ "tpu_int8": Int8TpuConfig, "fp8": Fp8Config, "fbgemm_fp8": FBGEMMFp8Config, + "modelopt": ModelOptFp8Config, # The order of gptq methods is important for config.py iteration over # override_quantization_method(..) "marlin": MarlinConfig, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py new file mode 100644 index 000000000000..dc5f47eb9b0f --- /dev/null +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -0,0 +1,163 @@ +from typing import Any, Dict, List, Optional + +import torch +from torch.nn import Module +from torch.nn.parameter import Parameter + +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, QuantizeMethodBase) +from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + apply_fp8_linear, cutlass_fp8_supported, requantize_with_max_scale) +from vllm.model_executor.parameter import (ModelWeightParameter, + PerTensorScaleParameter) + +logger = init_logger(__name__) + +ACTIVATION_SCHEMES = ["static"] + + +class ModelOptFp8Config(QuantizationConfig): + """Config class for ModelOpt FP8.""" + + def __init__( + self, + is_checkpoint_fp8_serialized: bool = False, + ) -> None: + self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized + if is_checkpoint_fp8_serialized: + logger.warning("Detected ModelOpt fp8 checkpoint. Please note that" + " the format is experimental and could change.") + + @classmethod + def get_name(cls) -> str: + return "modelopt" + + @classmethod + def get_supported_act_dtypes(cls) -> List[torch.dtype]: + return [torch.bfloat16, torch.half] + + @classmethod + def get_min_capability(cls) -> int: + return 89 + + @classmethod + def get_config_filenames(cls) -> List[str]: + return ["hf_quant_config.json"] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "ModelOptFp8Config": + quant_config = cls.get_from_keys(config, ["quantization"]) + quant_method = quant_config["quant_algo"] + is_checkpoint_fp8_serialized = ("FP8" in quant_method) + if not is_checkpoint_fp8_serialized: + raise ValueError("ModelOpt currently only supports static FP8" + "quantization in vLLM. Please check the " + "`hf_quant_config.json` file for your model's " + "quant configuration.") + return cls(is_checkpoint_fp8_serialized) + + def get_quant_method(self, layer: torch.nn.Module, + prefix: str) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import Attention # Avoid circular import + if isinstance(layer, LinearBase): + return ModelOptFp8LinearMethod(self) + elif isinstance(layer, Attention): + return ModelOptFp8KVCacheMethod(self) + return None + + def get_scaled_act_names(self) -> List[str]: + return [] + + +class ModelOptFp8KVCacheMethod(BaseKVCacheMethod): + """ + Supports loading kv-cache scaling factors from FP8 checkpoints. + """ + + def __init__(self, quant_config: ModelOptFp8Config): + super().__init__(quant_config) + + +class ModelOptFp8LinearMethod(LinearMethodBase): + """Linear method for Model Optimizer static quantization. + Supports loading FP8 checkpoints with static weight scale and + activation scale. Future support might be added for dynamic + scales. + + Limitations: + 1. Only support per-tensor quantization due to torch._scaled_mm support. + 2. Only support float8_e4m3fn datatype + Args: quant_config: The ModelOpt quantization config. + """ + + def __init__(self, quant_config: ModelOptFp8Config): + self.quant_config = quant_config + self.cutlass_fp8_supported = cutlass_fp8_supported() + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + del input_size, output_size + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + weight_dtype = (torch.float8_e4m3fn + if self.quant_config.is_checkpoint_fp8_serialized else + params_dtype) + weight = ModelWeightParameter(data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=weight_dtype), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) + layer.register_parameter("weight", weight) + + if self.quant_config.is_checkpoint_fp8_serialized: + # WEIGHT SCALE + weight_scale = PerTensorScaleParameter(data=torch.empty( + len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader) + weight_scale[:] = torch.finfo(torch.float32).min + layer.register_parameter("weight_scale", weight_scale) + # INPUT SCALE + scale = PerTensorScaleParameter(data=torch.empty( + len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader) + + scale[:] = torch.finfo(torch.float32).min + layer.register_parameter("input_scale", scale) + + def process_weights_after_loading(self, layer: Module) -> None: + max_w_scale, weight = requantize_with_max_scale( + layer.weight, layer.weight_scale, layer.logical_widths) + layer.weight = Parameter(weight.t(), requires_grad=False) + layer.weight_scale = Parameter(max_w_scale, requires_grad=False) + layer.input_scale = Parameter(layer.input_scale.max(), + requires_grad=False) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + return apply_fp8_linear( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=layer.input_scale, + bias=bias, + cutlass_fp8_supported=self.cutlass_fp8_supported) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 075451292a8e..5051d45dd115 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -192,6 +192,13 @@ def get_quant_config(model_config: ModelConfig, if model_config.quantization == "bitsandbytes": config["adapter_name_or_path"] = model_name_or_path + elif model_config.quantization == "modelopt": + if config["producer"]["name"] == "modelopt": + return quant_cls.from_config(config) + else: + raise ValueError( + f"Unsupported quantization config" + f" found for {model_config.quantization} in {f}.") return quant_cls.from_config(config) From 6a512a00dfa306762c2878bffc3a5664a758d105 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yangshen=E2=9A=A1Deng?= Date: Wed, 11 Sep 2024 13:21:36 +0800 Subject: [PATCH 35/54] [model] Support for Llava-Next-Video model (#7559) Co-authored-by: Roger Wang Co-authored-by: Cyrus Leung Co-authored-by: Cyrus Leung --- Dockerfile | 1 + Dockerfile.cpu | 1 + Dockerfile.neuron | 4 +- Dockerfile.openvino | 3 +- Dockerfile.ppc64le | 2 +- Dockerfile.tpu | 3 + Dockerfile.xpu | 3 +- docs/source/conf.py | 1 + docs/source/models/supported_models.rst | 14 + examples/offline_inference_vision_language.py | 70 ++- requirements-test.txt | 1 + setup.py | 1 + tests/conftest.py | 56 ++- tests/models/test_llava_next_video.py | 236 +++++++++ vllm/assets/video.py | 85 ++++ vllm/model_executor/models/__init__.py | 6 +- .../model_executor/models/llava_next_video.py | 471 ++++++++++++++++++ vllm/multimodal/registry.py | 3 +- vllm/multimodal/utils.py | 42 ++ vllm/multimodal/video.py | 71 +++ vllm/transformers_utils/image_processor.py | 27 + 21 files changed, 1083 insertions(+), 18 deletions(-) create mode 100644 tests/models/test_llava_next_video.py create mode 100644 vllm/assets/video.py create mode 100644 vllm/model_executor/models/llava_next_video.py create mode 100644 vllm/multimodal/video.py diff --git a/Dockerfile b/Dockerfile index 0ec6655ed449..5484be5bc578 100644 --- a/Dockerfile +++ b/Dockerfile @@ -145,6 +145,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && apt-get update -y \ && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \ + && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ && add-apt-repository ppa:deadsnakes/ppa \ && apt-get update -y \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \ diff --git a/Dockerfile.cpu b/Dockerfile.cpu index 9a570f988f3d..2b60835255cb 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -5,6 +5,7 @@ FROM ubuntu:22.04 AS cpu-test-1 RUN --mount=type=cache,target=/var/cache/apt \ apt-get update -y \ && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \ + && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html diff --git a/Dockerfile.neuron b/Dockerfile.neuron index caa1b1d6c442..f0c3479625a7 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -6,7 +6,9 @@ FROM $BASE_IMAGE RUN echo "Base image is $BASE_IMAGE" # Install some basic utilities -RUN apt-get update && apt-get install python3 python3-pip -y +RUN apt-get update \ + && apt-get install python3 python3-pip -y \ + && apt-get install -y ffmpeg libsm6 libxext6 libgl1 ### Mount Point ### # When launching the container, mount the code directory to /app diff --git a/Dockerfile.openvino b/Dockerfile.openvino index 06ca4638dfeb..96b9593a2bfa 100644 --- a/Dockerfile.openvino +++ b/Dockerfile.openvino @@ -4,7 +4,8 @@ FROM ubuntu:22.04 AS dev RUN apt-get update -y && \ - apt-get install -y python3-pip git + apt-get install -y python3-pip git && \ + apt-get install -y ffmpeg libsm6 libxext6 libgl1 WORKDIR /workspace # copy requirements diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le index 27d10e91342e..3313162bf28e 100644 --- a/Dockerfile.ppc64le +++ b/Dockerfile.ppc64le @@ -4,7 +4,7 @@ USER root ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/" -RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential +RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 # Some packages in requirements-cpu are installed here # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba diff --git a/Dockerfile.tpu b/Dockerfile.tpu index 3a11c6721ead..04cd4d79f404 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -4,6 +4,9 @@ ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:night FROM $BASE_IMAGE WORKDIR /workspace +# Install some basic utilities +RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1 + # Install the TPU and Pallas dependencies. RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html diff --git a/Dockerfile.xpu b/Dockerfile.xpu index f91baa11a375..321da98cf6c8 100644 --- a/Dockerfile.xpu +++ b/Dockerfile.xpu @@ -9,8 +9,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO chmod 644 /usr/share/keyrings/intel-graphics.gpg RUN apt-get update -y \ -&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip - +&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1 COPY ./ /workspace/vllm WORKDIR /workspace/vllm diff --git a/docs/source/conf.py b/docs/source/conf.py index b4f5b4ab9d56..8435129e752e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -99,6 +99,7 @@ def setup(app): "aiohttp", "compressed_tensors", "cpuinfo", + "cv2", "torch", "transformers", "psutil", diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 1bb3a448f2c9..29fa5d812deb 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -227,6 +227,11 @@ Multimodal Language Models - Image\ :sup:`E+` - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - + * - :code:`LlavaNextVideoForConditionalGeneration` + - LLaVA-NeXT-Video + - Video + - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. (see note) + - * - :code:`MiniCPMV` - MiniCPM-V - Image\ :sup:`+` @@ -260,6 +265,15 @@ Multimodal Language Models For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 + For :code:`LLaVA-NeXT-Video`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now. + This can be installed by running the following command: + + + .. code-block:: bash + + pip install git+https://github.com/huggingface/transformers.git@21fac7abba2a37fae86106f87fcf9974fd1e3830 + + ---- If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index aa1580343aee..2ec691608df6 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -9,12 +9,9 @@ from vllm import LLM, SamplingParams from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset from vllm.utils import FlexibleArgumentParser -# Input image and question -image = ImageAsset("cherry_blossom").pil_image.convert("RGB") -question = "What is the content of this image?" - # LLaVA-1.5 def run_llava(question): @@ -30,7 +27,16 @@ def run_llava(question): def run_llava_next(question): prompt = f"[INST] \n{question} [/INST]" - llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf") + llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192) + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# LlaVA-NeXT-Video +# Currently only support for video input +def run_llava_next_video(question): + prompt = f"USER: