From 8efbabcbbc905a950019f785c23beec3f2419b82 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Mon, 13 Jan 2025 08:13:44 +0800 Subject: [PATCH] [Model] Support GGUF models newly added in `transformers` 4.46.0 (#9685) Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung Signed-off-by: ice-tong --- examples/offline_inference/gguf_inference.py | 22 ++-- .../models/decoder_only/language/test_gguf.py | 105 ++++++++++++------ vllm/model_executor/layers/linear.py | 58 ++++++---- vllm/model_executor/models/gpt2.py | 11 +- vllm/model_executor/models/llama.py | 3 +- vllm/model_executor/models/stablelm.py | 31 ++++-- vllm/model_executor/models/starcoder2.py | 19 +++- 7 files changed, 162 insertions(+), 87 deletions(-) diff --git a/examples/offline_inference/gguf_inference.py b/examples/offline_inference/gguf_inference.py index 09a5fcc22e553..aa05c4c0bfaa5 100644 --- a/examples/offline_inference/gguf_inference.py +++ b/examples/offline_inference/gguf_inference.py @@ -3,27 +3,20 @@ from vllm import LLM, SamplingParams -def run_gguf_inference(model_path): - PROMPT_TEMPLATE = "<|system|>\n{system_message}\n<|user|>\n{prompt}\n<|assistant|>\n" # noqa: E501 - system_message = "You are a friendly chatbot who always responds in the style of a pirate." # noqa: E501 +def run_gguf_inference(model_path, tokenizer): # Sample prompts. prompts = [ "How many helicopters can a human eat in one sitting?", "What's the future of AI?", ] - prompts = [ - PROMPT_TEMPLATE.format(system_message=system_message, prompt=prompt) - for prompt in prompts - ] + prompts = [[{"role": "user", "content": prompt}] for prompt in prompts] # Create a sampling params object. sampling_params = SamplingParams(temperature=0, max_tokens=128) # Create an LLM. - llm = LLM(model=model_path, - tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0", - gpu_memory_utilization=0.95) + llm = LLM(model=model_path, tokenizer=tokenizer) - outputs = llm.generate(prompts, sampling_params) + outputs = llm.chat(prompts, sampling_params) # Print the outputs. for output in outputs: prompt = output.prompt @@ -32,7 +25,8 @@ def run_gguf_inference(model_path): if __name__ == "__main__": - repo_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" - filename = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf" + repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF" + filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf" + tokenizer = "microsoft/Phi-3-medium-4k-instruct" model = hf_hub_download(repo_id, filename=filename) - run_gguf_inference(model) + run_gguf_inference(model, tokenizer) diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py index 2b8f5e2faa45e..81b93ebdf0fc0 100644 --- a/tests/models/decoder_only/language/test_gguf.py +++ b/tests/models/decoder_only/language/test_gguf.py @@ -4,6 +4,7 @@ """ import os +from typing import List, NamedTuple, Type import pytest from huggingface_hub import hf_hub_download @@ -11,6 +12,7 @@ from tests.quantization.utils import is_quant_method_supported +from ....conftest import VllmRunner from ...utils import check_logprobs_close os.environ["TOKENIZERS_PARALLELISM"] = "true" @@ -18,31 +20,74 @@ MAX_MODEL_LEN = 1024 +class GGUFTestConfig(NamedTuple): + original_model: str + gguf_repo: str + gguf_filename: str + + @property + def gguf_model(self): + return hf_hub_download(self.gguf_repo, filename=self.gguf_filename) + + +LLAMA_CONFIG = GGUFTestConfig( + original_model="meta-llama/Llama-3.2-1B-Instruct", + gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF", + gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf", +) + +QWEN2_CONFIG = GGUFTestConfig( + original_model="Qwen/Qwen2.5-1.5B-Instruct", + gguf_repo="Qwen/Qwen2.5-1.5B-Instruct-GGUF", + gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf", +) + +PHI3_CONFIG = GGUFTestConfig( + original_model="microsoft/Phi-3.5-mini-instruct", + gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF", + gguf_filename="Phi-3.5-mini-instruct-IQ4_XS.gguf", +) + +GPT2_CONFIG = GGUFTestConfig( + original_model="openai-community/gpt2-large", + gguf_repo="QuantFactory/gpt2-large-GGUF", + gguf_filename="gpt2-large.Q4_K_M.gguf", +) + +STABLELM_CONFIG = GGUFTestConfig( + original_model="stabilityai/stablelm-3b-4e1t", + gguf_repo="afrideva/stablelm-3b-4e1t-GGUF", + gguf_filename="stablelm-3b-4e1t.q4_k_m.gguf", +) + +STARCODER_CONFIG = GGUFTestConfig( + original_model="bigcode/starcoder2-3b", + gguf_repo="QuantFactory/starcoder2-3b-GGUF", + gguf_filename="starcoder2-3b.Q6_K.gguf", +) + +MODELS = [ + LLAMA_CONFIG, + QWEN2_CONFIG, + PHI3_CONFIG, + GPT2_CONFIG, + STABLELM_CONFIG, + # STARCODER_CONFIG, # broken +] + + @pytest.mark.skipif(not is_quant_method_supported("gguf"), reason="gguf is not supported on this GPU type.") -@pytest.mark.parametrize(("original_model", "gguf_id", "gguf_path"), [ - ("meta-llama/Llama-3.2-1B-Instruct", - "bartowski/Llama-3.2-1B-Instruct-GGUF", - "Llama-3.2-1B-Instruct-Q4_K_M.gguf"), - ("meta-llama/Llama-3.2-1B-Instruct", - "bartowski/Llama-3.2-1B-Instruct-GGUF", - "Llama-3.2-1B-Instruct-IQ4_XS.gguf"), - ("Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct-GGUF", - "qwen2-1_5b-instruct-q4_k_m.gguf"), - ("Qwen/Qwen2-1.5B-Instruct", "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF", - "Qwen2-1.5B-Instruct.IQ4_XS.gguf"), -]) +@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("tp_size", [1, 2]) def test_models( - num_gpus_available, - vllm_runner, - example_prompts, - original_model, - gguf_id, - gguf_path, + num_gpus_available: int, + vllm_runner: Type[VllmRunner], + example_prompts: List[str], + model: GGUFTestConfig, dtype: str, max_tokens: int, num_logprobs: int, @@ -51,28 +96,26 @@ def test_models( if num_gpus_available < tp_size: pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") - gguf_model = hf_hub_download(gguf_id, filename=gguf_path) - - tokenizer = AutoTokenizer.from_pretrained(original_model) - messages = [[{ - 'role': 'user', - 'content': prompt - }] for prompt in example_prompts] - example_prompts = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model.original_model) + if tokenizer.chat_template is not None: + messages = [[{ + 'role': 'user', + 'content': prompt + }] for prompt in example_prompts] + example_prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True) # Run unquantized model. - with vllm_runner(model_name=original_model, + with vllm_runner(model_name=model.original_model, dtype=dtype, max_model_len=MAX_MODEL_LEN, tensor_parallel_size=tp_size) as original_model: - original_outputs = original_model.generate_greedy_logprobs( example_prompts[:-1], max_tokens, num_logprobs) # Run gguf model. - with vllm_runner(model_name=gguf_model, + with vllm_runner(model_name=model.gguf_model, + tokenizer_name=model.original_model, dtype=dtype, max_model_len=MAX_MODEL_LEN, tensor_parallel_size=tp_size) as gguf_model: diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 48cfb1b221720..8876ca72792cf 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -447,8 +447,14 @@ def weight_loader(self, is_gguf_weight = getattr(param, "is_gguf_weight", False) is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) if is_gguf_weight_type: - param.data[loaded_shard_id].copy_(loaded_weight) - param.shard_weight_type[loaded_shard_id] = loaded_weight.item() + if loaded_shard_id is not None: + param.data[loaded_shard_id].copy_(loaded_weight) + param.shard_weight_type[loaded_shard_id] = loaded_weight.item() + else: + param.shard_weight_type = { + i: loaded_weight.item() + for i, _ in enumerate(self.output_sizes) + } return if is_gguf_weight: @@ -459,15 +465,15 @@ def weight_loader(self, shard_size = loaded_weight.size(output_dim) // tp_size start_idx = tp_rank * shard_size - loaded_weight = loaded_weight.narrow(output_dim, start_idx, - shard_size) - - param.shard_id.append(loaded_shard_id) - param.shard_id_map[loaded_shard_id] = len(param.data_container) - param.data_container.append(loaded_weight) - if len(param.data_container) == 2: - self.qweight = param.materialize_nested() - return + if loaded_shard_id is not None: + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) + param.shard_id.append(loaded_shard_id) + param.shard_id_map[loaded_shard_id] = len(param.data_container) + param.data_container.append(loaded_weight) + if len(param.data_container) == 2: + self.qweight = param.materialize_nested() + return param_data = param.data output_dim = getattr(param, "output_dim", None) @@ -811,10 +817,16 @@ def weight_loader(self, # initialize GGUF param after we know the quantize type is_gguf_weight = getattr(param, "is_gguf_weight", False) is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) - if is_gguf_weight_type and loaded_shard_id is not None: + if is_gguf_weight_type: idx_map = {"q": 0, "k": 1, "v": 2} - param.data[idx_map[loaded_shard_id]].copy_(loaded_weight) - param.shard_weight_type[loaded_shard_id] = loaded_weight.item() + if loaded_shard_id is not None: + param.data[idx_map[loaded_shard_id]].copy_(loaded_weight) + param.shard_weight_type[loaded_shard_id] = loaded_weight.item() + else: + param.shard_weight_type = { + k: loaded_weight.item() + for k in idx_map + } return if is_gguf_weight: @@ -825,15 +837,15 @@ def weight_loader(self, shard_size = loaded_weight.size(output_dim) // tp_size start_idx = tp_rank * shard_size - loaded_weight = loaded_weight.narrow(output_dim, start_idx, - shard_size) - - param.shard_id.append(loaded_shard_id) - param.shard_id_map[loaded_shard_id] = len(param.data_container) - param.data_container.append(loaded_weight) - if len(param.data_container) == 3: - self.qweight = param.materialize_nested() - return + if loaded_shard_id is not None: + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) + param.shard_id.append(loaded_shard_id) + param.shard_id_map[loaded_shard_id] = len(param.data_container) + param.data_container.append(loaded_weight) + if len(param.data_container) == 3: + self.qweight = param.materialize_nested() + return param_data = param.data output_dim = getattr(param, "output_dim", None) diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index fd926ff0254d4..1656a3cc9e46d 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -198,7 +198,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): assert not config.scale_attn_by_inverse_layer_idx assert not config.reorder_and_upcast_attn self.embed_dim = config.hidden_size - self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim) + self.wte = VocabParallelEmbedding(config.vocab_size, + self.embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.wte") self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, @@ -259,7 +262,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.lm_head = self.transformer.wte else: self.lm_head = ParallelLMHead(self.config.vocab_size, - self.config.hidden_size) + self.config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.lm_head") self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( @@ -304,7 +309,7 @@ def load_weights(self, weights: Iterable[Tuple[str, params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: Set[str] = set() for name, loaded_weight in weights: - if "lm_head.weight" in name: + if name.startswith("lm_head"): # GPT-2 ties the weights of the embedding layer and the final # linear layer. continue diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 8623da99574bb..17b0fbb777e8e 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -156,7 +156,8 @@ def __init__( ) is_neox_style = True - if quant_config is not None and quant_config.get_name() == "gguf": + is_gguf = quant_config and quant_config.get_name() == "gguf" + if is_gguf and config.model_type == "llama": is_neox_style = False self.rotary_emb = get_rope( diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 6b2107bef0a66..c9d1af78246a6 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -22,7 +22,7 @@ import torch from torch import nn -from transformers import PretrainedConfig +from transformers import StableLmConfig from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, VllmConfig @@ -50,8 +50,9 @@ class StablelmMLP(nn.Module): def __init__(self, - config: PretrainedConfig, - quant_config: Optional[QuantizationConfig] = None) -> None: + config: StableLmConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> None: super().__init__() self.config = config self.hidden_size = config.hidden_size @@ -59,10 +60,13 @@ def __init__(self, self.gate_up_proj = MergedColumnParallelLinear( config.hidden_size, [config.intermediate_size] * 2, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") self.down_proj = RowParallelLinear(config.intermediate_size, config.hidden_size, - bias=False) + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") self.act_fn = SiluAndMul() def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -75,7 +79,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class StablelmAttention(nn.Module): def __init__(self, - config: PretrainedConfig, + config: StableLmConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "") -> None: @@ -116,11 +120,13 @@ def __init__(self, self.total_num_heads, self.total_num_key_value_heads, self.qkv_bias, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj") self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim, self.hidden_size, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.o_proj") self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.rotary_ndims, @@ -154,7 +160,7 @@ class StablelmDecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: StableLmConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -164,7 +170,7 @@ def __init__( cache_config, quant_config, prefix=f"{prefix}.self_attn") - self.mlp = StablelmMLP(config, quant_config) + self.mlp = StablelmMLP(config, quant_config, prefix=f"{prefix}.mlp") norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05)) self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps) @@ -210,6 +216,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens", ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, @@ -270,7 +278,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.lm_head") if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 22189a517d313..1cd0dedfed2cb 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -88,12 +88,14 @@ def __init__(self, self.total_num_kv_heads, bias=self.use_bias, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.o_proj = RowParallelLinear( self.total_num_heads * self.head_dim, self.hidden_size, bias=self.use_bias, quant_config=quant_config, + prefix=f"{prefix}.o_proj", ) self.rotary_emb = get_rope( self.head_dim, @@ -129,19 +131,22 @@ class Starcoder2MLP(nn.Module): def __init__(self, config: Starcoder2Config, - quant_config: Optional[QuantizationConfig] = None): + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): super().__init__() self.c_fc = ColumnParallelLinear( config.hidden_size, config.intermediate_size, bias=config.use_bias, quant_config=quant_config, + prefix=f"{prefix}.c_fc", ) self.c_proj = RowParallelLinear( config.intermediate_size, config.hidden_size, bias=config.use_bias, quant_config=quant_config, + prefix=f"{prefix}.c_proj", ) self.act = get_act_fn(config.hidden_act) @@ -165,7 +170,9 @@ def __init__(self, cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn") - self.mlp = Starcoder2MLP(config, quant_config=quant_config) + self.mlp = Starcoder2MLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon) self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, @@ -213,8 +220,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.vocab_size = config.vocab_size # TODO: consider padding_idx (currently removed) - self.embed_tokens = VocabParallelEmbedding(config.vocab_size, - config.hidden_size) + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens") self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Starcoder2DecoderLayer( @@ -279,6 +289,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): org_num_embeddings=config.vocab_size, padding_size=DEFAULT_VOCAB_PADDING_SIZE, quant_config=quant_config, + prefix=f"{prefix}.lm_head", ) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size)