From 2ac300835a108dbd6625584c9541eb6f64652e7a Mon Sep 17 00:00:00 2001 From: Alex-Brooks Date: Thu, 5 Sep 2024 06:56:14 -0400 Subject: [PATCH] Fix dummy data seq padding for multimodal qwen Signed-off-by: Alex-Brooks --- vllm/model_executor/models/qwen.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 07c15c82ca0e6..da1fb6a0ea2fb 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -830,9 +830,7 @@ def dummy_data_for_qwen( # Build the image prompts with no imgpads; the tokenizer will add img pads image_prompt = ''.join( [get_image_text(idx, False) for idx in range(1, num_images + 1)]) - toks = tokenizer.encode(image_prompt, - add_special_tokens=False, - return_tensors="pt")[0].tolist() + toks = tokenizer.encode(image_prompt, add_special_tokens=False) # Make sure we actually get the fixed context size per tok padding num_pads = toks.count(tokenizer.encode(IMG_PAD)[0]) @@ -842,6 +840,10 @@ def dummy_data_for_qwen( f" per image, but got {num_pads} pads for {num_images} image(s)" " in total. Are you using a qwen tokenizer?") + # Ensure the number of tokens is at minimum the sequence length provided + if len(toks) < seq_len: + toks += [0] * (seq_len - len(toks)) + # Build the input images; width/height doesn't actually matter here since # the data will get resized and the # of tokens per image is constant image = Image.new("RGB", (224, 224), color=0)