Fix dummy data seq padding for multimodal qwen

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
vllm-project · Sep 5, 2024 · 2ac3008 · 2ac3008
1 parent 4f25926
commit 2ac3008
Showing 1 changed file with 5 additions and 3 deletions.
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
@@ -830,9 +830,7 @@ def dummy_data_for_qwen(
     # Build the image prompts with no imgpads; the tokenizer will add img pads
     image_prompt = ''.join(
         [get_image_text(idx, False) for idx in range(1, num_images + 1)])
-    toks = tokenizer.encode(image_prompt,
-                            add_special_tokens=False,
-                            return_tensors="pt")[0].tolist()
+    toks = tokenizer.encode(image_prompt, add_special_tokens=False)
 
     # Make sure we actually get the fixed context size per tok padding
     num_pads = toks.count(tokenizer.encode(IMG_PAD)[0])
@@ -842,6 +840,10 @@ def dummy_data_for_qwen(
             f" per image, but got {num_pads} pads for {num_images} image(s)"
             " in total. Are you using a qwen tokenizer?")
 
+    # Ensure the number of tokens is at minimum the sequence length provided
+    if len(toks) < seq_len:
+        toks += [0] * (seq_len - len(toks))
+
     # Build the input images; width/height doesn't actually matter here since
     # the data will get resized and the # of tokens per image is constant
     image = Image.new("RGB", (224, 224), color=0)