From 2ac300835a108dbd6625584c9541eb6f64652e7a Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Thu, 5 Sep 2024 06:56:14 -0400
Subject: [PATCH] Fix dummy data seq padding for multimodal qwen

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
---
 vllm/model_executor/models/qwen.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 07c15c82ca0e6..da1fb6a0ea2fb 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -830,9 +830,7 @@ def dummy_data_for_qwen(
     # Build the image prompts with no imgpads; the tokenizer will add img pads
     image_prompt = ''.join(
         [get_image_text(idx, False) for idx in range(1, num_images + 1)])
-    toks = tokenizer.encode(image_prompt,
-                            add_special_tokens=False,
-                            return_tensors="pt")[0].tolist()
+    toks = tokenizer.encode(image_prompt, add_special_tokens=False)
 
     # Make sure we actually get the fixed context size per tok padding
     num_pads = toks.count(tokenizer.encode(IMG_PAD)[0])
@@ -842,6 +840,10 @@ def dummy_data_for_qwen(
             f" per image, but got {num_pads} pads for {num_images} image(s)"
             " in total. Are you using a qwen tokenizer?")
 
+    # Ensure the number of tokens is at minimum the sequence length provided
+    if len(toks) < seq_len:
+        toks += [0] * (seq_len - len(toks))
+
     # Build the input images; width/height doesn't actually matter here since
     # the data will get resized and the # of tokens per image is constant
     image = Image.new("RGB", (224, 224), color=0)