diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 9024831df543c..6adb1e29d6568 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -342,6 +342,8 @@ def input_mapper( elif is_list_of(data, Image.Image): # we can't stack here because images may have different num_patches data = [image_pixel_values_mapper(img) for img in data] + else: + return MultiModalInputs({"image_embeds": data}) model_config = ctx.model_config tokenizer = cached_get_tokenizer( model_config.tokenizer,