Skip to content

Commit

Permalink
[VLM] Disallow overflowing max_model_len for multimodal models (vll…
Browse files Browse the repository at this point in the history
  • Loading branch information
DarkLight1337 authored Aug 30, 2024
1 parent 0c785d3 commit 4abed65
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 3 deletions.
17 changes: 17 additions & 0 deletions tests/models/test_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,3 +179,20 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
)


@pytest.mark.parametrize("model", models)
def test_context_length_too_short(vllm_runner, image_assets, model):
images = [asset.pil_image for asset in image_assets]

with pytest.raises(ValueError, match="too long to fit into the model"):
vllm_model = vllm_runner(
model,
max_model_len=128, # LLaVA has a feature size of 576
enforce_eager=True,
)

with vllm_model:
vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
max_tokens=1,
images=[images[0]])
21 changes: 18 additions & 3 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -2010,7 +2010,22 @@ def is_embedding_model(self):

def _validate_model_inputs(self, inputs: Union[LLMInputs,
EncoderDecoderLLMInputs]):
prompt_key = "encoder_prompt_token_ids" \
if self.is_encoder_decoder_model() else "prompt_token_ids"
if not inputs.get(prompt_key):
if self.is_encoder_decoder_model():
prompt_ids = inputs.get("encoder_prompt_token_ids")
else:
prompt_ids = inputs.get("prompt_token_ids")

if prompt_ids is None or len(prompt_ids) == 0:
raise ValueError("Prompt cannot be empty")

if self.model_config.multimodal_config is not None:
max_prompt_len = self.model_config.max_model_len

if len(prompt_ids) > max_prompt_len:
raise ValueError(
f"The prompt (total length {len(prompt_ids)}) is too long "
f"to fit into the model (context length {max_prompt_len}). "
"Make sure that `max_model_len` is no smaller than the "
"number of text tokens plus multimodal tokens. For image "
"inputs, the number of image tokens depends on the number "
"of images, and possibly their aspect ratios as well.")

0 comments on commit 4abed65

Please sign in to comment.