diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 5ab4157cb358..70ac82e2005b 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -20,9 +20,9 @@ The following :ref:`engine arguments ` are specific to VLMs: Currently, the support for vision language models on vLLM has the following limitations: * Only single image input is supported per text prompt. - * Dynamic ``image_input_shape`` is not supported: the input image will be resized to the static ``image_input_shape``. This means model output might not exactly match the HuggingFace implementation. + * Dynamic ``image_input_shape`` is not supported: the input image will be resized to the static ``image_input_shape``. This means our LLaVA-NeXT output may not exactly match the huggingface implementation. - We are continuously improving user & developer experience for VLMs. Please raise an issue on GitHub if you have any feedback or feature requests. + We are continuously improving user & developer experience for VLMs. Please `open an issue on GitHub `_ if you have any feedback or feature requests. Offline Batched Inference ------------------------- diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 67b32a08833b..39355b9d3ab4 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -227,7 +227,7 @@ def forward( attn_metadata: AttentionMetadata, **kwargs: object, ) -> SamplerOutput: - """Run forward pass for Llava 1.5. + """Run forward pass for LLaVA-1.5. One key thing to understand is the `input_ids` already accounts for the positions of the to-be-inserted image embeddings. @@ -247,22 +247,25 @@ def forward( This way, the `positions` and `attn_metadata` are consistent with the `input_ids`. - The model takes two types of image inputs: - PIXEL_VALUES and IMAGE_FEATURES. - The following shows how each maps to huggingface implementation. - PIXEL_VALUES: - - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L353 - IMAGE_FEATURES: - - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L430 - before going through the multi modal projector. + This model has two modes of image inputs: + `PIXEL_VALUES` and `IMAGE_FEATURES`. Args: input_ids: Flattened (concatenated) input_ids corresponding to a batch. - pixel_values: For PIXEL_VALUES, expects a batch with shape - [1, 3, 336, 336]. - image_features: For IMAGE_FEATURES, expects a batch with shape - [1, 576, 1024]. + pixel_values: The pixels in each input image. + Expects a batch with shape `[1, 3, 336, 336]`. + (Only applicable to `PIXEL_VALUES` mode) + image_features: The image features for each input image outputted by + the vision tower before passing to the multi-modal projector. + Expects a batch with shape `[1, 576, 1024]`. + (Only applicable to `IMAGE_FEATURES` mode) + + See also: + Each input maps to huggingface implementation, as follows: + + - `pixel_values`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava/modeling_llava.py#L360 + - `image_features`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava/modeling_llava.py#L437 """ image_input = self._parse_and_validate_image_input(**kwargs) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 57cbd1e4a601..0ab9afea9ac6 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -108,15 +108,6 @@ def _image_pixel_processor( @MULTIMODAL_REGISTRY.register_image_pixel_input(_image_pixel_processor) @MULTIMODAL_REGISTRY.register_dummy_data(_get_dummy_image_data) class LlavaNextForConditionalGeneration(VisionLanguageModelBase): - """ - Args to `forward()`: - input_ids: Flattened (concatenated) input_ids corresponding to a - batch. - pixel_values: For PIXEL_VALUES, expects a batch with shape - [1, num_patches, 3, 336, 336]. - image_features: For IMAGE_FEATURES, expects a batch with shape - [1, num_patches, 1176, 1024]. - """ def __init__(self, config: LlavaNextConfig, @@ -355,7 +346,7 @@ def forward( attn_metadata: AttentionMetadata, **kwargs: object, ) -> SamplerOutput: - """Run forward pass for Llava 1.5. + """Run forward pass for LlaVA-NeXT. One key thing to understand is the `input_ids` already accounts for the positions of the to-be-inserted image embeddings. @@ -375,22 +366,19 @@ def forward( This way, the `positions` and `attn_metadata` are consistent with the `input_ids`. - The model takes two types of image inputs: - PIXEL_VALUES and IMAGE_FEATURES. - The following shows how each maps to huggingface implementation. - PIXEL_VALUES: - - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L353 - IMAGE_FEATURES: - - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L430 - before going through the multi modal projector. - Args: input_ids: Flattened (concatenated) input_ids corresponding to a batch. - pixel_values: For PIXEL_VALUES, expects a batch with shape - [1, 3, 336, 336]. - image_features: For IMAGE_FEATURES, expects a batch with shape - [1, 576, 1024]. + pixel_values: The pixels in each grid patch for each input image. + Expects a batch with shape `[1, num_patches, 3, 336, 336]`. + image_sizes: The original `(width, height)` for each input image. + Expects a batch with shape `[1, 2]`. + + See also: + Each input maps to huggingface implementation, as follows: + + - `pixel_values`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava_next/modeling_llava_next.py#L690 + - `image_sizes`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava_next/modeling_llava_next.py#L691 """ image_input = self._parse_and_validate_image_input(**kwargs)