Skip to content

Commit

Permalink
[Bugfix] Fix max image feature size for Llava-one-vision (vllm-projec…
Browse files Browse the repository at this point in the history
…t#12104)

Signed-off-by: Roger Wang <ywang@roblox.com>
  • Loading branch information
ywang96 authored and abmfy committed Jan 24, 2025
1 parent e942bbf commit 93a9b80
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 2 deletions.
61 changes: 61 additions & 0 deletions tests/models/multimodal/processing/test_llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,67 @@
from ...utils import build_model_context


def _validate_image_max_tokens_one(
processor: BaseMultiModalProcessor,
max_tokens: int,
failed_size_excs: list[tuple[ImageSize, Exception]],
image_size: ImageSize,
) -> None:
info = processor.info
feature_size = info.get_num_image_tokens(image_width=image_size.width,
image_height=image_size.height)

try:
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
except Exception as exc:
failed_size_excs.append((image_size, exc))


@pytest.mark.skip("This test takes around 5 minutes to run. "
"Comment this out to run it manually.")
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
def test_processor_max_tokens(model_id):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
)
info = processor.info

seen_aspect_ratios = set[float]()
image_sizes = list[ImageSize]()

# The aspect ratio of the grid layout is between 1 and 2
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for w, h in itertools.product(range(32, 4096), repeat=2):
aspect_ratio = w / h
if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
image_sizes.append(ImageSize(w, h))
seen_aspect_ratios.add(aspect_ratio)

failed_size_excs = list[tuple[ImageSize, Exception]]()

validate_one = partial(
_validate_image_max_tokens_one,
processor,
info.get_max_image_tokens(), # type: ignore
failed_size_excs,
)
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")

if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
raise AssertionError(msg)


def _validate_image_prompt_replacements_one(
processor: BaseMultiModalProcessor,
num_imgs: int,
Expand Down
62 changes: 62 additions & 0 deletions tests/models/multimodal/processing/test_llava_onevision.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,68 @@
from ...utils import build_model_context


def _validate_image_max_tokens_one(
processor: BaseMultiModalProcessor,
max_tokens: int,
failed_size_excs: list[tuple[ImageSize, Exception]],
image_size: ImageSize,
) -> None:
info = processor.info
feature_size = info.get_num_image_tokens(image_width=image_size.width,
image_height=image_size.height)

try:
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
except Exception as exc:
failed_size_excs.append((image_size, exc))


@pytest.mark.skip("This test takes around 5 minutes to run. "
"Comment this out to run it manually.")
@pytest.mark.parametrize("model_id",
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
def test_processor_max_tokens(model_id):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
)
info = processor.info

seen_aspect_ratios = set[float]()
image_sizes = list[ImageSize]()

# The aspect ratio of the grid layout is between 1 and 6
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for w, h in itertools.product(range(32, 4096), repeat=2):
aspect_ratio = w / h
if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
image_sizes.append(ImageSize(w, h))
seen_aspect_ratios.add(aspect_ratio)

failed_size_excs = list[tuple[ImageSize, Exception]]()

validate_one = partial(
_validate_image_max_tokens_one,
processor,
info.get_max_image_tokens(), # type: ignore
failed_size_excs,
)
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")

if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
raise AssertionError(msg)


def _validate_image_prompt_replacements_one(
processor: BaseMultiModalProcessor,
num_imgs: int,
Expand Down
8 changes: 6 additions & 2 deletions vllm/model_executor/models/llava_onevision.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
NestedTensors)
from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
VideoProcessorItems)
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
VideoEmbeddingItems, VideoProcessorItems)
from vllm.multimodal.processing import PromptReplacement
from vllm.multimodal.profiling import ProcessorInputs
from vllm.sequence import IntermediateTensors
Expand Down Expand Up @@ -145,6 +145,10 @@ def _get_num_unpadded_features(

return (unpadded_features, newline_features)

def get_image_size_with_most_features(self) -> ImageSize:
# NOTE: This hardcoded value is found via processor tests
return ImageSize(width=1153, height=944)

def _get_num_frame_tokens(
self,
*,
Expand Down

0 comments on commit 93a9b80

Please sign in to comment.