Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix CI for VLMs #35690

Merged
merged 5 commits into from
Jan 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions docs/source/en/model_doc/emu3.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ import torch
from PIL import Image
import requests

processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
model = Emu3ForConditionalGeneration.from_pretrained("Emu3-community/Emu3-Chat-hf", torch_dtype=torch.bfloat16, device_map="cuda")
processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")
model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", torch_dtype=torch.bfloat16, device_map="cuda")

# prepare image and text prompt
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
Expand All @@ -75,8 +75,8 @@ print(processor.decode(output[0], skip_special_tokens=True))
Emu3 can also generate images from textual input. Here is how you can do it:

```python
processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Gen-hf")
model = Emu3ForConditionalGeneration.from_pretrained("Emu3-community/Emu3-Gen-hf", torch_dtype="bfloat16", device_map="auto", attn_implementation="flash_attention_2")
processor = Emu3Processor.from_pretrained("BAAI/Emu3-Gen-hf")
model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Gen-hf", torch_dtype="bfloat16", device_map="auto", attn_implementation="flash_attention_2")


inputs = processor(
Expand Down
8 changes: 4 additions & 4 deletions src/transformers/models/emu3/modeling_emu3.py
Original file line number Diff line number Diff line change
Expand Up @@ -1740,8 +1740,8 @@ def forward(
>>> import requests
>>> from PIL import Image

>>> model = Emu3ForCausalLM.from_pretrained("Emu3-community/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
>>> processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
>>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
>>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

>>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)

Expand Down Expand Up @@ -1884,8 +1884,8 @@ def forward(
>>> import requests
>>> from PIL import Image

>>> model = Emu3ForConditionalGeneration.from_pretrained("Emu3-community/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
>>> processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
>>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
>>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

>>> conversation = [
... {
Expand Down
10 changes: 5 additions & 5 deletions src/transformers/models/emu3/modular_emu3.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@


_CONFIG_FOR_DOC = "Emu3Config"
_CHECKPOINT_FOR_DOC = "Emu3-community/Emu3-Chat-hf"
_CHECKPOINT_FOR_DOC = "BAAI/Emu3-Chat-hf"

logger = logging.get_logger(__name__)

Expand Down Expand Up @@ -1091,8 +1091,8 @@ def forward(**super_kwargs):
>>> import requests
>>> from PIL import Image

>>> model = Emu3ForCausalLM.from_pretrained("Emu3-community/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
>>> processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
>>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
>>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

>>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)

Expand Down Expand Up @@ -1196,8 +1196,8 @@ def forward(
>>> import requests
>>> from PIL import Image

>>> model = Emu3ForConditionalGeneration.from_pretrained("Emu3-community/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
>>> processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
>>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
>>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

>>> conversation = [
... {
Expand Down
160 changes: 0 additions & 160 deletions tests/models/aria/test_modeling_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
from transformers.testing_utils import (
require_bitsandbytes,
require_torch,
require_torch_gpu,
require_vision,
slow,
torch_device,
Expand Down Expand Up @@ -462,63 +461,6 @@ def test_batched_generation(self):
outputs = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
self.assertEqual(outputs, EXPECTED_OUTPUT)

@slow
@require_bitsandbytes
def test_aria_index_error_bug(self):
# This is a reproducer of https://github.com/huggingface/transformers/pull/28032 and makes sure it does not happen anymore
# Please refer to that PR, or specifically https://github.com/huggingface/transformers/pull/28032#issuecomment-1860650043 for
# more details
model_id = "rhymes-ai/Aria"
model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)

processor = AutoProcessor.from_pretrained(model_id)

# Simulate a super long prompt
user_prompt = "Describe the image:?\n" * 200
prompt = f"USER: <image>\n{user_prompt}ASSISTANT:"
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"

raw_image = Image.open(requests.get(image_file, stream=True).raw)
inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)

# Make sure that `generate` works
_ = model.generate(**inputs, max_new_tokens=20)

@slow
@require_torch_gpu
def test_aria_merge_inputs_error_bug(self):
# This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
model_id = "rhymes-ai/Aria"
model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)

# Simulate some user inputs
pixel_values = torch.randn(
(1, 3, 336, 336),
dtype=torch.float,
device=torch_device,
)
input_ids = torch.tensor(
[
[32001, 32001, 1, 15043, 7084, 32000, 29871, 13, 7900],
],
dtype=torch.long,
device=torch_device,
)
attention_mask = torch.tensor(
[[0, 0, 1, 1, 1, 1, 1, 1, 1]],
dtype=torch.long,
device=torch_device,
)

# Make sure that the loss is properly computed
loss = model(
pixel_values=pixel_values,
input_ids=input_ids,
attention_mask=attention_mask,
labels=input_ids,
).loss
loss.backward()

def test_tokenizer_integration(self):
model_id = "rhymes-ai/Aria"
slow_tokenizer = AutoTokenizer.from_pretrained(
Expand Down Expand Up @@ -552,105 +494,3 @@ def test_generation_no_images(self):

# Make sure that `generate` works
_ = model.generate(**inputs, max_new_tokens=20)

@slow
@require_bitsandbytes
def test_generation_siglip_backbone(self):
model_id = "rhymes-ai/Aria"
model = AriaForConditionalGeneration.from_pretrained(model_id, torch_dtype="float16", device_map=torch_device)
processor = AutoProcessor.from_pretrained(model_id)

# check processing with expansion of inputs (w/o expansion should work with any backbone)
processor.vision_feature_select_strategy = "default"
processor.patch_size = 14

image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)
inputs = processor(
text="<|im_start|>user\n<image>\nWhat are these?<|im_end|>\n<|im_start|>assistant",
images=raw_image,
return_tensors="pt",
).to(torch_device, torch.float16)

# Make sure that `generate` works
output = model.generate(**inputs, max_new_tokens=30)

EXPECTED_DECODED_TEXT = "user\n\nWhat are these?\nassistant The image shows two cats, one on the left and one on the right. They appear to be resting or sleeping on a pink blanket. The cat"
self.assertTrue(processor.batch_decode(output, skip_special_tokens=True)[0] == EXPECTED_DECODED_TEXT)

@slow
@require_bitsandbytes
def test_expansion_in_processing(self):
model_id = "rhymes-ai/Aria"
model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
processor = AutoProcessor.from_pretrained(model_id)

prompt = "USER: <image>\nDescribe the image:\nASSISTANT:"
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)

# check processing with expansion of inputs
processor.vision_feature_select_strategy = "default"
processor.patch_size = 14
inputs_expanded = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593)

# check processing without expansion of inputs (legacy behavior)
processor.vision_feature_select_strategy = None
processor.patch_size = None
inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
self.assertTrue(inputs.input_ids.shape[-1] == 18)

# generate exactly 20 tokens
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)

# check that both inputs are handled correctly and generate the same output
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())

@slow
@require_bitsandbytes
def test_pixtral(self):
model_id = "rhymes-ai/Aria"
model = AriaForConditionalGeneration.from_pretrained(model_id)
processor = AutoProcessor.from_pretrained(model_id)

IMG_URLS = [
Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw),
Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw),
Image.open(requests.get("https://picsum.photos/id/27/500/500", stream=True).raw),
Image.open(requests.get("https://picsum.photos/id/17/150/600", stream=True).raw),
]
PROMPT = "<s>[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]"

# image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to("cuda")
generate_ids = model.generate(**inputs, max_new_tokens=500)
ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

# fmt: off
EXPECTED_GENERATION = """
Describe the images.
Sure, let's break down each image description:

1. **Image 1:**
- **Description:** A black dog with a glossy coat is sitting on a wooden floor. The dog has a focused expression and is looking directly at the camera.
- **Details:** The wooden floor has a rustic appearance with visible wood grain patterns. The dog's eyes are a striking color, possibly brown or amber, which contrasts with its black fur.

2. **Image 2:**
- **Description:** A scenic view of a mountainous landscape with a winding road cutting through it. The road is surrounded by lush green vegetation and leads to a distant valley.
- **Details:** The mountains are rugged with steep slopes, and the sky is clear, indicating good weather. The winding road adds a sense of depth and perspective to the image.

3. **Image 3:**
- **Description:** A beach scene with waves crashing against the shore. There are several people in the water and on the beach, enjoying the waves and the sunset.
- **Details:** The waves are powerful, creating a dynamic and lively atmosphere. The sky is painted with hues of orange and pink from the setting sun, adding a warm glow to the scene.

4. **Image 4:**
- **Description:** A garden path leading to a large tree with a bench underneath it. The path is bordered by well-maintained grass and flowers.
- **Details:** The path is made of small stones or gravel, and the tree provides a shaded area with the bench invitingly placed beneath it. The surrounding area is lush and green, suggesting a well-kept garden.

Each image captures a different scene, from a close-up of a dog to expansive natural landscapes, showcasing various elements of nature and human interaction with it.
"""
# fmt: on
# check that both inputs are handled correctly and generate the same output
self.assertListEqual(ouptut, EXPECTED_GENERATION)
6 changes: 3 additions & 3 deletions tests/models/chameleon/test_modeling_chameleon.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ def test_model_7b(self):
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, torch.float16)

# greedy generation outputs
EXPECTED_TEXT_COMPLETION = ['Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue line extending across the center of the image. The line is labeled "390 light years" and is accompanied by a small black and'] # fmt: skip
EXPECTED_TEXT_COMPLETION = ['Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot in the center representing the star Alpha Centauri. The star map is a representation of the night sky, showing the positions of stars in'] # fmt: skip
generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
Expand Down Expand Up @@ -388,7 +388,7 @@ def test_model_7b_batched(self):
# greedy generation outputs
EXPECTED_TEXT_COMPLETION = [
'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot in the center representing the star Alpha Centauri. The star map is a representation of the night sky, showing the positions of stars in',
'What constellation is this image showing?The image is showing the constellation of Orion.'
'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.'
] # fmt: skip
generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)
Expand All @@ -414,7 +414,7 @@ def test_model_7b_multi_image(self):
inputs = processor(images=[image, image_2], text=prompt, return_tensors="pt").to(model.device, torch.float16)

# greedy generation outputs
EXPECTED_TEXT_COMPLETION = ['What do these two images have in common?The two images show a connection between two things that are not necessarily related. The first image shows a group of stars, while the second image shows a network of lines connecting two points. The connection between'] # fmt: skip
EXPECTED_TEXT_COMPLETION = ['What do these two images have in common?The two images show a connection between the night sky and the internet. The first image shows a starry night sky, with the stars arranged in a pattern that resembles the structure of the internet. The'] # fmt: skip
generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
Loading