From 49055e150d5a69cb93012e1fb6e47a8f597de463 Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 5 Dec 2024 18:19:10 +0000 Subject: [PATCH 1/7] Fix the structure of images output by the processor --- src/transformers/models/pixtral/processing_pixtral.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py index 5913e8688d00..2e6d50f9e79c 100644 --- a/src/transformers/models/pixtral/processing_pixtral.py +++ b/src/transformers/models/pixtral/processing_pixtral.py @@ -206,10 +206,7 @@ def __call__( if is_image_or_image_url(images): images = [[images]] elif isinstance(images, list) and is_image_or_image_url(images[0]): - if isinstance(text, list): - images = [[im] for im in images] - else: - images = [images] + images = [images] elif isinstance(images, list) and isinstance(images[0], list) and is_image_or_image_url(images[0][0]): pass else: From ed0b4303e3bbfe4e452af2adc917ef5c17a5839f Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 5 Dec 2024 18:31:08 +0000 Subject: [PATCH 2/7] Fix the structure of images output by the processor --- src/transformers/models/pixtral/processing_pixtral.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py index 2e6d50f9e79c..53a31abf1add 100644 --- a/src/transformers/models/pixtral/processing_pixtral.py +++ b/src/transformers/models/pixtral/processing_pixtral.py @@ -206,7 +206,11 @@ def __call__( if is_image_or_image_url(images): images = [[images]] elif isinstance(images, list) and is_image_or_image_url(images[0]): - images = [images] + if isinstance(text, str) or isinstance(text, list) and len(text) == 1: + # If there's a single sample, all images must belong to it + images = [images] + else: + raise ValueError("You have supplied multiple text samples, but only a flat list of images. When processing multiple samples, `images` should be a list of lists of images, one list per sample.") elif isinstance(images, list) and isinstance(images[0], list) and is_image_or_image_url(images[0][0]): pass else: From 031fdd5e103d18979a8becc72904a9c55c61c190 Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 5 Dec 2024 18:32:42 +0000 Subject: [PATCH 3/7] make fixup --- src/transformers/models/pixtral/processing_pixtral.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py index 53a31abf1add..67f18eeb5cf1 100644 --- a/src/transformers/models/pixtral/processing_pixtral.py +++ b/src/transformers/models/pixtral/processing_pixtral.py @@ -210,7 +210,9 @@ def __call__( # If there's a single sample, all images must belong to it images = [images] else: - raise ValueError("You have supplied multiple text samples, but only a flat list of images. When processing multiple samples, `images` should be a list of lists of images, one list per sample.") + raise ValueError( + "You have supplied multiple text samples, but only a flat list of images. When processing multiple samples, `images` should be a list of lists of images, one list per sample." + ) elif isinstance(images, list) and isinstance(images[0], list) and is_image_or_image_url(images[0][0]): pass else: From 3406432db3982bb00b1f34e77f3cd20d66f9bd84 Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 5 Dec 2024 18:36:06 +0000 Subject: [PATCH 4/7] More error handling --- src/transformers/models/pixtral/processing_pixtral.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py index 67f18eeb5cf1..9fb1ff485361 100644 --- a/src/transformers/models/pixtral/processing_pixtral.py +++ b/src/transformers/models/pixtral/processing_pixtral.py @@ -204,14 +204,20 @@ def __call__( if images is not None: if is_image_or_image_url(images): - images = [[images]] + if isinstance(text, str) or isinstance(text, list) and len(text) == 1: + # If there's a single sample, the image must belong to it + images = [[images]] + else: + raise ValueError( + "You have supplied multiple text samples, but `images` is not a nested list. When processing multiple samples, `images` should be a list of lists of images, one list per sample." + ) elif isinstance(images, list) and is_image_or_image_url(images[0]): if isinstance(text, str) or isinstance(text, list) and len(text) == 1: # If there's a single sample, all images must belong to it images = [images] else: raise ValueError( - "You have supplied multiple text samples, but only a flat list of images. When processing multiple samples, `images` should be a list of lists of images, one list per sample." + "You have supplied multiple text samples, but `images` is not a nested list. When processing multiple samples, `images` should be a list of lists of images, one list per sample." ) elif isinstance(images, list) and isinstance(images[0], list) and is_image_or_image_url(images[0][0]): pass From af9f67c9d91b2a54a249e961d95dcad811d2e1fc Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 5 Dec 2024 18:40:35 +0000 Subject: [PATCH 5/7] Correct nesting in test --- tests/models/pixtral/test_processor_pixtral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py index c3496dff3cdf..8de0a5f553b3 100644 --- a/tests/models/pixtral/test_processor_pixtral.py +++ b/tests/models/pixtral/test_processor_pixtral.py @@ -253,7 +253,7 @@ def test_processor_returns_full_length_batches(self): "USER: [IMG]\nWhat's the content of the image? ASSISTANT:", ] * 5 processor.tokenizer.pad_token = "" - image_inputs = [self.image_0] * 5 + image_inputs = [[self.image_0] * 5] # Make small for checking image token expansion processor.image_processor.size = {"longest_edge": 30} From 6769700a162b4a027113c13f6ba2dd43986d70d7 Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 5 Dec 2024 18:41:21 +0000 Subject: [PATCH 6/7] Correct nesting in test --- tests/models/pixtral/test_processor_pixtral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py index 8de0a5f553b3..d224c531241f 100644 --- a/tests/models/pixtral/test_processor_pixtral.py +++ b/tests/models/pixtral/test_processor_pixtral.py @@ -253,7 +253,7 @@ def test_processor_returns_full_length_batches(self): "USER: [IMG]\nWhat's the content of the image? ASSISTANT:", ] * 5 processor.tokenizer.pad_token = "" - image_inputs = [[self.image_0] * 5] + image_inputs = [[self.image_0]] * 5 # Make small for checking image token expansion processor.image_processor.size = {"longest_edge": 30} From f3ff530a45f20534fd26de5753e7c95f2c57cd3e Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 5 Dec 2024 18:45:21 +0000 Subject: [PATCH 7/7] [run-slow] pixtral